From aa63fef21cf9c0d1fd5708aca2c17171cee83fc0 Mon Sep 17 00:00:00 2001
From: koldo <koldo@f0d560ea-af0d-0410-9eb7-867de7ffcac7>
Date: Fri, 23 Oct 2020 09:15:00 +0000
Subject: [PATCH] Eigen: Updated to 3.3.8

git-svn-id: svn://ultimatepp.org/upp/trunk@15292 f0d560ea-af0d-0410-9eb7-867de7ffcac7
---
 uppsrc/plugin/Eigen/Eigen.h                   |    2 +
 uppsrc/plugin/Eigen/Eigen/Core                |  358 +-
 uppsrc/plugin/Eigen/Eigen/Geometry            |    6 +-
 uppsrc/plugin/Eigen/Eigen/KLUSupport          |   41 -
 uppsrc/plugin/Eigen/Eigen/OrderingMethods     |    3 +
 uppsrc/plugin/Eigen/Eigen/PaStiXSupport       |    1 -
 uppsrc/plugin/Eigen/Eigen/Sparse              |    2 +
 uppsrc/plugin/Eigen/Eigen/SparseCholesky      |    8 +
 uppsrc/plugin/Eigen/Eigen/SparseLU            |    4 -
 uppsrc/plugin/Eigen/Eigen/src/Cholesky/LDLT.h |   59 +-
 uppsrc/plugin/Eigen/Eigen/src/Cholesky/LLT.h  |   56 +-
 .../Eigen/src/CholmodSupport/CholmodSupport.h |  137 +-
 .../Eigen/Eigen/src/Core/ArithmeticSequence.h |  413 --
 uppsrc/plugin/Eigen/Eigen/src/Core/Array.h    |  100 +-
 .../plugin/Eigen/Eigen/src/Core/ArrayBase.h   |    2 +-
 .../Eigen/Eigen/src/Core/ArrayWrapper.h       |    2 +-
 uppsrc/plugin/Eigen/Eigen/src/Core/Assign.h   |    2 +-
 .../Eigen/Eigen/src/Core/AssignEvaluator.h    |   63 +-
 .../plugin/Eigen/Eigen/src/Core/Assign_MKL.h  |   20 +-
 uppsrc/plugin/Eigen/Eigen/src/Core/Block.h    |   60 +-
 .../Eigen/Eigen/src/Core/BooleanRedux.h       |   56 +-
 .../Eigen/Eigen/src/Core/CommaInitializer.h   |   10 +-
 .../Eigen/Eigen/src/Core/CoreEvaluators.h     |  386 +-
 .../Eigen/Eigen/src/Core/CoreIterators.h      |    5 -
 .../Eigen/Eigen/src/Core/CwiseBinaryOp.h      |   29 +-
 .../Eigen/Eigen/src/Core/CwiseNullaryOp.h     |   82 +-
 .../Eigen/Eigen/src/Core/CwiseUnaryView.h     |    2 +-
 .../plugin/Eigen/Eigen/src/Core/DenseBase.h   |   87 +-
 .../Eigen/Eigen/src/Core/DenseCoeffsBase.h    |   12 +-
 .../Eigen/Eigen/src/Core/DenseStorage.h       |   60 +-
 uppsrc/plugin/Eigen/Eigen/src/Core/Diagonal.h |   10 +-
 .../Eigen/Eigen/src/Core/DiagonalMatrix.h     |   52 +-
 .../Eigen/Eigen/src/Core/DiagonalProduct.h    |    2 +-
 uppsrc/plugin/Eigen/Eigen/src/Core/Dot.h      |   16 +-
 .../plugin/Eigen/Eigen/src/Core/EigenBase.h   |    3 +-
 uppsrc/plugin/Eigen/Eigen/src/Core/Fuzzy.h    |    6 +-
 .../Eigen/Eigen/src/Core/GeneralProduct.h     |   24 +-
 .../Eigen/Eigen/src/Core/GenericPacketMath.h  |  374 +-
 .../Eigen/Eigen/src/Core/GlobalFunctions.h    |   68 +-
 uppsrc/plugin/Eigen/Eigen/src/Core/IO.h       |   47 +-
 .../plugin/Eigen/Eigen/src/Core/IndexedView.h |  207 -
 uppsrc/plugin/Eigen/Eigen/src/Core/Inverse.h  |    7 +-
 uppsrc/plugin/Eigen/Eigen/src/Core/Map.h      |    6 +-
 .../Eigen/Eigen/src/Core/MathFunctions.h      |  541 +--
 .../Eigen/Eigen/src/Core/MathFunctionsImpl.h  |   39 +-
 uppsrc/plugin/Eigen/Eigen/src/Core/Matrix.h   |  138 +-
 .../plugin/Eigen/Eigen/src/Core/MatrixBase.h  |   21 +-
 .../plugin/Eigen/Eigen/src/Core/NestByValue.h |   71 +-
 uppsrc/plugin/Eigen/Eigen/src/Core/NoAlias.h  |    7 +-
 .../plugin/Eigen/Eigen/src/Core/NumTraits.h   |   47 +-
 .../Eigen/src/Core/PartialReduxEvaluator.h    |  232 --
 .../Eigen/Eigen/src/Core/PermutationMatrix.h  |   34 +-
 .../Eigen/Eigen/src/Core/PlainObjectBase.h    |  148 +-
 uppsrc/plugin/Eigen/Eigen/src/Core/Product.h  |   19 +-
 .../Eigen/Eigen/src/Core/ProductEvaluators.h  |  164 +-
 uppsrc/plugin/Eigen/Eigen/src/Core/Random.h   |    2 +-
 uppsrc/plugin/Eigen/Eigen/src/Core/Redux.h    |  330 +-
 uppsrc/plugin/Eigen/Eigen/src/Core/Ref.h      |    2 -
 .../plugin/Eigen/Eigen/src/Core/Replicate.h   |    4 +-
 uppsrc/plugin/Eigen/Eigen/src/Core/Reshaped.h |  453 ---
 .../Eigen/Eigen/src/Core/ReturnByValue.h      |    2 +-
 uppsrc/plugin/Eigen/Eigen/src/Core/Reverse.h  |   16 +-
 .../Eigen/Eigen/src/Core/SelfAdjointView.h    |   17 +-
 uppsrc/plugin/Eigen/Eigen/src/Core/Solve.h    |    4 +-
 .../Eigen/Eigen/src/Core/SolveTriangular.h    |    2 +-
 .../plugin/Eigen/Eigen/src/Core/SolverBase.h  |   44 +-
 .../plugin/Eigen/Eigen/src/Core/StableNorm.h  |  117 +-
 .../Eigen/Eigen/src/Core/StlIterators.h       |  331 --
 uppsrc/plugin/Eigen/Eigen/src/Core/Swap.h     |    9 +-
 .../plugin/Eigen/Eigen/src/Core/Transpose.h   |  109 +-
 .../Eigen/Eigen/src/Core/Transpositions.h     |   41 +-
 .../Eigen/Eigen/src/Core/TriangularMatrix.h   |   48 +-
 .../plugin/Eigen/Eigen/src/Core/VectorBlock.h |   10 +-
 .../Eigen/Eigen/src/Core/VectorwiseOp.h       |  203 +-
 uppsrc/plugin/Eigen/Eigen/src/Core/Visitor.h  |   36 -
 .../Eigen/Eigen/src/Core/arch/AVX/Complex.h   |   94 +-
 .../Eigen/src/Core/arch/AVX/MathFunctions.h   |  353 +-
 .../Eigen/src/Core/arch/AVX/PacketMath.h      |  719 +---
 .../Eigen/src/Core/arch/AVX/TypeCasting.h     |   40 +-
 .../Eigen/src/Core/arch/AVX512/Complex.h      |  447 ---
 .../src/Core/arch/AVX512/MathFunctions.h      |  153 +-
 .../Eigen/src/Core/arch/AVX512/PacketMath.h   |  834 ++--
 .../Eigen/src/Core/arch/AVX512/TypeCasting.h  |   47 -
 .../Eigen/src/Core/arch/AltiVec/Complex.h     |   76 +-
 .../src/Core/arch/AltiVec/MathFunctions.h     |  270 +-
 .../Eigen/src/Core/arch/AltiVec/PacketMath.h  | 1579 ++------
 .../Eigen/Eigen/src/Core/arch/CUDA/Complex.h  |    6 +-
 .../src/Core/arch/{Default => CUDA}/Half.h    |  158 +-
 .../Core/arch/{GPU => CUDA}/MathFunctions.h   |   20 +-
 .../Eigen/src/Core/arch/CUDA/PacketMath.h     |  333 ++
 .../Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 1124 ++++++
 .../Eigen/src/Core/arch/CUDA/TypeCasting.h    |  212 ++
 .../arch/Default/GenericPacketMathFunctions.h |  655 ----
 .../Default/GenericPacketMathFunctionsFwd.h   |   69 -
 .../Eigen/src/Core/arch/Default/Settings.h    |    2 +-
 .../Eigen/src/Core/arch/Default/TypeCasting.h |   77 -
 .../Eigen/src/Core/arch/GPU/PacketMath.h      | 1786 ---------
 .../Eigen/src/Core/arch/GPU/TypeCasting.h     |   80 -
 .../src/Core/arch/HIP/hcc/math_constants.h    |   23 -
 .../Eigen/Eigen/src/Core/arch/MSA/Complex.h   |  720 ----
 .../Eigen/src/Core/arch/MSA/MathFunctions.h   |  387 --
 .../Eigen/src/Core/arch/MSA/PacketMath.h      | 1237 ------
 .../Eigen/Eigen/src/Core/arch/NEON/Complex.h  |  505 +--
 .../Eigen/src/Core/arch/NEON/MathFunctions.h  |   88 +-
 .../Eigen/src/Core/arch/NEON/PacketMath.h     | 3358 ++---------------
 .../Eigen/src/Core/arch/NEON/TypeCasting.h    |  278 --
 .../Eigen/Eigen/src/Core/arch/SSE/Complex.h   |   81 +-
 .../Eigen/src/Core/arch/SSE/MathFunctions.h   |  460 ++-
 .../Eigen/src/Core/arch/SSE/PacketMath.h      |  796 ++--
 .../Eigen/src/Core/arch/SSE/TypeCasting.h     |   58 -
 .../Eigen/src/Core/arch/SYCL/InteropHeaders.h |  229 --
 .../Eigen/src/Core/arch/SYCL/MathFunctions.h  |  289 --
 .../Eigen/src/Core/arch/SYCL/PacketMath.h     |  670 ----
 .../src/Core/arch/SYCL/SyclMemoryModel.h      |  694 ----
 .../Eigen/src/Core/arch/SYCL/TypeCasting.h    |   85 -
 .../Eigen/src/Core/arch/ZVector/Complex.h     |  401 +-
 .../src/Core/arch/ZVector/MathFunctions.h     |  112 +-
 .../Eigen/src/Core/arch/ZVector/PacketMath.h  |  885 ++---
 .../src/Core/functors/AssignmentFunctors.h    |   13 +-
 .../Eigen/src/Core/functors/BinaryFunctors.h  |   93 +-
 .../Eigen/src/Core/functors/NullaryFunctors.h |   55 +-
 .../Eigen/src/Core/functors/UnaryFunctors.h   |  298 +-
 .../Core/products/GeneralBlockPanelKernel.h   | 1613 +++-----
 .../src/Core/products/GeneralMatrixMatrix.h   |   32 +-
 .../products/GeneralMatrixMatrixTriangular.h  |    8 +-
 .../GeneralMatrixMatrixTriangular_BLAS.h      |    2 +-
 .../src/Core/products/GeneralMatrixVector.h   |  865 +++--
 .../Eigen/src/Core/products/Parallelizer.h    |   36 +-
 .../Core/products/SelfadjointMatrixMatrix.h   |   29 +-
 .../Core/products/SelfadjointMatrixVector.h   |   14 +-
 .../src/Core/products/SelfadjointProduct.h    |    2 +-
 .../Core/products/SelfadjointRank2Update.h    |    5 +-
 .../Core/products/TriangularMatrixMatrix.h    |    6 +-
 .../Core/products/TriangularSolverMatrix.h    |    4 +-
 .../Core/products/TriangularSolverVector.h    |   21 +-
 .../Eigen/Eigen/src/Core/util/BlasUtil.h      |   77 +-
 .../src/Core/util/ConfigureVectorization.h    |  483 ---
 .../Eigen/Eigen/src/Core/util/Constants.h     |   15 +-
 .../src/Core/util/DisableStupidWarnings.h     |   13 +-
 .../Eigen/src/Core/util/ForwardDeclarations.h |   25 +-
 .../Eigen/src/Core/util/IndexedViewHelper.h   |  186 -
 .../Eigen/src/Core/util/IntegralConstant.h    |  272 --
 .../Eigen/Eigen/src/Core/util/MKL_support.h   |    9 +-
 .../plugin/Eigen/Eigen/src/Core/util/Macros.h |  724 ++--
 .../plugin/Eigen/Eigen/src/Core/util/Memory.h |  215 +-
 .../plugin/Eigen/Eigen/src/Core/util/Meta.h   |  248 +-
 .../Eigen/src/Core/util/ReshapedHelper.h      |   51 -
 .../Eigen/Eigen/src/Core/util/StaticAssert.h  |   10 +-
 .../Eigen/Eigen/src/Core/util/SymbolicIndex.h |  293 --
 .../Eigen/Eigen/src/Core/util/XprHelper.h     |   38 +-
 .../src/Eigenvalues/ComplexEigenSolver.h      |    2 +-
 .../Eigen/src/Eigenvalues/ComplexSchur.h      |    2 +-
 .../Eigen/Eigen/src/Eigenvalues/EigenSolver.h |    4 +-
 .../GeneralizedSelfAdjointEigenSolver.h       |    2 +-
 .../src/Eigenvalues/HessenbergDecomposition.h |    2 +-
 .../src/Eigenvalues/MatrixBaseEigenvalues.h   |    4 +-
 .../Eigen/Eigen/src/Eigenvalues/RealQZ.h      |   15 +-
 .../Eigen/Eigen/src/Eigenvalues/RealSchur.h   |   15 +-
 .../src/Eigenvalues/SelfAdjointEigenSolver.h  |   32 +-
 .../src/Eigenvalues/Tridiagonalization.h      |    9 +-
 .../Eigen/Eigen/src/Geometry/AlignedBox.h     |    2 +-
 .../Eigen/Eigen/src/Geometry/Hyperplane.h     |    2 +-
 .../Eigen/Eigen/src/Geometry/OrthoMethods.h   |    5 +-
 .../Eigen/src/Geometry/ParametrizedLine.h     |   39 +-
 .../Eigen/Eigen/src/Geometry/Quaternion.h     |   17 +-
 .../plugin/Eigen/Eigen/src/Geometry/Scaling.h |   26 +-
 .../Eigen/Eigen/src/Geometry/Transform.h      |   62 +-
 .../Eigen/Eigen/src/Geometry/Translation.h    |   12 +-
 .../Eigen/src/Geometry/arch/Geometry_SSE.h    |   51 +-
 .../Eigen/src/Householder/BlockHouseholder.h  |   11 +-
 .../Eigen/Eigen/src/Householder/Householder.h |   12 +-
 .../src/Householder/HouseholderSequence.h     |  147 +-
 .../src/IterativeLinearSolvers/BiCGSTAB.h     |   30 +-
 .../ConjugateGradient.h                       |   27 +-
 .../IncompleteCholesky.h                      |   12 +-
 .../IterativeLinearSolvers/IncompleteLUT.h    |   13 +-
 .../IterativeSolverBase.h                     |   56 +-
 .../LeastSquareConjugateGradient.h            |   22 +-
 .../IterativeLinearSolvers/SolveWithGuess.h   |    2 +-
 uppsrc/plugin/Eigen/Eigen/src/Jacobi/Jacobi.h |   39 +-
 .../Eigen/Eigen/src/KLUSupport/KLUSupport.h   |  358 --
 .../plugin/Eigen/Eigen/src/LU/Determinant.h   |   54 +-
 uppsrc/plugin/Eigen/Eigen/src/LU/FullPivLU.h  |   66 +-
 .../plugin/Eigen/Eigen/src/LU/InverseImpl.h   |    2 -
 .../plugin/Eigen/Eigen/src/LU/PartialPivLU.h  |  111 +-
 .../Eigen/Eigen/src/OrderingMethods/Amd.h     |   24 +-
 .../Eigen/src/OrderingMethods/Eigen_Colamd.h  |  574 ++-
 .../Eigen/src/OrderingMethods/Ordering.h      |   16 +-
 .../Eigen/src/PaStiXSupport/PaStiXSupport.h   |    2 +-
 .../Eigen/src/PardisoSupport/PardisoSupport.h |   19 +-
 .../Eigen/Eigen/src/QR/ColPivHouseholderQR.h  |   61 +-
 .../src/QR/CompleteOrthogonalDecomposition.h  |  127 +-
 .../Eigen/Eigen/src/QR/FullPivHouseholderQR.h |   81 +-
 .../plugin/Eigen/Eigen/src/QR/HouseholderQR.h |   71 +-
 .../src/SPQRSupport/SuiteSparseQRSupport.h    |   28 +-
 uppsrc/plugin/Eigen/Eigen/src/SVD/BDCSVD.h    |  155 +-
 uppsrc/plugin/Eigen/Eigen/src/SVD/JacobiSVD.h |    3 +-
 uppsrc/plugin/Eigen/Eigen/src/SVD/SVDBase.h   |   69 +-
 .../Eigen/src/SVD/UpperBidiagonalization.h    |    6 +-
 .../src/SparseCholesky/SimplicialCholesky.h   |   14 +-
 .../SparseCholesky/SimplicialCholesky_impl.h  |   47 +-
 .../Eigen/src/SparseCore/CompressedStorage.h  |   16 -
 .../Eigen/Eigen/src/SparseCore/SparseAssign.h |  108 +-
 .../Eigen/Eigen/src/SparseCore/SparseBlock.h  |   72 +-
 .../src/SparseCore/SparseCompressedBase.h     |   51 +-
 .../src/SparseCore/SparseCwiseBinaryOp.h      |   14 +-
 .../Eigen/src/SparseCore/SparseDenseProduct.h |   38 +-
 .../Eigen/Eigen/src/SparseCore/SparseMatrix.h |  126 +-
 .../Eigen/src/SparseCore/SparseMatrixBase.h   |   17 +-
 .../Eigen/src/SparseCore/SparseProduct.h      |    2 +-
 .../Eigen/Eigen/src/SparseCore/SparseRef.h    |   14 +-
 .../src/SparseCore/SparseSelfAdjointView.h    |    4 +-
 .../Eigen/Eigen/src/SparseCore/SparseUtil.h   |    8 -
 .../Eigen/Eigen/src/SparseCore/SparseVector.h |    2 +-
 .../Eigen/Eigen/src/SparseLU/SparseLU.h       |   16 +-
 .../Eigen/src/SparseLU/SparseLU_Memory.h      |    2 +-
 .../src/SparseLU/SparseLU_SupernodalMatrix.h  |    4 +-
 .../Eigen/src/SparseLU/SparseLU_column_dfs.h  |    4 +-
 .../Eigen/src/SparseLU/SparseLU_gemm_kernel.h |    2 +-
 .../Eigen/src/SparseLU/SparseLU_panel_bmod.h  |    2 +-
 .../Eigen/Eigen/src/SparseQR/SparseQR.h       |   25 +-
 .../Eigen/Eigen/src/StlSupport/StdDeque.h     |   10 +-
 .../Eigen/Eigen/src/StlSupport/StdList.h      |    4 +-
 .../Eigen/Eigen/src/StlSupport/StdVector.h    |    4 +-
 .../Eigen/src/SuperLUSupport/SuperLUSupport.h |    2 +-
 .../Eigen/src/UmfPackSupport/UmfPackSupport.h |  214 +-
 uppsrc/plugin/Eigen/Eigen/src/misc/lapacke.h  |    9 +-
 .../Eigen/src/plugins/ArrayCwiseBinaryOps.h   |   28 +-
 .../Eigen/src/plugins/ArrayCwiseUnaryOps.h    |  114 +-
 .../Eigen/Eigen/src/plugins/BlockMethods.h    |  869 ++---
 .../Eigen/src/plugins/CommonCwiseUnaryOps.h   |   57 -
 .../Eigen/src/plugins/IndexedViewMethods.h    |  262 --
 .../Eigen/Eigen/src/plugins/ReshapedMethods.h |  149 -
 .../plugin/Eigen/srcdoc.tpp/Eigen_en-us.tpp   |    5 +-
 .../plugin/Eigen/unsupported/CMakeLists.txt   |    9 +
 .../Eigen/unsupported/Eigen/AdolcForward      |    2 +-
 .../Eigen/unsupported/Eigen/AlignedVector3    |   12 +-
 .../Eigen/unsupported/Eigen/ArpackSupport     |    8 +-
 .../plugin/Eigen/unsupported/Eigen/AutoDiff   |    6 -
 uppsrc/plugin/Eigen/unsupported/Eigen/BVH     |    6 +-
 .../Eigen/unsupported/Eigen/CMakeLists.txt    |   32 +
 .../Eigen/unsupported/Eigen/CXX11/Tensor      |   61 +-
 .../unsupported/Eigen/CXX11/TensorSymmetry    |    6 +-
 .../Eigen/unsupported/Eigen/CXX11/ThreadPool  |   18 +-
 .../Eigen/CXX11/src/Tensor/README.md          |  217 +-
 .../Eigen/CXX11/src/Tensor/Tensor.h           |   29 +-
 .../Eigen/CXX11/src/Tensor/TensorArgMax.h     |   70 +-
 .../Eigen/CXX11/src/Tensor/TensorAssign.h     |   90 +-
 .../Eigen/CXX11/src/Tensor/TensorBase.h       |  205 +-
 .../Eigen/CXX11/src/Tensor/TensorBlock.h      | 1559 --------
 .../CXX11/src/Tensor/TensorBroadcasting.h     |  766 +---
 .../Eigen/CXX11/src/Tensor/TensorChipping.h   |  218 +-
 .../CXX11/src/Tensor/TensorConcatenation.h    |   57 +-
 .../CXX11/src/Tensor/TensorContraction.h      |  609 +--
 .../src/Tensor/TensorContractionBlocking.h    |   39 +-
 .../CXX11/src/Tensor/TensorContractionCuda.h  | 1393 ++++++-
 .../CXX11/src/Tensor/TensorContractionGpu.h   | 1413 -------
 .../src/Tensor/TensorContractionMapper.h      |  246 +-
 .../CXX11/src/Tensor/TensorContractionSycl.h  | 1650 --------
 .../src/Tensor/TensorContractionThreadPool.h  | 1576 +++-----
 .../Eigen/CXX11/src/Tensor/TensorConversion.h |  234 +-
 .../CXX11/src/Tensor/TensorConvolution.h      |  172 +-
 .../CXX11/src/Tensor/TensorConvolutionSycl.h  |  544 ---
 .../Eigen/CXX11/src/Tensor/TensorCostModel.h  |   10 +-
 .../Eigen/CXX11/src/Tensor/TensorCustomOp.h   |   82 +-
 .../Eigen/CXX11/src/Tensor/TensorDevice.h     |   67 -
 .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h |  337 +-
 .../CXX11/src/Tensor/TensorDeviceDefault.h    |   33 +-
 .../Eigen/CXX11/src/Tensor/TensorDeviceGpu.h  |  360 --
 .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 1078 +-----
 .../CXX11/src/Tensor/TensorDeviceThreadPool.h |  347 +-
 .../Eigen/CXX11/src/Tensor/TensorDimensions.h |  160 +-
 .../Eigen/CXX11/src/Tensor/TensorEvalTo.h     |  105 +-
 .../Eigen/CXX11/src/Tensor/TensorEvaluator.h  |  605 +--
 .../Eigen/CXX11/src/Tensor/TensorExecutor.h   |  699 +---
 .../Eigen/CXX11/src/Tensor/TensorExpr.h       |   23 +-
 .../Eigen/CXX11/src/Tensor/TensorFFT.h        |   82 +-
 .../Eigen/CXX11/src/Tensor/TensorFixedSize.h  |    9 +-
 .../Eigen/CXX11/src/Tensor/TensorForcedEval.h |  189 +-
 .../src/Tensor/TensorForwardDeclarations.h    |   90 +-
 .../Eigen/CXX11/src/Tensor/TensorFunctors.h   |  138 +-
 .../Eigen/CXX11/src/Tensor/TensorGenerator.h  |  145 +-
 .../src/Tensor/TensorGpuHipCudaDefines.h      |   93 -
 .../src/Tensor/TensorGpuHipCudaUndefines.h    |   40 -
 .../Eigen/CXX11/src/Tensor/TensorImagePatch.h |  142 +-
 .../Eigen/CXX11/src/Tensor/TensorIndexList.h  |  203 +-
 .../Eigen/CXX11/src/Tensor/TensorInflation.h  |   24 +-
 .../CXX11/src/Tensor/TensorInitializer.h      |    4 +-
 .../Eigen/CXX11/src/Tensor/TensorIntDiv.h     |   28 +-
 .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h |   30 +-
 .../Eigen/CXX11/src/Tensor/TensorMacros.h     |   41 +-
 .../Eigen/CXX11/src/Tensor/TensorMap.h        |   92 +-
 .../Eigen/CXX11/src/Tensor/TensorMeta.h       |  123 +-
 .../Eigen/CXX11/src/Tensor/TensorMorphing.h   |  479 +--
 .../Eigen/CXX11/src/Tensor/TensorPadding.h    |  371 +-
 .../Eigen/CXX11/src/Tensor/TensorPatch.h      |   28 +-
 .../Eigen/CXX11/src/Tensor/TensorRandom.h     |  154 +-
 .../Eigen/CXX11/src/Tensor/TensorReduction.h  |  386 +-
 .../CXX11/src/Tensor/TensorReductionCuda.h    |  750 +++-
 .../CXX11/src/Tensor/TensorReductionGpu.h     |  967 -----
 .../CXX11/src/Tensor/TensorReductionSycl.h    |  746 +---
 .../Eigen/CXX11/src/Tensor/TensorRef.h        |   31 +-
 .../Eigen/CXX11/src/Tensor/TensorReverse.h    |  220 +-
 .../Eigen/CXX11/src/Tensor/TensorScan.h       |  526 +--
 .../Eigen/CXX11/src/Tensor/TensorScanSycl.h   |  512 ---
 .../Eigen/CXX11/src/Tensor/TensorShuffling.h  |  289 +-
 .../Eigen/CXX11/src/Tensor/TensorStriding.h   |   38 +-
 .../Eigen/CXX11/src/Tensor/TensorSycl.h       |   82 +
 .../TensorSyclConvertToDeviceExpression.h     |  121 +
 .../src/Tensor/TensorSyclExprConstructor.h    |  239 ++
 .../src/Tensor/TensorSyclExtractAccessor.h    |  204 +
 .../src/Tensor/TensorSyclExtractFunctors.h    |  177 +
 .../CXX11/src/Tensor/TensorSyclLeafCount.h    |  114 +
 .../src/Tensor/TensorSyclPlaceHolderExpr.h    |  181 +
 .../Eigen/CXX11/src/Tensor/TensorSyclRun.h    |   70 +
 .../Eigen/CXX11/src/Tensor/TensorSyclTuple.h  |  237 ++
 .../Eigen/CXX11/src/Tensor/TensorTrace.h      |  303 --
 .../Eigen/CXX11/src/Tensor/TensorTraits.h     |   44 +-
 .../Eigen/CXX11/src/Tensor/TensorUInt128.h    |    1 -
 .../CXX11/src/Tensor/TensorVolumePatch.h      |   59 +-
 .../TensorSymmetry/util/TemplateGroupTheory.h |    2 +-
 .../Eigen/CXX11/src/ThreadPool/Barrier.h      |   67 -
 .../Eigen/CXX11/src/ThreadPool/EventCount.h   |  196 +-
 .../src/ThreadPool/NonBlockingThreadPool.h    |  398 +-
 .../Eigen/CXX11/src/ThreadPool/RunQueue.h     |   98 +-
 .../CXX11/src/ThreadPool/SimpleThreadPool.h   |  154 +
 .../Eigen/CXX11/src/ThreadPool/ThreadCancel.h |   23 -
 .../CXX11/src/ThreadPool/ThreadEnvironment.h  |    2 -
 .../Eigen/CXX11/src/ThreadPool/ThreadLocal.h  |  289 +-
 .../src/ThreadPool/ThreadPoolInterface.h      |   15 -
 .../Eigen/CXX11/src/util/CXX11Meta.h          |   93 +-
 .../Eigen/CXX11/src/util/CXX11Workarounds.h   |    6 +-
 .../Eigen/CXX11/src/util/EmulateArray.h       |   54 +-
 .../Eigen/CXX11/src/util/EmulateCXX11Meta.h   |  311 ++
 .../Eigen/CXX11/src/util/MaxSizeVector.h      |   51 +-
 .../Eigen/unsupported/Eigen/EulerAngles       |    8 +-
 uppsrc/plugin/Eigen/unsupported/Eigen/FFT     |    7 +-
 .../Eigen/unsupported/Eigen/IterativeSolvers  |   10 +-
 .../unsupported/Eigen/LevenbergMarquardt      |   16 +-
 .../Eigen/unsupported/Eigen/MPRealSupport     |    6 +-
 .../Eigen/unsupported/Eigen/MatrixFunctions   |   10 +-
 .../Eigen/unsupported/Eigen/MoreVectorization |    2 +-
 .../unsupported/Eigen/NonLinearOptimization   |   44 +-
 .../Eigen/unsupported/Eigen/NumericalDiff     |    2 +-
 .../Eigen/unsupported/Eigen/OpenGLSupport     |    4 +-
 .../Eigen/unsupported/Eigen/Polynomials       |    8 +-
 uppsrc/plugin/Eigen/unsupported/Eigen/Skyline |    6 +-
 .../Eigen/unsupported/Eigen/SpecialFunctions  |   29 +-
 uppsrc/plugin/Eigen/unsupported/Eigen/Splines |    4 -
 .../Eigen/src/AutoDiff/AutoDiffScalar.h       |   30 +-
 .../Eigen/unsupported/Eigen/src/BVH/KdBVH.h   |    2 +-
 .../ArpackSelfAdjointEigenSolver.h            |    4 +-
 .../Eigen/src/EulerAngles/CMakeLists.txt      |    4 +-
 .../Eigen/src/EulerAngles/EulerAngles.h       |  261 +-
 .../Eigen/src/EulerAngles/EulerSystem.h       |  197 +-
 .../Eigen/src/FFT/ei_kissfft_impl.h           |    4 +-
 .../IterativeSolvers/ConstrainedConjGrad.h    |    4 +-
 .../Eigen/src/IterativeSolvers/DGMRES.h       |   61 +-
 .../Eigen/src/IterativeSolvers/GMRES.h        |   36 +-
 .../Eigen/src/IterativeSolvers/MINRES.h       |   38 +-
 .../Eigen/src/IterativeSolvers/Scaling.h      |    6 -
 .../Eigen/src/LevenbergMarquardt/LMqrsolv.h   |    2 +-
 .../LevenbergMarquardt/LevenbergMarquardt.h   |    6 +-
 .../src/MatrixFunctions/MatrixExponential.h   |    5 +-
 .../src/MatrixFunctions/MatrixFunction.h      |   29 +-
 .../src/MatrixFunctions/MatrixLogarithm.h     |   20 +-
 .../Eigen/src/MatrixFunctions/MatrixPower.h   |   20 +-
 .../src/MatrixFunctions/MatrixSquareRoot.h    |   12 +-
 .../Eigen/src/NonLinearOptimization/qrsolv.h  |    2 +-
 .../Eigen/src/NonLinearOptimization/r1updt.h  |    2 +-
 .../Eigen/src/Polynomials/Companion.h         |    4 +-
 .../Eigen/src/Skyline/SkylineInplaceLU.h      |    4 +-
 .../Eigen/src/Skyline/SkylineMatrix.h         |   18 +-
 .../Eigen/src/Skyline/SkylineMatrixBase.h     |    2 +-
 .../Eigen/src/Skyline/SkylineStorage.h        |    2 +-
 .../src/SparseExtra/DynamicSparseMatrix.h     |    8 +-
 .../Eigen/src/SparseExtra/MarketIO.h          |   91 +-
 .../Eigen/src/SparseExtra/RandomSetter.h      |    6 +-
 .../BesselFunctionsArrayAPI.h                 |  286 --
 .../BesselFunctionsFunctors.h                 |  357 --
 .../SpecialFunctions/BesselFunctionsHalf.h    |   66 -
 .../SpecialFunctions/BesselFunctionsImpl.h    | 1959 ----------
 .../BesselFunctionsPacketMath.h               |  130 -
 .../SpecialFunctions/HipVectorCompatibility.h |   67 -
 .../SpecialFunctionsArrayAPI.h                |   55 +-
 .../SpecialFunctionsFunctors.h                |  140 +-
 .../SpecialFunctions/SpecialFunctionsHalf.h   |   11 -
 .../SpecialFunctions/SpecialFunctionsImpl.h   | 1028 ++---
 .../SpecialFunctionsPacketMath.h              |   23 +-
 .../arch/CUDA/CudaSpecialFunctions.h          |  165 +
 .../arch/GPU/GpuSpecialFunctions.h            |  369 --
 .../unsupported/Eigen/src/Splines/Spline.h    |    2 +-
 .../Eigen/src/Splines/SplineFitting.h         |   11 +-
 .../unsupported/Eigen/src/Splines/SplineFwd.h |    2 +-
 uppsrc/plugin/Eigen/unsupported/README.txt    |    2 +-
 .../Eigen/unsupported/bench/bench_svd.cpp     |  123 +
 .../Eigen/unsupported/doc/CMakeLists.txt      |    4 +
 .../plugin/Eigen/unsupported/doc/Overview.dox |   28 +
 .../unsupported/doc/eigendoxy_layout.xml.in   |  177 +
 .../unsupported/doc/examples/BVH_Example.cpp  |   50 +
 .../unsupported/doc/examples/CMakeLists.txt   |   20 +
 .../unsupported/doc/examples/EulerAngles.cpp  |   46 +
 .../Eigen/unsupported/doc/examples/FFT.cpp    |  118 +
 .../doc/examples/MatrixExponential.cpp        |   16 +
 .../doc/examples/MatrixFunction.cpp           |   23 +
 .../doc/examples/MatrixLogarithm.cpp          |   15 +
 .../unsupported/doc/examples/MatrixPower.cpp  |   16 +
 .../doc/examples/MatrixPower_optimal.cpp      |   17 +
 .../unsupported/doc/examples/MatrixSine.cpp   |   20 +
 .../unsupported/doc/examples/MatrixSinh.cpp   |   20 +
 .../doc/examples/MatrixSquareRoot.cpp         |   16 +
 .../doc/examples/PolynomialSolver1.cpp        |   53 +
 .../doc/examples/PolynomialUtils1.cpp         |   20 +
 .../unsupported/doc/snippets/CMakeLists.txt   |   26 +
 uppsrc/plugin/Eigen/unsupported/test/BVH.cpp  |  222 ++
 .../Eigen/unsupported/test/CMakeLists.txt     |  263 ++
 .../Eigen/unsupported/test/EulerAngles.cpp    |  208 +
 uppsrc/plugin/Eigen/unsupported/test/FFT.cpp  |    2 +
 uppsrc/plugin/Eigen/unsupported/test/FFTW.cpp |  262 ++
 .../test/NonLinearOptimization.cpp            | 1849 +++++++++
 .../Eigen/unsupported/test/NumericalDiff.cpp  |  114 +
 .../Eigen/unsupported/test/alignedvector3.cpp |   84 +
 .../Eigen/unsupported/test/autodiff.cpp       |  387 ++
 .../unsupported/test/autodiff_scalar.cpp      |  101 +
 .../unsupported/test/cxx11_eventcount.cpp     |  142 +
 .../Eigen/unsupported/test/cxx11_meta.cpp     |  357 ++
 .../test/cxx11_non_blocking_thread_pool.cpp   |  107 +
 .../Eigen/unsupported/test/cxx11_runqueue.cpp |  235 ++
 .../unsupported/test/cxx11_tensor_argmax.cpp  |  294 ++
 .../test/cxx11_tensor_argmax_cuda.cu          |  251 ++
 .../unsupported/test/cxx11_tensor_assign.cpp  |  370 ++
 .../test/cxx11_tensor_broadcast_sycl.cpp      |   74 +
 .../test/cxx11_tensor_broadcasting.cpp        |  194 +
 .../test/cxx11_tensor_cast_float16_cuda.cu    |   79 +
 .../unsupported/test/cxx11_tensor_casts.cpp   |  115 +
 .../test/cxx11_tensor_chipping.cpp            |  425 +++
 .../test/cxx11_tensor_comparisons.cpp         |   84 +
 .../test/cxx11_tensor_complex_cuda.cu         |  150 +
 .../cxx11_tensor_complex_cwise_ops_cuda.cu    |   94 +
 .../test/cxx11_tensor_concatenation.cpp       |  137 +
 .../unsupported/test/cxx11_tensor_const.cpp   |   62 +
 .../test/cxx11_tensor_contract_cuda.cu        |  213 ++
 .../test/cxx11_tensor_contraction.cpp         |  545 +++
 .../test/cxx11_tensor_convolution.cpp         |  149 +
 .../unsupported/test/cxx11_tensor_cuda.cu     | 1284 +++++++
 .../test/cxx11_tensor_custom_index.cpp        |  100 +
 .../test/cxx11_tensor_custom_op.cpp           |  111 +
 .../unsupported/test/cxx11_tensor_device.cu   |  387 ++
 .../test/cxx11_tensor_device_sycl.cpp         |   31 +
 .../test/cxx11_tensor_dimension.cpp           |   69 +
 .../unsupported/test/cxx11_tensor_empty.cpp   |   40 +
 .../unsupported/test/cxx11_tensor_expr.cpp    |  314 ++
 .../unsupported/test/cxx11_tensor_fft.cpp     |  273 ++
 .../test/cxx11_tensor_fixed_size.cpp          |  261 ++
 .../test/cxx11_tensor_forced_eval.cpp         |   79 +
 .../test/cxx11_tensor_forced_eval_sycl.cpp    |   70 +
 .../test/cxx11_tensor_generator.cpp           |   91 +
 .../unsupported/test/cxx11_tensor_ifft.cpp    |  154 +
 .../test/cxx11_tensor_image_patch.cpp         |  757 ++++
 .../test/cxx11_tensor_index_list.cpp          |  386 ++
 .../test/cxx11_tensor_inflation.cpp           |   81 +
 .../unsupported/test/cxx11_tensor_intdiv.cpp  |  147 +
 .../unsupported/test/cxx11_tensor_io.cpp      |  136 +
 .../test/cxx11_tensor_layout_swap.cpp         |   61 +
 .../unsupported/test/cxx11_tensor_lvalue.cpp  |   42 +
 .../unsupported/test/cxx11_tensor_map.cpp     |  277 ++
 .../unsupported/test/cxx11_tensor_math.cpp    |   46 +
 .../test/cxx11_tensor_mixed_indices.cpp       |   53 +
 .../test/cxx11_tensor_morphing.cpp            |  485 +++
 .../test/cxx11_tensor_notification.cpp        |   81 +
 .../test/cxx11_tensor_of_complex.cpp          |  103 +
 .../test/cxx11_tensor_of_const_values.cpp     |  105 +
 .../test/cxx11_tensor_of_float16_cuda.cu      |  491 +++
 .../test/cxx11_tensor_of_strings.cpp          |  152 +
 .../unsupported/test/cxx11_tensor_padding.cpp |   93 +
 .../unsupported/test/cxx11_tensor_patch.cpp   |  172 +
 .../unsupported/test/cxx11_tensor_random.cpp  |   78 +
 .../test/cxx11_tensor_random_cuda.cu          |   85 +
 .../test/cxx11_tensor_reduction.cpp           |  508 +++
 .../test/cxx11_tensor_reduction_cuda.cu       |  154 +
 .../test/cxx11_tensor_reduction_sycl.cpp      |  138 +
 .../unsupported/test/cxx11_tensor_ref.cpp     |  248 ++
 .../unsupported/test/cxx11_tensor_reverse.cpp |  190 +
 .../test/cxx11_tensor_roundings.cpp           |   62 +
 .../unsupported/test/cxx11_tensor_scan.cpp    |  110 +
 .../test/cxx11_tensor_scan_cuda.cu            |   76 +
 .../test/cxx11_tensor_shuffling.cpp           |  228 ++
 .../unsupported/test/cxx11_tensor_simple.cpp  |  327 ++
 .../test/cxx11_tensor_striding.cpp            |  119 +
 .../unsupported/test/cxx11_tensor_sugar.cpp   |   81 +
 .../unsupported/test/cxx11_tensor_sycl.cpp    |  159 +
 .../test/cxx11_tensor_symmetry.cpp            |  818 ++++
 .../test/cxx11_tensor_thread_pool.cpp         |  373 ++
 .../unsupported/test/cxx11_tensor_uint128.cpp |  160 +
 .../test/cxx11_tensor_volume_patch.cpp        |  112 +
 .../plugin/Eigen/unsupported/test/dgmres.cpp  |   31 +
 .../Eigen/unsupported/test/forward_adolc.cpp  |  141 +
 .../plugin/Eigen/unsupported/test/gmres.cpp   |   31 +
 .../unsupported/test/kronecker_product.cpp    |  252 ++
 .../unsupported/test/levenberg_marquardt.cpp  | 1477 ++++++++
 .../unsupported/test/matrix_exponential.cpp   |  141 +
 .../unsupported/test/matrix_function.cpp      |  227 ++
 .../Eigen/unsupported/test/matrix_functions.h |   67 +
 .../Eigen/unsupported/test/matrix_power.cpp   |  204 +
 .../unsupported/test/matrix_square_root.cpp   |   31 +
 .../plugin/Eigen/unsupported/test/minres.cpp  |   44 +
 .../Eigen/unsupported/test/mpreal/mpreal.h    | 3104 +++++++++++++++
 .../Eigen/unsupported/test/mpreal_support.cpp |   65 +
 .../Eigen/unsupported/test/openglsupport.cpp  |  333 ++
 .../unsupported/test/polynomialsolver.cpp     |  232 ++
 .../unsupported/test/polynomialutils.cpp      |  113 +
 .../Eigen/unsupported/test/sparse_extra.cpp   |  147 +
 .../unsupported/test/special_functions.cpp    |  345 ++
 .../plugin/Eigen/unsupported/test/splines.cpp |  281 ++
 514 files changed, 44279 insertions(+), 50937 deletions(-)
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/KLUSupport
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/ArithmeticSequence.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/IndexedView.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/PartialReduxEvaluator.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/Reshaped.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/StlIterators.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/Complex.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h
 rename uppsrc/plugin/Eigen/Eigen/src/Core/arch/{Default => CUDA}/Half.h (79%)
 rename uppsrc/plugin/Eigen/Eigen/src/Core/arch/{GPU => CUDA}/MathFunctions.h (82%)
 create mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/PacketMath.h
 create mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
 create mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/TypeCasting.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/GPU/PacketMath.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/GPU/TypeCasting.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/MSA/Complex.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/MSA/MathFunctions.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/MSA/PacketMath.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/TypeCasting.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/PacketMath.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/util/ConfigureVectorization.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/util/IndexedViewHelper.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/util/IntegralConstant.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/util/ReshapedHelper.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/util/SymbolicIndex.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/KLUSupport/KLUSupport.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/plugins/IndexedViewMethods.h
 delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/plugins/ReshapedMethods.h
 create mode 100644 uppsrc/plugin/Eigen/unsupported/CMakeLists.txt
 create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CMakeLists.txt
 delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
 delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
 delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
 delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
 delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
 delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
 delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
 delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
 delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
 create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
 create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
 create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
 create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
 create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
 create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
 create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
 create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
 create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
 delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
 delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h
 create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
 delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h
 create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h
 delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h
 delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h
 delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h
 delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h
 delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h
 delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h
 create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h
 delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/arch/GPU/GpuSpecialFunctions.h
 create mode 100644 uppsrc/plugin/Eigen/unsupported/bench/bench_svd.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/CMakeLists.txt
 create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/Overview.dox
 create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/eigendoxy_layout.xml.in
 create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/BVH_Example.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/CMakeLists.txt
 create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/EulerAngles.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/FFT.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixExponential.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixFunction.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixLogarithm.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixPower.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixPower_optimal.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixSine.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixSinh.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixSquareRoot.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/PolynomialSolver1.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/PolynomialUtils1.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/snippets/CMakeLists.txt
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/BVH.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/CMakeLists.txt
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/EulerAngles.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/FFT.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/FFTW.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/NonLinearOptimization.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/NumericalDiff.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/alignedvector3.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/autodiff.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/autodiff_scalar.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_eventcount.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_meta.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_non_blocking_thread_pool.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_runqueue.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_argmax.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_argmax_cuda.cu
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_assign.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_broadcasting.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_casts.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_chipping.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_comparisons.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_complex_cuda.cu
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_concatenation.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_const.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_contract_cuda.cu
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_contraction.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_convolution.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_cuda.cu
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_custom_index.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_custom_op.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_device.cu
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_device_sycl.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_dimension.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_empty.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_expr.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_fft.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_fixed_size.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_forced_eval.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_generator.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_ifft.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_image_patch.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_index_list.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_inflation.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_intdiv.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_io.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_layout_swap.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_lvalue.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_map.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_math.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_mixed_indices.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_morphing.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_notification.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_of_complex.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_of_const_values.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_of_float16_cuda.cu
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_of_strings.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_padding.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_patch.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_random.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_random_cuda.cu
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_reduction.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_reduction_cuda.cu
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_reduction_sycl.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_ref.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_reverse.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_roundings.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_scan.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_scan_cuda.cu
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_shuffling.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_simple.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_striding.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_sugar.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_sycl.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_symmetry.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_thread_pool.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_uint128.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_volume_patch.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/dgmres.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/forward_adolc.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/gmres.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/kronecker_product.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/levenberg_marquardt.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/matrix_exponential.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/matrix_function.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/matrix_functions.h
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/matrix_power.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/matrix_square_root.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/minres.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/mpreal/mpreal.h
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/mpreal_support.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/openglsupport.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/polynomialsolver.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/polynomialutils.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/sparse_extra.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/special_functions.cpp
 create mode 100644 uppsrc/plugin/Eigen/unsupported/test/splines.cpp

diff --git a/uppsrc/plugin/Eigen/Eigen.h b/uppsrc/plugin/Eigen/Eigen.h
index 2c0c1ba91..bc6cc86cf 100644
--- a/uppsrc/plugin/Eigen/Eigen.h
+++ b/uppsrc/plugin/Eigen/Eigen.h
@@ -9,6 +9,8 @@
 
 #ifndef _DEBUG
 #define EIGEN_NO_DEBUG
+#else
+#define EIGEN_INITIALIZE_MATRICES_BY_NAN 
 #endif
 
 #define eigen_assert(x) ASSERT(x)
diff --git a/uppsrc/plugin/Eigen/Eigen/Core b/uppsrc/plugin/Eigen/Eigen/Core
index 688361d46..ac7c5b300 100644
--- a/uppsrc/plugin/Eigen/Eigen/Core
+++ b/uppsrc/plugin/Eigen/Eigen/Core
@@ -14,26 +14,79 @@
 // first thing Eigen does: stop the compiler from committing suicide
 #include "src/Core/util/DisableStupidWarnings.h"
 
-// then include this file where all our macros are defined. It's really important to do it first because
-// it's where we do all the compiler/OS/arch detections and define most defaults.
-#include "src/Core/util/Macros.h"
-
-// This detects SSE/AVX/NEON/etc. and configure alignment settings
-#include "src/Core/util/ConfigureVectorization.h"
-
-// We need cuda_runtime.h/hip_runtime.h to ensure that
-// the EIGEN_USING_STD_MATH macro works properly on the device side
-#if defined(EIGEN_CUDACC)
-  #include <cuda_runtime.h>
-#elif defined(EIGEN_HIPCC)
-  #include <hip/hip_runtime.h>
+#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
+  #define EIGEN_CUDACC __CUDACC__
 #endif
 
+#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA)
+  #define EIGEN_CUDA_ARCH __CUDA_ARCH__
+#endif
+
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+#define EIGEN_CUDACC_VER  ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
+#elif defined(__CUDACC_VER__)
+#define EIGEN_CUDACC_VER __CUDACC_VER__
+#else
+#define EIGEN_CUDACC_VER 0
+#endif
+
+// Handle NVCC/CUDA/SYCL
+#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__)
+  // Do not try asserts on CUDA and SYCL!
+  #ifndef EIGEN_NO_DEBUG
+  #define EIGEN_NO_DEBUG
+  #endif
+
+  #ifdef EIGEN_INTERNAL_DEBUGGING
+  #undef EIGEN_INTERNAL_DEBUGGING
+  #endif
+
+  #ifdef EIGEN_EXCEPTIONS
+  #undef EIGEN_EXCEPTIONS
+  #endif
+
+  // All functions callable from CUDA code must be qualified with __device__
+  #ifdef __CUDACC__
+    // Do not try to vectorize on CUDA and SYCL!
+    #ifndef EIGEN_DONT_VECTORIZE
+    #define EIGEN_DONT_VECTORIZE
+    #endif
+
+    #define EIGEN_DEVICE_FUNC __host__ __device__
+    // We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro
+    // works properly on the device side
+    #include <cuda_runtime.h>
+  #else
+    #define EIGEN_DEVICE_FUNC
+  #endif
+
+#else
+  #define EIGEN_DEVICE_FUNC
+
+#endif
+
+// When compiling CUDA device code with NVCC, pull in math functions from the
+// global namespace.  In host mode, and when device doee with clang, use the
+// std versions.
+#if defined(__CUDA_ARCH__) && defined(__NVCC__)
+  #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
+#else
+  #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
+#endif
+
+#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL)
+  #define EIGEN_EXCEPTIONS
+#endif
 
 #ifdef EIGEN_EXCEPTIONS
   #include <new>
 #endif
 
+// then include this file where all our macros are defined. It's really important to do it first because
+// it's where we do all the alignment settings (platform detection and honoring the user's will if he
+// defined e.g. EIGEN_DONT_ALIGN) so it needs to be done before we do anything with vectorization.
+#include "src/Core/util/Macros.h"
+
 // Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3)
 // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details.
 #if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6)
@@ -46,9 +99,163 @@
 // and inclusion of their respective header files
 #include "src/Core/util/MKL_support.h"
 
+// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into
+// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks
+#if EIGEN_MAX_ALIGN_BYTES==0
+  #ifndef EIGEN_DONT_VECTORIZE
+    #define EIGEN_DONT_VECTORIZE
+  #endif
+#endif
 
-#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
-  #define EIGEN_HAS_GPU_FP16
+#if EIGEN_COMP_MSVC
+  #include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
+  #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later
+    // Remember that usage of defined() in a #define is undefined by the standard.
+    // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
+    #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64
+      #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
+    #endif
+  #endif
+#else
+  // Remember that usage of defined() in a #define is undefined by the standard
+  #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) )
+    #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
+  #endif
+#endif
+
+#ifndef EIGEN_DONT_VECTORIZE
+
+  #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
+
+    // Defines symbols for compile-time detection of which instructions are
+    // used.
+    // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_SSE
+    #define EIGEN_VECTORIZE_SSE2
+
+    // Detect sse3/ssse3/sse4:
+    // gcc and icc defines __SSE3__, ...
+    // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you
+    // want to force the use of those instructions with msvc.
+    #ifdef __SSE3__
+      #define EIGEN_VECTORIZE_SSE3
+    #endif
+    #ifdef __SSSE3__
+      #define EIGEN_VECTORIZE_SSSE3
+    #endif
+    #ifdef __SSE4_1__
+      #define EIGEN_VECTORIZE_SSE4_1
+    #endif
+    #ifdef __SSE4_2__
+      #define EIGEN_VECTORIZE_SSE4_2
+    #endif
+    #ifdef __AVX__
+      #define EIGEN_VECTORIZE_AVX
+      #define EIGEN_VECTORIZE_SSE3
+      #define EIGEN_VECTORIZE_SSSE3
+      #define EIGEN_VECTORIZE_SSE4_1
+      #define EIGEN_VECTORIZE_SSE4_2
+    #endif
+    #ifdef __AVX2__
+      #define EIGEN_VECTORIZE_AVX2
+    #endif
+    #ifdef __FMA__
+      #define EIGEN_VECTORIZE_FMA
+    #endif
+    #if defined(__AVX512F__) && defined(EIGEN_ENABLE_AVX512)
+      #define EIGEN_VECTORIZE_AVX512
+      #define EIGEN_VECTORIZE_AVX2
+      #define EIGEN_VECTORIZE_AVX
+      #define EIGEN_VECTORIZE_FMA
+      #ifdef __AVX512DQ__
+        #define EIGEN_VECTORIZE_AVX512DQ
+      #endif
+      #ifdef __AVX512ER__
+        #define EIGEN_VECTORIZE_AVX512ER
+      #endif
+    #endif
+
+    // include files
+
+    // This extern "C" works around a MINGW-w64 compilation issue
+    // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354
+    // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do).
+    // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations
+    // with conflicting linkage.  The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know;
+    // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too.
+    // notice that since these are C headers, the extern "C" is theoretically needed anyways.
+    extern "C" {
+      // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
+      // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
+      #if EIGEN_COMP_ICC >= 1110
+        #include <immintrin.h>
+      #else
+        #include <mmintrin.h>
+        #include <emmintrin.h>
+        #include <xmmintrin.h>
+        #ifdef  EIGEN_VECTORIZE_SSE3
+        #include <pmmintrin.h>
+        #endif
+        #ifdef EIGEN_VECTORIZE_SSSE3
+        #include <tmmintrin.h>
+        #endif
+        #ifdef EIGEN_VECTORIZE_SSE4_1
+        #include <smmintrin.h>
+        #endif
+        #ifdef EIGEN_VECTORIZE_SSE4_2
+        #include <nmmintrin.h>
+        #endif
+        #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512)
+        #include <immintrin.h>
+        #endif
+      #endif
+    } // end extern "C"
+  #elif defined __VSX__
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_VSX
+    #include <altivec.h>
+    // We need to #undef all these ugly tokens defined in <altivec.h>
+    // => use __vector instead of vector
+    #undef bool
+    #undef vector
+    #undef pixel
+  #elif defined __ALTIVEC__
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_ALTIVEC
+    #include <altivec.h>
+    // We need to #undef all these ugly tokens defined in <altivec.h>
+    // => use __vector instead of vector
+    #undef bool
+    #undef vector
+    #undef pixel
+  #elif (defined  __ARM_NEON) || (defined __ARM_NEON__)
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_NEON
+    #include <arm_neon.h>
+  #elif (defined __s390x__ && defined __VEC__)
+    #define EIGEN_VECTORIZE
+    #define EIGEN_VECTORIZE_ZVECTOR
+    #include <vecintrin.h>
+  #endif
+#endif
+
+#if defined(__F16C__) && !defined(EIGEN_COMP_CLANG)
+  // We can use the optimized fp16 to float and float to fp16 conversion routines
+  #define EIGEN_HAS_FP16_C
+#endif
+
+#if defined __CUDACC__
+  #define EIGEN_VECTORIZE_CUDA
+  #include <vector_types.h>
+  #if EIGEN_CUDACC_VER >= 70500
+    #define EIGEN_HAS_CUDA_FP16
+  #endif
+#endif
+
+#if defined EIGEN_HAS_CUDA_FP16
+  #include <host_defines.h>
+  #include <cuda_fp16.h>
 #endif
 
 #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
@@ -83,10 +290,6 @@
 // for min/max:
 #include <algorithm>
 
-#if EIGEN_HAS_CXX11
-#include <array>
-#endif
-
 // for std::is_nothrow_move_assignable
 #ifdef EIGEN_INCLUDE_TYPE_TRAITS
 #include <type_traits>
@@ -102,25 +305,38 @@
   #include <intrin.h>
 #endif
 
-#if defined(EIGEN_USE_SYCL)
-  #undef min
-  #undef max
-  #undef isnan
-  #undef isinf
-  #undef isfinite
-  #include <SYCL/sycl.hpp>
-  #include <map>
-  #include <memory>
-  #include <utility>
-  #include <thread>
-  #ifndef EIGEN_SYCL_LOCAL_THREAD_DIM0
-  #define EIGEN_SYCL_LOCAL_THREAD_DIM0 16
-  #endif
-  #ifndef EIGEN_SYCL_LOCAL_THREAD_DIM1
-  #define EIGEN_SYCL_LOCAL_THREAD_DIM1 16
-  #endif
-#endif
+/** \brief Namespace containing all symbols from the %Eigen library. */
+namespace Eigen {
 
+inline static const char *SimdInstructionSetsInUse(void) {
+#if defined(EIGEN_VECTORIZE_AVX512)
+  return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_AVX)
+  return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_SSE4_2)
+  return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_SSE4_1)
+  return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
+#elif defined(EIGEN_VECTORIZE_SSSE3)
+  return "SSE, SSE2, SSE3, SSSE3";
+#elif defined(EIGEN_VECTORIZE_SSE3)
+  return "SSE, SSE2, SSE3";
+#elif defined(EIGEN_VECTORIZE_SSE2)
+  return "SSE, SSE2";
+#elif defined(EIGEN_VECTORIZE_ALTIVEC)
+  return "AltiVec";
+#elif defined(EIGEN_VECTORIZE_VSX)
+  return "VSX";
+#elif defined(EIGEN_VECTORIZE_NEON)
+  return "ARM NEON";
+#elif defined(EIGEN_VECTORIZE_ZVECTOR)
+  return "S390X ZVECTOR";
+#else
+  return "None";
+#endif
+}
+
+} // end namespace Eigen
 
 #if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API || defined EIGEN2_SUPPORT
 // This will generate an error message:
@@ -129,7 +345,7 @@
 
 namespace Eigen {
 
-// we use size_t frequently and we'll never remember to prepend it with std:: every time just to
+// we use size_t frequently and we'll never remember to prepend it with std:: everytime just to
 // ensure QNX/QCC support
 using std::size_t;
 // gcc 4.6.0 wants std:: for ptrdiff_t
@@ -153,85 +369,60 @@ using std::ptrdiff_t;
 #include "src/Core/util/StaticAssert.h"
 #include "src/Core/util/XprHelper.h"
 #include "src/Core/util/Memory.h"
-#include "src/Core/util/IntegralConstant.h"
-#include "src/Core/util/SymbolicIndex.h"
 
 #include "src/Core/NumTraits.h"
 #include "src/Core/MathFunctions.h"
 #include "src/Core/GenericPacketMath.h"
 #include "src/Core/MathFunctionsImpl.h"
 #include "src/Core/arch/Default/ConjHelper.h"
-// Generic half float support
-#include "src/Core/arch/Default/Half.h"
-#include "src/Core/arch/Default/TypeCasting.h"
-#include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h"
 
 #if defined EIGEN_VECTORIZE_AVX512
   #include "src/Core/arch/SSE/PacketMath.h"
-  #include "src/Core/arch/SSE/TypeCasting.h"
-  #include "src/Core/arch/SSE/Complex.h"
-  #include "src/Core/arch/AVX/PacketMath.h"
-  #include "src/Core/arch/AVX/TypeCasting.h"
-  #include "src/Core/arch/AVX/Complex.h"
-  #include "src/Core/arch/AVX512/PacketMath.h"
-  #include "src/Core/arch/AVX512/TypeCasting.h"
-  #include "src/Core/arch/AVX512/Complex.h"
   #include "src/Core/arch/SSE/MathFunctions.h"
+  #include "src/Core/arch/AVX/PacketMath.h"
   #include "src/Core/arch/AVX/MathFunctions.h"
+  #include "src/Core/arch/AVX512/PacketMath.h"
   #include "src/Core/arch/AVX512/MathFunctions.h"
 #elif defined EIGEN_VECTORIZE_AVX
   // Use AVX for floats and doubles, SSE for integers
   #include "src/Core/arch/SSE/PacketMath.h"
-  #include "src/Core/arch/SSE/TypeCasting.h"
   #include "src/Core/arch/SSE/Complex.h"
-  #include "src/Core/arch/AVX/PacketMath.h"
-  #include "src/Core/arch/AVX/TypeCasting.h"
-  #include "src/Core/arch/AVX/Complex.h"
   #include "src/Core/arch/SSE/MathFunctions.h"
+  #include "src/Core/arch/AVX/PacketMath.h"
   #include "src/Core/arch/AVX/MathFunctions.h"
+  #include "src/Core/arch/AVX/Complex.h"
+  #include "src/Core/arch/AVX/TypeCasting.h"
+  #include "src/Core/arch/SSE/TypeCasting.h"
 #elif defined EIGEN_VECTORIZE_SSE
   #include "src/Core/arch/SSE/PacketMath.h"
-  #include "src/Core/arch/SSE/TypeCasting.h"
   #include "src/Core/arch/SSE/MathFunctions.h"
   #include "src/Core/arch/SSE/Complex.h"
+  #include "src/Core/arch/SSE/TypeCasting.h"
 #elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
   #include "src/Core/arch/AltiVec/PacketMath.h"
   #include "src/Core/arch/AltiVec/MathFunctions.h"
   #include "src/Core/arch/AltiVec/Complex.h"
 #elif defined EIGEN_VECTORIZE_NEON
   #include "src/Core/arch/NEON/PacketMath.h"
-  #include "src/Core/arch/NEON/TypeCasting.h"
   #include "src/Core/arch/NEON/MathFunctions.h"
   #include "src/Core/arch/NEON/Complex.h"
 #elif defined EIGEN_VECTORIZE_ZVECTOR
   #include "src/Core/arch/ZVector/PacketMath.h"
   #include "src/Core/arch/ZVector/MathFunctions.h"
   #include "src/Core/arch/ZVector/Complex.h"
-#elif defined EIGEN_VECTORIZE_MSA
-  #include "src/Core/arch/MSA/PacketMath.h"
-  #include "src/Core/arch/MSA/MathFunctions.h"
-  #include "src/Core/arch/MSA/Complex.h"
 #endif
 
-#if defined EIGEN_VECTORIZE_GPU
-  #include "src/Core/arch/GPU/PacketMath.h"
-  #include "src/Core/arch/GPU/MathFunctions.h"
-  #include "src/Core/arch/GPU/TypeCasting.h"
-#endif
+// Half float support
+#include "src/Core/arch/CUDA/Half.h"
+#include "src/Core/arch/CUDA/PacketMathHalf.h"
+#include "src/Core/arch/CUDA/TypeCasting.h"
 
-#if defined(EIGEN_USE_SYCL)
-  #include "src/Core/arch/SYCL/SyclMemoryModel.h"
-  #include "src/Core/arch/SYCL/InteropHeaders.h"
-#if !defined(EIGEN_DONT_VECTORIZE_SYCL)
-  #include "src/Core/arch/SYCL/PacketMath.h"
-  #include "src/Core/arch/SYCL/MathFunctions.h"
-  #include "src/Core/arch/SYCL/TypeCasting.h"
-#endif
+#if defined EIGEN_VECTORIZE_CUDA
+  #include "src/Core/arch/CUDA/PacketMath.h"
+  #include "src/Core/arch/CUDA/MathFunctions.h"
 #endif
 
 #include "src/Core/arch/Default/Settings.h"
-// This file provides generic implementations valid for scalar as well
-#include "src/Core/arch/Default/GenericPacketMathFunctions.h"
 
 #include "src/Core/functors/TernaryFunctors.h"
 #include "src/Core/functors/BinaryFunctors.h"
@@ -242,16 +433,9 @@ using std::ptrdiff_t;
 
 // Specialized functors to enable the processing of complex numbers
 // on CUDA devices
-#ifdef EIGEN_CUDACC
 #include "src/Core/arch/CUDA/Complex.h"
-#endif
 
-#include "src/Core/util/IndexedViewHelper.h"
-#include "src/Core/util/ReshapedHelper.h"
-#include "src/Core/ArithmeticSequence.h"
-#ifndef EIGEN_NO_IO
-  #include "src/Core/IO.h"
-#endif
+#include "src/Core/IO.h"
 #include "src/Core/DenseCoeffsBase.h"
 #include "src/Core/DenseBase.h"
 #include "src/Core/MatrixBase.h"
@@ -292,8 +476,6 @@ using std::ptrdiff_t;
 #include "src/Core/Ref.h"
 #include "src/Core/Block.h"
 #include "src/Core/VectorBlock.h"
-#include "src/Core/IndexedView.h"
-#include "src/Core/Reshaped.h"
 #include "src/Core/Transpose.h"
 #include "src/Core/DiagonalMatrix.h"
 #include "src/Core/Diagonal.h"
@@ -333,12 +515,10 @@ using std::ptrdiff_t;
 #include "src/Core/BooleanRedux.h"
 #include "src/Core/Select.h"
 #include "src/Core/VectorwiseOp.h"
-#include "src/Core/PartialReduxEvaluator.h"
 #include "src/Core/Random.h"
 #include "src/Core/Replicate.h"
 #include "src/Core/Reverse.h"
 #include "src/Core/ArrayWrapper.h"
-#include "src/Core/StlIterators.h"
 
 #ifdef EIGEN_USE_BLAS
 #include "src/Core/products/GeneralMatrixMatrix_BLAS.h"
diff --git a/uppsrc/plugin/Eigen/Eigen/Geometry b/uppsrc/plugin/Eigen/Eigen/Geometry
index 16b4bd6e1..da88c03bb 100644
--- a/uppsrc/plugin/Eigen/Eigen/Geometry
+++ b/uppsrc/plugin/Eigen/Eigen/Geometry
@@ -49,8 +49,9 @@
 #include "src/Geometry/AlignedBox.h"
 #include "src/Geometry/Umeyama.h"
 
-// Use the SSE optimized version whenever possible.
-#if defined EIGEN_VECTORIZE_SSE
+// Use the SSE optimized version whenever possible. At the moment the
+// SSE version doesn't compile when AVX is enabled
+#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX
 #include "src/Geometry/arch/Geometry_SSE.h"
 #endif
 
@@ -58,3 +59,4 @@
 
 #endif // EIGEN_GEOMETRY_MODULE_H
 /* vim: set filetype=cpp et sw=2 ts=2 ai: */
+
diff --git a/uppsrc/plugin/Eigen/Eigen/KLUSupport b/uppsrc/plugin/Eigen/Eigen/KLUSupport
deleted file mode 100644
index b23d90535..000000000
--- a/uppsrc/plugin/Eigen/Eigen/KLUSupport
+++ /dev/null
@@ -1,41 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_KLUSUPPORT_MODULE_H
-#define EIGEN_KLUSUPPORT_MODULE_H
-
-#include <Eigen/SparseCore>
-
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
-
-extern "C" {
-#include <btf.h>
-#include <klu.h>
-   }
-
-/** \ingroup Support_modules
-  * \defgroup KLUSupport_Module KLUSupport module
-  *
-  * This module provides an interface to the KLU library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
-  * It provides the following factorization class:
-  * - class KLU: a sparse LU factorization, well-suited for circuit simulation.
-  *
-  * \code
-  * #include <Eigen/KLUSupport>
-  * \endcode
-  *
-  * In order to use this module, the klu and btf headers must be accessible from the include paths, and your binary must be linked to the klu library and its dependencies.
-  * The dependencies depend on how umfpack has been compiled.
-  * For a cmake based project, you can use our FindKLU.cmake module to help you in this task.
-  *
-  */
-
-#include "src/KLUSupport/KLUSupport.h"
-
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
-
-#endif // EIGEN_KLUSUPPORT_MODULE_H
diff --git a/uppsrc/plugin/Eigen/Eigen/OrderingMethods b/uppsrc/plugin/Eigen/Eigen/OrderingMethods
index 29691a62b..d8ea36193 100644
--- a/uppsrc/plugin/Eigen/Eigen/OrderingMethods
+++ b/uppsrc/plugin/Eigen/Eigen/OrderingMethods
@@ -63,7 +63,10 @@
   * \endcode
   */
 
+#ifndef EIGEN_MPL2_ONLY
 #include "src/OrderingMethods/Amd.h"
+#endif
+
 #include "src/OrderingMethods/Ordering.h"
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/uppsrc/plugin/Eigen/Eigen/PaStiXSupport b/uppsrc/plugin/Eigen/Eigen/PaStiXSupport
index 234619acc..de3a63b4d 100644
--- a/uppsrc/plugin/Eigen/Eigen/PaStiXSupport
+++ b/uppsrc/plugin/Eigen/Eigen/PaStiXSupport
@@ -36,7 +36,6 @@ extern "C" {
   * \endcode
   *
   * In order to use this module, the PaSTiX headers must be accessible from the include paths, and your binary must be linked to the PaSTiX library and its dependencies.
-  * This wrapper resuires PaStiX version 5.x compiled without MPI support.
   * The dependencies depend on how PaSTiX has been compiled.
   * For a cmake based project, you can use our FindPaSTiX.cmake module to help you in this task.
   *
diff --git a/uppsrc/plugin/Eigen/Eigen/Sparse b/uppsrc/plugin/Eigen/Eigen/Sparse
index a2ef7a665..136e681a1 100644
--- a/uppsrc/plugin/Eigen/Eigen/Sparse
+++ b/uppsrc/plugin/Eigen/Eigen/Sparse
@@ -25,7 +25,9 @@
 
 #include "SparseCore"
 #include "OrderingMethods"
+#ifndef EIGEN_MPL2_ONLY
 #include "SparseCholesky"
+#endif
 #include "SparseLU"
 #include "SparseQR"
 #include "IterativeLinearSolvers"
diff --git a/uppsrc/plugin/Eigen/Eigen/SparseCholesky b/uppsrc/plugin/Eigen/Eigen/SparseCholesky
index d2b1f1276..b6a320c40 100644
--- a/uppsrc/plugin/Eigen/Eigen/SparseCholesky
+++ b/uppsrc/plugin/Eigen/Eigen/SparseCholesky
@@ -30,8 +30,16 @@
   * \endcode
   */
 
+#ifdef EIGEN_MPL2_ONLY
+#error The SparseCholesky module has nothing to offer in MPL2 only mode
+#endif
+
 #include "src/SparseCholesky/SimplicialCholesky.h"
+
+#ifndef EIGEN_MPL2_ONLY
 #include "src/SparseCholesky/SimplicialCholesky_impl.h"
+#endif
+
 #include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_SPARSECHOLESKY_MODULE_H
diff --git a/uppsrc/plugin/Eigen/Eigen/SparseLU b/uppsrc/plugin/Eigen/Eigen/SparseLU
index 37c4a5c5a..38b38b531 100644
--- a/uppsrc/plugin/Eigen/Eigen/SparseLU
+++ b/uppsrc/plugin/Eigen/Eigen/SparseLU
@@ -23,8 +23,6 @@
 // Ordering interface
 #include "OrderingMethods"
 
-#include "src/Core/util/DisableStupidWarnings.h"
-
 #include "src/SparseLU/SparseLU_gemm_kernel.h"
 
 #include "src/SparseLU/SparseLU_Structs.h"
@@ -45,6 +43,4 @@
 #include "src/SparseLU/SparseLU_Utils.h"
 #include "src/SparseLU/SparseLU.h"
 
-#include "src/Core/util/ReenableStupidWarnings.h"
-
 #endif // EIGEN_SPARSELU_MODULE_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Cholesky/LDLT.h b/uppsrc/plugin/Eigen/Eigen/src/Cholesky/LDLT.h
index 67e97ffb8..15ccf24f1 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Cholesky/LDLT.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Cholesky/LDLT.h
@@ -16,15 +16,6 @@
 namespace Eigen {
 
 namespace internal {
-  template<typename _MatrixType, int _UpLo> struct traits<LDLT<_MatrixType, _UpLo> >
-   : traits<_MatrixType>
-  {
-    typedef MatrixXpr XprKind;
-    typedef SolverStorage StorageKind;
-    typedef int StorageIndex;
-    enum { Flags = 0 };
-  };
-
   template<typename MatrixType, int UpLo> struct LDLT_Traits;
 
   // PositiveSemiDef means positive semi-definite and non-zero; same for NegativeSemiDef
@@ -57,19 +48,20 @@ namespace internal {
   * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT
   */
 template<typename _MatrixType, int _UpLo> class LDLT
-        : public SolverBase<LDLT<_MatrixType, _UpLo> >
 {
   public:
     typedef _MatrixType MatrixType;
-    typedef SolverBase<LDLT> Base;
-    friend class SolverBase<LDLT>;
-
-    EIGEN_GENERIC_PUBLIC_INTERFACE(LDLT)
     enum {
+      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
       UpLo = _UpLo
     };
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
+    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef Matrix<Scalar, RowsAtCompileTime, 1, 0, MaxRowsAtCompileTime, 1> TmpMatrixType;
 
     typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType;
@@ -188,7 +180,6 @@ template<typename _MatrixType, int _UpLo> class LDLT
       return m_sign == internal::NegativeSemiDef || m_sign == internal::ZeroSign;
     }
 
-    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** \returns a solution x of \f$ A x = b \f$ using the current decomposition of A.
       *
       * This function also supports in-place solves using the syntax <tt>x = decompositionObject.solve(x)</tt> .
@@ -206,8 +197,13 @@ template<typename _MatrixType, int _UpLo> class LDLT
       */
     template<typename Rhs>
     inline const Solve<LDLT, Rhs>
-    solve(const MatrixBase<Rhs>& b) const;
-    #endif
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "LDLT is not initialized.");
+      eigen_assert(m_matrix.rows()==b.rows()
+                && "LDLT::solve(): invalid number of rows of the right hand side matrix b");
+      return Solve<LDLT, Rhs>(*this, b.derived());
+    }
 
     template<typename Derived>
     bool solveInPlace(MatrixBase<Derived> &bAndX) const;
@@ -251,7 +247,7 @@ template<typename _MatrixType, int _UpLo> class LDLT
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was successful,
+      * \returns \c Success if computation was succesful,
       *          \c NumericalIssue if the factorization failed because of a zero pivot.
       */
     ComputationInfo info() const
@@ -262,10 +258,8 @@ template<typename _MatrixType, int _UpLo> class LDLT
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
-
-    template<bool Conjugate, typename RhsType, typename DstType>
-    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
     #endif
 
   protected:
@@ -566,22 +560,14 @@ template<typename _MatrixType, int _UpLo>
 template<typename RhsType, typename DstType>
 void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  _solve_impl_transposed<true>(rhs, dst);
-}
-
-template<typename _MatrixType,int _UpLo>
-template<bool Conjugate, typename RhsType, typename DstType>
-void LDLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
-{
+  eigen_assert(rhs.rows() == rows());
   // dst = P b
   dst = m_transpositions * rhs;
 
   // dst = L^-1 (P b)
-  // dst = L^-*T (P b)
-  matrixL().template conjugateIf<!Conjugate>().solveInPlace(dst);
+  matrixL().solveInPlace(dst);
 
-  // dst = D^-* (L^-1 P b)
-  // dst = D^-1 (L^-*T P b)
+  // dst = D^-1 (L^-1 P b)
   // more precisely, use pseudo-inverse of D (see bug 241)
   using std::abs;
   const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD());
@@ -593,6 +579,7 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType
   // Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
   // Using numeric_limits::min() gives us more robustness to denormals.
   RealScalar tolerance = (std::numeric_limits<RealScalar>::min)();
+
   for (Index i = 0; i < vecD.size(); ++i)
   {
     if(abs(vecD(i)) > tolerance)
@@ -601,12 +588,10 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType
       dst.row(i).setZero();
   }
 
-  // dst = L^-* (D^-* L^-1 P b)
-  // dst = L^-T (D^-1 L^-*T P b)
-  matrixL().transpose().template conjugateIf<Conjugate>().solveInPlace(dst);
+  // dst = L^-T (D^-1 L^-1 P b)
+  matrixU().solveInPlace(dst);
 
-  // dst = P^T (L^-* D^-* L^-1 P b) = A^-1 b
-  // dst = P^-T (L^-T D^-1 L^-*T P b) = A^-1 b
+  // dst = P^-1 (L^-T D^-1 L^-1 P b) = A^-1 b
   dst = m_transpositions.transpose() * dst;
 }
 #endif
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Cholesky/LLT.h b/uppsrc/plugin/Eigen/Eigen/src/Cholesky/LLT.h
index 5876966e6..e1624d21b 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Cholesky/LLT.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Cholesky/LLT.h
@@ -13,16 +13,6 @@
 namespace Eigen {
 
 namespace internal{
-
-template<typename _MatrixType, int _UpLo> struct traits<LLT<_MatrixType, _UpLo> >
- : traits<_MatrixType>
-{
-  typedef MatrixXpr XprKind;
-  typedef SolverStorage StorageKind;
-  typedef int StorageIndex;
-  enum { Flags = 0 };
-};
-
 template<typename MatrixType, int UpLo> struct LLT_Traits;
 }
 
@@ -64,17 +54,18 @@ template<typename MatrixType, int UpLo> struct LLT_Traits;
   * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
   */
 template<typename _MatrixType, int _UpLo> class LLT
-        : public SolverBase<LLT<_MatrixType, _UpLo> >
 {
   public:
     typedef _MatrixType MatrixType;
-    typedef SolverBase<LLT> Base;
-    friend class SolverBase<LLT>;
-
-    EIGEN_GENERIC_PUBLIC_INTERFACE(LLT)
     enum {
+      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+    typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
+    typedef typename MatrixType::StorageIndex StorageIndex;
 
     enum {
       PacketSize = internal::packet_traits<Scalar>::size,
@@ -109,7 +100,7 @@ template<typename _MatrixType, int _UpLo> class LLT
       compute(matrix.derived());
     }
 
-    /** \brief Constructs a LLT factorization from a given matrix
+    /** \brief Constructs a LDLT factorization from a given matrix
       *
       * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when
       * \c MatrixType is a Eigen::Ref.
@@ -138,7 +129,6 @@ template<typename _MatrixType, int _UpLo> class LLT
       return Traits::getL(m_matrix);
     }
 
-    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
       *
       * Since this LLT class assumes anyway that the matrix A is invertible, the solution
@@ -151,8 +141,13 @@ template<typename _MatrixType, int _UpLo> class LLT
       */
     template<typename Rhs>
     inline const Solve<LLT, Rhs>
-    solve(const MatrixBase<Rhs>& b) const;
-    #endif
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "LLT is not initialized.");
+      eigen_assert(m_matrix.rows()==b.rows()
+                && "LLT::solve(): invalid number of rows of the right hand side matrix b");
+      return Solve<LLT, Rhs>(*this, b.derived());
+    }
 
     template<typename Derived>
     void solveInPlace(const MatrixBase<Derived> &bAndX) const;
@@ -185,7 +180,7 @@ template<typename _MatrixType, int _UpLo> class LLT
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was successful,
+      * \returns \c Success if computation was succesful,
       *          \c NumericalIssue if the matrix.appears not to be positive definite.
       */
     ComputationInfo info() const
@@ -205,14 +200,12 @@ template<typename _MatrixType, int _UpLo> class LLT
     inline Index cols() const { return m_matrix.cols(); }
 
     template<typename VectorType>
-    LLT & rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
+    LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
-
-    template<bool Conjugate, typename RhsType, typename DstType>
-    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
     #endif
 
   protected:
@@ -466,7 +459,7 @@ LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const EigenBase<InputType>
   */
 template<typename _MatrixType, int _UpLo>
 template<typename VectorType>
-LLT<_MatrixType,_UpLo> & LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma)
+LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorType);
   eigen_assert(v.size()==m_matrix.cols());
@@ -484,17 +477,8 @@ template<typename _MatrixType,int _UpLo>
 template<typename RhsType, typename DstType>
 void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
-  _solve_impl_transposed<true>(rhs, dst);
-}
-
-template<typename _MatrixType,int _UpLo>
-template<bool Conjugate, typename RhsType, typename DstType>
-void LLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
-{
-    dst = rhs;
-
-    matrixL().template conjugateIf<!Conjugate>().solveInPlace(dst);
-    matrixU().template conjugateIf<!Conjugate>().solveInPlace(dst);
+  dst = rhs;
+  solveInPlace(dst);
 }
 #endif
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/CholmodSupport/CholmodSupport.h b/uppsrc/plugin/Eigen/Eigen/src/CholmodSupport/CholmodSupport.h
index adaf52858..571972023 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/CholmodSupport/CholmodSupport.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_CHOLMODSUPPORT_H
 #define EIGEN_CHOLMODSUPPORT_H
 
-namespace Eigen {
+namespace Eigen { 
 
 namespace internal {
 
@@ -32,7 +32,7 @@ template<> struct cholmod_configure_matrix<std::complex<double> > {
   }
 };
 
-// Other scalar types are not yet supported by Cholmod
+// Other scalar types are not yet suppotred by Cholmod
 // template<> struct cholmod_configure_matrix<float> {
 //   template<typename CholmodType>
 //   static void run(CholmodType& mat) {
@@ -79,12 +79,12 @@ cholmod_sparse viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_StorageIndex> >
 
   res.dtype   = 0;
   res.stype   = -1;
-
+  
   if (internal::is_same<_StorageIndex,int>::value)
   {
     res.itype = CHOLMOD_INT;
   }
-  else if (internal::is_same<_StorageIndex,SuiteSparse_long>::value)
+  else if (internal::is_same<_StorageIndex,long>::value)
   {
     res.itype = CHOLMOD_LONG;
   }
@@ -95,9 +95,9 @@ cholmod_sparse viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_StorageIndex> >
 
   // setup res.xtype
   internal::cholmod_configure_matrix<_Scalar>::run(res);
-
+  
   res.stype = 0;
-
+  
   return res;
 }
 
@@ -121,12 +121,9 @@ template<typename _Scalar, int _Options, typename _Index, unsigned int UpLo>
 cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<const SparseMatrix<_Scalar,_Options,_Index>, UpLo>& mat)
 {
   cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<_Scalar,_Options,_Index> >(mat.matrix().const_cast_derived()));
-
+  
   if(UpLo==Upper) res.stype =  1;
   if(UpLo==Lower) res.stype = -1;
-  // swap stype for rowmajor matrices (only works for real matrices)
-  EIGEN_STATIC_ASSERT((_Options & RowMajorBit) == 0 || NumTraits<_Scalar>::IsComplex == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
-  if(_Options & RowMajorBit) res.stype *=-1;
 
   return res;
 }
@@ -162,44 +159,6 @@ MappedSparseMatrix<Scalar,Flags,StorageIndex> viewAsEigen(cholmod_sparse& cm)
           static_cast<StorageIndex*>(cm.p), static_cast<StorageIndex*>(cm.i),static_cast<Scalar*>(cm.x) );
 }
 
-namespace internal {
-
-// template specializations for int and long that call the correct cholmod method
-
-#define EIGEN_CHOLMOD_SPECIALIZE0(ret, name) \
-    template<typename _StorageIndex> inline ret cm_ ## name       (cholmod_common &Common) { return cholmod_ ## name   (&Common); } \
-    template<>                       inline ret cm_ ## name<SuiteSparse_long> (cholmod_common &Common) { return cholmod_l_ ## name (&Common); }
-
-#define EIGEN_CHOLMOD_SPECIALIZE1(ret, name, t1, a1) \
-    template<typename _StorageIndex> inline ret cm_ ## name       (t1& a1, cholmod_common &Common) { return cholmod_ ## name   (&a1, &Common); } \
-    template<>                       inline ret cm_ ## name<SuiteSparse_long> (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); }
-
-EIGEN_CHOLMOD_SPECIALIZE0(int, start)
-EIGEN_CHOLMOD_SPECIALIZE0(int, finish)
-
-EIGEN_CHOLMOD_SPECIALIZE1(int, free_factor, cholmod_factor*, L)
-EIGEN_CHOLMOD_SPECIALIZE1(int, free_dense,  cholmod_dense*,  X)
-EIGEN_CHOLMOD_SPECIALIZE1(int, free_sparse, cholmod_sparse*, A)
-
-EIGEN_CHOLMOD_SPECIALIZE1(cholmod_factor*, analyze, cholmod_sparse, A)
-
-template<typename _StorageIndex> inline cholmod_dense*  cm_solve         (int sys, cholmod_factor& L, cholmod_dense&  B, cholmod_common &Common) { return cholmod_solve     (sys, &L, &B, &Common); }
-template<>                       inline cholmod_dense*  cm_solve<SuiteSparse_long>   (int sys, cholmod_factor& L, cholmod_dense&  B, cholmod_common &Common) { return cholmod_l_solve   (sys, &L, &B, &Common); }
-
-template<typename _StorageIndex> inline cholmod_sparse* cm_spsolve       (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve   (sys, &L, &B, &Common); }
-template<>                       inline cholmod_sparse* cm_spsolve<SuiteSparse_long> (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); }
-
-template<typename _StorageIndex>
-inline int  cm_factorize_p       (cholmod_sparse*  A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p   (A, beta, fset, fsize, L, &Common); }
-template<>
-inline int  cm_factorize_p<SuiteSparse_long> (cholmod_sparse*  A, double beta[2], SuiteSparse_long* fset,          std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); }
-
-#undef EIGEN_CHOLMOD_SPECIALIZE0
-#undef EIGEN_CHOLMOD_SPECIALIZE1
-
-}  // namespace internal
-
-
 enum CholmodMode {
   CholmodAuto, CholmodSimplicialLLt, CholmodSupernodalLLt, CholmodLDLt
 };
@@ -236,7 +195,7 @@ class CholmodBase : public SparseSolverBase<Derived>
     {
       EIGEN_STATIC_ASSERT((internal::is_same<double,RealScalar>::value), CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY);
       m_shiftOffset[0] = m_shiftOffset[1] = 0.0;
-      internal::cm_start<StorageIndex>(m_cholmod);
+      cholmod_start(&m_cholmod);
     }
 
     explicit CholmodBase(const MatrixType& matrix)
@@ -244,23 +203,23 @@ class CholmodBase : public SparseSolverBase<Derived>
     {
       EIGEN_STATIC_ASSERT((internal::is_same<double,RealScalar>::value), CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY);
       m_shiftOffset[0] = m_shiftOffset[1] = 0.0;
-      internal::cm_start<StorageIndex>(m_cholmod);
+      cholmod_start(&m_cholmod);
       compute(matrix);
     }
 
     ~CholmodBase()
     {
       if(m_cholmodFactor)
-        internal::cm_free_factor<StorageIndex>(m_cholmodFactor, m_cholmod);
-      internal::cm_finish<StorageIndex>(m_cholmod);
+        cholmod_free_factor(&m_cholmodFactor, &m_cholmod);
+      cholmod_finish(&m_cholmod);
     }
-
+    
     inline StorageIndex cols() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
     inline StorageIndex rows() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
-
+    
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was successful,
+      * \returns \c Success if computation was succesful,
       *          \c NumericalIssue if the matrix.appears to be negative.
       */
     ComputationInfo info() const
@@ -276,29 +235,29 @@ class CholmodBase : public SparseSolverBase<Derived>
       factorize(matrix);
       return derived();
     }
-
+    
     /** Performs a symbolic decomposition on the sparsity pattern of \a matrix.
       *
       * This function is particularly useful when solving for several problems having the same structure.
-      *
+      * 
       * \sa factorize()
       */
     void analyzePattern(const MatrixType& matrix)
     {
       if(m_cholmodFactor)
       {
-        internal::cm_free_factor<StorageIndex>(m_cholmodFactor, m_cholmod);
+        cholmod_free_factor(&m_cholmodFactor, &m_cholmod);
         m_cholmodFactor = 0;
       }
       cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>());
-      m_cholmodFactor = internal::cm_analyze<StorageIndex>(A, m_cholmod);
-
+      m_cholmodFactor = cholmod_analyze(&A, &m_cholmod);
+      
       this->m_isInitialized = true;
       this->m_info = Success;
       m_analysisIsOk = true;
       m_factorizationIsOk = false;
     }
-
+    
     /** Performs a numeric decomposition of \a matrix
       *
       * The given matrix must have the same sparsity pattern as the matrix on which the symbolic decomposition has been performed.
@@ -309,17 +268,17 @@ class CholmodBase : public SparseSolverBase<Derived>
     {
       eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
       cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>());
-      internal::cm_factorize_p<StorageIndex>(&A, m_shiftOffset, 0, 0, m_cholmodFactor, m_cholmod);
+      cholmod_factorize_p(&A, m_shiftOffset, 0, 0, m_cholmodFactor, &m_cholmod);
 
       // If the factorization failed, minor is the column at which it did. On success minor == n.
       this->m_info = (m_cholmodFactor->minor == m_cholmodFactor->n ? Success : NumericalIssue);
       m_factorizationIsOk = true;
     }
-
+    
     /** Returns a reference to the Cholmod's configuration structure to get a full control over the performed operations.
      *  See the Cholmod user guide for details. */
     cholmod_common& cholmod() { return m_cholmod; }
-
+    
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal */
     template<typename Rhs,typename Dest>
@@ -329,23 +288,22 @@ class CholmodBase : public SparseSolverBase<Derived>
       const Index size = m_cholmodFactor->n;
       EIGEN_UNUSED_VARIABLE(size);
       eigen_assert(size==b.rows());
-
-      // Cholmod needs column-major storage without inner-stride, which corresponds to the default behavior of Ref.
+      
+      // Cholmod needs column-major stoarge without inner-stride, which corresponds to the default behavior of Ref.
       Ref<const Matrix<typename Rhs::Scalar,Dynamic,Dynamic,ColMajor> > b_ref(b.derived());
 
       cholmod_dense b_cd = viewAsCholmod(b_ref);
-      cholmod_dense* x_cd = internal::cm_solve<StorageIndex>(CHOLMOD_A, *m_cholmodFactor, b_cd, m_cholmod);
+      cholmod_dense* x_cd = cholmod_solve(CHOLMOD_A, m_cholmodFactor, &b_cd, &m_cholmod);
       if(!x_cd)
       {
         this->m_info = NumericalIssue;
         return;
       }
       // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
-      // NOTE Actually, the copy can be avoided by calling cholmod_solve2 instead of cholmod_solve
       dest = Matrix<Scalar,Dest::RowsAtCompileTime,Dest::ColsAtCompileTime>::Map(reinterpret_cast<Scalar*>(x_cd->x),b.rows(),b.cols());
-      internal::cm_free_dense<StorageIndex>(x_cd, m_cholmod);
+      cholmod_free_dense(&x_cd, &m_cholmod);
     }
-
+    
     /** \internal */
     template<typename RhsDerived, typename DestDerived>
     void _solve_impl(const SparseMatrixBase<RhsDerived> &b, SparseMatrixBase<DestDerived> &dest) const
@@ -358,20 +316,19 @@ class CholmodBase : public SparseSolverBase<Derived>
       // note: cs stands for Cholmod Sparse
       Ref<SparseMatrix<typename RhsDerived::Scalar,ColMajor,typename RhsDerived::StorageIndex> > b_ref(b.const_cast_derived());
       cholmod_sparse b_cs = viewAsCholmod(b_ref);
-      cholmod_sparse* x_cs = internal::cm_spsolve<StorageIndex>(CHOLMOD_A, *m_cholmodFactor, b_cs, m_cholmod);
+      cholmod_sparse* x_cs = cholmod_spsolve(CHOLMOD_A, m_cholmodFactor, &b_cs, &m_cholmod);
       if(!x_cs)
       {
         this->m_info = NumericalIssue;
         return;
       }
       // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
-      // NOTE cholmod_spsolve in fact just calls the dense solver for blocks of 4 columns at a time (similar to Eigen's sparse solver)
       dest.derived() = viewAsEigen<typename DestDerived::Scalar,ColMajor,typename DestDerived::StorageIndex>(*x_cs);
-      internal::cm_free_sparse<StorageIndex>(x_cs, m_cholmod);
+      cholmod_free_sparse(&x_cs, &m_cholmod);
     }
     #endif // EIGEN_PARSED_BY_DOXYGEN
-
-
+    
+    
     /** Sets the shift parameter that will be used to adjust the diagonal coefficients during the numerical factorization.
       *
       * During the numerical factorization, an offset term is added to the diagonal coefficients:\n
@@ -386,7 +343,7 @@ class CholmodBase : public SparseSolverBase<Derived>
       m_shiftOffset[0] = double(offset);
       return derived();
     }
-
+    
     /** \returns the determinant of the underlying matrix from the current factorization */
     Scalar determinant() const
     {
@@ -441,7 +398,7 @@ class CholmodBase : public SparseSolverBase<Derived>
     template<typename Stream>
     void dumpMemory(Stream& /*s*/)
     {}
-
+    
   protected:
     mutable cholmod_common m_cholmod;
     cholmod_factor* m_cholmodFactor;
@@ -478,11 +435,11 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl
 {
     typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT> Base;
     using Base::m_cholmod;
-
+    
   public:
-
+    
     typedef _MatrixType MatrixType;
-
+    
     CholmodSimplicialLLT() : Base() { init(); }
 
     CholmodSimplicialLLT(const MatrixType& matrix) : Base()
@@ -529,11 +486,11 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp
 {
     typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT> Base;
     using Base::m_cholmod;
-
+    
   public:
-
+    
     typedef _MatrixType MatrixType;
-
+    
     CholmodSimplicialLDLT() : Base() { init(); }
 
     CholmodSimplicialLDLT(const MatrixType& matrix) : Base()
@@ -578,11 +535,11 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper
 {
     typedef CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT> Base;
     using Base::m_cholmod;
-
+    
   public:
-
+    
     typedef _MatrixType MatrixType;
-
+    
     CholmodSupernodalLLT() : Base() { init(); }
 
     CholmodSupernodalLLT(const MatrixType& matrix) : Base()
@@ -629,11 +586,11 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
 {
     typedef CholmodBase<_MatrixType, _UpLo, CholmodDecomposition> Base;
     using Base::m_cholmod;
-
+    
   public:
-
+    
     typedef _MatrixType MatrixType;
-
+    
     CholmodDecomposition() : Base() { init(); }
 
     CholmodDecomposition(const MatrixType& matrix) : Base()
@@ -643,7 +600,7 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom
     }
 
     ~CholmodDecomposition() {}
-
+    
     void setMode(CholmodMode mode)
     {
       switch(mode)
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/ArithmeticSequence.h b/uppsrc/plugin/Eigen/Eigen/src/Core/ArithmeticSequence.h
deleted file mode 100644
index b6200fac1..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/ArithmeticSequence.h
+++ /dev/null
@@ -1,413 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_ARITHMETIC_SEQUENCE_H
-#define EIGEN_ARITHMETIC_SEQUENCE_H
-
-namespace Eigen {
-
-namespace internal {
-
-#if (!EIGEN_HAS_CXX11) || !((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48)
-template<typename T> struct aseq_negate {};
-
-template<> struct aseq_negate<Index> {
-  typedef Index type;
-};
-
-template<int N> struct aseq_negate<FixedInt<N> > {
-  typedef FixedInt<-N> type;
-};
-
-// Compilation error in the following case:
-template<> struct aseq_negate<FixedInt<DynamicIndex> > {};
-
-template<typename FirstType,typename SizeType,typename IncrType,
-         bool FirstIsSymbolic=symbolic::is_symbolic<FirstType>::value,
-         bool SizeIsSymbolic =symbolic::is_symbolic<SizeType>::value>
-struct aseq_reverse_first_type {
-  typedef Index type;
-};
-
-template<typename FirstType,typename SizeType,typename IncrType>
-struct aseq_reverse_first_type<FirstType,SizeType,IncrType,true,true> {
-  typedef symbolic::AddExpr<FirstType,
-                            symbolic::ProductExpr<symbolic::AddExpr<SizeType,symbolic::ValueExpr<FixedInt<-1> > >,
-                                                  symbolic::ValueExpr<IncrType> >
-                           > type;
-};
-
-template<typename SizeType,typename IncrType,typename EnableIf = void>
-struct aseq_reverse_first_type_aux {
-  typedef Index type;
-};
-
-template<typename SizeType,typename IncrType>
-struct aseq_reverse_first_type_aux<SizeType,IncrType,typename internal::enable_if<bool((SizeType::value+IncrType::value)|0x1)>::type> {
-  typedef FixedInt<(SizeType::value-1)*IncrType::value> type;
-};
-
-template<typename FirstType,typename SizeType,typename IncrType>
-struct aseq_reverse_first_type<FirstType,SizeType,IncrType,true,false> {
-  typedef typename aseq_reverse_first_type_aux<SizeType,IncrType>::type Aux;
-  typedef symbolic::AddExpr<FirstType,symbolic::ValueExpr<Aux> > type;
-};
-
-template<typename FirstType,typename SizeType,typename IncrType>
-struct aseq_reverse_first_type<FirstType,SizeType,IncrType,false,true> {
-  typedef symbolic::AddExpr<symbolic::ProductExpr<symbolic::AddExpr<SizeType,symbolic::ValueExpr<FixedInt<-1> > >,
-                                                  symbolic::ValueExpr<IncrType> >,
-                            symbolic::ValueExpr<> > type;
-};
-#endif
-
-// Helper to cleanup the type of the increment:
-template<typename T> struct cleanup_seq_incr {
-  typedef typename cleanup_index_type<T,DynamicIndex>::type type;
-};
-
-}
-
-//--------------------------------------------------------------------------------
-// seq(first,last,incr) and seqN(first,size,incr)
-//--------------------------------------------------------------------------------
-
-template<typename FirstType=Index,typename SizeType=Index,typename IncrType=internal::FixedInt<1> >
-class ArithmeticSequence;
-
-template<typename FirstType,typename SizeType,typename IncrType>
-ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
-                   typename internal::cleanup_index_type<SizeType>::type,
-                   typename internal::cleanup_seq_incr<IncrType>::type >
-seqN(FirstType first, SizeType size, IncrType incr);
-
-/** \class ArithmeticSequence
-  * \ingroup Core_Module
-  *
-  * This class represents an arithmetic progression \f$ a_0, a_1, a_2, ..., a_{n-1}\f$ defined by
-  * its \em first value \f$ a_0 \f$, its \em size (aka length) \em n, and the \em increment (aka stride)
-  * that is equal to \f$ a_{i+1}-a_{i}\f$ for any \em i.
-  *
-  * It is internally used as the return type of the Eigen::seq and Eigen::seqN functions, and as the input arguments
-  * of DenseBase::operator()(const RowIndices&, const ColIndices&), and most of the time this is the
-  * only way it is used.
-  *
-  * \tparam FirstType type of the first element, usually an Index,
-  *                   but internally it can be a symbolic expression
-  * \tparam SizeType type representing the size of the sequence, usually an Index
-  *                  or a compile time integral constant. Internally, it can also be a symbolic expression
-  * \tparam IncrType type of the increment, can be a runtime Index, or a compile time integral constant (default is compile-time 1)
-  *
-  * \sa Eigen::seq, Eigen::seqN, DenseBase::operator()(const RowIndices&, const ColIndices&), class IndexedView
-  */
-template<typename FirstType,typename SizeType,typename IncrType>
-class ArithmeticSequence
-{
-public:
-  ArithmeticSequence(FirstType first, SizeType size) : m_first(first), m_size(size) {}
-  ArithmeticSequence(FirstType first, SizeType size, IncrType incr) : m_first(first), m_size(size), m_incr(incr) {}
-
-  enum {
-    SizeAtCompileTime = internal::get_fixed_value<SizeType>::value,
-    IncrAtCompileTime = internal::get_fixed_value<IncrType,DynamicIndex>::value
-  };
-
-  /** \returns the size, i.e., number of elements, of the sequence */
-  Index size()  const { return m_size; }
-
-  /** \returns the first element \f$ a_0 \f$ in the sequence */
-  Index first()  const { return m_first; }
-
-  /** \returns the value \f$ a_i \f$ at index \a i in the sequence. */
-  Index operator[](Index i) const { return m_first + i * m_incr; }
-
-  const FirstType& firstObject() const { return m_first; }
-  const SizeType&  sizeObject()  const { return m_size; }
-  const IncrType&  incrObject()  const { return m_incr; }
-
-protected:
-  FirstType m_first;
-  SizeType  m_size;
-  IncrType  m_incr;
-
-public:
-
-#if EIGEN_HAS_CXX11 && ((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48)
-  auto reverse() const -> decltype(Eigen::seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr)) {
-    return seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr);
-  }
-#else
-protected:
-  typedef typename internal::aseq_negate<IncrType>::type ReverseIncrType;
-  typedef typename internal::aseq_reverse_first_type<FirstType,SizeType,IncrType>::type ReverseFirstType;
-public:
-  ArithmeticSequence<ReverseFirstType,SizeType,ReverseIncrType>
-  reverse() const {
-    return seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr);
-  }
-#endif
-};
-
-/** \returns an ArithmeticSequence starting at \a first, of length \a size, and increment \a incr
-  *
-  * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */
-template<typename FirstType,typename SizeType,typename IncrType>
-ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,typename internal::cleanup_index_type<SizeType>::type,typename internal::cleanup_seq_incr<IncrType>::type >
-seqN(FirstType first, SizeType size, IncrType incr)  {
-  return ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,typename internal::cleanup_index_type<SizeType>::type,typename internal::cleanup_seq_incr<IncrType>::type>(first,size,incr);
-}
-
-/** \returns an ArithmeticSequence starting at \a first, of length \a size, and unit increment
-  *
-  * \sa seqN(FirstType,SizeType,IncrType), seq(FirstType,LastType) */
-template<typename FirstType,typename SizeType>
-ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,typename internal::cleanup_index_type<SizeType>::type >
-seqN(FirstType first, SizeType size)  {
-  return ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,typename internal::cleanup_index_type<SizeType>::type>(first,size);
-}
-
-#ifdef EIGEN_PARSED_BY_DOXYGEN
-
-/** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and with positive (or negative) increment \a incr
-  *
-  * It is essentially an alias to:
-  * \code
-  * seqN(f, (l-f+incr)/incr, incr);
-  * \endcode
-  *
-  * \sa seqN(FirstType,SizeType,IncrType), seq(FirstType,LastType)
-  */
-template<typename FirstType,typename LastType, typename IncrType>
-auto seq(FirstType f, LastType l, IncrType incr);
-
-/** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and unit increment
-  *
-  * It is essentially an alias to:
-  * \code
-  * seqN(f,l-f+1);
-  * \endcode
-  *
-  * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType)
-  */
-template<typename FirstType,typename LastType>
-auto seq(FirstType f, LastType l);
-
-#else // EIGEN_PARSED_BY_DOXYGEN
-
-#if EIGEN_HAS_CXX11
-template<typename FirstType,typename LastType>
-auto seq(FirstType f, LastType l) -> decltype(seqN(typename internal::cleanup_index_type<FirstType>::type(f),
-                                                   (  typename internal::cleanup_index_type<LastType>::type(l)
-                                                    - typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>())))
-{
-  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
-              (typename internal::cleanup_index_type<LastType>::type(l)
-               -typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>()));
-}
-
-template<typename FirstType,typename LastType, typename IncrType>
-auto seq(FirstType f, LastType l, IncrType incr)
-  -> decltype(seqN(typename internal::cleanup_index_type<FirstType>::type(f),
-                   (   typename internal::cleanup_index_type<LastType>::type(l)
-                     - typename internal::cleanup_index_type<FirstType>::type(f)+typename internal::cleanup_seq_incr<IncrType>::type(incr)
-                   ) / typename internal::cleanup_seq_incr<IncrType>::type(incr),
-                   typename internal::cleanup_seq_incr<IncrType>::type(incr)))
-{
-  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
-  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
-              ( typename internal::cleanup_index_type<LastType>::type(l)
-               -typename internal::cleanup_index_type<FirstType>::type(f)+CleanedIncrType(incr)) / CleanedIncrType(incr),
-              CleanedIncrType(incr));
-}
-
-#else // EIGEN_HAS_CXX11
-
-template<typename FirstType,typename LastType>
-typename internal::enable_if<!(symbolic::is_symbolic<FirstType>::value || symbolic::is_symbolic<LastType>::value),
-                             ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,Index> >::type
-seq(FirstType f, LastType l)
-{
-  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
-              Index((typename internal::cleanup_index_type<LastType>::type(l)-typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>())));
-}
-
-template<typename FirstTypeDerived,typename LastType>
-typename internal::enable_if<!symbolic::is_symbolic<LastType>::value,
-    ArithmeticSequence<FirstTypeDerived, symbolic::AddExpr<symbolic::AddExpr<symbolic::NegateExpr<FirstTypeDerived>,symbolic::ValueExpr<> >,
-                                                            symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
-seq(const symbolic::BaseExpr<FirstTypeDerived> &f, LastType l)
-{
-  return seqN(f.derived(),(typename internal::cleanup_index_type<LastType>::type(l)-f.derived()+fix<1>()));
-}
-
-template<typename FirstType,typename LastTypeDerived>
-typename internal::enable_if<!symbolic::is_symbolic<FirstType>::value,
-    ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
-                        symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::ValueExpr<> >,
-                                          symbolic::ValueExpr<internal::FixedInt<1> > > > >::type
-seq(FirstType f, const symbolic::BaseExpr<LastTypeDerived> &l)
-{
-  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),(l.derived()-typename internal::cleanup_index_type<FirstType>::type(f)+fix<1>()));
-}
-
-template<typename FirstTypeDerived,typename LastTypeDerived>
-ArithmeticSequence<FirstTypeDerived,
-                    symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::NegateExpr<FirstTypeDerived> >,symbolic::ValueExpr<internal::FixedInt<1> > > >
-seq(const symbolic::BaseExpr<FirstTypeDerived> &f, const symbolic::BaseExpr<LastTypeDerived> &l)
-{
-  return seqN(f.derived(),(l.derived()-f.derived()+fix<1>()));
-}
-
-
-template<typename FirstType,typename LastType, typename IncrType>
-typename internal::enable_if<!(symbolic::is_symbolic<FirstType>::value || symbolic::is_symbolic<LastType>::value),
-    ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,Index,typename internal::cleanup_seq_incr<IncrType>::type> >::type
-seq(FirstType f, LastType l, IncrType incr)
-{
-  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
-  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
-              Index((typename internal::cleanup_index_type<LastType>::type(l)-typename internal::cleanup_index_type<FirstType>::type(f)+CleanedIncrType(incr))/CleanedIncrType(incr)), incr);
-}
-
-template<typename FirstTypeDerived,typename LastType, typename IncrType>
-typename internal::enable_if<!symbolic::is_symbolic<LastType>::value,
-    ArithmeticSequence<FirstTypeDerived,
-                        symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<symbolic::NegateExpr<FirstTypeDerived>,
-                                                                                   symbolic::ValueExpr<> >,
-                                                                 symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
-                                              symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
-                        typename internal::cleanup_seq_incr<IncrType>::type> >::type
-seq(const symbolic::BaseExpr<FirstTypeDerived> &f, LastType l, IncrType incr)
-{
-  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
-  return seqN(f.derived(),(typename internal::cleanup_index_type<LastType>::type(l)-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
-}
-
-template<typename FirstType,typename LastTypeDerived, typename IncrType>
-typename internal::enable_if<!symbolic::is_symbolic<FirstType>::value,
-    ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
-                        symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,symbolic::ValueExpr<> >,
-                                                                 symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
-                                               symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
-                        typename internal::cleanup_seq_incr<IncrType>::type> >::type
-seq(FirstType f, const symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
-{
-  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
-  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
-              (l.derived()-typename internal::cleanup_index_type<FirstType>::type(f)+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
-}
-
-template<typename FirstTypeDerived,typename LastTypeDerived, typename IncrType>
-ArithmeticSequence<FirstTypeDerived,
-                    symbolic::QuotientExpr<symbolic::AddExpr<symbolic::AddExpr<LastTypeDerived,
-                                                                               symbolic::NegateExpr<FirstTypeDerived> >,
-                                                             symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
-                                          symbolic::ValueExpr<typename internal::cleanup_seq_incr<IncrType>::type> >,
-                    typename internal::cleanup_seq_incr<IncrType>::type>
-seq(const symbolic::BaseExpr<FirstTypeDerived> &f, const symbolic::BaseExpr<LastTypeDerived> &l, IncrType incr)
-{
-  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
-  return seqN(f.derived(),(l.derived()-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr);
-}
-#endif // EIGEN_HAS_CXX11
-
-#endif // EIGEN_PARSED_BY_DOXYGEN
-
-
-#if EIGEN_HAS_CXX11 || defined(EIGEN_PARSED_BY_DOXYGEN)
-/** \cpp11
-  * \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr.
-  *
-  * It is a shortcut for: \code seqN(last-(size-fix<1>)*incr, size, incr) \endcode
-  * 
-  * \sa lastN(SizeType), seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */
-template<typename SizeType,typename IncrType>
-auto lastN(SizeType size, IncrType incr)
--> decltype(seqN(Eigen::last-(size-fix<1>())*incr, size, incr))
-{
-  return seqN(Eigen::last-(size-fix<1>())*incr, size, incr);
-}
-
-/** \cpp11
-  * \returns a symbolic ArithmeticSequence representing the last \a size elements with a unit increment.
-  *
-  *  It is a shortcut for: \code seq(last+fix<1>-size, last) \endcode
-  * 
-  * \sa lastN(SizeType,IncrType, seqN(FirstType,SizeType), seq(FirstType,LastType) */
-template<typename SizeType>
-auto lastN(SizeType size)
--> decltype(seqN(Eigen::last+fix<1>()-size, size))
-{
-  return seqN(Eigen::last+fix<1>()-size, size);
-}
-#endif
-
-namespace internal {
-
-// Convert a symbolic span into a usable one (i.e., remove last/end "keywords")
-template<typename T>
-struct make_size_type {
-  typedef typename internal::conditional<symbolic::is_symbolic<T>::value, Index, T>::type type;
-};
-
-template<typename FirstType,typename SizeType,typename IncrType,int XprSize>
-struct IndexedViewCompatibleType<ArithmeticSequence<FirstType,SizeType,IncrType>, XprSize> {
-  typedef ArithmeticSequence<Index,typename make_size_type<SizeType>::type,IncrType> type;
-};
-
-template<typename FirstType,typename SizeType,typename IncrType>
-ArithmeticSequence<Index,typename make_size_type<SizeType>::type,IncrType>
-makeIndexedViewCompatible(const ArithmeticSequence<FirstType,SizeType,IncrType>& ids, Index size,SpecializedType) {
-  return ArithmeticSequence<Index,typename make_size_type<SizeType>::type,IncrType>(
-            eval_expr_given_size(ids.firstObject(),size),eval_expr_given_size(ids.sizeObject(),size),ids.incrObject());
-}
-
-template<typename FirstType,typename SizeType,typename IncrType>
-struct get_compile_time_incr<ArithmeticSequence<FirstType,SizeType,IncrType> > {
-  enum { value = get_fixed_value<IncrType,DynamicIndex>::value };
-};
-
-} // end namespace internal
-
-/** \namespace Eigen::indexing
-  * \ingroup Core_Module
-  * 
-  * The sole purpose of this namespace is to be able to import all functions
-  * and symbols that are expected to be used within operator() for indexing
-  * and slicing. If you already imported the whole Eigen namespace:
-  * \code using namespace Eigen; \endcode
-  * then you are already all set. Otherwise, if you don't want/cannot import
-  * the whole Eigen namespace, the following line:
-  * \code using namespace Eigen::indexing; \endcode
-  * is equivalent to:
-  * \code
-  using Eigen::all;
-  using Eigen::seq;
-  using Eigen::seqN;
-  using Eigen::lastN; // c++11 only
-  using Eigen::last;
-  using Eigen::lastp1;
-  using Eigen::fix;
-  \endcode
-  */
-namespace indexing {
-  using Eigen::all;
-  using Eigen::seq;
-  using Eigen::seqN;
-  #if EIGEN_HAS_CXX11
-  using Eigen::lastN;
-  #endif
-  using Eigen::last;
-  using Eigen::lastp1;
-  using Eigen::fix;
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_ARITHMETIC_SEQUENCE_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Array.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Array.h
index 64fd02ddf..16770fc7b 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/Array.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Array.h
@@ -162,45 +162,6 @@ class Array
     }
 #endif
 
-    #if EIGEN_HAS_CXX11
-    /** \copydoc PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
-     *
-     * Example: \include Array_variadic_ctor_cxx11.cpp
-     * Output: \verbinclude Array_variadic_ctor_cxx11.out
-     *
-     * \sa Array(const std::initializer_list<std::initializer_list<Scalar>>&)
-     * \sa Array(const Scalar&), Array(const Scalar&,const Scalar&)
-     */
-    template <typename... ArgTypes>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
-      : Base(a0, a1, a2, a3, args...) {}
-
-    /** \brief Constructs an array and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11
-      * 
-      * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
-      * 
-      * Example: \include Array_initializer_list_23_cxx11.cpp
-      * Output: \verbinclude Array_initializer_list_23_cxx11.out
-      * 
-      * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered.
-      * 
-      * In the case of a compile-time column 1D array, implicit transposition from a single row is allowed.
-      * Therefore <code> Array<int,Dynamic,1>{{1,2,3,4,5}}</code> is legal and the more verbose syntax
-      * <code>Array<int,Dynamic,1>{{1},{2},{3},{4},{5}}</code> can be avoided:
-      * 
-      * Example: \include Array_initializer_list_vector_cxx11.cpp
-      * Output: \verbinclude Array_initializer_list_vector_cxx11.out
-      * 
-      * In the case of fixed-sized arrays, the initializer list sizes must exactly match the array sizes,
-      * and implicit transposition is allowed for compile-time 1D arrays only.
-      * 
-      * \sa  Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
-      */
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Array(const std::initializer_list<std::initializer_list<Scalar>>& list) : Base(list) {}
-    #endif // end EIGEN_HAS_CXX11
-
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename T>
     EIGEN_DEVICE_FUNC
@@ -217,7 +178,6 @@ class Array
       Base::_check_template_params();
       this->template _init2<T0,T1>(val0, val1);
     }
-
     #else
     /** \brief Constructs a fixed-sized array initialized with coefficients starting at \a data */
     EIGEN_DEVICE_FUNC explicit Array(const Scalar *data);
@@ -229,8 +189,7 @@ class Array
       */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE explicit Array(Index dim);
-    /** constructs an initialized 1x1 Array with the given coefficient
-      * \sa const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args */
+    /** constructs an initialized 1x1 Array with the given coefficient */
     Array(const Scalar& value);
     /** constructs an uninitialized array with \a rows rows and \a cols columns.
       *
@@ -238,14 +197,11 @@ class Array
       * it is redundant to pass these parameters, so one should use the default constructor
       * Array() instead. */
     Array(Index rows, Index cols);
-    /** constructs an initialized 2D vector with given coefficients
-      * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) */
+    /** constructs an initialized 2D vector with given coefficients */
     Array(const Scalar& val0, const Scalar& val1);
-    #endif  // end EIGEN_PARSED_BY_DOXYGEN 
+    #endif
 
-    /** constructs an initialized 3D vector with given coefficients
-      * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
-      */
+    /** constructs an initialized 3D vector with given coefficients */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2)
     {
@@ -255,9 +211,7 @@ class Array
       m_storage.data()[1] = val1;
       m_storage.data()[2] = val2;
     }
-    /** constructs an initialized 4D vector with given coefficients
-      * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
-      */
+    /** constructs an initialized 4D vector with given coefficients */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2, const Scalar& val3)
     {
@@ -304,7 +258,7 @@ class Array
 /** \defgroup arraytypedefs Global array typedefs
   * \ingroup Core_Module
   *
-  * %Eigen defines several typedef shortcuts for most common 1D and 2D array types.
+  * Eigen defines several typedef shortcuts for most common 1D and 2D array types.
   *
   * The general patterns are the following:
   *
@@ -317,12 +271,6 @@ class Array
   * There are also \c ArraySizeType which are self-explanatory. For example, \c Array4cf is
   * a fixed-size 1D array of 4 complex floats.
   *
-  * With \cpp11, template alias are also defined for common sizes.
-  * They follow the same pattern as above except that the scalar type suffix is replaced by a
-  * template parameter, i.e.:
-  *   - `ArrayRowsCols<Type>` where `Rows` and `Cols` can be \c 2,\c 3,\c 4, or \c X for fixed or dynamic size.
-  *   - `ArraySize<Type>` where `Size` can be \c 2,\c 3,\c 4 or \c X for fixed or dynamic size 1D arrays.
-  * 
   * \sa class Array
   */
 
@@ -355,43 +303,9 @@ EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(std::complex<double>, cd)
 
 #undef EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES
 #undef EIGEN_MAKE_ARRAY_TYPEDEFS
-#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS
 
-#if EIGEN_HAS_CXX11
+#undef EIGEN_MAKE_ARRAY_TYPEDEFS_LARGE
 
-#define EIGEN_MAKE_ARRAY_TYPEDEFS(Size, SizeSuffix)               \
-/** \ingroup arraytypedefs */                                     \
-/** \brief \cpp11 */                                              \
-template <typename Type>                                          \
-using Array##SizeSuffix##SizeSuffix = Array<Type, Size, Size>;    \
-/** \ingroup arraytypedefs */                                     \
-/** \brief \cpp11 */                                              \
-template <typename Type>                                          \
-using Array##SizeSuffix = Array<Type, Size, 1>; 
-
-#define EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Size)                     \
-/** \ingroup arraytypedefs */                                     \
-/** \brief \cpp11 */                                              \
-template <typename Type>                                          \
-using Array##Size##X = Array<Type, Size, Dynamic>;                \
-/** \ingroup arraytypedefs */                                     \
-/** \brief \cpp11 */                                              \
-template <typename Type>                                          \
-using Array##X##Size = Array<Type, Dynamic, Size>;
-
-EIGEN_MAKE_ARRAY_TYPEDEFS(2, 2)
-EIGEN_MAKE_ARRAY_TYPEDEFS(3, 3)
-EIGEN_MAKE_ARRAY_TYPEDEFS(4, 4)
-EIGEN_MAKE_ARRAY_TYPEDEFS(Dynamic, X)
-EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(2)
-EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(3)
-EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(4)
-
-#undef EIGEN_MAKE_ARRAY_TYPEDEFS
-#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS
-
-#endif // EIGEN_HAS_CXX11
-  
 #define EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, SizeSuffix) \
 using Eigen::Matrix##SizeSuffix##TypeSuffix; \
 using Eigen::Vector##SizeSuffix##TypeSuffix; \
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/ArrayBase.h b/uppsrc/plugin/Eigen/Eigen/src/Core/ArrayBase.h
index ea3dd1c3b..33f644e21 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/ArrayBase.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/ArrayBase.h
@@ -69,7 +69,6 @@ template<typename Derived> class ArrayBase
     using Base::coeff;
     using Base::coeffRef;
     using Base::lazyAssign;
-    using Base::operator-;
     using Base::operator=;
     using Base::operator+=;
     using Base::operator-=;
@@ -89,6 +88,7 @@ template<typename Derived> class ArrayBase
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase
 #define EIGEN_DOC_UNARY_ADDONS(X,Y)
+#   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
 #   include "../plugins/ArrayCwiseUnaryOps.h"
 #   include "../plugins/CommonCwiseBinaryOps.h"
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/ArrayWrapper.h b/uppsrc/plugin/Eigen/Eigen/src/Core/ArrayWrapper.h
index 757b31825..688aadd62 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/ArrayWrapper.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/ArrayWrapper.h
@@ -90,8 +90,8 @@ class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
     EIGEN_DEVICE_FUNC
     inline void evalTo(Dest& dst) const { dst = m_expression; }
 
-    EIGEN_DEVICE_FUNC
     const typename internal::remove_all<NestedExpressionType>::type& 
+    EIGEN_DEVICE_FUNC
     nestedExpression() const 
     {
       return m_expression;
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Assign.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Assign.h
index 655412efd..53806ba33 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/Assign.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Assign.h
@@ -16,7 +16,7 @@ namespace Eigen {
 
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
+EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
   ::lazyAssign(const DenseBase<OtherDerived>& other)
 {
   enum{
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/AssignEvaluator.h b/uppsrc/plugin/Eigen/Eigen/src/Core/AssignEvaluator.h
index 229e25854..dbe435d86 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/AssignEvaluator.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/AssignEvaluator.h
@@ -24,7 +24,7 @@ namespace internal {
 
 // copy_using_evaluator_traits is based on assign_traits
 
-template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc, int MaxPacketSize = -1>
+template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc>
 struct copy_using_evaluator_traits
 {
   typedef typename DstEvaluator::XprType Dst;
@@ -51,15 +51,13 @@ private:
     InnerMaxSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime)
               : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime)
               : int(Dst::MaxRowsAtCompileTime),
-    RestrictedInnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(InnerSize,MaxPacketSize),
-    RestrictedLinearSize = EIGEN_SIZE_MIN_PREFER_FIXED(Dst::SizeAtCompileTime,MaxPacketSize),
     OuterStride = int(outer_stride_at_compile_time<Dst>::ret),
     MaxSizeAtCompileTime = Dst::SizeAtCompileTime
   };
 
   // TODO distinguish between linear traversal and inner-traversals
-  typedef typename find_best_packet<DstScalar,RestrictedLinearSize>::type LinearPacketType;
-  typedef typename find_best_packet<DstScalar,RestrictedInnerSize>::type InnerPacketType;
+  typedef typename find_best_packet<DstScalar,Dst::SizeAtCompileTime>::type LinearPacketType;
+  typedef typename find_best_packet<DstScalar,InnerSize>::type InnerPacketType;
 
   enum {
     LinearPacketSize = unpacket_traits<LinearPacketType>::size,
@@ -99,7 +97,7 @@ private:
 
 public:
   enum {
-    Traversal = (int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize)) ? int(LinearVectorizedTraversal)
+    Traversal = int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize) ? int(LinearVectorizedTraversal)
               : int(MayInnerVectorize)   ? int(InnerVectorizedTraversal)
               : int(MayLinearVectorize)  ? int(LinearVectorizedTraversal)
               : int(MaySliceVectorize)   ? int(SliceVectorizedTraversal)
@@ -174,8 +172,6 @@ public:
     EIGEN_DEBUG_VAR(MaySliceVectorize)
     std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
     EIGEN_DEBUG_VAR(SrcEvaluator::CoeffReadCost)
-    EIGEN_DEBUG_VAR(DstEvaluator::CoeffReadCost)
-    EIGEN_DEBUG_VAR(Dst::SizeAtCompileTime)
     EIGEN_DEBUG_VAR(UnrollingLimit)
     EIGEN_DEBUG_VAR(MayUnrollCompletely)
     EIGEN_DEBUG_VAR(MayUnrollInner)
@@ -534,7 +530,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling>
     const Scalar *dst_ptr = kernel.dstDataPtr();
     if((!bool(dstIsAligned)) && (UIntPtr(dst_ptr) % sizeof(Scalar))>0)
     {
-      // the pointer is not aligned-on scalar, so alignment is not possible
+      // the pointer is not aligend-on scalar, so alignment is not possible
       return dense_assignment_loop<Kernel,DefaultTraversal,NoUnrolling>::run(kernel);
     }
     const Index packetAlignedMask = packetSize - 1;
@@ -611,8 +607,7 @@ public:
   typedef typename AssignmentTraits::PacketType PacketType;
   
   
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
+  EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr)
     : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr)
   {
     #ifdef EIGEN_DEBUG_ASSIGN
@@ -702,27 +697,6 @@ protected:
   DstXprType& m_dstExpr;
 };
 
-// Special kernel used when computing small products whose operands have dynamic dimensions.  It ensures that the
-// PacketSize used is no larger than 4, thereby increasing the chance that vectorized instructions will be used
-// when computing the product.
-
-template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor>
-class restricted_packet_dense_assignment_kernel : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, BuiltIn>
-{
-protected:
-  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, BuiltIn> Base;
- public:
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::DstXprType DstXprType;
-    typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, 4> AssignmentTraits;
-    typedef typename AssignmentTraits::PacketType PacketType;
-    
-    EIGEN_DEVICE_FUNC restricted_packet_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
-    : Base(dst, src, func, dstExpr)
-  {
-  }
- };
- 
 /***************************************************************************
 * Part 5 : Entry point for dense rectangular assignment
 ***************************************************************************/
@@ -782,7 +756,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType
 // AssignmentKind must define a Kind typedef.
 template<typename DstShape, typename SrcShape> struct AssignmentKind;
 
-// Assignment kind defined in this file:
+// Assignement kind defined in this file:
 struct Dense2Dense {};
 struct EigenBase2EigenBase {};
 
@@ -861,27 +835,6 @@ void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
   
   Assignment<ActualDstTypeCleaned,Src,Func>::run(actualDst, src, func);
 }
-
-template<typename Dst, typename Src, typename Func>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-void call_restricted_packet_assignment_no_alias(Dst& dst, const Src& src, const Func& func)
-{
-    typedef evaluator<Dst> DstEvaluatorType;
-    typedef evaluator<Src> SrcEvaluatorType;
-    typedef restricted_packet_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Func> Kernel;
-
-    EIGEN_STATIC_ASSERT_LVALUE(Dst)
-    EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename Dst::Scalar,typename Src::Scalar);
-
-    SrcEvaluatorType srcEvaluator(src);
-    resize_if_allowed(dst, src, func);
-    
-    DstEvaluatorType dstEvaluator(dst);
-    Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
-
-    dense_assignment_loop<Kernel>::run(kernel);
-}
-
 template<typename Dst, typename Src>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 void call_assignment_no_alias(Dst& dst, const Src& src)
@@ -946,7 +899,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak>
     src.evalTo(dst);
   }
 
-  // NOTE The following two functions are templated to avoid their instantiation if not needed
+  // NOTE The following two functions are templated to avoid their instanciation if not needed
   //      This is needed because some expressions supports evalTo only and/or have 'void' as scalar type.
   template<typename SrcScalarType>
   EIGEN_DEVICE_FUNC
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Assign_MKL.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Assign_MKL.h
index c6140d185..6866095bf 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/Assign_MKL.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Assign_MKL.h
@@ -68,16 +68,16 @@ class vml_assign_traits
 
 #define EIGEN_PP_EXPAND(ARG) ARG
 #if !defined (EIGEN_FAST_MATH) || (EIGEN_FAST_MATH != 1)
-#define EIGEN_VMLMODE_EXPAND_xLA , VML_HA
+#define EIGEN_VMLMODE_EXPAND_LA , VML_HA
 #else
-#define EIGEN_VMLMODE_EXPAND_xLA , VML_LA
+#define EIGEN_VMLMODE_EXPAND_LA , VML_LA
 #endif
 
-#define EIGEN_VMLMODE_EXPAND_x_
+#define EIGEN_VMLMODE_EXPAND__ 
 
-#define EIGEN_VMLMODE_PREFIX_xLA vm
-#define EIGEN_VMLMODE_PREFIX_x_  v
-#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_x,VMLMODE)
+#define EIGEN_VMLMODE_PREFIX_LA vm
+#define EIGEN_VMLMODE_PREFIX__  v
+#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_,VMLMODE)
 
 #define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                                           \
   template< typename DstXprType, typename SrcXprNested>                                                                         \
@@ -89,7 +89,7 @@ class vml_assign_traits
       eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                                       \
       if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal) {                                              \
         VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(),                                                        \
-              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE) );                                           \
+              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) );                                           \
       } else {                                                                                                                  \
         const Index outerSize = dst.outerSize();                                                                                \
         for(Index outer = 0; outer < outerSize; ++outer) {                                                                      \
@@ -97,7 +97,7 @@ class vml_assign_traits
                                                       &(src.nestedExpression().coeffRef(0, outer));                             \
           EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));                           \
           VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr,                                                                      \
-                (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE));                                             \
+                (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE));                                             \
         }                                                                                                                       \
       }                                                                                                                         \
     }                                                                                                                           \
@@ -152,7 +152,7 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil,  Ceil,   _)
       if(vml_assign_traits<DstXprType,SrcXprNested>::Traversal==LinearTraversal)                                              \
       {                                                                                                                       \
         VMLOP( dst.size(), (const VMLTYPE*)src.lhs().data(), exponent,                                                        \
-              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE) );                                         \
+              (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) );                                         \
       } else {                                                                                                                \
         const Index outerSize = dst.outerSize();                                                                              \
         for(Index outer = 0; outer < outerSize; ++outer) {                                                                    \
@@ -160,7 +160,7 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil,  Ceil,   _)
                                                       &(src.lhs().coeffRef(0, outer));                                        \
           EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));                         \
           VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr, exponent,                                                          \
-                 (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE));                                          \
+                 (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE));                                          \
         }                                                                                                                     \
       }                                                                                                                       \
     }                                                                                                                         \
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Block.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Block.h
index 6e938ea58..11de45c2e 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/Block.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Block.h
@@ -114,8 +114,8 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class
   
     /** Column or Row constructor
       */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Block(XprType& xpr, Index i) : Impl(xpr,i)
+    EIGEN_DEVICE_FUNC
+    inline Block(XprType& xpr, Index i) : Impl(xpr,i)
     {
       eigen_assert( (i>=0) && (
           ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && i<xpr.rows())
@@ -124,8 +124,8 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class
 
     /** Fixed-size constructor
       */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Block(XprType& xpr, Index startRow, Index startCol)
+    EIGEN_DEVICE_FUNC
+    inline Block(XprType& xpr, Index startRow, Index startCol)
       : Impl(xpr, startRow, startCol)
     {
       EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
@@ -135,8 +135,8 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class
 
     /** Dynamic-size constructor
       */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Block(XprType& xpr,
+    EIGEN_DEVICE_FUNC
+    inline Block(XprType& xpr,
           Index startRow, Index startCol,
           Index blockRows, Index blockCols)
       : Impl(xpr, startRow, startCol, blockRows, blockCols)
@@ -159,10 +159,10 @@ class BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, Dense>
   public:
     typedef Impl Base;
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {}
+    EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
+    EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {}
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+    inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
       : Impl(xpr, startRow, startCol, blockRows, blockCols) {}
 };
 
@@ -294,22 +294,22 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
     EIGEN_DEVICE_FUNC inline Index outerStride() const;
     #endif
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
     const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const
     { 
       return m_xpr; 
     }
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
     XprType& nestedExpression() { return m_xpr; }
       
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
     StorageIndex startRow() const
     { 
       return m_startRow.value(); 
     }
       
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
     StorageIndex startCol() const
     { 
       return m_startCol.value(); 
@@ -342,8 +342,8 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
 
     /** Column or Row constructor
       */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    BlockImpl_dense(XprType& xpr, Index i)
+    EIGEN_DEVICE_FUNC
+    inline BlockImpl_dense(XprType& xpr, Index i)
       : Base(xpr.data() + i * (    ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) 
                                 || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()),
              BlockRows==1 ? 1 : xpr.rows(),
@@ -357,8 +357,8 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
 
     /** Fixed-size constructor
       */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
+    EIGEN_DEVICE_FUNC
+    inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
       : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)),
         m_xpr(xpr), m_startRow(startRow), m_startCol(startCol)
     {
@@ -367,8 +367,8 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
 
     /** Dynamic-size constructor
       */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    BlockImpl_dense(XprType& xpr,
+    EIGEN_DEVICE_FUNC
+    inline BlockImpl_dense(XprType& xpr,
           Index startRow, Index startCol,
           Index blockRows, Index blockCols)
       : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols),
@@ -377,18 +377,18 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
       init();
     }
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
     const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const
     { 
       return m_xpr; 
     }
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
     XprType& nestedExpression() { return m_xpr; }
       
     /** \sa MapBase::innerStride() */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index innerStride() const
+    EIGEN_DEVICE_FUNC
+    inline Index innerStride() const
     {
       return internal::traits<BlockType>::HasSameStorageOrderAsXprType
              ? m_xpr.innerStride()
@@ -396,19 +396,19 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
     }
 
     /** \sa MapBase::outerStride() */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index outerStride() const
+    EIGEN_DEVICE_FUNC
+    inline Index outerStride() const
     {
       return m_outerStride;
     }
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
     StorageIndex startRow() const
     {
       return m_startRow.value();
     }
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
     StorageIndex startCol() const
     {
       return m_startCol.value();
@@ -422,8 +422,8 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal used by allowAligned() */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols)
+    EIGEN_DEVICE_FUNC
+    inline BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols)
       : Base(data, blockRows, blockCols), m_xpr(xpr)
     {
       init();
@@ -431,7 +431,7 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
     #endif
 
   protected:
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
     void init()
     {
       m_outerStride = internal::traits<BlockType>::HasSameStorageOrderAsXprType
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/BooleanRedux.h b/uppsrc/plugin/Eigen/Eigen/src/Core/BooleanRedux.h
index e32c4ac5b..8409d8749 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/BooleanRedux.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/BooleanRedux.h
@@ -14,56 +14,58 @@ namespace Eigen {
 
 namespace internal {
 
-template<typename Derived, int UnrollCount, int Rows>
+template<typename Derived, int UnrollCount>
 struct all_unroller
 {
+  typedef typename Derived::ExpressionTraits Traits;
   enum {
-    col = (UnrollCount-1) / Rows,
-    row = (UnrollCount-1) % Rows
+    col = (UnrollCount-1) / Traits::RowsAtCompileTime,
+    row = (UnrollCount-1) % Traits::RowsAtCompileTime
   };
 
-  EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat)
+  static inline bool run(const Derived &mat)
   {
-    return all_unroller<Derived, UnrollCount-1, Rows>::run(mat) && mat.coeff(row, col);
+    return all_unroller<Derived, UnrollCount-1>::run(mat) && mat.coeff(row, col);
   }
 };
 
-template<typename Derived, int Rows>
-struct all_unroller<Derived, 0, Rows>
+template<typename Derived>
+struct all_unroller<Derived, 0>
 {
-  EIGEN_DEVICE_FUNC static inline bool run(const Derived &/*mat*/) { return true; }
+  static inline bool run(const Derived &/*mat*/) { return true; }
 };
 
-template<typename Derived, int Rows>
-struct all_unroller<Derived, Dynamic, Rows>
+template<typename Derived>
+struct all_unroller<Derived, Dynamic>
 {
-  EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; }
+  static inline bool run(const Derived &) { return false; }
 };
 
-template<typename Derived, int UnrollCount, int Rows>
+template<typename Derived, int UnrollCount>
 struct any_unroller
 {
+  typedef typename Derived::ExpressionTraits Traits;
   enum {
-    col = (UnrollCount-1) / Rows,
-    row = (UnrollCount-1) % Rows
+    col = (UnrollCount-1) / Traits::RowsAtCompileTime,
+    row = (UnrollCount-1) % Traits::RowsAtCompileTime
   };
   
-  EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat)
+  static inline bool run(const Derived &mat)
   {
-    return any_unroller<Derived, UnrollCount-1, Rows>::run(mat) || mat.coeff(row, col);
+    return any_unroller<Derived, UnrollCount-1>::run(mat) || mat.coeff(row, col);
   }
 };
 
-template<typename Derived, int Rows>
-struct any_unroller<Derived, 0, Rows>
+template<typename Derived>
+struct any_unroller<Derived, 0>
 {
-  EIGEN_DEVICE_FUNC static inline bool run(const Derived & /*mat*/) { return false; }
+  static inline bool run(const Derived & /*mat*/) { return false; }
 };
 
-template<typename Derived, int Rows>
-struct any_unroller<Derived, Dynamic, Rows>
+template<typename Derived>
+struct any_unroller<Derived, Dynamic>
 {
-  EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; }
+  static inline bool run(const Derived &) { return false; }
 };
 
 } // end namespace internal
@@ -76,7 +78,7 @@ struct any_unroller<Derived, Dynamic, Rows>
   * \sa any(), Cwise::operator<()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::all() const
+inline bool DenseBase<Derived>::all() const
 {
   typedef internal::evaluator<Derived> Evaluator;
   enum {
@@ -85,7 +87,7 @@ EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::all() const
   };
   Evaluator evaluator(derived());
   if(unroll)
-    return internal::all_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic, internal::traits<Derived>::RowsAtCompileTime>::run(evaluator);
+    return internal::all_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic>::run(evaluator);
   else
   {
     for(Index j = 0; j < cols(); ++j)
@@ -100,7 +102,7 @@ EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::all() const
   * \sa all()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::any() const
+inline bool DenseBase<Derived>::any() const
 {
   typedef internal::evaluator<Derived> Evaluator;
   enum {
@@ -109,7 +111,7 @@ EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::any() const
   };
   Evaluator evaluator(derived());
   if(unroll)
-    return internal::any_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic, internal::traits<Derived>::RowsAtCompileTime>::run(evaluator);
+    return internal::any_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic>::run(evaluator);
   else
   {
     for(Index j = 0; j < cols(); ++j)
@@ -124,7 +126,7 @@ EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::any() const
   * \sa all(), any()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline Eigen::Index DenseBase<Derived>::count() const
+inline Eigen::Index DenseBase<Derived>::count() const
 {
   return derived().template cast<bool>().template cast<Index>().sum();
 }
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/CommaInitializer.h b/uppsrc/plugin/Eigen/Eigen/src/Core/CommaInitializer.h
index c0e29c75c..d218e9814 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/CommaInitializer.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/CommaInitializer.h
@@ -33,8 +33,6 @@ struct CommaInitializer
   inline CommaInitializer(XprType& xpr, const Scalar& s)
     : m_xpr(xpr), m_row(0), m_col(1), m_currentBlockRows(1)
   {
-    eigen_assert(m_xpr.rows() > 0 && m_xpr.cols() > 0
-      && "Cannot comma-initialize a 0x0 matrix (operator<<)");
     m_xpr.coeffRef(0,0) = s;
   }
 
@@ -43,8 +41,6 @@ struct CommaInitializer
   inline CommaInitializer(XprType& xpr, const DenseBase<OtherDerived>& other)
     : m_xpr(xpr), m_row(0), m_col(other.cols()), m_currentBlockRows(other.rows())
   {
-    eigen_assert(m_xpr.rows() >= other.rows() && m_xpr.cols() >= other.cols()
-      && "Cannot comma-initialize a 0x0 matrix (operator<<)");
     m_xpr.block(0, 0, other.rows(), other.cols()) = other;
   }
 
@@ -107,7 +103,7 @@ struct CommaInitializer
   EIGEN_EXCEPTION_SPEC(Eigen::eigen_assert_exception)
 #endif
   {
-    finished();
+      finished();
   }
 
   /** \returns the built matrix once all its coefficients have been set.
@@ -145,7 +141,7 @@ struct CommaInitializer
   * \sa CommaInitializer::finished(), class CommaInitializer
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
+inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
 {
   return CommaInitializer<Derived>(*static_cast<Derived*>(this), s);
 }
@@ -153,7 +149,7 @@ EIGEN_DEVICE_FUNC inline CommaInitializer<Derived> DenseBase<Derived>::operator<
 /** \sa operator<<(const Scalar&) */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC inline CommaInitializer<Derived>
+inline CommaInitializer<Derived>
 DenseBase<Derived>::operator<<(const DenseBase<OtherDerived>& other)
 {
   return CommaInitializer<Derived>(*static_cast<Derived *>(this), other);
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/CoreEvaluators.h b/uppsrc/plugin/Eigen/Eigen/src/Core/CoreEvaluators.h
index a77c0fa81..910889efa 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/CoreEvaluators.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/CoreEvaluators.h
@@ -90,8 +90,7 @@ template<typename T>
 struct evaluator : public unary_evaluator<T>
 {
   typedef unary_evaluator<T> Base;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit evaluator(const T& xpr) : Base(xpr) {}
+  EIGEN_DEVICE_FUNC explicit evaluator(const T& xpr) : Base(xpr) {}
 };
 
 
@@ -100,14 +99,14 @@ template<typename T>
 struct evaluator<const T>
   : evaluator<T>
 {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  EIGEN_DEVICE_FUNC
   explicit evaluator(const T& xpr) : evaluator<T>(xpr) {}
 };
 
 // ---------- base class for all evaluators ----------
 
 template<typename ExpressionType>
-struct evaluator_base
+struct evaluator_base : public noncopyable
 {
   // TODO that's not very nice to have to propagate all these traits. They are currently only needed to handle outer,inner indices.
   typedef traits<ExpressionType> ExpressionTraits;
@@ -115,14 +114,6 @@ struct evaluator_base
   enum {
     Alignment = 0
   };
-  // noncopyable:
-  // Don't make this class inherit noncopyable as this kills EBO (Empty Base Optimization)
-  // and make complex evaluator much larger than then should do.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE evaluator_base() {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~evaluator_base() {}
-private:
-  EIGEN_DEVICE_FUNC evaluator_base(const evaluator_base&);
-  EIGEN_DEVICE_FUNC const evaluator_base& operator=(const evaluator_base&);
 };
 
 // -------------------- Matrix and Array --------------------
@@ -132,33 +123,6 @@ private:
 // Here we directly specialize evaluator. This is not really a unary expression, and it is, by definition, dense,
 // so no need for more sophisticated dispatching.
 
-// this helper permits to completely eliminate m_outerStride if it is known at compiletime.
-template<typename Scalar,int OuterStride> class plainobjectbase_evaluator_data {
-public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr)
-  {
-#ifndef EIGEN_INTERNAL_DEBUGGING
-    EIGEN_UNUSED_VARIABLE(outerStride);
-#endif
-    eigen_internal_assert(outerStride==OuterStride);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Index outerStride() const { return OuterStride; }
-  const Scalar *data;
-};
-
-template<typename Scalar> class plainobjectbase_evaluator_data<Scalar,Dynamic> {
-public:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Index outerStride() const { return m_outerStride; }
-  const Scalar *data;
-protected:
-  Index m_outerStride;
-};
-
 template<typename Derived>
 struct evaluator<PlainObjectBase<Derived> >
   : evaluator_base<Derived>
@@ -177,23 +141,18 @@ struct evaluator<PlainObjectBase<Derived> >
     Flags = traits<Derived>::EvaluatorFlags,
     Alignment = traits<Derived>::Alignment
   };
-  enum {
-    // We do not need to know the outer stride for vectors
-    OuterStrideAtCompileTime = IsVectorAtCompileTime  ? 0
-                                                      : int(IsRowMajor) ? ColsAtCompileTime
-                                                                        : RowsAtCompileTime
-  };
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  evaluator()
-    : m_d(0,OuterStrideAtCompileTime)
+  
+  EIGEN_DEVICE_FUNC evaluator()
+    : m_data(0),
+      m_outerStride(IsVectorAtCompileTime  ? 0 
+                                           : int(IsRowMajor) ? ColsAtCompileTime 
+                                           : RowsAtCompileTime)
   {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit evaluator(const PlainObjectType& m)
-    : m_d(m.data(),IsVectorAtCompileTime ? 0 : m.outerStride())
+  
+  EIGEN_DEVICE_FUNC explicit evaluator(const PlainObjectType& m)
+    : m_data(m.data()), m_outerStride(IsVectorAtCompileTime ? 0 : m.outerStride()) 
   {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
@@ -202,30 +161,30 @@ struct evaluator<PlainObjectBase<Derived> >
   CoeffReturnType coeff(Index row, Index col) const
   {
     if (IsRowMajor)
-      return m_d.data[row * m_d.outerStride() + col];
+      return m_data[row * m_outerStride.value() + col];
     else
-      return m_d.data[row + col * m_d.outerStride()];
+      return m_data[row + col * m_outerStride.value()];
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index index) const
   {
-    return m_d.data[index];
+    return m_data[index];
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Scalar& coeffRef(Index row, Index col)
   {
     if (IsRowMajor)
-      return const_cast<Scalar*>(m_d.data)[row * m_d.outerStride() + col];
+      return const_cast<Scalar*>(m_data)[row * m_outerStride.value() + col];
     else
-      return const_cast<Scalar*>(m_d.data)[row + col * m_d.outerStride()];
+      return const_cast<Scalar*>(m_data)[row + col * m_outerStride.value()];
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Scalar& coeffRef(Index index)
   {
-    return const_cast<Scalar*>(m_d.data)[index];
+    return const_cast<Scalar*>(m_data)[index];
   }
 
   template<int LoadMode, typename PacketType>
@@ -233,16 +192,16 @@ struct evaluator<PlainObjectBase<Derived> >
   PacketType packet(Index row, Index col) const
   {
     if (IsRowMajor)
-      return ploadt<PacketType, LoadMode>(m_d.data + row * m_d.outerStride() + col);
+      return ploadt<PacketType, LoadMode>(m_data + row * m_outerStride.value() + col);
     else
-      return ploadt<PacketType, LoadMode>(m_d.data + row + col * m_d.outerStride());
+      return ploadt<PacketType, LoadMode>(m_data + row + col * m_outerStride.value());
   }
 
   template<int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE
   PacketType packet(Index index) const
   {
-    return ploadt<PacketType, LoadMode>(m_d.data + index);
+    return ploadt<PacketType, LoadMode>(m_data + index);
   }
 
   template<int StoreMode,typename PacketType>
@@ -251,22 +210,26 @@ struct evaluator<PlainObjectBase<Derived> >
   {
     if (IsRowMajor)
       return pstoret<Scalar, PacketType, StoreMode>
-	            (const_cast<Scalar*>(m_d.data) + row * m_d.outerStride() + col, x);
+	            (const_cast<Scalar*>(m_data) + row * m_outerStride.value() + col, x);
     else
       return pstoret<Scalar, PacketType, StoreMode>
-                    (const_cast<Scalar*>(m_d.data) + row + col * m_d.outerStride(), x);
+                    (const_cast<Scalar*>(m_data) + row + col * m_outerStride.value(), x);
   }
 
   template<int StoreMode, typename PacketType>
   EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketType& x)
   {
-    return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + index, x);
+    return pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_data) + index, x);
   }
 
 protected:
+  const Scalar *m_data;
 
-  plainobjectbase_evaluator_data<Scalar,OuterStrideAtCompileTime> m_d;
+  // We do not need to know the outer stride for vectors
+  variable_if_dynamic<Index, IsVectorAtCompileTime  ? 0 
+                                                    : int(IsRowMajor) ? ColsAtCompileTime 
+                                                    : RowsAtCompileTime> m_outerStride;
 };
 
 template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
@@ -275,11 +238,9 @@ struct evaluator<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
 {
   typedef Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
   
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  evaluator() {}
+  EIGEN_DEVICE_FUNC evaluator() {}
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit evaluator(const XprType& m)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m)
     : evaluator<PlainObjectBase<XprType> >(m) 
   { }
 };
@@ -290,11 +251,9 @@ struct evaluator<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
 {
   typedef Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  evaluator() {}
+  EIGEN_DEVICE_FUNC evaluator() {}
   
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit evaluator(const XprType& m)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m)
     : evaluator<PlainObjectBase<XprType> >(m) 
   { }
 };
@@ -313,8 +272,7 @@ struct unary_evaluator<Transpose<ArgType>, IndexBased>
     Alignment = evaluator<ArgType>::Alignment
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {}
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {}
 
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
@@ -569,7 +527,9 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit unary_evaluator(const XprType& op) : m_d(op)
+  explicit unary_evaluator(const XprType& op)
+    : m_functor(op.functor()), 
+      m_argImpl(op.nestedExpression()) 
   {
     EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
@@ -580,43 +540,32 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index row, Index col) const
   {
-    return m_d.func()(m_d.argImpl.coeff(row, col));
+    return m_functor(m_argImpl.coeff(row, col));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index index) const
   {
-    return m_d.func()(m_d.argImpl.coeff(index));
+    return m_functor(m_argImpl.coeff(index));
   }
 
   template<int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE
   PacketType packet(Index row, Index col) const
   {
-    return m_d.func().packetOp(m_d.argImpl.template packet<LoadMode, PacketType>(row, col));
+    return m_functor.packetOp(m_argImpl.template packet<LoadMode, PacketType>(row, col));
   }
 
   template<int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE
   PacketType packet(Index index) const
   {
-    return m_d.func().packetOp(m_d.argImpl.template packet<LoadMode, PacketType>(index));
+    return m_functor.packetOp(m_argImpl.template packet<LoadMode, PacketType>(index));
   }
 
 protected:
-
-  // this helper permits to completely eliminate the functor if it is empty
-  class Data : private UnaryOp
-  {
-  public:
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Data(const XprType& xpr) : UnaryOp(xpr.functor()), argImpl(xpr.nestedExpression()) {}
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const UnaryOp& func() const { return static_cast<const UnaryOp&>(*this); }
-    evaluator<ArgType> argImpl;
-  };
-
-  Data m_d;
+  const UnaryOp m_functor;
+  evaluator<ArgType> m_argImpl;
 };
 
 // -------------------- CwiseTernaryOp --------------------
@@ -660,7 +609,11 @@ struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased
         evaluator<Arg3>::Alignment)
   };
 
-  EIGEN_DEVICE_FUNC explicit ternary_evaluator(const XprType& xpr) : m_d(xpr)
+  EIGEN_DEVICE_FUNC explicit ternary_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_arg1Impl(xpr.arg1()), 
+      m_arg2Impl(xpr.arg2()), 
+      m_arg3Impl(xpr.arg3())  
   {
     EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<TernaryOp>::Cost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
@@ -671,47 +624,38 @@ struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index row, Index col) const
   {
-    return m_d.func()(m_d.arg1Impl.coeff(row, col), m_d.arg2Impl.coeff(row, col), m_d.arg3Impl.coeff(row, col));
+    return m_functor(m_arg1Impl.coeff(row, col), m_arg2Impl.coeff(row, col), m_arg3Impl.coeff(row, col));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index index) const
   {
-    return m_d.func()(m_d.arg1Impl.coeff(index), m_d.arg2Impl.coeff(index), m_d.arg3Impl.coeff(index));
+    return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index));
   }
 
   template<int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE
   PacketType packet(Index row, Index col) const
   {
-    return m_d.func().packetOp(m_d.arg1Impl.template packet<LoadMode,PacketType>(row, col),
-                               m_d.arg2Impl.template packet<LoadMode,PacketType>(row, col),
-                               m_d.arg3Impl.template packet<LoadMode,PacketType>(row, col));
+    return m_functor.packetOp(m_arg1Impl.template packet<LoadMode,PacketType>(row, col),
+                              m_arg2Impl.template packet<LoadMode,PacketType>(row, col),
+                              m_arg3Impl.template packet<LoadMode,PacketType>(row, col));
   }
 
   template<int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE
   PacketType packet(Index index) const
   {
-    return m_d.func().packetOp(m_d.arg1Impl.template packet<LoadMode,PacketType>(index),
-                               m_d.arg2Impl.template packet<LoadMode,PacketType>(index),
-                               m_d.arg3Impl.template packet<LoadMode,PacketType>(index));
+    return m_functor.packetOp(m_arg1Impl.template packet<LoadMode,PacketType>(index),
+                              m_arg2Impl.template packet<LoadMode,PacketType>(index),
+                              m_arg3Impl.template packet<LoadMode,PacketType>(index));
   }
 
 protected:
-  // this helper permits to completely eliminate the functor if it is empty
-  struct Data : private TernaryOp
-  {
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Data(const XprType& xpr) : TernaryOp(xpr.functor()), arg1Impl(xpr.arg1()), arg2Impl(xpr.arg2()), arg3Impl(xpr.arg3()) {}
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TernaryOp& func() const { return static_cast<const TernaryOp&>(*this); }
-    evaluator<Arg1> arg1Impl;
-    evaluator<Arg2> arg2Impl;
-    evaluator<Arg3> arg3Impl;
-  };
-
-  Data m_d;
+  const TernaryOp m_functor;
+  evaluator<Arg1> m_arg1Impl;
+  evaluator<Arg2> m_arg2Impl;
+  evaluator<Arg3> m_arg3Impl;
 };
 
 // -------------------- CwiseBinaryOp --------------------
@@ -724,8 +668,7 @@ struct evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
   typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
   typedef binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs> > Base;
   
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit evaluator(const XprType& xpr) : Base(xpr) {}
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
 };
 
 template<typename BinaryOp, typename Lhs, typename Rhs>
@@ -753,8 +696,10 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
     Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<Lhs>::Alignment,evaluator<Rhs>::Alignment)
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit binary_evaluator(const XprType& xpr) : m_d(xpr)
+  EIGEN_DEVICE_FUNC explicit binary_evaluator(const XprType& xpr)
+    : m_functor(xpr.functor()),
+      m_lhsImpl(xpr.lhs()), 
+      m_rhsImpl(xpr.rhs())  
   {
     EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
@@ -765,45 +710,35 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index row, Index col) const
   {
-    return m_d.func()(m_d.lhsImpl.coeff(row, col), m_d.rhsImpl.coeff(row, col));
+    return m_functor(m_lhsImpl.coeff(row, col), m_rhsImpl.coeff(row, col));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index index) const
   {
-    return m_d.func()(m_d.lhsImpl.coeff(index), m_d.rhsImpl.coeff(index));
+    return m_functor(m_lhsImpl.coeff(index), m_rhsImpl.coeff(index));
   }
 
   template<int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE
   PacketType packet(Index row, Index col) const
   {
-    return m_d.func().packetOp(m_d.lhsImpl.template packet<LoadMode,PacketType>(row, col),
-                               m_d.rhsImpl.template packet<LoadMode,PacketType>(row, col));
+    return m_functor.packetOp(m_lhsImpl.template packet<LoadMode,PacketType>(row, col),
+                              m_rhsImpl.template packet<LoadMode,PacketType>(row, col));
   }
 
   template<int LoadMode, typename PacketType>
   EIGEN_STRONG_INLINE
   PacketType packet(Index index) const
   {
-    return m_d.func().packetOp(m_d.lhsImpl.template packet<LoadMode,PacketType>(index),
-                               m_d.rhsImpl.template packet<LoadMode,PacketType>(index));
+    return m_functor.packetOp(m_lhsImpl.template packet<LoadMode,PacketType>(index),
+                              m_rhsImpl.template packet<LoadMode,PacketType>(index));
   }
 
 protected:
-
-  // this helper permits to completely eliminate the functor if it is empty
-  struct Data : private BinaryOp
-  {
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Data(const XprType& xpr) : BinaryOp(xpr.functor()), lhsImpl(xpr.lhs()), rhsImpl(xpr.rhs()) {}
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const BinaryOp& func() const { return static_cast<const BinaryOp&>(*this); }
-    evaluator<Lhs> lhsImpl;
-    evaluator<Rhs> rhsImpl;
-  };
-
-  Data m_d;
+  const BinaryOp m_functor;
+  evaluator<Lhs> m_lhsImpl;
+  evaluator<Rhs> m_rhsImpl;
 };
 
 // -------------------- CwiseUnaryView --------------------
@@ -822,7 +757,9 @@ struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased>
     Alignment = 0 // FIXME it is not very clear why alignment is necessarily lost...
   };
 
-  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op) : m_d(op)
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op)
+    : m_unaryOp(op.functor()), 
+      m_argImpl(op.nestedExpression()) 
   {
     EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
@@ -834,40 +771,30 @@ struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index row, Index col) const
   {
-    return m_d.func()(m_d.argImpl.coeff(row, col));
+    return m_unaryOp(m_argImpl.coeff(row, col));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index index) const
   {
-    return m_d.func()(m_d.argImpl.coeff(index));
+    return m_unaryOp(m_argImpl.coeff(index));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Scalar& coeffRef(Index row, Index col)
   {
-    return m_d.func()(m_d.argImpl.coeffRef(row, col));
+    return m_unaryOp(m_argImpl.coeffRef(row, col));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Scalar& coeffRef(Index index)
   {
-    return m_d.func()(m_d.argImpl.coeffRef(index));
+    return m_unaryOp(m_argImpl.coeffRef(index));
   }
 
 protected:
-
-  // this helper permits to completely eliminate the functor if it is empty
-  struct Data : private UnaryOp
-  {
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Data(const XprType& xpr) : UnaryOp(xpr.functor()), argImpl(xpr.nestedExpression()) {}
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const UnaryOp& func() const { return static_cast<const UnaryOp&>(*this); }
-    evaluator<ArgType> argImpl;
-  };
-
-  Data m_d;
+  const UnaryOp m_unaryOp;
+  evaluator<ArgType> m_argImpl;
 };
 
 // -------------------- Map --------------------
@@ -891,8 +818,7 @@ struct mapbase_evaluator : evaluator_base<Derived>
     CoeffReadCost = NumTraits<Scalar>::ReadCost
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit mapbase_evaluator(const XprType& map)
+  EIGEN_DEVICE_FUNC explicit mapbase_evaluator(const XprType& map)
     : m_data(const_cast<PointerType>(map.data())),
       m_innerStride(map.innerStride()),
       m_outerStride(map.outerStride())
@@ -956,10 +882,10 @@ struct mapbase_evaluator : evaluator_base<Derived>
     internal::pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
   }
 protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); }
+  EIGEN_DEVICE_FUNC
+  inline Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); }
+  EIGEN_DEVICE_FUNC
+  inline Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); }
 
   PointerType m_data;
   const internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
@@ -1012,8 +938,7 @@ struct evaluator<Ref<PlainObjectType, RefOptions, StrideType> >
     Alignment = evaluator<Map<PlainObjectType, RefOptions, StrideType> >::Alignment
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit evaluator(const XprType& ref)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& ref)
     : mapbase_evaluator<XprType, PlainObjectType>(ref) 
   { }
 };
@@ -1068,8 +993,7 @@ struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel> >
     Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ArgType>::Alignment, Alignment0)
   };
   typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit evaluator(const XprType& block) : block_evaluator_type(block)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& block) : block_evaluator_type(block)
   {
     EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
   }
@@ -1082,8 +1006,7 @@ struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /*HasDirectAcc
 {
   typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit block_evaluator(const XprType& block)
+  EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block)
     : unary_evaluator<XprType>(block) 
   {}
 };
@@ -1094,12 +1017,11 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
 {
   typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit unary_evaluator(const XprType& block)
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block)
     : m_argImpl(block.nestedExpression()), 
       m_startRow(block.startRow()), 
       m_startCol(block.startCol()),
-      m_linear_offset(ForwardLinearAccess?(ArgType::IsRowMajor ? block.startRow()*block.nestedExpression().cols() + block.startCol() : block.startCol()*block.nestedExpression().rows() + block.startRow()):0)
+      m_linear_offset(InnerPanel?(XprType::IsRowMajor ? block.startRow()*block.cols() : block.startCol()*block.rows()):0)
   { }
  
   typedef typename XprType::Scalar Scalar;
@@ -1107,7 +1029,7 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
 
   enum {
     RowsAtCompileTime = XprType::RowsAtCompileTime,
-    ForwardLinearAccess = (InnerPanel || int(XprType::IsRowMajor)==int(ArgType::IsRowMajor)) && bool(evaluator<ArgType>::Flags&LinearAccessBit)
+    ForwardLinearAccess = InnerPanel && bool(evaluator<ArgType>::Flags&LinearAccessBit)
   };
  
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -1118,8 +1040,11 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
   
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index index) const
-  {
-    return linear_coeff_impl(index, bool_constant<ForwardLinearAccess>());
+  { 
+    if (ForwardLinearAccess)
+      return m_argImpl.coeff(m_linear_offset.value() + index); 
+    else
+      return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -1130,8 +1055,11 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
   
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Scalar& coeffRef(Index index)
-  {
-    return linear_coeffRef_impl(index, bool_constant<ForwardLinearAccess>());
+  { 
+    if (ForwardLinearAccess)
+      return m_argImpl.coeffRef(m_linear_offset.value() + index); 
+    else
+      return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
   }
  
   template<int LoadMode, typename PacketType>
@@ -1172,32 +1100,10 @@ struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBa
   }
  
 protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  CoeffReturnType linear_coeff_impl(Index index, internal::true_type /* ForwardLinearAccess */) const
-  {
-    return m_argImpl.coeff(m_linear_offset.value() + index); 
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  CoeffReturnType linear_coeff_impl(Index index, internal::false_type /* not ForwardLinearAccess */) const
-  {
-    return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Scalar& linear_coeffRef_impl(Index index, internal::true_type /* ForwardLinearAccess */)
-  {
-    return m_argImpl.coeffRef(m_linear_offset.value() + index); 
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Scalar& linear_coeffRef_impl(Index index, internal::false_type /* not ForwardLinearAccess */)
-  {
-    return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
-  }
-
   evaluator<ArgType> m_argImpl;
   const variable_if_dynamic<Index, (ArgType::RowsAtCompileTime == 1 && BlockRows==1) ? 0 : Dynamic> m_startRow;
   const variable_if_dynamic<Index, (ArgType::ColsAtCompileTime == 1 && BlockCols==1) ? 0 : Dynamic> m_startCol;
-  const variable_if_dynamic<Index, ForwardLinearAccess ? Dynamic : 0> m_linear_offset;
+  const variable_if_dynamic<Index, InnerPanel ? Dynamic : 0> m_linear_offset;
 };
 
 // TODO: This evaluator does not actually use the child evaluator; 
@@ -1211,8 +1117,7 @@ struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAc
   typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
   typedef typename XprType::Scalar Scalar;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit block_evaluator(const XprType& block)
+  EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block)
     : mapbase_evaluator<XprType, typename XprType::PlainObject>(block) 
   {
     // TODO: for the 3.3 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime
@@ -1240,8 +1145,7 @@ struct evaluator<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
     Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator<ThenMatrixType>::Alignment, evaluator<ElseMatrixType>::Alignment)
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit evaluator(const XprType& select)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& select)
     : m_conditionImpl(select.conditionMatrix()),
       m_thenImpl(select.thenMatrix()),
       m_elseImpl(select.elseMatrix())
@@ -1298,8 +1202,7 @@ struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor> >
     Alignment = evaluator<ArgTypeNestedCleaned>::Alignment
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit unary_evaluator(const XprType& replicate)
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& replicate)
     : m_arg(replicate.nestedExpression()),
       m_argImpl(m_arg),
       m_rows(replicate.nestedExpression().rows()),
@@ -1363,6 +1266,64 @@ protected:
   const variable_if_dynamic<Index, ArgType::ColsAtCompileTime> m_cols;
 };
 
+
+// -------------------- PartialReduxExpr --------------------
+
+template< typename ArgType, typename MemberOp, int Direction>
+struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
+  : evaluator_base<PartialReduxExpr<ArgType, MemberOp, Direction> >
+{
+  typedef PartialReduxExpr<ArgType, MemberOp, Direction> XprType;
+  typedef typename internal::nested_eval<ArgType,1>::type ArgTypeNested;
+  typedef typename internal::remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
+  typedef typename ArgType::Scalar InputScalar;
+  typedef typename XprType::Scalar Scalar;
+  enum {
+    TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) :  int(ArgType::ColsAtCompileTime)
+  };
+  typedef typename MemberOp::template Cost<InputScalar,int(TraversalSize)> CostOpType;
+  enum {
+    CoeffReadCost = TraversalSize==Dynamic ? HugeCost
+                  : TraversalSize * evaluator<ArgType>::CoeffReadCost + int(CostOpType::value),
+    
+    Flags = (traits<XprType>::Flags&RowMajorBit) | (evaluator<ArgType>::Flags&(HereditaryBits&(~RowMajorBit))) | LinearAccessBit,
+    
+    Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr)
+    : m_arg(xpr.nestedExpression()), m_functor(xpr.functor())
+  {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : int(CostOpType::value));
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Scalar coeff(Index i, Index j) const
+  {
+    if (Direction==Vertical)
+      return m_functor(m_arg.col(j));
+    else
+      return m_functor(m_arg.row(i));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Scalar coeff(Index index) const
+  {
+    if (Direction==Vertical)
+      return m_functor(m_arg.col(index));
+    else
+      return m_functor(m_arg.row(index));
+  }
+
+protected:
+  typename internal::add_const_on_value_type<ArgTypeNested>::type m_arg;
+  const MemberOp m_functor;
+};
+
+
 // -------------------- MatrixWrapper and ArrayWrapper --------------------
 //
 // evaluator_wrapper_base<T> is a common base class for the
@@ -1379,8 +1340,7 @@ struct evaluator_wrapper_base
     Alignment = evaluator<ArgType>::Alignment
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}
+  EIGEN_DEVICE_FUNC explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}
 
   typedef typename ArgType::Scalar Scalar;
   typedef typename ArgType::CoeffReturnType CoeffReturnType;
@@ -1447,8 +1407,7 @@ struct unary_evaluator<MatrixWrapper<TArgType> >
 {
   typedef MatrixWrapper<TArgType> XprType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit unary_evaluator(const XprType& wrapper)
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper)
     : evaluator_wrapper_base<MatrixWrapper<TArgType> >(wrapper.nestedExpression())
   { }
 };
@@ -1459,8 +1418,7 @@ struct unary_evaluator<ArrayWrapper<TArgType> >
 {
   typedef ArrayWrapper<TArgType> XprType;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit unary_evaluator(const XprType& wrapper)
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper)
     : evaluator_wrapper_base<ArrayWrapper<TArgType> >(wrapper.nestedExpression())
   { }
 };
@@ -1502,8 +1460,7 @@ struct unary_evaluator<Reverse<ArgType, Direction> >
     Alignment = 0 // FIXME in some rare cases, Alignment could be preserved, like a Vector4f.
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit unary_evaluator(const XprType& reverse)
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& reverse)
     : m_argImpl(reverse.nestedExpression()),
       m_rows(ReverseRow ? reverse.nestedExpression().rows() : 1),
       m_cols(ReverseCol ? reverse.nestedExpression().cols() : 1)
@@ -1610,8 +1567,7 @@ struct evaluator<Diagonal<ArgType, DiagIndex> >
     Alignment = 0
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit evaluator(const XprType& diagonal)
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& diagonal)
     : m_argImpl(diagonal.nestedExpression()),
       m_index(diagonal.index())
   { }
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/CoreIterators.h b/uppsrc/plugin/Eigen/Eigen/src/Core/CoreIterators.h
index b96719681..4eb42b93a 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/CoreIterators.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/CoreIterators.h
@@ -48,11 +48,6 @@ public:
     * Explicit zeros are not skipped over. To skip explicit zeros, see class SparseView
     */
   EIGEN_STRONG_INLINE InnerIterator& operator++()   { m_iter.operator++(); return *this; }
-  EIGEN_STRONG_INLINE InnerIterator& operator+=(Index i) { m_iter.operator+=(i); return *this; }
-  EIGEN_STRONG_INLINE InnerIterator operator+(Index i) 
-  { InnerIterator result(*this); result+=i; return result; }
-    
-
   /// \returns the column or row index of the current coefficient.
   EIGEN_STRONG_INLINE Index index() const           { return m_iter.index(); }
   /// \returns the row index of the current coefficient.
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/CwiseBinaryOp.h b/uppsrc/plugin/Eigen/Eigen/src/Core/CwiseBinaryOp.h
index 8b8de8382..a36765e39 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/CwiseBinaryOp.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/CwiseBinaryOp.h
@@ -100,14 +100,8 @@ class CwiseBinaryOp :
     typedef typename internal::remove_reference<LhsNested>::type _LhsNested;
     typedef typename internal::remove_reference<RhsNested>::type _RhsNested;
 
-#if EIGEN_COMP_MSVC && EIGEN_HAS_CXX11
-    //Required for Visual Studio or the Copy constructor will probably not get inlined!
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    CwiseBinaryOp(const CwiseBinaryOp<BinaryOp,LhsType,RhsType>&) = default;
-#endif
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp())
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp())
       : m_lhs(aLhs), m_rhs(aRhs), m_functor(func)
     {
       EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename Rhs::Scalar);
@@ -116,16 +110,16 @@ class CwiseBinaryOp :
       eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols());
     }
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index rows() const {
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index rows() const {
       // return the fixed size type if available to enable compile time optimizations
       if (internal::traits<typename internal::remove_all<LhsNested>::type>::RowsAtCompileTime==Dynamic)
         return m_rhs.rows();
       else
         return m_lhs.rows();
     }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index cols() const {
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Index cols() const {
       // return the fixed size type if available to enable compile time optimizations
       if (internal::traits<typename internal::remove_all<LhsNested>::type>::ColsAtCompileTime==Dynamic)
         return m_rhs.cols();
@@ -134,13 +128,13 @@ class CwiseBinaryOp :
     }
 
     /** \returns the left hand side nested expression */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
     const _LhsNested& lhs() const { return m_lhs; }
     /** \returns the right hand side nested expression */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
     const _RhsNested& rhs() const { return m_rhs; }
     /** \returns the functor representing the binary operation */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
     const BinaryOp& functor() const { return m_functor; }
 
   protected:
@@ -164,7 +158,7 @@ public:
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
+EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
 {
   call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -177,7 +171,7 @@ MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived &
+EIGEN_STRONG_INLINE Derived &
 MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 {
   call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -187,3 +181,4 @@ MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
 } // end namespace Eigen
 
 #endif // EIGEN_CWISE_BINARY_OP_H
+
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/CwiseNullaryOp.h b/uppsrc/plugin/Eigen/Eigen/src/Core/CwiseNullaryOp.h
index ddac9df78..ddd607e38 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/CwiseNullaryOp.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/CwiseNullaryOp.h
@@ -105,12 +105,7 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-const CwiseNullaryOp<CustomNullaryOp,typename DenseBase<Derived>::PlainObject>
-#else
-const CwiseNullaryOp<CustomNullaryOp,PlainObject>
-#endif
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func)
 {
   return CwiseNullaryOp<CustomNullaryOp, PlainObject>(rows, cols, func);
@@ -136,12 +131,7 @@ DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
-#else
-const CwiseNullaryOp<CustomNullaryOp, PlainObject>
-#endif
+EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
@@ -160,12 +150,7 @@ DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
   */
 template<typename Derived>
 template<typename CustomNullaryOp>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
-#else
-const CwiseNullaryOp<CustomNullaryOp, PlainObject>
-#endif
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
 DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
 {
   return CwiseNullaryOp<CustomNullaryOp, PlainObject>(RowsAtCompileTime, ColsAtCompileTime, func);
@@ -185,7 +170,7 @@ DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
   * \sa class CwiseNullaryOp
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
 DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value)
 {
   return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_constant_op<Scalar>(value));
@@ -232,32 +217,27 @@ DenseBase<Derived>::Constant(const Scalar& value)
 
 /** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(Index,const Scalar&,const Scalar&)
   *
-  * \only_for_vectors
-  *
-  * Example: \include DenseBase_LinSpaced_seq_deprecated.cpp
-  * Output: \verbinclude DenseBase_LinSpaced_seq_deprecated.out
-  *
-  * \sa LinSpaced(Index,const Scalar&, const Scalar&), setLinSpaced(Index,const Scalar&,const Scalar&)
+  * \sa LinSpaced(Index,Scalar,Scalar), setLinSpaced(Index,const Scalar&,const Scalar&)
   */
 template<typename Derived>
-EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar>(low,high,size));
+  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,PacketScalar>(low,high,size));
 }
 
 /** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(const Scalar&,const Scalar&)
   *
-  * \sa LinSpaced(const Scalar&, const Scalar&)
+  * \sa LinSpaced(Scalar,Scalar)
   */
 template<typename Derived>
-EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
 DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
-  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar>(low,high,Derived::SizeAtCompileTime));
+  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,PacketScalar>(low,high,Derived::SizeAtCompileTime));
 }
 
 /**
@@ -288,7 +268,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomA
 DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar>(low,high,size));
+  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,PacketScalar>(low,high,size));
 }
 
 /**
@@ -301,7 +281,7 @@ DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
-  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar>(low,high,Derived::SizeAtCompileTime));
+  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,PacketScalar>(low,high,Derived::SizeAtCompileTime));
 }
 
 /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */
@@ -403,7 +383,7 @@ template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar>(low,high,newSize));
+  return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,PacketScalar>(low,high,newSize));
 }
 
 /**
@@ -881,42 +861,6 @@ template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
 { return Derived::Unit(3); }
 
-/** \brief Set the coefficients of \c *this to the i-th unit (basis) vector
-  *
-  * \param i index of the unique coefficient to be set to 1
-  *
-  * \only_for_vectors
-  *
-  * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
-  */
-template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index i)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  eigen_assert(i<size());
-  derived().setZero();
-  derived().coeffRef(i) = Scalar(1);
-  return derived();
-}
-
-/** \brief Resizes to the given \a newSize, and writes the i-th unit (basis) vector into *this.
-  *
-  * \param newSize the new size of the vector
-  * \param i index of the unique coefficient to be set to 1
-  *
-  * \only_for_vectors
-  *
-  * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
-  */
-template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index newSize, Index i)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  eigen_assert(i<newSize);
-  derived().resize(newSize);
-  return setUnit(i);
-}
-
 } // end namespace Eigen
 
 #endif // EIGEN_CWISE_NULLARY_OP_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/CwiseUnaryView.h b/uppsrc/plugin/Eigen/Eigen/src/Core/CwiseUnaryView.h
index ff3134d43..5a30fa8df 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/CwiseUnaryView.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/CwiseUnaryView.h
@@ -81,7 +81,7 @@ class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename in
 
     /** \returns the nested expression */
     typename internal::remove_reference<MatrixTypeNested>::type&
-    nestedExpression() { return m_matrix; }
+    nestedExpression() { return m_matrix.const_cast_derived(); }
 
   protected:
     MatrixTypeNested m_matrix;
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/DenseBase.h b/uppsrc/plugin/Eigen/Eigen/src/Core/DenseBase.h
index 59756a494..c55a68230 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/DenseBase.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/DenseBase.h
@@ -150,18 +150,13 @@ template<typename Derived> class DenseBase
           * \sa SizeAtCompileTime, MaxRowsAtCompileTime, MaxColsAtCompileTime
           */
 
-      IsVectorAtCompileTime = internal::traits<Derived>::RowsAtCompileTime == 1
-                           || internal::traits<Derived>::ColsAtCompileTime == 1,
+      IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1
+                           || internal::traits<Derived>::MaxColsAtCompileTime == 1,
         /**< This is set to true if either the number of rows or the number of
           * columns is known at compile-time to be equal to 1. Indeed, in that case,
           * we are dealing with a column-vector (if there is only one column) or with
           * a row-vector (if there is only one row). */
 
-      NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2,
-        /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors, 
-         * and 2 for matrices.
-         */
-
       Flags = internal::traits<Derived>::Flags,
         /**< This stores expression \ref flags flags which may or may not be inherited by new expressions
           * constructed from this one. See the \ref flags "list of flags".
@@ -266,9 +261,9 @@ template<typename Derived> class DenseBase
     /** \internal Represents a matrix with all coefficients equal to one another*/
     typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,PlainObject> ConstantReturnType;
     /** \internal \deprecated Represents a vector with linearly spaced coefficients that allows sequential access only. */
-    EIGEN_DEPRECATED typedef CwiseNullaryOp<internal::linspaced_op<Scalar>,PlainObject> SequentialLinSpacedReturnType;
+    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,PacketScalar>,PlainObject> SequentialLinSpacedReturnType;
     /** \internal Represents a vector with linearly spaced coefficients that allows random access. */
-    typedef CwiseNullaryOp<internal::linspaced_op<Scalar>,PlainObject> RandomAccessLinSpacedReturnType;
+    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,PacketScalar>,PlainObject> RandomAccessLinSpacedReturnType;
     /** \internal the return type of MatrixBase::eigenvalues() */
     typedef Matrix<typename NumTraits<typename internal::traits<Derived>::Scalar>::Real, internal::traits<Derived>::ColsAtCompileTime, 1> EigenvaluesReturnType;
 
@@ -302,17 +297,17 @@ template<typename Derived> class DenseBase
     Derived& operator=(const ReturnByValue<OtherDerived>& func);
 
     /** \internal
-      * Copies \a other into *this without evaluating other. \returns a reference to *this. */
+      * Copies \a other into *this without evaluating other. \returns a reference to *this.
+      * \deprecated */
     template<typename OtherDerived>
-    /** \deprecated */
-    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC
     Derived& lazyAssign(const DenseBase<OtherDerived>& other);
 
     EIGEN_DEVICE_FUNC
     CommaInitializer<Derived> operator<< (const Scalar& s);
 
-    template<unsigned int Added,unsigned int Removed>
     /** \deprecated it now returns \c *this */
+    template<unsigned int Added,unsigned int Removed>
     EIGEN_DEPRECATED
     const Derived& flagged() const
     { return derived(); }
@@ -337,13 +332,12 @@ template<typename Derived> class DenseBase
     EIGEN_DEVICE_FUNC static const ConstantReturnType
     Constant(const Scalar& value);
 
-    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
+    EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType
     LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high);
-    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
-    LinSpaced(Sequential_t, const Scalar& low, const Scalar& high);
-
     EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
     LinSpaced(Index size, const Scalar& low, const Scalar& high);
+    EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType
+    LinSpaced(Sequential_t, const Scalar& low, const Scalar& high);
     EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType
     LinSpaced(const Scalar& low, const Scalar& high);
 
@@ -375,7 +369,7 @@ template<typename Derived> class DenseBase
     template<typename OtherDerived> EIGEN_DEVICE_FUNC
     bool isApprox(const DenseBase<OtherDerived>& other,
                   const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC 
     bool isMuchSmallerThan(const RealScalar& other,
                            const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
     template<typename OtherDerived> EIGEN_DEVICE_FUNC
@@ -386,7 +380,7 @@ template<typename Derived> class DenseBase
     EIGEN_DEVICE_FUNC bool isConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
     EIGEN_DEVICE_FUNC bool isZero(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
     EIGEN_DEVICE_FUNC bool isOnes(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-
+    
     inline bool hasNaN() const;
     inline bool allFinite() const;
 
@@ -400,8 +394,8 @@ template<typename Derived> class DenseBase
       *
       * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
       * a const reference, in order to avoid a useless copy.
-      *
-      * \warning Be careful with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink.
+      * 
+      * \warning Be carefull with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink.
       */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE EvalReturnType eval() const
@@ -416,7 +410,7 @@ template<typename Derived> class DenseBase
       *
       */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
     void swap(const DenseBase<OtherDerived>& other)
     {
       EIGEN_STATIC_ASSERT(!OtherDerived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
@@ -428,7 +422,7 @@ template<typename Derived> class DenseBase
       *
       */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
     void swap(PlainObjectBase<OtherDerived>& other)
     {
       eigen_assert(rows()==other.rows() && cols()==other.cols());
@@ -499,7 +493,7 @@ template<typename Derived> class DenseBase
     typedef VectorwiseOp<Derived, Vertical> ColwiseReturnType;
     typedef const VectorwiseOp<const Derived, Vertical> ConstColwiseReturnType;
 
-    /** \returns a VectorwiseOp wrapper of *this for broadcasting and partial reductions
+    /** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
     *
     * Example: \include MatrixBase_rowwise.cpp
     * Output: \verbinclude MatrixBase_rowwise.out
@@ -512,7 +506,7 @@ template<typename Derived> class DenseBase
     }
     EIGEN_DEVICE_FUNC RowwiseReturnType rowwise();
 
-    /** \returns a VectorwiseOp wrapper of *this broadcasting and partial reductions
+    /** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
     *
     * Example: \include MatrixBase_colwise.cpp
     * Output: \verbinclude MatrixBase_colwise.out
@@ -573,59 +567,16 @@ template<typename Derived> class DenseBase
     }
     EIGEN_DEVICE_FUNC void reverseInPlace();
 
-    #ifdef EIGEN_PARSED_BY_DOXYGEN
-    /** STL-like <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">RandomAccessIterator</a>
-      * iterator type as returned by the begin() and end() methods.
-      */
-    typedef random_access_iterator_type iterator;
-    /** This is the const version of iterator (aka read-only) */
-    typedef random_access_iterator_type const_iterator;
-    #else
-    typedef typename internal::conditional< (Flags&DirectAccessBit)==DirectAccessBit,
-                                            internal::pointer_based_stl_iterator<Derived>,
-                                            internal::generic_randaccess_stl_iterator<Derived>
-                                          >::type iterator_type;
-
-    typedef typename internal::conditional< (Flags&DirectAccessBit)==DirectAccessBit,
-                                            internal::pointer_based_stl_iterator<const Derived>,
-                                            internal::generic_randaccess_stl_iterator<const Derived>
-                                          >::type const_iterator_type;
-
-    // Stl-style iterators are supported only for vectors.
-
-    typedef typename internal::conditional< IsVectorAtCompileTime,
-                                            iterator_type,
-                                            void
-                                          >::type iterator;
-
-    typedef typename internal::conditional< IsVectorAtCompileTime,
-                                            const_iterator_type,
-                                            void
-                                          >::type const_iterator;
-    #endif
-
-    inline iterator begin();
-    inline const_iterator begin() const;
-    inline const_iterator cbegin() const;
-    inline iterator end();
-    inline const_iterator end() const;
-    inline const_iterator cend() const;
-
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase
 #define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 #define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
-#define EIGEN_DOC_UNARY_ADDONS(X,Y)
-#   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/BlockMethods.h"
-#   include "../plugins/IndexedViewMethods.h"
-#   include "../plugins/ReshapedMethods.h"
 #   ifdef EIGEN_DENSEBASE_PLUGIN
 #     include EIGEN_DENSEBASE_PLUGIN
 #   endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
 #undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 #undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
-#undef EIGEN_DOC_UNARY_ADDONS
 
     // disable the use of evalTo for dense objects with a nice compilation error
     template<typename Dest>
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/DenseCoeffsBase.h b/uppsrc/plugin/Eigen/Eigen/src/Core/DenseCoeffsBase.h
index 463b471c8..c4af48ab6 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/DenseCoeffsBase.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/DenseCoeffsBase.h
@@ -22,8 +22,7 @@ template<typename T> struct add_const_on_value_type_if_arithmetic
 /** \brief Base class providing read-only coefficient access to matrices and arrays.
   * \ingroup Core_Module
   * \tparam Derived Type of the derived class
-  *
-  * \note #ReadOnlyAccessors Constant indicating read-only access
+  * \tparam #ReadOnlyAccessors Constant indicating read-only access
   *
   * This class defines the \c operator() \c const function and friends, which can be used to read specific
   * entries of a matrix or array.
@@ -289,8 +288,7 @@ class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
 /** \brief Base class providing read/write coefficient access to matrices and arrays.
   * \ingroup Core_Module
   * \tparam Derived Type of the derived class
-  *
-  * \note #WriteAccessors Constant indicating read/write access
+  * \tparam #WriteAccessors Constant indicating read/write access
   *
   * This class defines the non-const \c operator() function and friends, which can be used to write specific
   * entries of a matrix or array. This class inherits DenseCoeffsBase<Derived, ReadOnlyAccessors> which
@@ -468,8 +466,7 @@ class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived,
 /** \brief Base class providing direct read-only coefficient access to matrices and arrays.
   * \ingroup Core_Module
   * \tparam Derived Type of the derived class
-  *
-  * \note #DirectAccessors Constant indicating direct access
+  * \tparam #DirectAccessors Constant indicating direct access
   *
   * This class defines functions to work with strides which can be used to access entries directly. This class
   * inherits DenseCoeffsBase<Derived, ReadOnlyAccessors> which defines functions to access entries read-only using
@@ -542,8 +539,7 @@ class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived
 /** \brief Base class providing direct read/write coefficient access to matrices and arrays.
   * \ingroup Core_Module
   * \tparam Derived Type of the derived class
-  *
-  * \note #DirectWriteAccessors Constant indicating direct access
+  * \tparam #DirectWriteAccessors Constant indicating direct access
   *
   * This class defines functions to work with strides which can be used to access entries directly. This class
   * inherits DenseCoeffsBase<Derived, WriteAccessors> which defines functions to access entries read/write using
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/DenseStorage.h b/uppsrc/plugin/Eigen/Eigen/src/Core/DenseStorage.h
index a8bb8a624..7d6d4e66d 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/DenseStorage.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/DenseStorage.h
@@ -61,7 +61,7 @@ struct plain_array
 #if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
   #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask)
 #elif EIGEN_GNUC_AT_LEAST(4,7) 
-  // GCC 4.7 is too aggressive in its optimizations and remove the alignment test based on the fact the array is declared to be aligned.
+  // GCC 4.7 is too aggressive in its optimizations and remove the alignement test based on the fact the array is declared to be aligned.
   // See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900
   // Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined:
   template<typename PtrType>
@@ -207,9 +207,7 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
       EIGEN_UNUSED_VARIABLE(rows);
       EIGEN_UNUSED_VARIABLE(cols);
     }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
-      numext::swap(m_data, other.m_data);
-    }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
     EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
     EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {}
@@ -269,11 +267,7 @@ template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
     EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
-    {
-      numext::swap(m_data,other.m_data);
-      numext::swap(m_rows,other.m_rows);
-      numext::swap(m_cols,other.m_cols);
-    }
+    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
     EIGEN_DEVICE_FUNC Index rows() const {return m_rows;}
     EIGEN_DEVICE_FUNC Index cols() const {return m_cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index cols) { m_rows = rows; m_cols = cols; }
@@ -302,11 +296,7 @@ template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Si
       return *this; 
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
-    {
-      numext::swap(m_data,other.m_data);
-      numext::swap(m_rows,other.m_rows);
-    }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
     EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const {return _Cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index, Index rows, Index) { m_rows = rows; }
@@ -335,14 +325,11 @@ template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Si
       return *this;
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {}
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
-      numext::swap(m_data,other.m_data);
-      numext::swap(m_cols,other.m_cols);
-    }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
     EIGEN_DEVICE_FUNC Index rows(void) const {return _Rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
-    EIGEN_DEVICE_FUNC void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
-    EIGEN_DEVICE_FUNC void resize(Index, Index, Index cols) { m_cols = cols; }
+    void conservativeResize(Index, Index, Index cols) { m_cols = cols; }
+    void resize(Index, Index, Index cols) { m_cols = cols; }
     EIGEN_DEVICE_FUNC const T *data() const { return m_data.array; }
     EIGEN_DEVICE_FUNC T *data() { return m_data.array; }
 };
@@ -394,19 +381,16 @@ template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynam
     EIGEN_DEVICE_FUNC
     DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
     {
-      numext::swap(m_data, other.m_data);
-      numext::swap(m_rows, other.m_rows);
-      numext::swap(m_cols, other.m_cols);
+      using std::swap;
+      swap(m_data, other.m_data);
+      swap(m_rows, other.m_rows);
+      swap(m_cols, other.m_cols);
       return *this;
     }
 #endif
     EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
     EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
-    {
-      numext::swap(m_data,other.m_data);
-      numext::swap(m_rows,other.m_rows);
-      numext::swap(m_cols,other.m_cols);
-    }
+    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
     EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
     void conservativeResize(Index size, Index rows, Index cols)
@@ -475,16 +459,14 @@ template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Ro
     EIGEN_DEVICE_FUNC
     DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
     {
-      numext::swap(m_data, other.m_data);
-      numext::swap(m_cols, other.m_cols);
+      using std::swap;
+      swap(m_data, other.m_data);
+      swap(m_cols, other.m_cols);
       return *this;
     }
 #endif
     EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
-      numext::swap(m_data,other.m_data);
-      numext::swap(m_cols,other.m_cols);
-    }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
     EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;}
     EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols)
@@ -551,16 +533,14 @@ template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dyn
     EIGEN_DEVICE_FUNC
     DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT
     {
-      numext::swap(m_data, other.m_data);
-      numext::swap(m_rows, other.m_rows);
+      using std::swap;
+      swap(m_data, other.m_data);
+      swap(m_rows, other.m_rows);
       return *this;
     }
 #endif
     EIGEN_DEVICE_FUNC ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
-    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
-      numext::swap(m_data,other.m_data);
-      numext::swap(m_rows,other.m_rows);
-    }
+    EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
     EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;}
     EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;}
     void conservativeResize(Index size, Index rows, Index)
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Diagonal.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Diagonal.h
index 563135fb2..afcaf3575 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/Diagonal.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Diagonal.h
@@ -187,7 +187,7 @@ template<typename MatrixType, int _DiagIndex> class Diagonal
   *
   * \sa class Diagonal */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalReturnType
+inline typename MatrixBase<Derived>::DiagonalReturnType
 MatrixBase<Derived>::diagonal()
 {
   return DiagonalReturnType(derived());
@@ -195,7 +195,7 @@ MatrixBase<Derived>::diagonal()
 
 /** This is the const version of diagonal(). */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalReturnType
+inline typename MatrixBase<Derived>::ConstDiagonalReturnType
 MatrixBase<Derived>::diagonal() const
 {
   return ConstDiagonalReturnType(derived());
@@ -213,7 +213,7 @@ MatrixBase<Derived>::diagonal() const
   *
   * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
+inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index)
 {
   return DiagonalDynamicIndexReturnType(derived(), index);
@@ -221,7 +221,7 @@ MatrixBase<Derived>::diagonal(Index index)
 
 /** This is the const version of diagonal(Index). */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
+inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
 MatrixBase<Derived>::diagonal(Index index) const
 {
   return ConstDiagonalDynamicIndexReturnType(derived(), index);
@@ -240,7 +240,6 @@ MatrixBase<Derived>::diagonal(Index index) const
   * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
 template<int Index_>
-EIGEN_DEVICE_FUNC
 inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal()
 {
@@ -250,7 +249,6 @@ MatrixBase<Derived>::diagonal()
 /** This is the const version of diagonal<int>(). */
 template<typename Derived>
 template<int Index_>
-EIGEN_DEVICE_FUNC
 inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
 MatrixBase<Derived>::diagonal() const
 {
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/DiagonalMatrix.h b/uppsrc/plugin/Eigen/Eigen/src/Core/DiagonalMatrix.h
index 542685c65..ecfdce8ef 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/DiagonalMatrix.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/DiagonalMatrix.h
@@ -44,7 +44,7 @@ class DiagonalBase : public EigenBase<Derived>
 
     EIGEN_DEVICE_FUNC
     DenseMatrixType toDenseMatrix() const { return derived(); }
-
+    
     EIGEN_DEVICE_FUNC
     inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
     EIGEN_DEVICE_FUNC
@@ -83,30 +83,6 @@ class DiagonalBase : public EigenBase<Derived>
     {
       return DiagonalWrapper<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,DiagonalVectorType,product) >(scalar * other.diagonal());
     }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    #ifdef EIGEN_PARSED_BY_DOXYGEN
-    inline unspecified_expression_type
-    #else
-    inline const DiagonalWrapper<const EIGEN_CWISE_BINARY_RETURN_TYPE(DiagonalVectorType,typename OtherDerived::DiagonalVectorType,sum) >
-    #endif
-    operator+(const DiagonalBase<OtherDerived>& other) const
-    {
-      return (diagonal() + other.diagonal()).asDiagonal();
-    }
-
-    template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
-    #ifdef EIGEN_PARSED_BY_DOXYGEN
-    inline unspecified_expression_type
-    #else
-    inline const DiagonalWrapper<const EIGEN_CWISE_BINARY_RETURN_TYPE(DiagonalVectorType,typename OtherDerived::DiagonalVectorType,difference) >
-    #endif
-    operator-(const DiagonalBase<OtherDerived>& other) const
-    {
-      return (diagonal() - other.diagonal()).asDiagonal();
-    }
 };
 
 #endif
@@ -178,30 +154,6 @@ class DiagonalMatrix
     EIGEN_DEVICE_FUNC
     inline DiagonalMatrix(const Scalar& x, const Scalar& y, const Scalar& z) : m_diagonal(x,y,z) {}
 
-    #if EIGEN_HAS_CXX11
-    /** \brief Construct a diagonal matrix with fixed size from an arbitrary number of coefficients. \cpp11
-      * 
-      * There exists C++98 anologue constructors for fixed-size diagonal matrices having 2 or 3 coefficients.
-      * 
-      * \warning To construct a diagonal matrix of fixed size, the number of values passed to this 
-      * constructor must match the fixed dimension of \c *this.
-      * 
-      * \sa DiagonalMatrix(const Scalar&, const Scalar&)
-      * \sa DiagonalMatrix(const Scalar&, const Scalar&, const Scalar&)
-      */
-    template <typename... ArgTypes>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    DiagonalMatrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const ArgTypes&... args)
-      : m_diagonal(a0, a1, a2, args...) {}
-
-    /** \brief Constructs a DiagonalMatrix and initializes it by elements given by an initializer list of initializer
-      * lists \cpp11
-      */
-    EIGEN_DEVICE_FUNC
-    explicit EIGEN_STRONG_INLINE DiagonalMatrix(const std::initializer_list<std::initializer_list<Scalar>>& list)
-      : m_diagonal(list) {}
-    #endif  // EIGEN_HAS_CXX11
-
     /** Copy constructor. */
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
@@ -321,7 +273,7 @@ class DiagonalWrapper
   * \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal()
   **/
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline const DiagonalWrapper<const Derived>
+inline const DiagonalWrapper<const Derived>
 MatrixBase<Derived>::asDiagonal() const
 {
   return DiagonalWrapper<const Derived>(derived());
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/DiagonalProduct.h b/uppsrc/plugin/Eigen/Eigen/src/Core/DiagonalProduct.h
index 7911d1cd1..d372b938f 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/DiagonalProduct.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/DiagonalProduct.h
@@ -17,7 +17,7 @@ namespace Eigen {
   */
 template<typename Derived>
 template<typename DiagonalDerived>
-EIGEN_DEVICE_FUNC inline const Product<Derived, DiagonalDerived, LazyProduct>
+inline const Product<Derived, DiagonalDerived, LazyProduct>
 MatrixBase<Derived>::operator*(const DiagonalBase<DiagonalDerived> &a_diagonal) const
 {
   return Product<Derived, DiagonalDerived, LazyProduct>(derived(),a_diagonal.derived());
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Dot.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Dot.h
index 11da432b2..1fe7a84a4 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/Dot.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Dot.h
@@ -93,7 +93,7 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
   * \sa dot(), norm(), lpNorm()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
+EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
 {
   return numext::real((*this).cwiseAbs2().sum());
 }
@@ -105,7 +105,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::trai
   * \sa lpNorm(), dot(), squaredNorm()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
+EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
 {
   return numext::sqrt(squaredNorm());
 }
@@ -120,7 +120,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::trai
   * \sa norm(), normalize()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
+EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::normalized() const
 {
   typedef typename internal::nested_eval<Derived,2>::type _Nested;
@@ -142,7 +142,7 @@ MatrixBase<Derived>::normalized() const
   * \sa norm(), normalized()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
+EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
 {
   RealScalar z = squaredNorm();
   // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
@@ -163,7 +163,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize()
   * \sa stableNorm(), stableNormalize(), normalized()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
+EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
 MatrixBase<Derived>::stableNormalized() const
 {
   typedef typename internal::nested_eval<Derived,3>::type _Nested;
@@ -188,7 +188,7 @@ MatrixBase<Derived>::stableNormalized() const
   * \sa stableNorm(), stableNormalized(), normalize()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::stableNormalize()
+EIGEN_STRONG_INLINE void MatrixBase<Derived>::stableNormalize()
 {
   RealScalar w = cwiseAbs().maxCoeff();
   RealScalar z = (derived()/w).squaredNorm();
@@ -260,9 +260,9 @@ struct lpNorm_selector<Derived, Infinity>
 template<typename Derived>
 template<int p>
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-EIGEN_DEVICE_FUNC inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 #else
-EIGEN_DEVICE_FUNC MatrixBase<Derived>::RealScalar
+MatrixBase<Derived>::RealScalar
 #endif
 MatrixBase<Derived>::lpNorm() const
 {
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/EigenBase.h b/uppsrc/plugin/Eigen/Eigen/src/Core/EigenBase.h
index 0c34fb656..b195506a9 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/EigenBase.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/EigenBase.h
@@ -32,9 +32,8 @@ template<typename Derived> struct EigenBase
   
   /** \brief The interface type of indices
     * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
+    * \deprecated Since Eigen 3.3, its usage is deprecated. Use Eigen::Index instead.
     * \sa StorageIndex, \ref TopicPreprocessorDirectives.
-    * DEPRECATED: Since Eigen 3.3, its usage is deprecated. Use Eigen::Index instead.
-    * Deprecation is not marked with a doxygen comment because there are too many existing usages to add the deprecation attribute.
     */
   typedef Eigen::Index Index;
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Fuzzy.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Fuzzy.h
index 43aa49b2b..3e403a09d 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/Fuzzy.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Fuzzy.h
@@ -100,7 +100,7 @@ struct isMuchSmallerThan_scalar_selector<Derived, true>
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApprox(
+bool DenseBase<Derived>::isApprox(
   const DenseBase<OtherDerived>& other,
   const RealScalar& prec
 ) const
@@ -122,7 +122,7 @@ EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApprox(
   * \sa isApprox(), isMuchSmallerThan(const DenseBase<OtherDerived>&, RealScalar) const
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
+bool DenseBase<Derived>::isMuchSmallerThan(
   const typename NumTraits<Scalar>::Real& other,
   const RealScalar& prec
 ) const
@@ -142,7 +142,7 @@ EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(
+bool DenseBase<Derived>::isMuchSmallerThan(
   const DenseBase<OtherDerived>& other,
   const RealScalar& prec
 ) const
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/GeneralProduct.h b/uppsrc/plugin/Eigen/Eigen/src/Core/GeneralProduct.h
index bf7ef54b5..6f0cc80e9 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/GeneralProduct.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/GeneralProduct.h
@@ -18,16 +18,6 @@ enum {
   Small = 3
 };
 
-// Define the threshold value to fallback from the generic matrix-matrix product
-// implementation (heavy) to the lightweight coeff-based product one.
-// See generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
-// in products/GeneralMatrixMatrix.h for more details.
-// TODO This threshold should also be used in the compile-time selector below.
-#ifndef EIGEN_GEMM_TO_COEFFBASED_THRESHOLD
-// This default value has been obtained on a Haswell architecture.
-#define EIGEN_GEMM_TO_COEFFBASED_THRESHOLD 20
-#endif
-
 namespace internal {
 
 template<int Rows, int Cols, int Depth> struct product_type_selector;
@@ -35,7 +25,7 @@ template<int Rows, int Cols, int Depth> struct product_type_selector;
 template<int Size, int MaxSize> struct product_size_category
 {
   enum {
-    #ifndef EIGEN_GPU_COMPILE_PHASE
+    #ifndef EIGEN_CUDA_ARCH
     is_large = MaxSize == Dynamic ||
                Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
                (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
@@ -163,13 +153,13 @@ template<typename Scalar,int Size,int MaxSize,bool Cond> struct gemv_static_vect
 template<typename Scalar,int Size,int MaxSize>
 struct gemv_static_vector_if<Scalar,Size,MaxSize,false>
 {
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
+  EIGEN_STRONG_INLINE  Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
 };
 
 template<typename Scalar,int Size>
 struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
 {
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { return 0; }
+  EIGEN_STRONG_INLINE Scalar* data() { return 0; }
 };
 
 template<typename Scalar,int Size,int MaxSize>
@@ -239,7 +229,7 @@ template<> struct gemv_dense_selector<OnTheRight,ColMajor,true>
       // on, the other hand it is good for the cache to pack the vector anyways...
       EvalToDestAtCompileTime = (ActualDest::InnerStrideAtCompileTime==1),
       ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex),
-      MightCannotUseDest = ((!EvalToDestAtCompileTime) || ComplexByReal) && (ActualDest::MaxSizeAtCompileTime!=0)
+      MightCannotUseDest = (!EvalToDestAtCompileTime) || ComplexByReal
     };
 
     typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
@@ -326,7 +316,7 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,true>
     enum {
       // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
       // on, the other hand it is good for the cache to pack the vector anyways...
-      DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 || ActualRhsTypeCleaned::MaxSizeAtCompileTime==0
+      DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1
     };
 
     gemv_static_vector_if<RhsScalar,ActualRhsTypeCleaned::SizeAtCompileTime,ActualRhsTypeCleaned::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
@@ -396,8 +386,7 @@ template<> struct gemv_dense_selector<OnTheRight,RowMajor,false>
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-const Product<Derived, OtherDerived>
+inline const Product<Derived, OtherDerived>
 MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
 {
   // A note regarding the function declaration: In MSVC, this function will sometimes
@@ -439,7 +428,6 @@ MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
   */
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 const Product<Derived,OtherDerived,LazyProduct>
 MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
 {
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/GenericPacketMath.h b/uppsrc/plugin/Eigen/Eigen/src/Core/GenericPacketMath.h
index 449793372..e59443779 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/GenericPacketMath.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/GenericPacketMath.h
@@ -44,27 +44,23 @@ struct default_packet_traits
   enum {
     HasHalfPacket = 0,
 
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasShift     = 1,
-    HasMul       = 1,
-    HasNegate    = 1,
-    HasAbs       = 1,
-    HasArg       = 0,
-    HasAbs2      = 1,
-    HasAbsDiff   = 0,
-    HasMin       = 1,
-    HasMax       = 1,
-    HasConj      = 1,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasNegate = 1,
+    HasAbs    = 1,
+    HasArg    = 0,
+    HasAbs2   = 1,
+    HasMin    = 1,
+    HasMax    = 1,
+    HasConj   = 1,
     HasSetLinear = 1,
-    HasBlend     = 0,
-    HasInsert    = 0,
+    HasBlend  = 0,
 
     HasDiv    = 0,
     HasSqrt   = 0,
     HasRsqrt  = 0,
     HasExp    = 0,
-    HasExpm1  = 0,
     HasLog    = 0,
     HasLog1p  = 0,
     HasLog10  = 0,
@@ -85,19 +81,14 @@ struct default_packet_traits
     HasPolygamma = 0,
     HasErf = 0,
     HasErfc = 0,
-    HasNdtri = 0,
-    HasBessel = 0,
     HasIGamma = 0,
-    HasIGammaDerA = 0,
-    HasGammaSampleDerAlpha = 0,
     HasIGammac = 0,
     HasBetaInc = 0,
 
     HasRound  = 0,
-    HasRint   = 0,
     HasFloor  = 0,
     HasCeil   = 0,
-    HasCast   = 0, 
+
     HasSign   = 0
   };
 };
@@ -136,22 +127,6 @@ template <typename Src, typename Tgt> struct type_casting_traits {
   };
 };
 
-/** \internal Wrapper to ensure that multiple packet types can map to the same
-    same underlying vector type. */
-template<typename T, int unique_id = 0>
-struct eigen_packet_wrapper
-{
-  EIGEN_ALWAYS_INLINE operator T&() { return m_val; }
-  EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; }
-  EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {}
-  EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {}
-  EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) {
-    m_val = v;
-    return *this;
-  }
-
-  T m_val;
-};
 
 /** \internal \returns static_cast<TgtType>(a) (coeff-wise) */
 template <typename SrcPacket, typename TgtPacket>
@@ -171,21 +146,15 @@ pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const
   return static_cast<TgtPacket>(a);
 }
 
-/** \internal \returns reinterpret_cast<Target>(a) */
-template <typename Target, typename Packet>
-EIGEN_DEVICE_FUNC inline Target
-preinterpret(const Packet& a); /* { return reinterpret_cast<const Target&>(a); } */
-
 /** \internal \returns a + b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-padd(const Packet& a, const Packet& b) { return a+b; }
-// Avoid compiler warning for boolean algebra.
-template<> EIGEN_DEVICE_FUNC inline bool
-padd(const bool& a, const bool& b) { return a || b; }
+padd(const Packet& a,
+        const Packet& b) { return a+b; }
 
 /** \internal \returns a - b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-psub(const Packet& a, const Packet& b) { return a-b; }
+psub(const Packet& a,
+        const Packet& b) { return a-b; }
 
 /** \internal \returns -a (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
@@ -198,86 +167,32 @@ pconj(const Packet& a) { return numext::conj(a); }
 
 /** \internal \returns a * b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pmul(const Packet& a, const Packet& b) { return a*b; }
-// Avoid compiler warning for boolean algebra.
-template<> EIGEN_DEVICE_FUNC inline bool
-pmul(const bool& a, const bool& b) { return a && b; }
+pmul(const Packet& a,
+        const Packet& b) { return a*b; }
 
 /** \internal \returns a / b (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pdiv(const Packet& a, const Packet& b) { return a/b; }
+pdiv(const Packet& a,
+        const Packet& b) { return a/b; }
 
 /** \internal \returns the min of \a a and \a b  (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pmin(const Packet& a, const Packet& b) { return numext::mini(a, b); }
+pmin(const Packet& a,
+        const Packet& b) { return numext::mini(a, b); }
 
 /** \internal \returns the max of \a a and \a b  (coeff-wise) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pmax(const Packet& a, const Packet& b) { return numext::maxi(a, b); }
+pmax(const Packet& a,
+        const Packet& b) { return numext::maxi(a, b); }
 
 /** \internal \returns the absolute value of \a a */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pabs(const Packet& a) { using std::abs; return abs(a); }
-template<> EIGEN_DEVICE_FUNC inline unsigned int
-pabs(const unsigned int& a) { return a; }
-template<> EIGEN_DEVICE_FUNC inline unsigned long
-pabs(const unsigned long& a) { return a; }
-template<> EIGEN_DEVICE_FUNC inline unsigned long long
-pabs(const unsigned long long& a) { return a; }
 
 /** \internal \returns the phase angle of \a a */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 parg(const Packet& a) { using numext::arg; return arg(a); }
 
-
-/** \internal \returns \a a logically shifted by N bits to the right */
-template<int N> EIGEN_DEVICE_FUNC inline int
-parithmetic_shift_right(const int& a) { return a >> N; }
-template<int N> EIGEN_DEVICE_FUNC inline long int
-parithmetic_shift_right(const long int& a) { return a >> N; }
-
-/** \internal \returns \a a arithmetically shifted by N bits to the right */
-template<int N> EIGEN_DEVICE_FUNC inline int
-plogical_shift_right(const int& a) { return static_cast<int>(static_cast<unsigned int>(a) >> N); }
-template<int N> EIGEN_DEVICE_FUNC inline long int
-plogical_shift_right(const long int& a) { return static_cast<long>(static_cast<unsigned long>(a) >> N); }
-
-/** \internal \returns \a a shifted by N bits to the left */
-template<int N> EIGEN_DEVICE_FUNC inline int
-plogical_shift_left(const int& a) { return a << N; }
-template<int N> EIGEN_DEVICE_FUNC inline long int
-plogical_shift_left(const long int& a) { return a << N; }
-
-/** \internal \returns the significant and exponent of the underlying floating point numbers
-  * See https://en.cppreference.com/w/cpp/numeric/math/frexp
-  */
-template <typename Packet>
-EIGEN_DEVICE_FUNC inline Packet pfrexp(const Packet& a, Packet& exponent) {
-  int exp;
-  EIGEN_USING_STD_MATH(frexp);
-  Packet result = frexp(a, &exp);
-  exponent = static_cast<Packet>(exp);
-  return result;
-}
-
-/** \internal \returns a * 2^exponent
-  * See https://en.cppreference.com/w/cpp/numeric/math/ldexp
-  */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pldexp(const Packet &a, const Packet &exponent) {
-  EIGEN_USING_STD_MATH(ldexp);
-  return ldexp(a, static_cast<int>(exponent));
-}
-
-// Notice: The following ops accept and operator on bitwise masks.
-// The value of each field in a masks is Scalar(0) or ~Scalar(0).
-// For boolean packet like Packet16b, this is different from the
-// representation of true and false, which are 1 and 0.
-// As an example
-//    ptrue<Packet16b>()     = 0xffffffffffffffffffffffffffffffff
-// while
-//    pset1<Packet16b>(true) = 0x01010101010101010101010101010101
-
 /** \internal \returns the bitwise and of \a a and \a b */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pand(const Packet& a, const Packet& b) { return a & b; }
@@ -290,76 +205,9 @@ por(const Packet& a, const Packet& b) { return a | b; }
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pxor(const Packet& a, const Packet& b) { return a ^ b; }
 
-/** \internal \returns the bitwise and of \a a and not \a b */
+/** \internal \returns the bitwise andnot of \a a and \a b */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pandnot(const Packet& a, const Packet& b) { return a & (~b); }
-
-/** \internal \returns ones */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-ptrue(const Packet& /*a*/) { Packet b; memset((void*)&b, 0xff, sizeof(b)); return b;}
-
-/** \internal \returns zeros */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pzero(const Packet& a) { return pxor(a,a); }
-
-template<> EIGEN_DEVICE_FUNC inline float pzero<float>(const float& a) {
-  EIGEN_UNUSED_VARIABLE(a);
-  return 0.f;
-}
-
-template<> EIGEN_DEVICE_FUNC inline double pzero<double>(const double& a) {
-  EIGEN_UNUSED_VARIABLE(a);
-  return 0.;
-}
-
-template <typename RealScalar>
-EIGEN_DEVICE_FUNC inline std::complex<RealScalar> ptrue(const std::complex<RealScalar>& /*a*/) {
-  RealScalar b;
-  b = ptrue(b);
-  return std::complex<RealScalar>(b, b);
-}
-
-/** \internal \returns the bitwise not of \a a */
-template <typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pnot(const Packet& a) { return pxor(ptrue(a), a);}
-
-/** \internal \returns a <= b as a bit mask */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pcmp_le(const Packet& a, const Packet& b)  { return a<=b ? ptrue(a) : pzero(a); }
-
-/** \internal \returns a < b as a bit mask */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pcmp_lt(const Packet& a, const Packet& b)  { return a<b ? ptrue(a) : pzero(a); }
-
-/** \internal \returns a == b as a bit mask */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pcmp_eq(const Packet& a, const Packet& b) { return a==b ? ptrue(a) : pzero(a); }
-
-/** \internal \returns a < b or a==NaN or b==NaN as a bit mask */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pcmp_lt_or_nan(const Packet& a, const Packet& b) { return pnot(pcmp_le(b,a)); } 
-
-/** \internal \returns \a or \b for each field in packet according to \mask */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pselect(const Packet& mask, const Packet& a, const Packet& b) {
-  return por(pand(a,mask),pandnot(b,mask));
-}
-
-template<> EIGEN_DEVICE_FUNC inline float pselect<float>(
-    const float& cond, const float& a, const float&b) {
-  return numext::equal_strict(cond,0.f) ? b : a;
-}
-
-template<> EIGEN_DEVICE_FUNC inline double pselect<double>(
-    const double& cond, const double& a, const double& b) {
-  return numext::equal_strict(cond,0.) ? b : a;
-}
-
-
-
-/** \internal \returns the min of \a a and \a b  (coeff-wise) */
-template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pabsdiff(const Packet& a, const Packet& b) { return pselect(pcmp_lt(a, b), psub(b, a), psub(a, b)); }
+pandnot(const Packet& a, const Packet& b) { return a & (!b); }
 
 /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
@@ -369,22 +217,10 @@ pload(const typename unpacket_traits<Packet>::type* from) { return *from; }
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
 
-/** \internal \returns a packet version of \a *from, (un-aligned masked load)
- * There is no generic implementation. We only have implementations for specialized
- * cases. Generic case should not be called.
- */
-template<typename Packet> EIGEN_DEVICE_FUNC inline
-typename enable_if<unpacket_traits<Packet>::masked_load_available, Packet>::type
-ploadu(const typename unpacket_traits<Packet>::type* from, typename unpacket_traits<Packet>::mask_t umask);
-
 /** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
 
-/** \internal \returns a packet with constant coefficients set from bits */
-template<typename Packet,typename BitsType> EIGEN_DEVICE_FUNC inline Packet
-pset1frombits(BitsType a);
-
 /** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pload1(const typename unpacket_traits<Packet>::type  *a) { return pset1<Packet>(*a); }
@@ -453,15 +289,6 @@ template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstore(
 template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from)
 {  (*to) = from; }
 
-/** \internal copy the packet \a from to \a *to, (un-aligned store with a mask)
- * There is no generic implementation. We only have implementations for specialized
- * cases. Generic case should not be called.
- */
-template<typename Scalar, typename Packet>
-EIGEN_DEVICE_FUNC inline
-typename enable_if<unpacket_traits<Packet>::masked_store_available, void>::type
-pstoreu(Scalar* to, const Packet& from, typename unpacket_traits<Packet>::mask_t umask);
-
  template<typename Scalar, typename Packet> EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/)
  { return ploadu<Packet>(from); }
 
@@ -471,9 +298,7 @@ pstoreu(Scalar* to, const Packet& from, typename unpacket_traits<Packet>::mask_t
 /** \internal tries to do cache prefetching of \a addr */
 template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr)
 {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-  // do nothing
-#elif defined(EIGEN_CUDA_ARCH)
+#ifdef __CUDA_ARCH__
 #if defined(__LP64__)
   // 64-bit pointer operand constraint for inlined asm
   asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
@@ -490,52 +315,35 @@ template<typename Scalar> EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* a
 template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type pfirst(const Packet& a)
 { return a; }
 
+/** \internal \returns a packet where the element i contains the sum of the packet of \a vec[i] */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+preduxp(const Packet* vecs) { return vecs[0]; }
+
 /** \internal \returns the sum of the elements of \a a*/
 template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a)
 { return a; }
 
-/** \internal \returns the sum of the elements of upper and lower half of \a a if \a a is larger than 4.
+/** \internal \returns the sum of the elements of \a a by block of 4 elements.
   * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7}
   * For packet-size smaller or equal to 4, this boils down to a noop.
   */
 template<typename Packet> EIGEN_DEVICE_FUNC inline
 typename conditional<(unpacket_traits<Packet>::size%8)==0,typename unpacket_traits<Packet>::half,Packet>::type
-predux_half_dowto4(const Packet& a)
+predux_downto4(const Packet& a)
 { return a; }
 
-/** \internal \returns the product of the elements of \a a */
+/** \internal \returns the product of the elements of \a a*/
 template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
 { return a; }
 
-/** \internal \returns the min of the elements of \a a */
+/** \internal \returns the min of the elements of \a a*/
 template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a)
 { return a; }
 
-/** \internal \returns the max of the elements of \a a */
+/** \internal \returns the max of the elements of \a a*/
 template<typename Packet> EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a)
 { return a; }
 
-/** \internal \returns true if all coeffs of \a a means "true"
-  * It is supposed to be called on values returned by pcmp_*.
-  */
-// not needed yet
-// template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_all(const Packet& a)
-// { return bool(a); }
-
-/** \internal \returns true if any coeffs of \a a means "true"
-  * It is supposed to be called on values returned by pcmp_*.
-  */
-template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_any(const Packet& a)
-{
-  // Dirty but generic implementation where "true" is assumed to be non 0 and all the sames.
-  // It is expected that "true" is either:
-  //  - Scalar(1)
-  //  - bits full of ones (NaN for floats),
-  //  - or first bit equals to 1 (1 for ints, smallest denormal for floats).
-  // For all these cases, taking the sum is just fine, and this boils down to a no-op for scalars.
-  return bool(predux(a));
-}
-
 /** \internal \returns the reversed elements of \a a*/
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a)
 { return a; }
@@ -543,7 +351,7 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet preverse(const Packet&
 /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a)
 {
-  return Packet(numext::imag(a),numext::real(a));
+  return Packet(a.imag(),a.real());
 }
 
 /**************************
@@ -552,51 +360,47 @@ template<typename Packet> EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet
 
 /** \internal \returns the sine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet psin(const Packet& a) { EIGEN_USING_STD_MATH(sin); return sin(a); }
+Packet psin(const Packet& a) { using std::sin; return sin(a); }
 
 /** \internal \returns the cosine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pcos(const Packet& a) { EIGEN_USING_STD_MATH(cos); return cos(a); }
+Packet pcos(const Packet& a) { using std::cos; return cos(a); }
 
 /** \internal \returns the tan of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet ptan(const Packet& a) { EIGEN_USING_STD_MATH(tan); return tan(a); }
+Packet ptan(const Packet& a) { using std::tan; return tan(a); }
 
 /** \internal \returns the arc sine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pasin(const Packet& a) { EIGEN_USING_STD_MATH(asin); return asin(a); }
+Packet pasin(const Packet& a) { using std::asin; return asin(a); }
 
 /** \internal \returns the arc cosine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pacos(const Packet& a) { EIGEN_USING_STD_MATH(acos); return acos(a); }
+Packet pacos(const Packet& a) { using std::acos; return acos(a); }
 
 /** \internal \returns the arc tangent of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet patan(const Packet& a) { EIGEN_USING_STD_MATH(atan); return atan(a); }
+Packet patan(const Packet& a) { using std::atan; return atan(a); }
 
 /** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet psinh(const Packet& a) { EIGEN_USING_STD_MATH(sinh); return sinh(a); }
+Packet psinh(const Packet& a) { using std::sinh; return sinh(a); }
 
 /** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pcosh(const Packet& a) { EIGEN_USING_STD_MATH(cosh); return cosh(a); }
+Packet pcosh(const Packet& a) { using std::cosh; return cosh(a); }
 
 /** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet ptanh(const Packet& a) { EIGEN_USING_STD_MATH(tanh); return tanh(a); }
+Packet ptanh(const Packet& a) { using std::tanh; return tanh(a); }
 
 /** \internal \returns the exp of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pexp(const Packet& a) { EIGEN_USING_STD_MATH(exp); return exp(a); }
-
-/** \internal \returns the expm1 of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pexpm1(const Packet& a) { return numext::expm1(a); }
+Packet pexp(const Packet& a) { using std::exp; return exp(a); }
 
 /** \internal \returns the log of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog(const Packet& a) { EIGEN_USING_STD_MATH(log); return log(a); }
+Packet plog(const Packet& a) { using std::log; return log(a); }
 
 /** \internal \returns the log1p of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
@@ -604,11 +408,11 @@ Packet plog1p(const Packet& a) { return numext::log1p(a); }
 
 /** \internal \returns the log10 of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog10(const Packet& a) { EIGEN_USING_STD_MATH(log10); return log10(a); }
+Packet plog10(const Packet& a) { using std::log10; return log10(a); }
 
 /** \internal \returns the square-root of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet psqrt(const Packet& a) { EIGEN_USING_STD_MATH(sqrt); return sqrt(a); }
+Packet psqrt(const Packet& a) { using std::sqrt; return sqrt(a); }
 
 /** \internal \returns the reciprocal square-root of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
@@ -624,11 +428,6 @@ Packet pround(const Packet& a) { using numext::round; return round(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pfloor(const Packet& a) { using numext::floor; return floor(a); }
 
-/** \internal \returns the rounded value of \a a (coeff-wise) with current
- * rounding mode */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet print(const Packet& a) { using numext::rint; return rint(a); }
-
 /** \internal \returns the ceil of \a a (coeff-wise) */
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); }
@@ -637,7 +436,7 @@ Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); }
 * The following functions might not have to be overwritten for vectorized types
 ***************************************************************************/
 
-/** \internal copy a packet with constant coefficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */
+/** \internal copy a packet with constant coeficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */
 // NOTE: this function must really be templated on the packet type (think about different packet types for the same scalar type)
 template<typename Packet>
 inline void pstore1(typename unpacket_traits<Packet>::type* to, const typename unpacket_traits<Packet>::type& a)
@@ -685,12 +484,41 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_t
   return ploadt<Packet, LoadMode>(from);
 }
 
+/** \internal default implementation of palign() allowing partial specialization */
+template<int Offset,typename PacketType>
+struct palign_impl
+{
+  // by default data are aligned, so there is nothing to be done :)
+  static inline void run(PacketType&, const PacketType&) {}
+};
+
+/** \internal update \a first using the concatenation of the packet_size minus \a Offset last elements
+  * of \a first and \a Offset first elements of \a second.
+  * 
+  * This function is currently only used to optimize matrix-vector products on unligned matrices.
+  * It takes 2 packets that represent a contiguous memory array, and returns a packet starting
+  * at the position \a Offset. For instance, for packets of 4 elements, we have:
+  *  Input:
+  *  - first = {f0,f1,f2,f3}
+  *  - second = {s0,s1,s2,s3}
+  * Output: 
+  *   - if Offset==0 then {f0,f1,f2,f3}
+  *   - if Offset==1 then {f1,f2,f3,s0}
+  *   - if Offset==2 then {f2,f3,s0,s1}
+  *   - if Offset==3 then {f3,s0,s1,s3}
+  */
+template<int Offset,typename PacketType>
+inline void palign(PacketType& first, const PacketType& second)
+{
+  palign_impl<Offset,PacketType>::run(first,second);
+}
+
 /***************************************************************************
 * Fast complex products (GCC generates a function call which is very slow)
 ***************************************************************************/
 
 // Eigen+CUDA does not support complexes.
-#if !defined(EIGEN_GPUCC)
+#ifndef __CUDACC__
 
 template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
 { return std::complex<float>(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); }
@@ -727,21 +555,33 @@ pblend(const Selector<unpacket_traits<Packet>::size>& ifPacket, const Packet& th
   return ifPacket.select[0] ? thenPacket : elsePacket;
 }
 
-/***************************************************************************
- * Some generic implementations to be used by implementors
-***************************************************************************/
+/** \internal \returns \a a with the first coefficient replaced by the scalar b */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pinsertfirst(const Packet& a, typename unpacket_traits<Packet>::type b)
+{
+  // Default implementation based on pblend.
+  // It must be specialized for higher performance.
+  Selector<unpacket_traits<Packet>::size> mask;
+  mask.select[0] = true;
+  // This for loop should be optimized away by the compiler.
+  for(Index i=1; i<unpacket_traits<Packet>::size; ++i)
+    mask.select[i] = false;
+  return pblend(mask, pset1<Packet>(b), a);
+}
 
-/** Default implementation of pfrexp for float.
-  * It is expected to be called by implementers of template<> pfrexp.
-  */
-template<typename Packet> EIGEN_STRONG_INLINE Packet
-pfrexp_float(const Packet& a, Packet& exponent);
-
-/** Default implementation of pldexp for float.
-  * It is expected to be called by implementers of template<> pldexp.
-  */
-template<typename Packet> EIGEN_STRONG_INLINE Packet
-pldexp_float(Packet a, Packet exponent);
+/** \internal \returns \a a with the last coefficient replaced by the scalar b */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pinsertlast(const Packet& a, typename unpacket_traits<Packet>::type b)
+{
+  // Default implementation based on pblend.
+  // It must be specialized for higher performance.
+  Selector<unpacket_traits<Packet>::size> mask;
+  // This for loop should be optimized away by the compiler.
+  for(Index i=0; i<unpacket_traits<Packet>::size-1; ++i)
+    mask.select[i] = false;
+  mask.select[unpacket_traits<Packet>::size-1] = true;
+  return pblend(mask, pset1<Packet>(b), a);
+}
 
 } // end namespace internal
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/GlobalFunctions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/GlobalFunctions.h
index 8d54f92df..769dc255c 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/GlobalFunctions.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/GlobalFunctions.h
@@ -66,19 +66,11 @@ namespace Eigen
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh)
-#if EIGEN_HAS_CXX11_MATH
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asinh,scalar_asinh_op,inverse hyperbolic sine,\sa ArrayBase::asinh)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acosh,scalar_acosh_op,inverse hyperbolic cosine,\sa ArrayBase::acosh)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atanh,scalar_atanh_op,inverse hyperbolic tangent,\sa ArrayBase::atanh)
-#endif
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(logistic,scalar_logistic_op,logistic function,\sa ArrayBase::logistic)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op,error function,\sa ArrayBase::erf)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op,complement error function,\sa ArrayBase::erfc)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ndtri,scalar_ndtri_op,inverse normal distribution function,\sa ArrayBase::ndtri)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op,exponential,\sa ArrayBase::exp)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(expm1,scalar_expm1_op,exponential of a value minus 1,\sa ArrayBase::expm1)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op,natural logarithm,\sa Eigen::log10 DOXCOMMA ArrayBase::log)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log1p,scalar_log1p_op,natural logarithm of 1 plus the value,\sa ArrayBase::log1p)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op,base 10 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log)
@@ -89,7 +81,6 @@ namespace Eigen
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rsqrt,scalar_rsqrt_op,reciprocal square root,\sa ArrayBase::rsqrt)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(square,scalar_square_op,square (power 2),\sa Eigen::abs2 DOXCOMMA Eigen::pow DOXCOMMA ArrayBase::square)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cube,scalar_cube_op,cube (power 3),\sa Eigen::pow DOXCOMMA ArrayBase::cube)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rint,scalar_rint_op,nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(round,scalar_round_op,nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(floor,scalar_floor_op,nearest integer not greater than the giben value,\sa Eigen::ceil DOXCOMMA ArrayBase::floor)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ceil,scalar_ceil_op,nearest integer not less than the giben value,\sa Eigen::floor DOXCOMMA ArrayBase::ceil)
@@ -97,7 +88,7 @@ namespace Eigen
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op,infinite value test,\sa Eigen::isnan DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isinf)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op,finite value test,\sa Eigen::isinf DOXCOMMA Eigen::isnan DOXCOMMA ArrayBase::isfinite)
   EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op,sign (or 0),\sa ArrayBase::sign)
-
+  
   /** \returns an expression of the coefficient-wise power of \a x to the given constant \a exponent.
     *
     * \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression (\c Derived::Scalar).
@@ -111,18 +102,17 @@ namespace Eigen
   inline const CwiseBinaryOp<internal::scalar_pow_op<Derived::Scalar,ScalarExponent>,Derived,Constant<ScalarExponent> >
   pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent);
 #else
-  template <typename Derived,typename ScalarExponent>
-  EIGEN_DEVICE_FUNC inline
-  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(
-    const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg<typename Derived::Scalar
-                                                 EIGEN_COMMA ScalarExponent EIGEN_COMMA
-                                                 EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent)>::type,pow))
-  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent)
-  {
-    typedef typename internal::promote_scalar_arg<typename Derived::Scalar,ScalarExponent,
-                                                  EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent)>::type PromotedExponent;
-    return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedExponent,pow)(x.derived(),
-           typename internal::plain_constant_type<Derived,PromotedExponent>::type(x.derived().rows(), x.derived().cols(), internal::scalar_constant_op<PromotedExponent>(exponent)));
+  template<typename Derived,typename ScalarExponent>
+  inline typename internal::enable_if<   !(internal::is_same<typename Derived::Scalar,ScalarExponent>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent),
+          const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,ScalarExponent,pow) >::type
+  pow(const Eigen::ArrayBase<Derived>& x, const ScalarExponent& exponent) {
+    return x.derived().pow(exponent);
+  }
+
+  template<typename Derived>
+  inline const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename Derived::Scalar,pow)
+  pow(const Eigen::ArrayBase<Derived>& x, const typename Derived::Scalar& exponent) {
+    return x.derived().pow(exponent);
   }
 #endif
 
@@ -132,21 +122,21 @@ namespace Eigen
     *
     * Example: \include Cwise_array_power_array.cpp
     * Output: \verbinclude Cwise_array_power_array.out
-    *
+    * 
     * \sa ArrayBase::pow()
     *
     * \relates ArrayBase
     */
   template<typename Derived,typename ExponentDerived>
   inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>
-  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents)
+  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents) 
   {
     return Eigen::CwiseBinaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived, const ExponentDerived>(
       x.derived(),
       exponents.derived()
     );
   }
-
+  
   /** \returns an expression of the coefficient-wise power of the scalar \a x to the given array of \a exponents.
     *
     * This function computes the coefficient-wise power between a scalar and an array of exponents.
@@ -155,7 +145,7 @@ namespace Eigen
     *
     * Example: \include Cwise_scalar_power_array.cpp
     * Output: \verbinclude Cwise_scalar_power_array.out
-    *
+    * 
     * \sa ArrayBase::pow()
     *
     * \relates ArrayBase
@@ -165,17 +155,21 @@ namespace Eigen
   inline const CwiseBinaryOp<internal::scalar_pow_op<Scalar,Derived::Scalar>,Constant<Scalar>,Derived>
   pow(const Scalar& x,const Eigen::ArrayBase<Derived>& x);
 #else
-  template <typename Scalar, typename Derived>
-  EIGEN_DEVICE_FUNC inline
-  EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(
-    const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg<typename Derived::Scalar
-                                                 EIGEN_COMMA Scalar EIGEN_COMMA
-                                                 EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar)>::type,Derived,pow))
-  pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents) {
-    typedef typename internal::promote_scalar_arg<typename Derived::Scalar,Scalar,
-                                                  EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar)>::type PromotedScalar;
-    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedScalar,Derived,pow)(
-           typename internal::plain_constant_type<Derived,PromotedScalar>::type(exponents.derived().rows(), exponents.derived().cols(), internal::scalar_constant_op<PromotedScalar>(x)), exponents.derived());
+  template<typename Scalar, typename Derived>
+  inline typename internal::enable_if<   !(internal::is_same<typename Derived::Scalar,Scalar>::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar),
+          const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow) >::type
+  pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents)
+  {
+    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow)(
+            typename internal::plain_constant_type<Derived,Scalar>::type(exponents.rows(), exponents.cols(), x), exponents.derived() );
+  }
+
+  template<typename Derived>
+  inline const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)
+  pow(const typename Derived::Scalar& x, const Eigen::ArrayBase<Derived>& exponents)
+  {
+    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)(
+      typename internal::plain_constant_type<Derived,typename Derived::Scalar>::type(exponents.rows(), exponents.cols(), x), exponents.derived() );
   }
 #endif
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/IO.h b/uppsrc/plugin/Eigen/Eigen/src/Core/IO.h
index e81c31521..da7fd6cce 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/IO.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/IO.h
@@ -41,7 +41,6 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
   *  - \b rowSuffix string printed at the end of each row
   *  - \b matPrefix string printed at the beginning of the matrix
   *  - \b matSuffix string printed at the end of the matrix
-  *  - \b fill character printed to fill the empty space in aligned columns
   *
   * Example: \include IOFormat.cpp
   * Output: \verbinclude IOFormat.out
@@ -54,9 +53,9 @@ struct IOFormat
   IOFormat(int _precision = StreamPrecision, int _flags = 0,
     const std::string& _coeffSeparator = " ",
     const std::string& _rowSeparator = "\n", const std::string& _rowPrefix="", const std::string& _rowSuffix="",
-    const std::string& _matPrefix="", const std::string& _matSuffix="", const char _fill=' ')
+    const std::string& _matPrefix="", const std::string& _matSuffix="")
   : matPrefix(_matPrefix), matSuffix(_matSuffix), rowPrefix(_rowPrefix), rowSuffix(_rowSuffix), rowSeparator(_rowSeparator),
-    rowSpacer(""), coeffSeparator(_coeffSeparator), fill(_fill), precision(_precision), flags(_flags)
+    rowSpacer(""), coeffSeparator(_coeffSeparator), precision(_precision), flags(_flags)
   {
     // TODO check if rowPrefix, rowSuffix or rowSeparator contains a newline
     // don't add rowSpacer if columns are not to be aligned
@@ -72,7 +71,6 @@ struct IOFormat
   std::string matPrefix, matSuffix;
   std::string rowPrefix, rowSuffix, rowSeparator, rowSpacer;
   std::string coeffSeparator;
-  char fill;
   int precision;
   int flags;
 };
@@ -130,9 +128,6 @@ struct significant_decimals_impl
 template<typename Derived>
 std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& fmt)
 {
-  using internal::is_same;
-  using internal::conditional;
-
   if(_m.size() == 0)
   {
     s << fmt.matPrefix << fmt.matSuffix;
@@ -141,22 +136,6 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
   
   typename Derived::Nested m = _m;
   typedef typename Derived::Scalar Scalar;
-  typedef typename
-      conditional<
-          is_same<Scalar, char>::value ||
-            is_same<Scalar, unsigned char>::value ||
-            is_same<Scalar, numext::int8_t>::value ||
-            is_same<Scalar, numext::uint8_t>::value,
-          int,
-          typename conditional<
-              is_same<Scalar, std::complex<char> >::value ||
-                is_same<Scalar, std::complex<unsigned char> >::value ||
-                is_same<Scalar, std::complex<numext::int8_t> >::value ||
-                is_same<Scalar, std::complex<numext::uint8_t> >::value,
-              std::complex<int>,
-              const Scalar&
-            >::type
-        >::type PrintType;
 
   Index width = 0;
 
@@ -193,31 +172,23 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
       {
         std::stringstream sstr;
         sstr.copyfmt(s);
-        sstr << static_cast<PrintType>(m.coeff(i,j));
+        sstr << m.coeff(i,j);
         width = std::max<Index>(width, Index(sstr.str().length()));
       }
   }
-  std::streamsize old_width = s.width();
-  char old_fill_character = s.fill();
   s << fmt.matPrefix;
   for(Index i = 0; i < m.rows(); ++i)
   {
     if (i)
       s << fmt.rowSpacer;
     s << fmt.rowPrefix;
-    if(width) {
-      s.fill(fmt.fill);
-      s.width(width);
-    }
-    s << static_cast<PrintType>(m.coeff(i, 0));
+    if(width) s.width(width);
+    s << m.coeff(i, 0);
     for(Index j = 1; j < m.cols(); ++j)
     {
       s << fmt.coeffSeparator;
-      if(width) {
-        s.fill(fmt.fill);
-        s.width(width);
-      }
-      s << static_cast<PrintType>(m.coeff(i, j));
+      if (width) s.width(width);
+      s << m.coeff(i, j);
     }
     s << fmt.rowSuffix;
     if( i < m.rows() - 1)
@@ -225,10 +196,6 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat&
   }
   s << fmt.matSuffix;
   if(explicit_precision) s.precision(old_precision);
-  if(width) {
-    s.fill(old_fill_character);
-    s.width(old_width);
-  }
   return s;
 }
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/IndexedView.h b/uppsrc/plugin/Eigen/Eigen/src/Core/IndexedView.h
deleted file mode 100644
index 377f8a5cc..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/IndexedView.h
+++ /dev/null
@@ -1,207 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_INDEXED_VIEW_H
-#define EIGEN_INDEXED_VIEW_H
-
-namespace Eigen {
-
-namespace internal {
-
-template<typename XprType, typename RowIndices, typename ColIndices>
-struct traits<IndexedView<XprType, RowIndices, ColIndices> >
- : traits<XprType>
-{
-  enum {
-    RowsAtCompileTime = int(array_size<RowIndices>::value),
-    ColsAtCompileTime = int(array_size<ColIndices>::value),
-    MaxRowsAtCompileTime = RowsAtCompileTime != Dynamic ? int(RowsAtCompileTime) : Dynamic,
-    MaxColsAtCompileTime = ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime) : Dynamic,
-
-    XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0,
-    IsRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
-               : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
-               : XprTypeIsRowMajor,
-
-    RowIncr = int(get_compile_time_incr<RowIndices>::value),
-    ColIncr = int(get_compile_time_incr<ColIndices>::value),
-    InnerIncr = IsRowMajor ? ColIncr : RowIncr,
-    OuterIncr = IsRowMajor ? RowIncr : ColIncr,
-
-    HasSameStorageOrderAsXprType = (IsRowMajor == XprTypeIsRowMajor),
-    XprInnerStride = HasSameStorageOrderAsXprType ? int(inner_stride_at_compile_time<XprType>::ret) : int(outer_stride_at_compile_time<XprType>::ret),
-    XprOuterstride = HasSameStorageOrderAsXprType ? int(outer_stride_at_compile_time<XprType>::ret) : int(inner_stride_at_compile_time<XprType>::ret),
-
-    InnerSize = XprTypeIsRowMajor ? ColsAtCompileTime : RowsAtCompileTime,
-    IsBlockAlike = InnerIncr==1 && OuterIncr==1,
-    IsInnerPannel = HasSameStorageOrderAsXprType && is_same<AllRange<InnerSize>,typename conditional<XprTypeIsRowMajor,ColIndices,RowIndices>::type>::value,
-
-    InnerStrideAtCompileTime = InnerIncr<0 || InnerIncr==DynamicIndex || XprInnerStride==Dynamic ? Dynamic : XprInnerStride * InnerIncr,
-    OuterStrideAtCompileTime = OuterIncr<0 || OuterIncr==DynamicIndex || XprOuterstride==Dynamic ? Dynamic : XprOuterstride * OuterIncr,
-
-    ReturnAsScalar = is_same<RowIndices,SingleRange>::value && is_same<ColIndices,SingleRange>::value,
-    ReturnAsBlock = (!ReturnAsScalar) && IsBlockAlike,
-    ReturnAsIndexedView = (!ReturnAsScalar) && (!ReturnAsBlock),
-
-    // FIXME we deal with compile-time strides if and only if we have DirectAccessBit flag,
-    // but this is too strict regarding negative strides...
-    DirectAccessMask = (int(InnerIncr)!=UndefinedIncr && int(OuterIncr)!=UndefinedIncr && InnerIncr>=0 && OuterIncr>=0) ? DirectAccessBit : 0,
-    FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
-    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
-    Flags = (traits<XprType>::Flags & (HereditaryBits | DirectAccessMask)) | FlagsLvalueBit | FlagsRowMajorBit
-  };
-
-  typedef Block<XprType,RowsAtCompileTime,ColsAtCompileTime,IsInnerPannel> BlockType;
-};
-
-}
-
-template<typename XprType, typename RowIndices, typename ColIndices, typename StorageKind>
-class IndexedViewImpl;
-
-
-/** \class IndexedView
-  * \ingroup Core_Module
-  *
-  * \brief Expression of a non-sequential sub-matrix defined by arbitrary sequences of row and column indices
-  *
-  * \tparam XprType the type of the expression in which we are taking the intersections of sub-rows and sub-columns
-  * \tparam RowIndices the type of the object defining the sequence of row indices
-  * \tparam ColIndices the type of the object defining the sequence of column indices
-  *
-  * This class represents an expression of a sub-matrix (or sub-vector) defined as the intersection
-  * of sub-sets of rows and columns, that are themself defined by generic sequences of row indices \f$ \{r_0,r_1,..r_{m-1}\} \f$
-  * and column indices \f$ \{c_0,c_1,..c_{n-1} \}\f$. Let \f$ A \f$  be the nested matrix, then the resulting matrix \f$ B \f$ has \c m
-  * rows and \c n columns, and its entries are given by: \f$ B(i,j) = A(r_i,c_j) \f$.
-  *
-  * The \c RowIndices and \c ColIndices types must be compatible with the following API:
-  * \code
-  * <integral type> operator[](Index) const;
-  * Index size() const;
-  * \endcode
-  *
-  * Typical supported types thus include:
-  *  - std::vector<int>
-  *  - std::valarray<int>
-  *  - std::array<int>
-  *  - Plain C arrays: int[N]
-  *  - Eigen::ArrayXi
-  *  - decltype(ArrayXi::LinSpaced(...))
-  *  - Any view/expressions of the previous types
-  *  - Eigen::ArithmeticSequence
-  *  - Eigen::internal::AllRange      (helper for Eigen::all)
-  *  - Eigen::internal::SingleRange  (helper for single index)
-  *  - etc.
-  *
-  * In typical usages of %Eigen, this class should never be used directly. It is the return type of
-  * DenseBase::operator()(const RowIndices&, const ColIndices&).
-  *
-  * \sa class Block
-  */
-template<typename XprType, typename RowIndices, typename ColIndices>
-class IndexedView : public IndexedViewImpl<XprType, RowIndices, ColIndices, typename internal::traits<XprType>::StorageKind>
-{
-public:
-  typedef typename IndexedViewImpl<XprType, RowIndices, ColIndices, typename internal::traits<XprType>::StorageKind>::Base Base;
-  EIGEN_GENERIC_PUBLIC_INTERFACE(IndexedView)
-  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(IndexedView)
-
-  typedef typename internal::ref_selector<XprType>::non_const_type MatrixTypeNested;
-  typedef typename internal::remove_all<XprType>::type NestedExpression;
-
-  template<typename T0, typename T1>
-  IndexedView(XprType& xpr, const T0& rowIndices, const T1& colIndices)
-    : m_xpr(xpr), m_rowIndices(rowIndices), m_colIndices(colIndices)
-  {}
-
-  /** \returns number of rows */
-  Index rows() const { return internal::size(m_rowIndices); }
-
-  /** \returns number of columns */
-  Index cols() const { return internal::size(m_colIndices); }
-
-  /** \returns the nested expression */
-  const typename internal::remove_all<XprType>::type&
-  nestedExpression() const { return m_xpr; }
-
-  /** \returns the nested expression */
-  typename internal::remove_reference<XprType>::type&
-  nestedExpression() { return m_xpr; }
-
-  /** \returns a const reference to the object storing/generating the row indices */
-  const RowIndices& rowIndices() const { return m_rowIndices; }
-
-  /** \returns a const reference to the object storing/generating the column indices */
-  const ColIndices& colIndices() const { return m_colIndices; }
-
-protected:
-  MatrixTypeNested m_xpr;
-  RowIndices m_rowIndices;
-  ColIndices m_colIndices;
-};
-
-
-// Generic API dispatcher
-template<typename XprType, typename RowIndices, typename ColIndices, typename StorageKind>
-class IndexedViewImpl
-  : public internal::generic_xpr_base<IndexedView<XprType, RowIndices, ColIndices> >::type
-{
-public:
-  typedef typename internal::generic_xpr_base<IndexedView<XprType, RowIndices, ColIndices> >::type Base;
-};
-
-namespace internal {
-
-
-template<typename ArgType, typename RowIndices, typename ColIndices>
-struct unary_evaluator<IndexedView<ArgType, RowIndices, ColIndices>, IndexBased>
-  : evaluator_base<IndexedView<ArgType, RowIndices, ColIndices> >
-{
-  typedef IndexedView<ArgType, RowIndices, ColIndices> XprType;
-
-  enum {
-    CoeffReadCost = evaluator<ArgType>::CoeffReadCost /* TODO + cost of row/col index */,
-
-    Flags = (evaluator<ArgType>::Flags & (HereditaryBits /*| LinearAccessBit | DirectAccessBit*/)),
-
-    Alignment = 0
-  };
-
-  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_xpr(xpr)
-  {
-    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
-  }
-
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  CoeffReturnType coeff(Index row, Index col) const
-  {
-    return m_argImpl.coeff(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Scalar& coeffRef(Index row, Index col)
-  {
-    return m_argImpl.coeffRef(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
-  }
-
-protected:
-
-  evaluator<ArgType> m_argImpl;
-  const XprType& m_xpr;
-
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_INDEXED_VIEW_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Inverse.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Inverse.h
index 7352d8037..b76f0439d 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/Inverse.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Inverse.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2014-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -44,6 +44,7 @@ class Inverse : public InverseImpl<XprType,typename internal::traits<XprType>::S
 {
 public:
   typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename XprType::PlainObject                       PlainObject;
   typedef typename XprType::Scalar                            Scalar;
   typedef typename internal::ref_selector<XprType>::type      XprTypeNested;
   typedef typename internal::remove_all<XprTypeNested>::type  XprTypeNestedCleaned;
@@ -54,8 +55,8 @@ public:
     : m_xpr(xpr)
   {}
 
-  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.cols(); }
-  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
 
   EIGEN_DEVICE_FUNC const XprTypeNestedCleaned& nestedExpression() const { return m_xpr; }
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Map.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Map.h
index c437f1a92..548bf9a2d 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/Map.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Map.h
@@ -113,10 +113,10 @@ template<typename PlainObjectType, int MapOptions, typename StrideType> class Ma
     EIGEN_DEVICE_FUNC
     inline Index outerStride() const
     {
-      return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
-           : internal::traits<Map>::OuterStrideAtCompileTime != Dynamic ? Index(internal::traits<Map>::OuterStrideAtCompileTime)
+      return int(StrideType::OuterStrideAtCompileTime) != 0 ? m_stride.outer()
+           : int(internal::traits<Map>::OuterStrideAtCompileTime) != Dynamic ? Index(internal::traits<Map>::OuterStrideAtCompileTime)
            : IsVectorAtCompileTime ? (this->size() * innerStride())
-           : int(Flags)&RowMajorBit ? (this->cols() * innerStride())
+           : (int(Flags)&RowMajorBit) ? (this->cols() * innerStride())
            : (this->rows() * innerStride());
     }
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/MathFunctions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/MathFunctions.h
index 96cb24fcb..01736c2a0 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/MathFunctions.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/MathFunctions.h
@@ -14,6 +14,7 @@
 // TODO this should better be moved to NumTraits
 #define EIGEN_PI 3.141592653589793238462643383279502884197169399375105820974944592307816406L
 
+
 namespace Eigen {
 
 // On WINCE, std::abs is defined for int only, so let's defined our own overloads:
@@ -96,7 +97,7 @@ struct real_default_impl<Scalar,true>
 
 template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
 
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#ifdef __CUDA_ARCH__
 template<typename T>
 struct real_impl<std::complex<T> >
 {
@@ -144,7 +145,7 @@ struct imag_default_impl<Scalar,true>
 
 template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
 
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#ifdef __CUDA_ARCH__
 template<typename T>
 struct imag_impl<std::complex<T> >
 {
@@ -238,7 +239,7 @@ struct imag_ref_retval
 ****************************************************************************/
 
 template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-struct conj_default_impl
+struct conj_impl
 {
   EIGEN_DEVICE_FUNC
   static inline Scalar run(const Scalar& x)
@@ -248,7 +249,7 @@ struct conj_default_impl
 };
 
 template<typename Scalar>
-struct conj_default_impl<Scalar,true>
+struct conj_impl<Scalar,true>
 {
   EIGEN_DEVICE_FUNC
   static inline Scalar run(const Scalar& x)
@@ -258,20 +259,6 @@ struct conj_default_impl<Scalar,true>
   }
 };
 
-template<typename Scalar> struct conj_impl : conj_default_impl<Scalar> {};
-
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-template<typename T>
-struct conj_impl<std::complex<T> >
-{
-  EIGEN_DEVICE_FUNC
-  static inline std::complex<T> run(const std::complex<T>& x)
-  {
-    return std::complex<T>(x.real(), -x.imag());
-  }
-};
-#endif
-
 template<typename Scalar>
 struct conj_retval
 {
@@ -402,11 +389,10 @@ inline NewType cast(const OldType& x)
 #if EIGEN_HAS_CXX11_MATH
   template<typename Scalar>
   struct round_impl {
-    EIGEN_DEVICE_FUNC
     static inline Scalar run(const Scalar& x)
     {
       EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
-      EIGEN_USING_STD_MATH(round);
+      using std::round;
       return round(x);
     }
   };
@@ -414,7 +400,6 @@ inline NewType cast(const OldType& x)
   template<typename Scalar>
   struct round_impl
   {
-    EIGEN_DEVICE_FUNC
     static inline Scalar run(const Scalar& x)
     {
       EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
@@ -431,48 +416,6 @@ struct round_retval
   typedef Scalar type;
 };
 
-/****************************************************************************
-* Implementation of rint                                                    *
-****************************************************************************/
-
-template<typename Scalar>
-struct rint_impl {
-  EIGEN_DEVICE_FUNC
-  static inline Scalar run(const Scalar& x)
-  {
-    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex), NUMERIC_TYPE_MUST_BE_REAL)
-#if EIGEN_HAS_CXX11_MATH
-      EIGEN_USING_STD_MATH(rint);
-#endif
-    return rint(x);
-  }
-};
-
-#if !EIGEN_HAS_CXX11_MATH
-template<>
-struct rint_impl<double> {
-  EIGEN_DEVICE_FUNC
-  static inline double run(const double& x)
-  {
-    return ::rint(x);
-  }
-};
-template<>
-struct rint_impl<float> {
-  EIGEN_DEVICE_FUNC
-  static inline float run(const float& x)
-  {
-    return ::rintf(x);
-  }
-};
-#endif
-
-template<typename Scalar>
-struct rint_retval
-{
-  typedef Scalar type;
-};
-
 /****************************************************************************
 * Implementation of arg                                                     *
 ****************************************************************************/
@@ -480,15 +423,9 @@ struct rint_retval
 #if EIGEN_HAS_CXX11_MATH
   template<typename Scalar>
   struct arg_impl {
-    EIGEN_DEVICE_FUNC
     static inline Scalar run(const Scalar& x)
     {
-      #if defined(EIGEN_HIP_DEVICE_COMPILE)
-      // HIP does not seem to have a native device side implementation for the math routine "arg"
-      using std::arg;
-      #else 		  
       EIGEN_USING_STD_MATH(arg);
-      #endif
       return arg(x);
     }
   };
@@ -524,86 +461,6 @@ struct arg_retval
   typedef typename NumTraits<Scalar>::Real type;
 };
 
-/****************************************************************************
-* Implementation of expm1                                                   *
-****************************************************************************/
-
-// This implementation is based on GSL Math's expm1.
-namespace std_fallback {
-  // fallback expm1 implementation in case there is no expm1(Scalar) function in namespace of Scalar,
-  // or that there is no suitable std::expm1 function available. Implementation
-  // attributed to Kahan. See: http://www.plunk.org/~hatch/rightway.php.
-  template<typename Scalar>
-  EIGEN_DEVICE_FUNC inline Scalar expm1(const Scalar& x) {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-
-    EIGEN_USING_STD_MATH(exp);
-    Scalar u = exp(x);
-    if (numext::equal_strict(u, Scalar(1))) {
-      return x;
-    }
-    Scalar um1 = u - RealScalar(1);
-    if (numext::equal_strict(um1, Scalar(-1))) {
-      return RealScalar(-1);
-    }
-
-    EIGEN_USING_STD_MATH(log);
-    Scalar logu = log(u);
-    return numext::equal_strict(u, logu) ? u : (u - RealScalar(1)) * x / logu;
-  }
-}
-
-template<typename Scalar>
-struct expm1_impl {
-  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
-  {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-    #if EIGEN_HAS_CXX11_MATH
-    using std::expm1;
-    #else
-    using std_fallback::expm1;
-    #endif
-    return expm1(x);
-  }
-};
-
-// Specialization for complex types that are not supported by std::expm1.
-template <typename RealScalar>
-struct expm1_impl<std::complex<RealScalar> > {
-  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
-      const std::complex<RealScalar>& x) {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
-    RealScalar xr = x.real();
-    RealScalar xi = x.imag();
-    // expm1(z) = exp(z) - 1
-    //          = exp(x +  i * y) - 1
-    //          = exp(x) * (cos(y) + i * sin(y)) - 1
-    //          = exp(x) * cos(y) - 1 + i * exp(x) * sin(y)
-    // Imag(expm1(z)) = exp(x) * sin(y)
-    // Real(expm1(z)) = exp(x) * cos(y) - 1
-    //          = exp(x) * cos(y) - 1.
-    //          = expm1(x) + exp(x) * (cos(y) - 1)
-    //          = expm1(x) + exp(x) * (2 * sin(y / 2) ** 2)
-
-    // TODO better use numext::expm1 and numext::sin (but that would require forward declarations or moving this specialization down).
-    RealScalar erm1 = expm1_impl<RealScalar>::run(xr);
-    RealScalar er = erm1 + RealScalar(1.);
-    EIGEN_USING_STD_MATH(sin);
-    RealScalar sin2 = sin(xi / RealScalar(2.));
-    sin2 = sin2 * sin2;
-    RealScalar s = sin(xi);
-    RealScalar real_part = erm1 - RealScalar(2.) * er * sin2;
-    return std::complex<RealScalar>(real_part, er * s);
-  }
-};
-
-template<typename Scalar>
-struct expm1_retval
-{
-  typedef Scalar type;
-};
-
 /****************************************************************************
 * Implementation of log1p                                                   *
 ****************************************************************************/
@@ -617,36 +474,23 @@ namespace std_fallback {
     typedef typename NumTraits<Scalar>::Real RealScalar;
     EIGEN_USING_STD_MATH(log);
     Scalar x1p = RealScalar(1) + x;
-    Scalar log_1p = log(x1p);
-    const bool is_small = numext::equal_strict(x1p, Scalar(1));
-    const bool is_inf = numext::equal_strict(x1p, log_1p);
-    return (is_small || is_inf) ? x : x * (log_1p / (x1p - RealScalar(1)));
+    return numext::equal_strict(x1p, Scalar(1)) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) );
   }
 }
 
 template<typename Scalar>
 struct log1p_impl {
-  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
+  static inline Scalar run(const Scalar& x)
   {
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
     #if EIGEN_HAS_CXX11_MATH
     using std::log1p;
-    #else
-    using std_fallback::log1p;
     #endif
+    using std_fallback::log1p;
     return log1p(x);
   }
 };
 
-// Specialization for complex types that are not supported by std::log1p.
-template <typename RealScalar>
-struct log1p_impl<std::complex<RealScalar> > {
-  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(
-      const std::complex<RealScalar>& x) {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
-    return std_fallback::log1p(x);
-  }
-};
 
 template<typename Scalar>
 struct log1p_retval
@@ -843,7 +687,7 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random()
   return EIGEN_MATHFUNC_IMPL(random, Scalar)::run();
 }
 
-// Implementation of is* functions
+// Implementatin of is* functions
 
 // std::is* do not work with fast-math and gcc, std::is* are available on MSVC 2013 and newer, as well as in clang.
 #if (EIGEN_HAS_CXX11_MATH && !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || (EIGEN_COMP_MSVC>=1800) || (EIGEN_COMP_CLANG)
@@ -872,7 +716,7 @@ EIGEN_DEVICE_FUNC
 typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
 isfinite_impl(const T& x)
 {
-  #if defined(EIGEN_GPU_COMPILE_PHASE)
+  #ifdef __CUDA_ARCH__
     return (::isfinite)(x);
   #elif EIGEN_USE_STD_FPCLASSIFY
     using std::isfinite;
@@ -887,7 +731,7 @@ EIGEN_DEVICE_FUNC
 typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
 isinf_impl(const T& x)
 {
-  #if defined(EIGEN_GPU_COMPILE_PHASE)
+  #ifdef __CUDA_ARCH__
     return (::isinf)(x);
   #elif EIGEN_USE_STD_FPCLASSIFY
     using std::isinf;
@@ -902,7 +746,7 @@ EIGEN_DEVICE_FUNC
 typename internal::enable_if<(!internal::is_integral<T>::value)&&(!NumTraits<T>::IsComplex),bool>::type
 isnan_impl(const T& x)
 {
-  #if defined(EIGEN_GPU_COMPILE_PHASE)
+  #ifdef __CUDA_ARCH__
     return (::isnan)(x);
   #elif EIGEN_USE_STD_FPCLASSIFY
     using std::isnan;
@@ -959,6 +803,7 @@ template<typename T> EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex<T>& x)
 template<typename T> EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex<T>& x);
 
 template<typename T> T generic_fast_tanh_float(const T& a_x);
+
 } // end namespace internal
 
 /****************************************************************************
@@ -967,7 +812,7 @@ template<typename T> T generic_fast_tanh_float(const T& a_x);
 
 namespace numext {
 
-#if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) 
+#ifndef __CUDA_ARCH__
 template<typename T>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y)
@@ -996,24 +841,6 @@ EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y)
 {
   return fminf(x, y);
 }
-template<>
-EIGEN_DEVICE_FUNC
-EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y)
-{
-  return fmin(x, y);
-}
-template<>
-EIGEN_DEVICE_FUNC
-EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y)
-{
-#if defined(EIGEN_HIPCC)
-  // no "fminl" on HIP yet
-  return (x < y) ? x : y;
-#else
-  return fminl(x, y);
-#endif
-}
-
 template<typename T>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y)
@@ -1026,92 +853,6 @@ EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y)
 {
   return fmaxf(x, y);
 }
-template<>
-EIGEN_DEVICE_FUNC
-EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y)
-{
-  return fmax(x, y);
-}
-template<>
-EIGEN_DEVICE_FUNC
-EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y)
-{
-#if defined(EIGEN_HIPCC)
-  // no "fmaxl" on HIP yet
-  return (x > y) ? x : y;
-#else
-  return fmaxl(x, y);
-#endif
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-
-
-#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_char)   \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_short)  \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_int)    \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
-#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_char)   \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_short)  \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_int)    \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
-#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar)  \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uint)   \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
-#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar)  \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uint)   \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
-#define SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC)
-#define SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC)
-#define SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
-  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC,cl::sycl::cl_double)
-#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(NAME, FUNC) \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
-  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC,cl::sycl::cl_double)
-#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(NAME, FUNC, RET_TYPE) \
-  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_float) \
-  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_double)
-
-#define SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
-template<>                                               \
-  EIGEN_DEVICE_FUNC                                      \
-  EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE& x) { \
-    return cl::sycl::FUNC(x);                            \
-  }
-
-#define SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, TYPE) \
-  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, TYPE, TYPE)
-
-#define SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE1, ARG_TYPE2) \
-  template<>                                                                  \
-  EIGEN_DEVICE_FUNC                                                           \
-  EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE1& x, const ARG_TYPE2& y) { \
-    return cl::sycl::FUNC(x, y);                                              \
-  }
-
-#define SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
-  SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE, ARG_TYPE)
-
-#define SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, TYPE) \
-  SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, TYPE, TYPE)
-
-SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(mini, min)
-SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(mini, fmin)
-SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(maxi, max)
-SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(maxi, fmax)
-
 #endif
 
 
@@ -1181,36 +922,6 @@ inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x)
 EIGEN_DEVICE_FUNC
 inline bool abs2(bool x) { return x; }
 
-template<typename T>
-EIGEN_DEVICE_FUNC
-EIGEN_ALWAYS_INLINE T absdiff(const T& x, const T& y)
-{
-  return x > y ? x - y : y - x;
-}
-template<>
-EIGEN_DEVICE_FUNC
-EIGEN_ALWAYS_INLINE float absdiff(const float& x, const float& y)
-{
-  return fabsf(x - y);
-}
-template<>
-EIGEN_DEVICE_FUNC
-EIGEN_ALWAYS_INLINE double absdiff(const double& x, const double& y)
-{
-  return fabs(x - y);
-}
-template<>
-EIGEN_DEVICE_FUNC
-EIGEN_ALWAYS_INLINE long double absdiff(const long double& x, const long double& y)
-{
-#if defined(EIGEN_HIPCC)
-  // no "fabsl" on HIP yet
-  return (x > y) ? x : y;
-#else
-  return fabsl(x - y);
-#endif
-}
-
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x)
@@ -1225,10 +936,6 @@ inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar&
   return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y);
 }
 
-#if defined(SYCL_DEVICE_ONLY)
-  SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(hypot, hypot)
-#endif
-
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
@@ -1236,11 +943,7 @@ inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x)
   return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x);
 }
 
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log1p, log1p)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float log1p(const float &x) { return ::log1pf(x); }
 
@@ -1255,27 +958,10 @@ inline typename internal::pow_impl<ScalarX,ScalarY>::result_type pow(const Scala
   return internal::pow_impl<ScalarX,ScalarY>::run(x, y);
 }
 
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(pow, pow)
-#endif
-
 template<typename T> EIGEN_DEVICE_FUNC bool (isnan)   (const T &x) { return internal::isnan_impl(x); }
 template<typename T> EIGEN_DEVICE_FUNC bool (isinf)   (const T &x) { return internal::isinf_impl(x); }
 template<typename T> EIGEN_DEVICE_FUNC bool (isfinite)(const T &x) { return internal::isfinite_impl(x); }
 
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isnan, isnan, bool)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isinf, isinf, bool)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isfinite, isfinite, bool)
-#endif
-
-template<typename Scalar>
-EIGEN_DEVICE_FUNC
-inline EIGEN_MATHFUNC_RETVAL(rint, Scalar) rint(const Scalar& x)
-{
-  return EIGEN_MATHFUNC_IMPL(rint, Scalar)::run(x);
-}
-
 template<typename Scalar>
 EIGEN_DEVICE_FUNC
 inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x)
@@ -1283,10 +969,6 @@ inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x)
   return EIGEN_MATHFUNC_IMPL(round, Scalar)::run(x);
 }
 
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(round, round)
-#endif
-
 template<typename T>
 EIGEN_DEVICE_FUNC
 T (floor)(const T& x)
@@ -1295,11 +977,7 @@ T (floor)(const T& x)
   return floor(x);
 }
 
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(floor, floor)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float floor(const float &x) { return ::floorf(x); }
 
@@ -1315,11 +993,7 @@ T (ceil)(const T& x)
   return ceil(x);
 }
 
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(ceil, ceil)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float ceil(const float &x) { return ::ceilf(x); }
 
@@ -1360,10 +1034,6 @@ T sqrt(const T &x)
   return sqrt(x);
 }
 
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt)
-#endif
-
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T log(const T &x) {
@@ -1371,12 +1041,7 @@ T log(const T &x) {
   return log(x);
 }
 
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log, log)
-#endif
-
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float log(const float &x) { return ::logf(x); }
 
@@ -1399,12 +1064,12 @@ abs(const T &x) {
   return x;
 }
 
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(abs, abs)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(abs, fabs)
-#endif
+#if defined(__SYCL_DEVICE_ONLY__)
+EIGEN_ALWAYS_INLINE float   abs(float x) { return cl::sycl::fabs(x); }
+EIGEN_ALWAYS_INLINE double  abs(double x) { return cl::sycl::fabs(x); }
+#endif // defined(__SYCL_DEVICE_ONLY__)
 
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float abs(const float &x) { return ::fabsf(x); }
 
@@ -1429,51 +1094,12 @@ T exp(const T &x) {
   return exp(x);
 }
 
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(exp, exp)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float exp(const float &x) { return ::expf(x); }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 double exp(const double &x) { return ::exp(x); }
-
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-std::complex<float> exp(const std::complex<float>& x) {
-  float com = ::expf(x.real());
-  float res_real = com * ::cosf(x.imag());
-  float res_imag = com * ::sinf(x.imag());
-  return std::complex<float>(res_real, res_imag);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-std::complex<double> exp(const std::complex<double>& x) {
-  double com = ::exp(x.real());
-  double res_real = com * ::cos(x.imag());
-  double res_imag = com * ::sin(x.imag());
-  return std::complex<double>(res_real, res_imag);
-}
-#endif
-
-template<typename Scalar>
-EIGEN_DEVICE_FUNC
-inline EIGEN_MATHFUNC_RETVAL(expm1, Scalar) expm1(const Scalar& x)
-{
-  return EIGEN_MATHFUNC_IMPL(expm1, Scalar)::run(x);
-}
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(expm1, expm1)
-#endif
-
-#if defined(EIGEN_GPUCC)
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-float expm1(const float &x) { return ::expm1f(x); }
-
-template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-double expm1(const double &x) { return ::expm1(x); }
 #endif
 
 template<typename T>
@@ -1483,11 +1109,7 @@ T cos(const T &x) {
   return cos(x);
 }
 
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cos,cos)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float cos(const float &x) { return ::cosf(x); }
 
@@ -1502,11 +1124,7 @@ T sin(const T &x) {
   return sin(x);
 }
 
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sin, sin)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float sin(const float &x) { return ::sinf(x); }
 
@@ -1521,11 +1139,7 @@ T tan(const T &x) {
   return tan(x);
 }
 
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tan, tan)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tan(const float &x) { return ::tanf(x); }
 
@@ -1540,21 +1154,7 @@ T acos(const T &x) {
   return acos(x);
 }
 
-#if EIGEN_HAS_CXX11_MATH
-template<typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-T acosh(const T &x) {
-  EIGEN_USING_STD_MATH(acosh);
-  return acosh(x);
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acos, acos)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acosh, acosh)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float acos(const float &x) { return ::acosf(x); }
 
@@ -1569,21 +1169,7 @@ T asin(const T &x) {
   return asin(x);
 }
 
-#if EIGEN_HAS_CXX11_MATH
-template<typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-T asinh(const T &x) {
-  EIGEN_USING_STD_MATH(asinh);
-  return asinh(x);
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asin, asin)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asinh, asinh)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float asin(const float &x) { return ::asinf(x); }
 
@@ -1598,21 +1184,7 @@ T atan(const T &x) {
   return atan(x);
 }
 
-#if EIGEN_HAS_CXX11_MATH
-template<typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-T atanh(const T &x) {
-  EIGEN_USING_STD_MATH(atanh);
-  return atanh(x);
-}
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atan, atan)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atanh, atanh)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float atan(const float &x) { return ::atanf(x); }
 
@@ -1628,11 +1200,7 @@ T cosh(const T &x) {
   return cosh(x);
 }
 
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cosh, cosh)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float cosh(const float &x) { return ::coshf(x); }
 
@@ -1647,11 +1215,7 @@ T sinh(const T &x) {
   return sinh(x);
 }
 
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sinh, sinh)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float sinh(const float &x) { return ::sinhf(x); }
 
@@ -1666,16 +1230,12 @@ T tanh(const T &x) {
   return tanh(x);
 }
 
-#if (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH && !defined(SYCL_DEVICE_ONLY)
+#if (!defined(__CUDACC__)) && EIGEN_FAST_MATH
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tanh(float x) { return internal::generic_fast_tanh_float(x); }
 #endif
 
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tanh, tanh)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float tanh(const float &x) { return ::tanhf(x); }
 
@@ -1690,11 +1250,7 @@ T fmod(const T& a, const T& b) {
   return fmod(a, b);
 }
 
-#if defined(SYCL_DEVICE_ONLY)
-SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(fmod, fmod)
-#endif
-
-#if defined(EIGEN_GPUCC)
+#ifdef __CUDACC__
 template <>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float fmod(const float& a, const float& b) {
@@ -1708,23 +1264,6 @@ double fmod(const double& a, const double& b) {
 }
 #endif
 
-#if defined(SYCL_DEVICE_ONLY)
-#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY
-#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY
-#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY
-#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
-#undef SYCL_SPECIALIZE_INTEGER_TYPES_BINARY
-#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
-#undef SYCL_SPECIALIZE_FLOATING_TYPES_BINARY
-#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY
-#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE
-#undef SYCL_SPECIALIZE_GEN_UNARY_FUNC
-#undef SYCL_SPECIALIZE_UNARY_FUNC
-#undef SYCL_SPECIALIZE_GEN1_BINARY_FUNC
-#undef SYCL_SPECIALIZE_GEN2_BINARY_FUNC
-#undef SYCL_SPECIALIZE_BINARY_FUNC
-#endif
-
 } // end namespace numext
 
 namespace internal {
@@ -1853,13 +1392,13 @@ template<> struct random_impl<bool>
 template<> struct scalar_fuzzy_impl<bool>
 {
   typedef bool RealScalar;
-
+  
   template<typename OtherScalar> EIGEN_DEVICE_FUNC
   static inline bool isMuchSmallerThan(const bool& x, const bool&, const bool&)
   {
     return !x;
   }
-
+  
   EIGEN_DEVICE_FUNC
   static inline bool isApprox(bool x, bool y, bool)
   {
@@ -1871,10 +1410,10 @@ template<> struct scalar_fuzzy_impl<bool>
   {
     return (!x) || y;
   }
-
+  
 };
 
-
+  
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/MathFunctionsImpl.h b/uppsrc/plugin/Eigen/Eigen/src/Core/MathFunctionsImpl.h
index 7af58fadb..9c1ceb0eb 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/MathFunctionsImpl.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/MathFunctionsImpl.h
@@ -17,28 +17,24 @@ namespace internal {
 
 /** \internal \returns the hyperbolic tan of \a a (coeff-wise)
     Doesn't do anything fancy, just a 13/6-degree rational interpolant which
-    is accurate up to a couple of ulps in the (approximate) range [-8, 8],
-    outside of which tanh(x) = +/-1 in single precision. The input is clamped
-    to the range [-c, c]. The value c is chosen as the smallest value where
-    the approximation evaluates to exactly 1. In the reange [-0.0004, 0.0004]
-    the approxmation tanh(x) ~= x is used for better accuracy as x tends to zero.
+    is accurate up to a couple of ulp in the range [-9, 9], outside of which
+    the tanh(x) = +/-1.
 
     This implementation works on both scalars and packets.
 */
 template<typename T>
 T generic_fast_tanh_float(const T& a_x)
 {
-  // Clamp the inputs to the range [-c, c]
-#ifdef EIGEN_VECTORIZE_FMA
-  const T plus_clamp = pset1<T>(7.99881172180175781f);
-  const T minus_clamp = pset1<T>(-7.99881172180175781f);
-#else
-  const T plus_clamp = pset1<T>(7.90531110763549805f);
-  const T minus_clamp = pset1<T>(-7.90531110763549805f);
-#endif
-  const T tiny = pset1<T>(0.0004f);
-  const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
-  const T tiny_mask = pcmp_lt(pabs(a_x), tiny);
+  // Clamp the inputs to the range [-9, 9] since anything outside
+  // this range is +/-1.0f in single-precision.
+  const T plus_9 = pset1<T>(9.f);
+  const T minus_9 = pset1<T>(-9.f);
+  // NOTE GCC prior to 6.3 might improperly optimize this max/min
+  //      step such that if a_x is nan, x will be either 9 or -9,
+  //      and tanh will return 1 or -1 instead of nan.
+  //      This is supposed to be fixed in gcc6.3,
+  //      see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  const T x = pmax(minus_9,pmin(plus_9,a_x));
   // The monomial coefficients of the numerator polynomial (odd).
   const T alpha_1 = pset1<T>(4.89352455891786e-03f);
   const T alpha_3 = pset1<T>(6.37261928875436e-04f);
@@ -66,24 +62,24 @@ T generic_fast_tanh_float(const T& a_x)
   p = pmadd(x2, p, alpha_1);
   p = pmul(x, p);
 
-  // Evaluate the denominator polynomial q.
+  // Evaluate the denominator polynomial p.
   T q = pmadd(x2, beta_6, beta_4);
   q = pmadd(x2, q, beta_2);
   q = pmadd(x2, q, beta_0);
 
   // Divide the numerator by the denominator.
-  return pselect(tiny_mask, x, pdiv(p, q));
+  return pdiv(p, q);
 }
 
 template<typename RealScalar>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+EIGEN_STRONG_INLINE
 RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y)
 {
   EIGEN_USING_STD_MATH(sqrt);
   RealScalar p, qp;
   p = numext::maxi(x,y);
   if(p==RealScalar(0)) return RealScalar(0);
-  qp = numext::mini(y,x) / p;
+  qp = numext::mini(y,x) / p;    
   return p * sqrt(RealScalar(1) + qp*qp);
 }
 
@@ -91,8 +87,7 @@ template<typename Scalar>
 struct hypot_impl
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  static EIGEN_DEVICE_FUNC
-  inline RealScalar run(const Scalar& x, const Scalar& y)
+  static inline RealScalar run(const Scalar& x, const Scalar& y)
   {
     EIGEN_USING_STD_MATH(abs);
     return positive_real_hypot<RealScalar>(abs(x), abs(y));
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Matrix.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Matrix.h
index fb7238265..7f4a7af93 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/Matrix.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Matrix.h
@@ -255,27 +255,27 @@ class Matrix
       *
       * \sa resize(Index,Index)
       */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Matrix() : Base()
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Matrix() : Base()
     {
       Base::_check_template_params();
       EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
     }
 
     // FIXME is it still needed
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
     explicit Matrix(internal::constructor_without_unaligned_array_assert)
       : Base(internal::constructor_without_unaligned_array_assert())
     { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }
 
 #if EIGEN_HAS_RVALUE_REFERENCES
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
     Matrix(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
       : Base(std::move(other))
     {
       Base::_check_template_params();
     }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
     Matrix& operator=(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
     {
       other.swap(*this);
@@ -283,65 +283,25 @@ class Matrix
     }
 #endif
 
-#if EIGEN_HAS_CXX11
-    /** \copydoc PlainObjectBase(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&... args)
-     *
-     * Example: \include Matrix_variadic_ctor_cxx11.cpp
-     * Output: \verbinclude Matrix_variadic_ctor_cxx11.out
-     *
-     * \sa Matrix(const std::initializer_list<std::initializer_list<Scalar>>&)
-     */
-    template <typename... ArgTypes>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2,  const Scalar& a3, const ArgTypes&... args)
-      : Base(a0, a1, a2, a3, args...) {}
-
-    /** \brief Constructs a Matrix and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11
-      * 
-      * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
-      * 
-      * Example: \include Matrix_initializer_list_23_cxx11.cpp
-      * Output: \verbinclude Matrix_initializer_list_23_cxx11.out
-      * 
-      * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered.
-      * 
-      * In the case of a compile-time column vector, implicit transposition from a single row is allowed.
-      * Therefore <code>VectorXd{{1,2,3,4,5}}</code> is legal and the more verbose syntax
-      * <code>RowVectorXd{{1},{2},{3},{4},{5}}</code> can be avoided:
-      * 
-      * Example: \include Matrix_initializer_list_vector_cxx11.cpp
-      * Output: \verbinclude Matrix_initializer_list_vector_cxx11.out
-      * 
-      * In the case of fixed-sized matrices, the initializer list sizes must exactly match the matrix sizes,
-      * and implicit transposition is allowed for compile-time vectors only.
-      * 
-      * \sa Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2,  const Scalar& a3, const ArgTypes&... args)
-      */
-    EIGEN_DEVICE_FUNC
-    explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list<std::initializer_list<Scalar>>& list) : Base(list) {}
-#endif // end EIGEN_HAS_CXX11
-
-#ifndef EIGEN_PARSED_BY_DOXYGEN
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
 
     // This constructor is for both 1x1 matrices and dynamic vectors
     template<typename T>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    explicit Matrix(const T& x)
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE explicit Matrix(const T& x)
     {
       Base::_check_template_params();
       Base::template _init1<T>(x);
     }
 
     template<typename T0, typename T1>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Matrix(const T0& x, const T1& y)
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y)
     {
       Base::_check_template_params();
       Base::template _init2<T0,T1>(x, y);
     }
-
-
-#else
+    #else
     /** \brief Constructs a fixed-sized matrix initialized with coefficients starting at \a data */
     EIGEN_DEVICE_FUNC
     explicit Matrix(const Scalar *data);
@@ -359,8 +319,7 @@ class Matrix
       * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
       */
     EIGEN_STRONG_INLINE explicit Matrix(Index dim);
-    /** \brief Constructs an initialized 1x1 matrix with the given coefficient
-      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...) */
+    /** \brief Constructs an initialized 1x1 matrix with the given coefficient */
     Matrix(const Scalar& x);
     /** \brief Constructs an uninitialized matrix with \a rows rows and \a cols columns.
       *
@@ -377,14 +336,11 @@ class Matrix
     EIGEN_DEVICE_FUNC
     Matrix(Index rows, Index cols);
     
-    /** \brief Constructs an initialized 2D vector with given coefficients
-      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...) */
+    /** \brief Constructs an initialized 2D vector with given coefficients */
     Matrix(const Scalar& x, const Scalar& y);
-    #endif  // end EIGEN_PARSED_BY_DOXYGEN
+    #endif
 
-    /** \brief Constructs an initialized 3D vector with given coefficients
-      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...)
-      */
+    /** \brief Constructs an initialized 3D vector with given coefficients */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z)
     {
@@ -394,9 +350,7 @@ class Matrix
       m_storage.data()[1] = y;
       m_storage.data()[2] = z;
     }
-    /** \brief Constructs an initialized 4D vector with given coefficients
-      * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...)
-      */
+    /** \brief Constructs an initialized 4D vector with given coefficients */
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w)
     {
@@ -451,7 +405,7 @@ class Matrix
   *
   * \ingroup Core_Module
   *
-  * %Eigen defines several typedef shortcuts for most common matrix and vector types.
+  * Eigen defines several typedef shortcuts for most common matrix and vector types.
   *
   * The general patterns are the following:
   *
@@ -463,15 +417,6 @@ class Matrix
   *
   * There are also \c VectorSizeType and \c RowVectorSizeType which are self-explanatory. For example, \c Vector4cf is
   * a fixed-size vector of 4 complex floats.
-  * 
-  * With \cpp11, template alias are also defined for common sizes.
-  * They follow the same pattern as above except that the scalar type suffix is replaced by a
-  * template parameter, i.e.:
-  *   - `MatrixSize<Type>` where `Size` can be \c 2,\c 3,\c 4 for fixed size square matrices or \c X for dynamic size.
-  *   - `MatrixXSize<Type>` and `MatrixSizeX<Type>` where `Size` can be \c 2,\c 3,\c 4 for hybrid dynamic/fixed matrices.
-  *   - `VectorSize<Type>` and `RowVectorSize<Type>` for column and row vectors.
-  * 
-  * With \cpp11, you can also use fully generic column and row vector types: `Vector<Type,Size>` and `RowVector<Type,Size>`.
   *
   * \sa class Matrix
   */
@@ -509,55 +454,6 @@ EIGEN_MAKE_TYPEDEFS_ALL_SIZES(std::complex<double>, cd)
 #undef EIGEN_MAKE_TYPEDEFS
 #undef EIGEN_MAKE_FIXED_TYPEDEFS
 
-#if EIGEN_HAS_CXX11
-
-#define EIGEN_MAKE_TYPEDEFS(Size, SizeSuffix)                     \
-/** \ingroup matrixtypedefs */                                    \
-/** \brief \cpp11 */                                              \
-template <typename Type>                                          \
-using Matrix##SizeSuffix = Matrix<Type, Size, Size>;              \
-/** \ingroup matrixtypedefs */                                    \
-/** \brief \cpp11 */                                              \
-template <typename Type>                                          \
-using Vector##SizeSuffix = Matrix<Type, Size, 1>;                 \
-/** \ingroup matrixtypedefs */                                    \
-/** \brief \cpp11 */                                              \
-template <typename Type>                                          \
-using RowVector##SizeSuffix = Matrix<Type, 1, Size>;
-
-#define EIGEN_MAKE_FIXED_TYPEDEFS(Size)                           \
-/** \ingroup matrixtypedefs */                                    \
-/** \brief \cpp11 */                                              \
-template <typename Type>                                          \
-using Matrix##Size##X = Matrix<Type, Size, Dynamic>;              \
-/** \ingroup matrixtypedefs */                                    \
-/** \brief \cpp11 */                                              \
-template <typename Type>                                          \
-using Matrix##X##Size = Matrix<Type, Dynamic, Size>;
-
-EIGEN_MAKE_TYPEDEFS(2, 2)
-EIGEN_MAKE_TYPEDEFS(3, 3)
-EIGEN_MAKE_TYPEDEFS(4, 4)
-EIGEN_MAKE_TYPEDEFS(Dynamic, X)
-EIGEN_MAKE_FIXED_TYPEDEFS(2)
-EIGEN_MAKE_FIXED_TYPEDEFS(3)
-EIGEN_MAKE_FIXED_TYPEDEFS(4)
-
-/** \ingroup matrixtypedefs
-  * \brief \cpp11 */
-template <typename Type, int Size>
-using Vector = Matrix<Type, Size, 1>;
-
-/** \ingroup matrixtypedefs
-  * \brief \cpp11 */
-template <typename Type, int Size>
-using RowVector = Matrix<Type, 1, Size>;
-
-#undef EIGEN_MAKE_TYPEDEFS
-#undef EIGEN_MAKE_FIXED_TYPEDEFS
-
-#endif // EIGEN_HAS_CXX11
-
 } // end namespace Eigen
 
 #endif // EIGEN_MATRIX_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/MatrixBase.h b/uppsrc/plugin/Eigen/Eigen/src/Core/MatrixBase.h
index 45c3a596e..f8bcc8c6f 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/MatrixBase.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/MatrixBase.h
@@ -76,7 +76,6 @@ template<typename Derived> class MatrixBase
     using Base::coeffRef;
     using Base::lazyAssign;
     using Base::eval;
-    using Base::operator-;
     using Base::operator+=;
     using Base::operator-=;
     using Base::operator*=;
@@ -123,6 +122,7 @@ template<typename Derived> class MatrixBase
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::MatrixBase
 #define EIGEN_DOC_UNARY_ADDONS(X,Y)
+#   include "../plugins/CommonCwiseUnaryOps.h"
 #   include "../plugins/CommonCwiseBinaryOps.h"
 #   include "../plugins/MatrixCwiseUnaryOps.h"
 #   include "../plugins/MatrixCwiseBinaryOps.h"
@@ -268,8 +268,6 @@ template<typename Derived> class MatrixBase
     Derived& setIdentity();
     EIGEN_DEVICE_FUNC
     Derived& setIdentity(Index rows, Index cols);
-    EIGEN_DEVICE_FUNC Derived& setUnit(Index i);
-    EIGEN_DEVICE_FUNC Derived& setUnit(Index newSize, Index i);
 
     bool isIdentity(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
     bool isDiagonal(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
@@ -298,7 +296,7 @@ template<typename Derived> class MatrixBase
     EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase<OtherDerived>& other) const
     { return cwiseNotEqual(other).any(); }
 
-    NoAlias<Derived,Eigen::MatrixBase > EIGEN_DEVICE_FUNC noalias();
+    NoAlias<Derived,Eigen::MatrixBase > noalias();
 
     // TODO forceAlignedAccess is temporarily disabled
     // Need to find a nicer workaround.
@@ -328,7 +326,6 @@ template<typename Derived> class MatrixBase
 
     inline const PartialPivLU<PlainObject> lu() const;
 
-    EIGEN_DEVICE_FUNC
     inline const Inverse<Derived> inverse() const;
 
     template<typename ResultType>
@@ -338,15 +335,12 @@ template<typename Derived> class MatrixBase
       bool& invertible,
       const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
     ) const;
-
     template<typename ResultType>
     inline void computeInverseWithCheck(
       ResultType& inverse,
       bool& invertible,
       const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
     ) const;
-
-    EIGEN_DEVICE_FUNC
     Scalar determinant() const;
 
 /////////// Cholesky module ///////////
@@ -418,19 +412,15 @@ template<typename Derived> class MatrixBase
 
 ////////// Householder module ///////////
 
-    EIGEN_DEVICE_FUNC
     void makeHouseholderInPlace(Scalar& tau, RealScalar& beta);
     template<typename EssentialPart>
-    EIGEN_DEVICE_FUNC
     void makeHouseholder(EssentialPart& essential,
                          Scalar& tau, RealScalar& beta) const;
     template<typename EssentialPart>
-    EIGEN_DEVICE_FUNC
     void applyHouseholderOnTheLeft(const EssentialPart& essential,
                                    const Scalar& tau,
                                    Scalar* workspace);
     template<typename EssentialPart>
-    EIGEN_DEVICE_FUNC
     void applyHouseholderOnTheRight(const EssentialPart& essential,
                                     const Scalar& tau,
                                     Scalar* workspace);
@@ -438,10 +428,8 @@ template<typename Derived> class MatrixBase
 ///////// Jacobi module /////////
 
     template<typename OtherScalar>
-    EIGEN_DEVICE_FUNC
     void applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j);
     template<typename OtherScalar>
-    EIGEN_DEVICE_FUNC
     void applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j);
 
 ///////// SparseCore module /////////
@@ -468,11 +456,6 @@ template<typename Derived> class MatrixBase
     const MatrixFunctionReturnValue<Derived> matrixFunction(StemFunction f) const;
     EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cosh, hyperbolic cosine)
     EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sinh, hyperbolic sine)
-#if EIGEN_HAS_CXX11_MATH
-    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, atanh, inverse hyperbolic cosine)
-    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, acosh, inverse hyperbolic cosine)
-    EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, asinh, inverse hyperbolic sine)
-#endif
     EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cos, cosine)
     EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sin, sine)
     EIGEN_MATRIX_FUNCTION(MatrixSquareRootReturnValue, sqrt, square root)
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/NestByValue.h b/uppsrc/plugin/Eigen/Eigen/src/Core/NestByValue.h
index 239bbba63..13adf070e 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/NestByValue.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/NestByValue.h
@@ -16,11 +16,7 @@ namespace Eigen {
 namespace internal {
 template<typename ExpressionType>
 struct traits<NestByValue<ExpressionType> > : public traits<ExpressionType>
-{
-  enum {
-    Flags = traits<ExpressionType>::Flags & ~NestByRefBit
-  };
-};
+{};
 }
 
 /** \class NestByValue
@@ -47,11 +43,55 @@ template<typename ExpressionType> class NestByValue
 
     EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); }
     EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); }
+    EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); }
+    EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); }
+
+    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const
+    {
+      return m_expression.coeff(row, col);
+    }
+
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col)
+    {
+      return m_expression.const_cast_derived().coeffRef(row, col);
+    }
+
+    EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const
+    {
+      return m_expression.coeff(index);
+    }
+
+    EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index)
+    {
+      return m_expression.const_cast_derived().coeffRef(index);
+    }
+
+    template<int LoadMode>
+    inline const PacketScalar packet(Index row, Index col) const
+    {
+      return m_expression.template packet<LoadMode>(row, col);
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index row, Index col, const PacketScalar& x)
+    {
+      m_expression.const_cast_derived().template writePacket<LoadMode>(row, col, x);
+    }
+
+    template<int LoadMode>
+    inline const PacketScalar packet(Index index) const
+    {
+      return m_expression.template packet<LoadMode>(index);
+    }
+
+    template<int LoadMode>
+    inline void writePacket(Index index, const PacketScalar& x)
+    {
+      m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
+    }
 
     EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
 
-    EIGEN_DEVICE_FUNC const ExpressionType& nestedExpression() const { return m_expression; }
-
   protected:
     const ExpressionType m_expression;
 };
@@ -59,27 +99,12 @@ template<typename ExpressionType> class NestByValue
 /** \returns an expression of the temporary version of *this.
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline const NestByValue<Derived>
+inline const NestByValue<Derived>
 DenseBase<Derived>::nestByValue() const
 {
   return NestByValue<Derived>(derived());
 }
 
-namespace internal {
-
-// Evaluator of Solve -> eval into a temporary
-template<typename ArgType>
-struct evaluator<NestByValue<ArgType> >
-  : public evaluator<ArgType>
-{
-  typedef evaluator<ArgType> Base;
-
-  EIGEN_DEVICE_FUNC explicit evaluator(const NestByValue<ArgType>& xpr)
-    : Base(xpr.nestedExpression())
-  {}
-};
-}
-
 } // end namespace Eigen
 
 #endif // EIGEN_NESTBYVALUE_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/NoAlias.h b/uppsrc/plugin/Eigen/Eigen/src/Core/NoAlias.h
index 570283d90..33908010b 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/NoAlias.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/NoAlias.h
@@ -33,7 +33,6 @@ class NoAlias
   public:
     typedef typename ExpressionType::Scalar Scalar;
     
-    EIGEN_DEVICE_FUNC
     explicit NoAlias(ExpressionType& expression) : m_expression(expression) {}
     
     template<typename OtherDerived>
@@ -75,10 +74,10 @@ class NoAlias
   *
   * More precisely, noalias() allows to bypass the EvalBeforeAssignBit flag.
   * Currently, even though several expressions may alias, only product
-  * expressions have this flag. Therefore, noalias() is only useful when
+  * expressions have this flag. Therefore, noalias() is only usefull when
   * the source expression contains a matrix product.
   *
-  * Here are some examples where noalias is useful:
+  * Here are some examples where noalias is usefull:
   * \code
   * D.noalias()  = A * B;
   * D.noalias() += A.transpose() * B;
@@ -99,7 +98,7 @@ class NoAlias
   * \sa class NoAlias
   */
 template<typename Derived>
-NoAlias<Derived,MatrixBase> EIGEN_DEVICE_FUNC MatrixBase<Derived>::noalias()
+NoAlias<Derived,MatrixBase> MatrixBase<Derived>::noalias()
 {
   return NoAlias<Derived, Eigen::MatrixBase >(derived());
 }
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/NumTraits.h b/uppsrc/plugin/Eigen/Eigen/src/Core/NumTraits.h
index 9ab55534f..daf489878 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/NumTraits.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/NumTraits.h
@@ -21,14 +21,12 @@ template< typename T,
           bool is_integer = NumTraits<T>::IsInteger>
 struct default_digits10_impl
 {
-  EIGEN_DEVICE_FUNC
   static int run() { return std::numeric_limits<T>::digits10; }
 };
 
 template<typename T>
 struct default_digits10_impl<T,false,false> // Floating point
 {
-  EIGEN_DEVICE_FUNC
   static int run() {
     using std::log10;
     using std::ceil;
@@ -40,38 +38,6 @@ struct default_digits10_impl<T,false,false> // Floating point
 template<typename T>
 struct default_digits10_impl<T,false,true> // Integer
 {
-  EIGEN_DEVICE_FUNC
-  static int run() { return 0; }
-};
-
-
-// default implementation of digits(), based on numeric_limits if specialized,
-// 0 for integer types, and log2(epsilon()) otherwise.
-template< typename T,
-          bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
-          bool is_integer = NumTraits<T>::IsInteger>
-struct default_digits_impl
-{
-  EIGEN_DEVICE_FUNC
-  static int run() { return std::numeric_limits<T>::digits; }
-};
-
-template<typename T>
-struct default_digits_impl<T,false,false> // Floating point
-{
-  EIGEN_DEVICE_FUNC
-  static int run() {
-    using std::log;
-    using std::ceil;
-    typedef typename NumTraits<T>::Real Real;
-    return int(ceil(-log(NumTraits<Real>::epsilon())/log(static_cast<Real>(2))));
-  }
-};
-
-template<typename T>
-struct default_digits_impl<T,false,true> // Integer
-{
-  EIGEN_DEVICE_FUNC
   static int run() { return 0; }
 };
 
@@ -105,7 +71,7 @@ struct default_digits_impl<T,false,true> // Integer
   *     and to \c 0 otherwise.
   * \li Enum values ReadCost, AddCost and MulCost representing a rough estimate of the number of CPU cycles needed
   *     to by move / add / mul instructions respectively, assuming the data is already stored in CPU registers.
-  *     Stay vague here. No need to do architecture-specific stuff. If you don't know what this means, just use \c Eigen::HugeCost.
+  *     Stay vague here. No need to do architecture-specific stuff.
   * \li An enum value \a IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned.
   * \li An enum value \a RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must
   *     be called, and to 0 if it is safe not to call it. Default is 0 if \a T is an arithmetic type, and 1 otherwise.
@@ -152,12 +118,6 @@ template<typename T> struct GenericNumTraits
     return internal::default_digits10_impl<T>::run();
   }
 
-  EIGEN_DEVICE_FUNC
-  static inline int digits()
-  {
-    return internal::default_digits_impl<T>::run();
-  }
-
   EIGEN_DEVICE_FUNC
   static inline Real dummy_precision()
   {
@@ -173,8 +133,7 @@ template<typename T> struct GenericNumTraits
 
   EIGEN_DEVICE_FUNC
   static inline T lowest()  {
-    return IsInteger ? (numext::numeric_limits<T>::min)()
-                     : static_cast<T>(-(numext::numeric_limits<T>::max)());
+    return IsInteger ? (numext::numeric_limits<T>::min)() : (-(numext::numeric_limits<T>::max)());
   }
 
   EIGEN_DEVICE_FUNC
@@ -284,8 +243,6 @@ private:
 // Empty specialization for void to allow template specialization based on NumTraits<T>::Real with T==void and SFINAE.
 template<> struct NumTraits<void> {};
 
-template<> struct NumTraits<bool> : GenericNumTraits<bool> {};
-
 } // end namespace Eigen
 
 #endif // EIGEN_NUMTRAITS_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/PartialReduxEvaluator.h b/uppsrc/plugin/Eigen/Eigen/src/Core/PartialReduxEvaluator.h
deleted file mode 100644
index 0be694259..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/PartialReduxEvaluator.h
+++ /dev/null
@@ -1,232 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2011-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_PARTIALREDUX_H
-#define EIGEN_PARTIALREDUX_H
-
-namespace Eigen { 
-
-namespace internal {
-
-
-/***************************************************************************
-*
-* This file provides evaluators for partial reductions.
-* There are two modes:
-*
-*  - scalar path: simply calls the respective function on the column or row.
-*    -> nothing special here, all the tricky part is handled by the return
-*       types of VectorwiseOp's members. They embed the functor calling the
-*       respective DenseBase's member function.
-*
-*  - vectorized path: implements a packet-wise reductions followed by
-*    some (optional) processing of the outcome, e.g., division by n for mean.
-*
-* For the vectorized path let's observe that the packet-size and outer-unrolling
-* are both decided by the assignement logic. So all we have to do is to decide
-* on the inner unrolling.
-*
-* For the unrolling, we can reuse "internal::redux_vec_unroller" from Redux.h,
-* but be need to be careful to specify correct increment.
-*
-***************************************************************************/
-
-
-/* logic deciding a strategy for unrolling of vectorized paths */
-template<typename Func, typename Evaluator>
-struct packetwise_redux_traits
-{
-  enum {
-    OuterSize = int(Evaluator::IsRowMajor) ? Evaluator::RowsAtCompileTime : Evaluator::ColsAtCompileTime,
-    Cost = OuterSize == Dynamic ? HugeCost
-         : OuterSize * Evaluator::CoeffReadCost + (OuterSize-1) * functor_traits<Func>::Cost,
-    Unrolling = Cost <= EIGEN_UNROLLING_LIMIT ? CompleteUnrolling : NoUnrolling
-  };
-
-};
-
-/* Value to be returned when size==0 , by default let's return 0 */
-template<typename PacketType,typename Func>
-EIGEN_DEVICE_FUNC
-PacketType packetwise_redux_empty_value(const Func& ) { return pset1<PacketType>(0); }
-
-/* For products the default is 1 */
-template<typename PacketType,typename Scalar>
-EIGEN_DEVICE_FUNC
-PacketType packetwise_redux_empty_value(const scalar_product_op<Scalar,Scalar>& ) { return pset1<PacketType>(1); }
-
-/* Perform the actual reduction */
-template<typename Func, typename Evaluator,
-         int Unrolling = packetwise_redux_traits<Func, Evaluator>::Unrolling
->
-struct packetwise_redux_impl;
-
-/* Perform the actual reduction with unrolling */
-template<typename Func, typename Evaluator>
-struct packetwise_redux_impl<Func, Evaluator, CompleteUnrolling>
-{
-  typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
-  typedef typename Evaluator::Scalar Scalar;
-
-  template<typename PacketType>
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
-  PacketType run(const Evaluator &eval, const Func& func, Index /*size*/)
-  {
-    return redux_vec_unroller<Func, Evaluator, 0, packetwise_redux_traits<Func, Evaluator>::OuterSize>::template run<PacketType>(eval,func);
-  }
-};
-
-/* Add a specialization of redux_vec_unroller for size==0 at compiletime.
- * This specialization is not required for general reductions, which is
- * why it is defined here.
- */
-template<typename Func, typename Evaluator, int Start>
-struct redux_vec_unroller<Func, Evaluator, Start, 0>
-{
-  template<typename PacketType>
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE PacketType run(const Evaluator &, const Func& f)
-  {
-    return packetwise_redux_empty_value<PacketType>(f);
-  }
-};
-
-/* Perform the actual reduction for dynamic sizes */
-template<typename Func, typename Evaluator>
-struct packetwise_redux_impl<Func, Evaluator, NoUnrolling>
-{
-  typedef typename Evaluator::Scalar Scalar;
-  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
-
-  template<typename PacketType>
-  EIGEN_DEVICE_FUNC
-  static PacketType run(const Evaluator &eval, const Func& func, Index size)
-  {
-    if(size==0)
-      return packetwise_redux_empty_value<PacketType>(func);
-    
-    const Index size4 = (size-1)&(~3);
-    PacketType p = eval.template packetByOuterInner<Unaligned,PacketType>(0,0);
-    Index i = 1;
-    // This loop is optimized for instruction pipelining:
-    // - each iteration generates two independent instructions
-    // - thanks to branch prediction and out-of-order execution we have independent instructions across loops
-    for(; i<size4; i+=4)
-      p = func.packetOp(p,
-            func.packetOp(
-              func.packetOp(eval.template packetByOuterInner<Unaligned,PacketType>(i+0,0),eval.template packetByOuterInner<Unaligned,PacketType>(i+1,0)),
-              func.packetOp(eval.template packetByOuterInner<Unaligned,PacketType>(i+2,0),eval.template packetByOuterInner<Unaligned,PacketType>(i+3,0))));
-    for(; i<size; ++i)
-      p = func.packetOp(p, eval.template packetByOuterInner<Unaligned,PacketType>(i,0));
-    return p;
-  }
-};
-
-template< typename ArgType, typename MemberOp, int Direction>
-struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
-  : evaluator_base<PartialReduxExpr<ArgType, MemberOp, Direction> >
-{
-  typedef PartialReduxExpr<ArgType, MemberOp, Direction> XprType;
-  typedef typename internal::nested_eval<ArgType,1>::type ArgTypeNested;
-  typedef typename internal::add_const_on_value_type<ArgTypeNested>::type ConstArgTypeNested;
-  typedef typename internal::remove_all<ArgTypeNested>::type ArgTypeNestedCleaned;
-  typedef typename ArgType::Scalar InputScalar;
-  typedef typename XprType::Scalar Scalar;
-  enum {
-    TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) :  int(ArgType::ColsAtCompileTime)
-  };
-  typedef typename MemberOp::template Cost<int(TraversalSize)> CostOpType;
-  enum {
-    CoeffReadCost = TraversalSize==Dynamic ? HugeCost
-                  : TraversalSize==0 ? 1
-                  : TraversalSize * evaluator<ArgType>::CoeffReadCost + int(CostOpType::value),
-    
-    _ArgFlags = evaluator<ArgType>::Flags,
-
-    _Vectorizable =  bool(int(_ArgFlags)&PacketAccessBit)
-                  && bool(MemberOp::Vectorizable)
-                  && (Direction==int(Vertical) ? bool(_ArgFlags&RowMajorBit) : (_ArgFlags&RowMajorBit)==0)
-                  && (TraversalSize!=0),
-                  
-    Flags = (traits<XprType>::Flags&RowMajorBit)
-          | (evaluator<ArgType>::Flags&(HereditaryBits&(~RowMajorBit)))
-          | (_Vectorizable ? PacketAccessBit : 0)
-          | LinearAccessBit,
-    
-    Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized
-  };
-
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr)
-    : m_arg(xpr.nestedExpression()), m_functor(xpr.functor())
-  {
-    EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : (TraversalSize==0 ? 1 : int(CostOpType::value)));
-    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
-  }
-
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  const Scalar coeff(Index i, Index j) const
-  {
-    return coeff(Direction==Vertical ? j : i);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  const Scalar coeff(Index index) const
-  {
-    return m_functor(m_arg.template subVector<DirectionType(Direction)>(index));
-  }
-
-  template<int LoadMode,typename PacketType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  PacketType packet(Index i, Index j) const
-  {
-    return packet<LoadMode,PacketType>(Direction==Vertical ? j : i);
-  }
-  
-  template<int LoadMode,typename PacketType>
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
-  PacketType packet(Index idx) const
-  {
-    enum { PacketSize = internal::unpacket_traits<PacketType>::size };
-    typedef Block<const ArgTypeNestedCleaned,
-                  Direction==Vertical ? int(ArgType::RowsAtCompileTime) : int(PacketSize),
-                  Direction==Vertical ? int(PacketSize) : int(ArgType::ColsAtCompileTime),
-                  true /* InnerPanel */> PanelType;
-    
-    PanelType panel(m_arg,
-                    Direction==Vertical ? 0 : idx,
-                    Direction==Vertical ? idx : 0,
-                    Direction==Vertical ? m_arg.rows() : Index(PacketSize),
-                    Direction==Vertical ? Index(PacketSize) : m_arg.cols());
-
-    // FIXME
-    // See bug 1612, currently if PacketSize==1 (i.e. complex<double> with 128bits registers) then the storage-order of panel get reversed
-    // and methods like packetByOuterInner do not make sense anymore in this context.
-    // So let's just by pass "vectorization" in this case:
-    if(PacketSize==1)
-      return internal::pset1<PacketType>(coeff(idx));
-    
-    typedef typename internal::redux_evaluator<PanelType> PanelEvaluator;
-    PanelEvaluator panel_eval(panel);
-    typedef typename MemberOp::BinaryOp BinaryOp;
-    PacketType p = internal::packetwise_redux_impl<BinaryOp,PanelEvaluator>::template run<PacketType>(panel_eval,m_functor.binaryFunc(),m_arg.outerSize());
-    return p;
-  }
-
-protected:
-  ConstArgTypeNested m_arg;
-  const MemberOp m_functor;
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_PARTIALREDUX_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/PermutationMatrix.h b/uppsrc/plugin/Eigen/Eigen/src/Core/PermutationMatrix.h
index 69401bf41..b1fb455b9 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/PermutationMatrix.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/PermutationMatrix.h
@@ -87,14 +87,25 @@ class PermutationBase : public EigenBase<Derived>
       return derived();
     }
 
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** This is a special case of the templated operator=. Its purpose is to
+      * prevent a default operator= from hiding the templated operator=.
+      */
+    Derived& operator=(const PermutationBase& other)
+    {
+      indices() = other.indices();
+      return derived();
+    }
+    #endif
+
     /** \returns the number of rows */
-    inline EIGEN_DEVICE_FUNC Index rows() const { return Index(indices().size()); }
+    inline Index rows() const { return Index(indices().size()); }
 
     /** \returns the number of columns */
-    inline EIGEN_DEVICE_FUNC Index cols() const { return Index(indices().size()); }
+    inline Index cols() const { return Index(indices().size()); }
 
     /** \returns the size of a side of the respective square matrix, i.e., the number of indices */
-    inline EIGEN_DEVICE_FUNC Index size() const { return Index(indices().size()); }
+    inline Index size() const { return Index(indices().size()); }
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename DenseDerived>
@@ -322,6 +333,12 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
     inline PermutationMatrix(const PermutationBase<OtherDerived>& other)
       : m_indices(other.indices()) {}
 
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** Standard copy constructor. Defined only to prevent a default copy constructor
+      * from hiding the other templated constructor */
+    inline PermutationMatrix(const PermutationMatrix& other) : m_indices(other.indices()) {}
+    #endif
+
     /** Generic constructor from expression of the indices. The indices
       * array has the meaning that the permutations sends each integer i to indices[i].
       *
@@ -356,6 +373,17 @@ class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompile
       return Base::operator=(tr.derived());
     }
 
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** This is a special case of the templated operator=. Its purpose is to
+      * prevent a default operator= from hiding the templated operator=.
+      */
+    PermutationMatrix& operator=(const PermutationMatrix& other)
+    {
+      m_indices = other.m_indices;
+      return *this;
+    }
+    #endif
+
     /** const version of indices(). */
     const IndicesType& indices() const { return m_indices; }
     /** \returns a reference to the stored array representing the permutation. */
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/PlainObjectBase.h b/uppsrc/plugin/Eigen/Eigen/src/Core/PlainObjectBase.h
index f6497e9f7..0f3632cfd 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/PlainObjectBase.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/PlainObjectBase.h
@@ -104,7 +104,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
 
     typedef typename internal::traits<Derived>::StorageKind StorageKind;
     typedef typename internal::traits<Derived>::Scalar Scalar;
-
+    
     typedef typename internal::packet_traits<Scalar>::type PacketScalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
     typedef Derived DenseType;
@@ -358,7 +358,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * remain row-vectors and vectors remain vectors.
       */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC 
     EIGEN_STRONG_INLINE void resizeLike(const EigenBase<OtherDerived>& _other)
     {
       const OtherDerived& other = _other.derived();
@@ -383,7 +383,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or
       * conservativeResize(Index, NoChange_t).
       *
-      * Matrices are resized relative to the top-left element. In case values need to be
+      * Matrices are resized relative to the top-left element. In case values need to be 
       * appended to the matrix they will be uninitialized.
       */
     EIGEN_DEVICE_FUNC
@@ -440,7 +440,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or
       * conservativeResize(Index, NoChange_t).
       *
-      * Matrices are resized relative to the top-left element. In case values need to be
+      * Matrices are resized relative to the top-left element. In case values need to be 
       * appended to the matrix they will copied from \c other.
       */
     template<typename OtherDerived>
@@ -526,71 +526,6 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
 //       EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
     }
 
-    #if EIGEN_HAS_CXX11
-    /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients. \cpp11
-      *
-      * \only_for_vectors
-      * 
-      * This constructor is for 1D array or vectors with more than 4 coefficients.
-      * There exists C++98 analogue constructors for fixed-size array/vector having 1, 2, 3, or 4 coefficients.
-      * 
-      * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this 
-      * constructor must match the the fixed number of rows (resp. columns) of \c *this.
-      */
-    template <typename... ArgTypes>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2,  const Scalar& a3, const ArgTypes&... args)
-      : m_storage()
-    {
-      _check_template_params();
-      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, sizeof...(args) + 4);
-      m_storage.data()[0] = a0;
-      m_storage.data()[1] = a1;
-      m_storage.data()[2] = a2;
-      m_storage.data()[3] = a3;
-      int i = 4;
-      auto x = {(m_storage.data()[i++] = args, 0)...};
-      static_cast<void>(x);
-    }
-
-    /** \brief Constructs a Matrix or Array and initializes it by elements given by an initializer list of initializer
-      * lists \cpp11
-      */
-    EIGEN_DEVICE_FUNC
-    explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list<std::initializer_list<Scalar>>& list)
-      : m_storage()
-    {
-      _check_template_params();
-
-      size_t list_size = 0;
-      if (list.begin() != list.end()) {
-        list_size = list.begin()->size();
-      }
-
-      // This is to allow syntax like VectorXi {{1, 2, 3, 4}}
-      if (ColsAtCompileTime == 1 && list.size() == 1) {
-        eigen_assert(list_size == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic);
-        resize(list_size, ColsAtCompileTime);
-        std::copy(list.begin()->begin(), list.begin()->end(), m_storage.data());
-      } else {
-        eigen_assert(list.size() == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic);
-        eigen_assert(list_size == static_cast<size_t>(ColsAtCompileTime) || ColsAtCompileTime == Dynamic);
-        resize(list.size(), list_size);
-       
-        Index row_index = 0;
-        for (const std::initializer_list<Scalar>& row : list) {
-          eigen_assert(list_size == row.size());
-          Index col_index = 0;
-          for (const Scalar& e : row) {
-            coeffRef(row_index, col_index) = e;
-            ++col_index;
-          }
-          ++row_index;
-        }
-      }
-    }
-    #endif  // end EIGEN_HAS_CXX11
-
     /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
     template<typename OtherDerived>
     EIGEN_DEVICE_FUNC
@@ -629,7 +564,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * \copydetails DenseBase::operator=(const EigenBase<OtherDerived> &other)
       */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC 
     EIGEN_STRONG_INLINE Derived& operator=(const EigenBase<OtherDerived> &other)
     {
       _resize_to_match(other);
@@ -743,7 +678,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * remain row-vectors and vectors remain vectors.
       */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC 
     EIGEN_STRONG_INLINE void _resize_to_match(const EigenBase<OtherDerived>& other)
     {
       #ifdef EIGEN_NO_AUTOMATIC_RESIZING
@@ -770,10 +705,10 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       *
       * \internal
       */
-    // aliasing is dealt once in internal::call_assignment
+    // aliasing is dealt once in internall::call_assignment
     // so at this stage we have to assume aliasing... and resising has to be done later.
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC 
     EIGEN_STRONG_INLINE Derived& _set(const DenseBase<OtherDerived>& other)
     {
       internal::call_assignment(this->derived(), other.derived());
@@ -786,7 +721,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       * \sa operator=(const MatrixBase<OtherDerived>&), _set()
       */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC 
     EIGEN_STRONG_INLINE Derived& _set_noalias(const DenseBase<OtherDerived>& other)
     {
       // I don't think we need this resize call since the lazyAssign will anyways resize
@@ -809,18 +744,18 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
                           FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
       resize(rows,cols);
     }
-
+    
     template<typename T0, typename T1>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC 
     EIGEN_STRONG_INLINE void _init2(const T0& val0, const T1& val1, typename internal::enable_if<Base::SizeAtCompileTime==2,T0>::type* = 0)
     {
       EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
       m_storage.data()[0] = Scalar(val0);
       m_storage.data()[1] = Scalar(val1);
     }
-
+    
     template<typename T0, typename T1>
-    EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC 
     EIGEN_STRONG_INLINE void _init2(const Index& val0, const Index& val1,
                                     typename internal::enable_if<    (!internal::is_same<Index,Scalar>::value)
                                                                   && (internal::is_same<T0,Index>::value)
@@ -846,8 +781,8 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
                           FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
       resize(size);
     }
-
-    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitly converted)
+    
+    // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitely converted)
     template<typename T>
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if<Base::SizeAtCompileTime==1 && internal::is_convertible<T, Scalar>::value,T>::type* = 0)
@@ -855,7 +790,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
       EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1)
       m_storage.data()[0] = val0;
     }
-
+    
     // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type match the index type)
     template<typename T>
     EIGEN_DEVICE_FUNC
@@ -911,7 +846,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     {
       this->derived() = r;
     }
-
+    
     // For fixed-size Array<Scalar,...>
     template<typename T>
     EIGEN_DEVICE_FUNC
@@ -923,7 +858,7 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     {
       Base::setConstant(val0);
     }
-
+    
     // For fixed-size Array<Index,...>
     template<typename T>
     EIGEN_DEVICE_FUNC
@@ -937,34 +872,34 @@ class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
     {
       Base::setConstant(val0);
     }
-
+    
     template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
     friend struct internal::matrix_swap_impl;
 
   public:
-
+    
 #ifndef EIGEN_PARSED_BY_DOXYGEN
     /** \internal
       * \brief Override DenseBase::swap() since for dynamic-sized matrices
       * of same type it is enough to swap the data pointers.
       */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
     void swap(DenseBase<OtherDerived> & other)
     {
       enum { SwapPointers = internal::is_same<Derived, OtherDerived>::value && Base::SizeAtCompileTime==Dynamic };
       internal::matrix_swap_impl<Derived, OtherDerived, bool(SwapPointers)>::run(this->derived(), other.derived());
     }
-
+    
     /** \internal
       * \brief const version forwarded to DenseBase::swap
       */
     template<typename OtherDerived>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
     void swap(DenseBase<OtherDerived> const & other)
     { Base::swap(other.derived()); }
-
-    EIGEN_DEVICE_FUNC
+    
+    EIGEN_DEVICE_FUNC 
     static EIGEN_STRONG_INLINE void _check_template_params()
     {
       EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (Options&RowMajor)==RowMajor)
@@ -988,19 +923,13 @@ namespace internal {
 template <typename Derived, typename OtherDerived, bool IsVector>
 struct conservative_resize_like_impl
 {
-  #if EIGEN_HAS_TYPE_TRAITS
-  static const bool IsRelocatable = std::is_trivially_copyable<typename Derived::Scalar>::value;
-  #else
-  static const bool IsRelocatable = !NumTraits<typename Derived::Scalar>::RequireInitialization;
-  #endif
   static void run(DenseBase<Derived>& _this, Index rows, Index cols)
   {
     if (_this.rows() == rows && _this.cols() == cols) return;
     EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
 
-    if ( IsRelocatable
-          && (( Derived::IsRowMajor && _this.cols() == cols) ||  // row-major and we change only the number of rows
-              (!Derived::IsRowMajor && _this.rows() == rows) ))  // column-major and we change only the number of columns
+    if ( ( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows
+         (!Derived::IsRowMajor && _this.rows() == rows) )  // column-major and we change only the number of columns
     {
       internal::check_rows_cols_for_overflow<Derived::MaxSizeAtCompileTime>::run(rows, cols);
       _this.derived().m_storage.conservativeResize(rows*cols,rows,cols);
@@ -1028,9 +957,8 @@ struct conservative_resize_like_impl
     EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
     EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(OtherDerived)
 
-    if ( IsRelocatable &&
-          (( Derived::IsRowMajor && _this.cols() == other.cols()) ||  // row-major and we change only the number of rows
-           (!Derived::IsRowMajor && _this.rows() == other.rows()) ))  // column-major and we change only the number of columns
+    if ( ( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows
+         (!Derived::IsRowMajor && _this.rows() == other.rows()) )  // column-major and we change only the number of columns
     {
       const Index new_rows = other.rows() - _this.rows();
       const Index new_cols = other.cols() - _this.cols();
@@ -1058,18 +986,13 @@ template <typename Derived, typename OtherDerived>
 struct conservative_resize_like_impl<Derived,OtherDerived,true>
   : conservative_resize_like_impl<Derived,OtherDerived,false>
 {
-  typedef conservative_resize_like_impl<Derived,OtherDerived,false> Base;
-  using Base::run;
-  using Base::IsRelocatable;
-
+  using conservative_resize_like_impl<Derived,OtherDerived,false>::run;
+  
   static void run(DenseBase<Derived>& _this, Index size)
   {
     const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : size;
     const Index new_cols = Derived::RowsAtCompileTime==1 ? size : 1;
-    if(IsRelocatable)
-      _this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
-    else
-      Base::run(_this.derived(), new_rows, new_cols);
+    _this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
   }
 
   static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other)
@@ -1080,10 +1003,7 @@ struct conservative_resize_like_impl<Derived,OtherDerived,true>
 
     const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows();
     const Index new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1;
-    if(IsRelocatable)
-      _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
-    else
-      Base::run(_this.derived(), new_rows, new_cols);
+    _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
 
     if (num_new_elements > 0)
       _this.tail(num_new_elements) = other.tail(num_new_elements);
@@ -1094,7 +1014,7 @@ template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
 struct matrix_swap_impl
 {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE void run(MatrixTypeA& a, MatrixTypeB& b)
+  static inline void run(MatrixTypeA& a, MatrixTypeB& b)
   {
     a.base().swap(b);
   }
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Product.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Product.h
index 13d5662df..676c48027 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/Product.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Product.h
@@ -90,23 +90,18 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option,
     typedef typename internal::remove_all<LhsNested>::type LhsNestedCleaned;
     typedef typename internal::remove_all<RhsNested>::type RhsNestedCleaned;
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs)
+    EIGEN_DEVICE_FUNC Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs)
     {
       eigen_assert(lhs.cols() == rhs.rows()
         && "invalid matrix product"
         && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
     }
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index rows() const { return m_lhs.rows(); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index cols() const { return m_rhs.cols(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const LhsNestedCleaned& lhs() const { return m_lhs; }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const RhsNestedCleaned& rhs() const { return m_rhs; }
+    EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; }
+    EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; }
 
   protected:
 
@@ -121,7 +116,7 @@ class dense_product_base
  : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
 {};
 
-/** Conversion to scalar for inner-products */
+/** Convertion to scalar for inner-products */
 template<typename Lhs, typename Rhs, int Option>
 class dense_product_base<Lhs, Rhs, Option, InnerProduct>
  : public internal::dense_xpr_base<Product<Lhs,Rhs,Option> >::type
@@ -132,7 +127,7 @@ public:
   using Base::derived;
   typedef typename Base::Scalar Scalar;
   
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator const Scalar() const
+  EIGEN_STRONG_INLINE operator const Scalar() const
   {
     return internal::evaluator<ProductXpr>(derived()).coeff(0,0);
   }
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/ProductEvaluators.h b/uppsrc/plugin/Eigen/Eigen/src/Core/ProductEvaluators.h
index 792b1811c..bce1310c9 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/ProductEvaluators.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/ProductEvaluators.h
@@ -20,7 +20,7 @@ namespace internal {
 /** \internal
   * Evaluator of a product expression.
   * Since products require special treatments to handle all possible cases,
-  * we simply defer the evaluation logic to a product_evaluator class
+  * we simply deffer the evaluation logic to a product_evaluator class
   * which offers more partial specialization possibilities.
   * 
   * \sa class product_evaluator
@@ -128,7 +128,7 @@ protected:
   PlainObject m_result;
 };
 
-// The following three shortcuts are enabled only if the scalar types match exactly.
+// The following three shortcuts are enabled only if the scalar types match excatly.
 // TODO: we could enable them for different scalar types when the product is not vectorized.
 
 // Dense = Product
@@ -137,7 +137,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::assign_op<Scal
   typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
   typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static EIGEN_STRONG_INLINE
   void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
   {
     Index dstRows = src.rows();
@@ -155,7 +155,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::add_assign_op<
   typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
   typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static EIGEN_STRONG_INLINE
   void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<Scalar,Scalar> &)
   {
     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
@@ -170,7 +170,7 @@ struct Assignment<DstXprType, Product<Lhs,Rhs,Options>, internal::sub_assign_op<
   typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
 {
   typedef Product<Lhs,Rhs,Options> SrcXprType;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static EIGEN_STRONG_INLINE
   void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<Scalar,Scalar> &)
   {
     eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
@@ -190,7 +190,7 @@ struct Assignment<DstXprType, CwiseBinaryOp<internal::scalar_product_op<ScalarBi
   typedef CwiseBinaryOp<internal::scalar_product_op<ScalarBis,Scalar>,
                         const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>,Plain>,
                         const Product<Lhs,Rhs,DefaultProduct> > SrcXprType;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static EIGEN_STRONG_INLINE
   void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func)
   {
     call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func);
@@ -217,7 +217,7 @@ template<typename DstXprType, typename OtherXpr, typename ProductType, typename
 struct assignment_from_xpr_op_product
 {
   template<typename SrcXprType, typename InitialFunc>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static EIGEN_STRONG_INLINE
   void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/)
   {
     call_assignment_no_alias(dst, src.lhs(), Func1());
@@ -246,19 +246,19 @@ template<typename Lhs, typename Rhs>
 struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
 {
   template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
   }
   
   template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum();
   }
   
   template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); }
 };
 
@@ -269,10 +269,10 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,InnerProduct>
 
 // Column major result
 template<typename Dst, typename Lhs, typename Rhs, typename Func>
-void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
+void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&)
 {
   evaluator<Rhs> rhsEval(rhs);
-  ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs);
+  typename nested_eval<Lhs,Rhs::SizeAtCompileTime>::type actual_lhs(lhs);
   // FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored
   // FIXME not very good if rhs is real and lhs complex while alpha is real too
   const Index cols = dst.cols();
@@ -282,10 +282,10 @@ void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, cons
 
 // Row major result
 template<typename Dst, typename Lhs, typename Rhs, typename Func>
-void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
+void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&)
 {
   evaluator<Lhs> lhsEval(lhs);
-  ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs);
+  typename nested_eval<Rhs,Lhs::SizeAtCompileTime>::type actual_rhs(rhs);
   // FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored
   // FIXME not very good if lhs is real and rhs complex while alpha is real too
   const Index rows = dst.rows();
@@ -300,37 +300,37 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,OuterProduct>
   typedef typename Product<Lhs,Rhs>::Scalar Scalar;
   
   // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
-  struct set  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
-  struct add  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
-  struct sub  { template<typename Dst, typename Src> EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
+  struct set  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
+  struct add  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
+  struct sub  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
   struct adds {
     Scalar m_scale;
     explicit adds(const Scalar& s) : m_scale(s) {}
-    template<typename Dst, typename Src> void EIGEN_DEVICE_FUNC operator()(const Dst& dst, const Src& src) const {
+    template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const {
       dst.const_cast_derived() += m_scale * src;
     }
   };
   
   template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
   }
   
   template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
   }
   
   template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
   }
   
   template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   {
     internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
   }
@@ -345,19 +345,19 @@ struct generic_product_impl_base
   typedef typename Product<Lhs,Rhs>::Scalar Scalar;
   
   template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   { dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); }
 
   template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   { scaleAndAddTo(dst,lhs, rhs, Scalar(1)); }
 
   template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   { scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); }
   
   template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   { Derived::scaleAndAddTo(dst,lhs,rhs,alpha); }
 
 };
@@ -373,7 +373,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemvProduct>
   typedef typename internal::remove_all<typename internal::conditional<int(Side)==OnTheRight,LhsNested,RhsNested>::type>::type MatrixType;
 
   template<typename Dest>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   {
     LhsNested actual_lhs(lhs);
     RhsNested actual_rhs(rhs);
@@ -390,7 +390,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
   typedef typename Product<Lhs,Rhs>::Scalar Scalar;
   
   template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     // Same as: dst.noalias() = lhs.lazyProduct(rhs);
     // but easier on the compiler side
@@ -398,71 +398,48 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,CoeffBasedProductMode>
   }
 
   template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     // dst.noalias() += lhs.lazyProduct(rhs);
     call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<typename Dst::Scalar,Scalar>());
   }
   
   template<typename Dst>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
+  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
     // dst.noalias() -= lhs.lazyProduct(rhs);
     call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
   }
 
-  // This is a special evaluation path called from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
-  // This variant tries to extract scalar multiples from both the LHS and RHS and factor them out. For instance:
-  //   dst {,+,-}= (s1*A)*(B*s2)
-  // will be rewritten as:
-  //   dst {,+,-}= (s1*s2) * (A.lazyProduct(B))
-  // There are at least four benefits of doing so:
-  //  1 - huge performance gain for heap-allocated matrix types as it save costly allocations.
-  //  2 - it is faster than simply by-passing the heap allocation through stack allocation.
-  //  3 - it makes this fallback consistent with the heavy GEMM routine.
-  //  4 - it fully by-passes huge stack allocation attempts when multiplying huge fixed-size matrices.
-  //      (see https://stackoverflow.com/questions/54738495)
-  // For small fixed sizes matrices, howver, the gains are less obvious, it is sometimes x2 faster, but sometimes x3 slower,
-  // and the behavior depends also a lot on the compiler... This is why this re-writting strategy is currently
-  // enabled only when falling back from the main GEMM.
-  template<typename Dst, typename Func>
+  // Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor:
+  //    dst {,+,-}= s * (A.lazyProduct(B))
+  // This is a huge benefit for heap-allocated matrix types as it save one costly allocation.
+  // For them, this strategy is also faster than simply by-passing the heap allocation through
+  // stack allocation.
+  // For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower,
+  // and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only,
+  // that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
+  template<typename Dst, typename Scalar1, typename Scalar2, typename Plain1, typename Xpr2, typename Func>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void eval_dynamic(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Func &func)
+  void eval_dynamic(Dst& dst, const CwiseBinaryOp<internal::scalar_product_op<Scalar1,Scalar2>,
+                                           const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func)
   {
-    enum {
-      HasScalarFactor = blas_traits<Lhs>::HasScalarFactor || blas_traits<Rhs>::HasScalarFactor,
-      ConjLhs = blas_traits<Lhs>::NeedToConjugate,
-      ConjRhs = blas_traits<Rhs>::NeedToConjugate
-    };
-    // FIXME: in c++11 this should be auto, and extractScalarFactor should also return auto
-    //        this is important for real*complex_mat
-    Scalar actualAlpha =    blas_traits<Lhs>::extractScalarFactor(lhs)
-                          * blas_traits<Rhs>::extractScalarFactor(rhs);
-    eval_dynamic_impl(dst,
-                      blas_traits<Lhs>::extract(lhs).template conjugateIf<ConjLhs>(),
-                      blas_traits<Rhs>::extract(rhs).template conjugateIf<ConjRhs>(),
-                      func,
-                      actualAlpha,
-                      typename conditional<HasScalarFactor,true_type,false_type>::type());
+    call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func);
   }
 
-protected:
-
-  template<typename Dst, typename LhsT, typename RhsT, typename Func, typename Scalar>
+  // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above
+  // overload more specialized.
+  template<typename Dst, typename LhsT, typename Func>
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar&  s /* == 1 */, false_type)
+  void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func)
   {
-    EIGEN_UNUSED_VARIABLE(s);
-    eigen_internal_assert(s==Scalar(1));
-    call_restricted_packet_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
-  }
-
-  template<typename Dst, typename LhsT, typename RhsT, typename Func, typename Scalar>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar& s, true_type)
-  {
-    call_restricted_packet_assignment_no_alias(dst, s * lhs.lazyProduct(rhs), func);
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
   }
+  
+  
+//   template<typename Dst>
+//   static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+//   { dst.noalias() += alpha * lhs.lazyProduct(rhs); }
 };
 
 // This specialization enforces the use of a coefficient-based evaluation strategy
@@ -605,8 +582,7 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
    * which is why we don't set the LinearAccessBit.
    * TODO: this seems possible when the result is a vector
    */
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  const CoeffReturnType coeff(Index index) const
+  EIGEN_DEVICE_FUNC const CoeffReturnType coeff(Index index) const
   {
     const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index;
     const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? index : 0;
@@ -614,7 +590,6 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
   }
 
   template<int LoadMode, typename PacketType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const PacketType packet(Index row, Index col) const
   {
     PacketType res;
@@ -626,7 +601,6 @@ struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape,
   }
 
   template<int LoadMode, typename PacketType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const PacketType packet(Index index) const
   {
     const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index;
@@ -655,8 +629,7 @@ struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, LazyCoeffBasedProduc
   enum {
     Flags = Base::Flags | EvalBeforeNestingBit
   };
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit product_evaluator(const XprType& xpr)
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
     : Base(BaseProduct(xpr.lhs(),xpr.rhs()))
   {}
 };
@@ -794,8 +767,7 @@ struct generic_product_impl<Lhs,Rhs,SelfAdjointShape,DenseShape,ProductTag>
   typedef typename Product<Lhs,Rhs>::Scalar Scalar;
   
   template<typename Dest>
-  static EIGEN_DEVICE_FUNC
-  void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha)
   {
     selfadjoint_product_impl<typename Lhs::MatrixType,Lhs::Mode,false,Rhs,0,Rhs::IsVectorAtCompileTime>::run(dst, lhs.nestedExpression(), rhs, alpha);
   }
@@ -830,21 +802,13 @@ public:
     
     MatrixFlags = evaluator<MatrixType>::Flags,
     DiagFlags = evaluator<DiagonalType>::Flags,
-    
-    _StorageOrder = (Derived::MaxRowsAtCompileTime==1 && Derived::MaxColsAtCompileTime!=1) ? RowMajor
-                  : (Derived::MaxColsAtCompileTime==1 && Derived::MaxRowsAtCompileTime!=1) ? ColMajor
-                  : MatrixFlags & RowMajorBit ? RowMajor : ColMajor,
-    _SameStorageOrder = _StorageOrder == (MatrixFlags & RowMajorBit ? RowMajor : ColMajor),
-
+    _StorageOrder = MatrixFlags & RowMajorBit ? RowMajor : ColMajor,
     _ScalarAccessOnDiag =  !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft)
                            ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)),
     _SameTypes = is_same<typename MatrixType::Scalar, typename DiagonalType::Scalar>::value,
     // FIXME currently we need same types, but in the future the next rule should be the one
     //_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagFlags)&PacketAccessBit))),
-    _Vectorizable =   bool(int(MatrixFlags)&PacketAccessBit)
-                  &&  _SameTypes
-                  && (_SameStorageOrder || (MatrixFlags&LinearAccessBit)==LinearAccessBit)
-                  && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
+    _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))),
     _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0,
     Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0),
     Alignment = evaluator<MatrixType>::Alignment,
@@ -854,7 +818,7 @@ public:
                       ||  (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight)
   };
   
-  EIGEN_DEVICE_FUNC diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
+  diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag)
     : m_diagImpl(diag), m_matImpl(mat)
   {
     EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
@@ -905,10 +869,10 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
   
   typedef Product<Lhs, Rhs, ProductKind> XprType;
   typedef typename XprType::PlainObject PlainObject;
-  typedef typename Lhs::DiagonalVectorType DiagonalType;
-
   
-  enum { StorageOrder = Base::_StorageOrder };
+  enum {
+    StorageOrder = int(Rhs::Flags) & RowMajorBit ? RowMajor : ColMajor
+  };
 
   EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
     : Base(xpr.rhs(), xpr.lhs().diagonal())
@@ -920,7 +884,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalSha
     return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
   }
   
-#ifndef EIGEN_GPUCC
+#ifndef __CUDACC__
   template<int LoadMode,typename PacketType>
   EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
   {
@@ -952,7 +916,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
   typedef Product<Lhs, Rhs, ProductKind> XprType;
   typedef typename XprType::PlainObject PlainObject;
   
-  enum { StorageOrder = Base::_StorageOrder };
+  enum { StorageOrder = int(Lhs::Flags) & RowMajorBit ? RowMajor : ColMajor };
 
   EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
     : Base(xpr.lhs(), xpr.rhs().diagonal())
@@ -964,7 +928,7 @@ struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape,
     return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
   }
   
-#ifndef EIGEN_GPUCC
+#ifndef __CUDACC__
   template<int LoadMode,typename PacketType>
   EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const
   {
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Random.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Random.h
index 486e9ed52..6faf789c7 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/Random.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Random.h
@@ -128,7 +128,7 @@ DenseBase<Derived>::Random()
   * \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index)
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline Derived& DenseBase<Derived>::setRandom()
+inline Derived& DenseBase<Derived>::setRandom()
 {
   return *this = Random(rows(), cols());
 }
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Redux.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Redux.h
index 2eef5abc5..760e9f861 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/Redux.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Redux.h
@@ -23,29 +23,23 @@ namespace internal {
 * Part 1 : the logic deciding a strategy for vectorization and unrolling
 ***************************************************************************/
 
-template<typename Func, typename Evaluator>
+template<typename Func, typename Derived>
 struct redux_traits
 {
 public:
-    typedef typename find_best_packet<typename Evaluator::Scalar,Evaluator::SizeAtCompileTime>::type PacketType;
+    typedef typename find_best_packet<typename Derived::Scalar,Derived::SizeAtCompileTime>::type PacketType;
   enum {
     PacketSize = unpacket_traits<PacketType>::size,
-    InnerMaxSize = int(Evaluator::IsRowMajor)
-                 ? Evaluator::MaxColsAtCompileTime
-                 : Evaluator::MaxRowsAtCompileTime,
-    OuterMaxSize = int(Evaluator::IsRowMajor)
-                 ? Evaluator::MaxRowsAtCompileTime
-                 : Evaluator::MaxColsAtCompileTime,
-    SliceVectorizedWork = int(InnerMaxSize)==Dynamic ? Dynamic
-                        : int(OuterMaxSize)==Dynamic ? (int(InnerMaxSize)>=int(PacketSize) ? Dynamic : 0)
-                        : (int(InnerMaxSize)/int(PacketSize)) * int(OuterMaxSize)
+    InnerMaxSize = int(Derived::IsRowMajor)
+                 ? Derived::MaxColsAtCompileTime
+                 : Derived::MaxRowsAtCompileTime
   };
 
   enum {
-    MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit)
+    MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit)
                   && (functor_traits<Func>::PacketAccess),
-    MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit),
-    MaySliceVectorize  = bool(MightVectorize) && (int(SliceVectorizedWork)==Dynamic || int(SliceVectorizedWork)>=3)
+    MayLinearVectorize = bool(MightVectorize) && (int(Derived::Flags)&LinearAccessBit),
+    MaySliceVectorize  = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize
   };
 
 public:
@@ -57,8 +51,8 @@ public:
 
 public:
   enum {
-    Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost
-         : Evaluator::SizeAtCompileTime * Evaluator::CoeffReadCost + (Evaluator::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
+    Cost = Derived::SizeAtCompileTime == Dynamic ? HugeCost
+         : Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
     UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize))
   };
 
@@ -70,20 +64,18 @@ public:
 #ifdef EIGEN_DEBUG_ASSIGN
   static void debug()
   {
-    std::cerr << "Xpr: " << typeid(typename Evaluator::XprType).name() << std::endl;
+    std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl;
     std::cerr.setf(std::ios::hex, std::ios::basefield);
-    EIGEN_DEBUG_VAR(Evaluator::Flags)
+    EIGEN_DEBUG_VAR(Derived::Flags)
     std::cerr.unsetf(std::ios::hex);
     EIGEN_DEBUG_VAR(InnerMaxSize)
-    EIGEN_DEBUG_VAR(OuterMaxSize)
-    EIGEN_DEBUG_VAR(SliceVectorizedWork)
     EIGEN_DEBUG_VAR(PacketSize)
     EIGEN_DEBUG_VAR(MightVectorize)
     EIGEN_DEBUG_VAR(MayLinearVectorize)
     EIGEN_DEBUG_VAR(MaySliceVectorize)
-    std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
+    EIGEN_DEBUG_VAR(Traversal)
     EIGEN_DEBUG_VAR(UnrollingLimit)
-    std::cerr << "Unrolling" << " = " << Unrolling << " (" << demangle_unrolling(Unrolling) << ")" << std::endl;
+    EIGEN_DEBUG_VAR(Unrolling)
     std::cerr << std::endl;
   }
 #endif
@@ -95,86 +87,88 @@ public:
 
 /*** no vectorization ***/
 
-template<typename Func, typename Evaluator, int Start, int Length>
+template<typename Func, typename Derived, int Start, int Length>
 struct redux_novec_unroller
 {
   enum {
     HalfLength = Length/2
   };
 
-  typedef typename Evaluator::Scalar Scalar;
+  typedef typename Derived::Scalar Scalar;
 
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func& func)
+  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
   {
-    return func(redux_novec_unroller<Func, Evaluator, Start, HalfLength>::run(eval,func),
-                redux_novec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::run(eval,func));
+    return func(redux_novec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
+                redux_novec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func));
   }
 };
 
-template<typename Func, typename Evaluator, int Start>
-struct redux_novec_unroller<Func, Evaluator, Start, 1>
+template<typename Func, typename Derived, int Start>
+struct redux_novec_unroller<Func, Derived, Start, 1>
 {
   enum {
-    outer = Start / Evaluator::InnerSizeAtCompileTime,
-    inner = Start % Evaluator::InnerSizeAtCompileTime
+    outer = Start / Derived::InnerSizeAtCompileTime,
+    inner = Start % Derived::InnerSizeAtCompileTime
   };
 
-  typedef typename Evaluator::Scalar Scalar;
+  typedef typename Derived::Scalar Scalar;
 
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func&)
+  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&)
   {
-    return eval.coeffByOuterInner(outer, inner);
+    return mat.coeffByOuterInner(outer, inner);
   }
 };
 
 // This is actually dead code and will never be called. It is required
 // to prevent false warnings regarding failed inlining though
 // for 0 length run() will never be called at all.
-template<typename Func, typename Evaluator, int Start>
-struct redux_novec_unroller<Func, Evaluator, Start, 0>
+template<typename Func, typename Derived, int Start>
+struct redux_novec_unroller<Func, Derived, Start, 0>
 {
-  typedef typename Evaluator::Scalar Scalar;
+  typedef typename Derived::Scalar Scalar;
   EIGEN_DEVICE_FUNC 
-  static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
+  static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); }
 };
 
 /*** vectorization ***/
 
-template<typename Func, typename Evaluator, int Start, int Length>
+template<typename Func, typename Derived, int Start, int Length>
 struct redux_vec_unroller
 {
-  template<typename PacketType>
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE PacketType run(const Evaluator &eval, const Func& func)
-  {
-    enum {
-      PacketSize = unpacket_traits<PacketType>::size,
-      HalfLength = Length/2
-    };
+  enum {
+    PacketSize = redux_traits<Func, Derived>::PacketSize,
+    HalfLength = Length/2
+  };
 
+  typedef typename Derived::Scalar Scalar;
+  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+
+  static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func)
+  {
     return func.packetOp(
-            redux_vec_unroller<Func, Evaluator, Start, HalfLength>::template run<PacketType>(eval,func),
-            redux_vec_unroller<Func, Evaluator, Start+HalfLength, Length-HalfLength>::template run<PacketType>(eval,func) );
+            redux_vec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
+            redux_vec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func) );
   }
 };
 
-template<typename Func, typename Evaluator, int Start>
-struct redux_vec_unroller<Func, Evaluator, Start, 1>
+template<typename Func, typename Derived, int Start>
+struct redux_vec_unroller<Func, Derived, Start, 1>
 {
-  template<typename PacketType>
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE PacketType run(const Evaluator &eval, const Func&)
+  enum {
+    index = Start * redux_traits<Func, Derived>::PacketSize,
+    outer = index / int(Derived::InnerSizeAtCompileTime),
+    inner = index % int(Derived::InnerSizeAtCompileTime),
+    alignment = Derived::Alignment
+  };
+
+  typedef typename Derived::Scalar Scalar;
+  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
+
+  static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&)
   {
-    enum {
-      PacketSize = unpacket_traits<PacketType>::size,
-      index = Start * PacketSize,
-      outer = index / int(Evaluator::InnerSizeAtCompileTime),
-      inner = index % int(Evaluator::InnerSizeAtCompileTime),
-      alignment = Evaluator::Alignment
-    };
-    return eval.template packetByOuterInner<alignment,PacketType>(outer, inner);
+    return mat.template packetByOuterInner<alignment,PacketScalar>(outer, inner);
   }
 };
 
@@ -182,65 +176,53 @@ struct redux_vec_unroller<Func, Evaluator, Start, 1>
 * Part 3 : implementation of all cases
 ***************************************************************************/
 
-template<typename Func, typename Evaluator,
-         int Traversal = redux_traits<Func, Evaluator>::Traversal,
-         int Unrolling = redux_traits<Func, Evaluator>::Unrolling
+template<typename Func, typename Derived,
+         int Traversal = redux_traits<Func, Derived>::Traversal,
+         int Unrolling = redux_traits<Func, Derived>::Unrolling
 >
 struct redux_impl;
 
-template<typename Func, typename Evaluator>
-struct redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>
+template<typename Func, typename Derived>
+struct redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>
 {
-  typedef typename Evaluator::Scalar Scalar;
-
-  template<typename XprType>
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
-  Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
+  typedef typename Derived::Scalar Scalar;
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
   {
-    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
+    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
     Scalar res;
-    res = eval.coeffByOuterInner(0, 0);
-    for(Index i = 1; i < xpr.innerSize(); ++i)
-      res = func(res, eval.coeffByOuterInner(0, i));
-    for(Index i = 1; i < xpr.outerSize(); ++i)
-      for(Index j = 0; j < xpr.innerSize(); ++j)
-        res = func(res, eval.coeffByOuterInner(i, j));
+    res = mat.coeffByOuterInner(0, 0);
+    for(Index i = 1; i < mat.innerSize(); ++i)
+      res = func(res, mat.coeffByOuterInner(0, i));
+    for(Index i = 1; i < mat.outerSize(); ++i)
+      for(Index j = 0; j < mat.innerSize(); ++j)
+        res = func(res, mat.coeffByOuterInner(i, j));
     return res;
   }
 };
 
-template<typename Func, typename Evaluator>
-struct redux_impl<Func,Evaluator, DefaultTraversal, CompleteUnrolling>
-  : redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime>
-{
-  typedef redux_novec_unroller<Func,Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
-  typedef typename Evaluator::Scalar Scalar;
-  template<typename XprType>
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
-  Scalar run(const Evaluator &eval, const Func& func, const XprType& /*xpr*/)
-  {
-    return Base::run(eval,func);
-  }
-};
+template<typename Func, typename Derived>
+struct redux_impl<Func,Derived, DefaultTraversal, CompleteUnrolling>
+  : public redux_novec_unroller<Func,Derived, 0, Derived::SizeAtCompileTime>
+{};
 
-template<typename Func, typename Evaluator>
-struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, NoUnrolling>
+template<typename Func, typename Derived>
+struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
 {
-  typedef typename Evaluator::Scalar Scalar;
-  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
 
-  template<typename XprType>
-  static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
+  static Scalar run(const Derived &mat, const Func& func)
   {
-    const Index size = xpr.size();
+    const Index size = mat.size();
     
-    const Index packetSize = redux_traits<Func, Evaluator>::PacketSize;
+    const Index packetSize = redux_traits<Func, Derived>::PacketSize;
     const int packetAlignment = unpacket_traits<PacketScalar>::alignment;
     enum {
-      alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
-      alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment)
+      alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned),
+      alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment)
     };
-    const Index alignedStart = internal::first_default_aligned(xpr);
+    const Index alignedStart = internal::first_default_aligned(mat.nestedExpression());
     const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
     const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
     const Index alignedEnd2 = alignedStart + alignedSize2;
@@ -248,34 +230,34 @@ struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, NoUnrolling>
     Scalar res;
     if(alignedSize)
     {
-      PacketScalar packet_res0 = eval.template packet<alignment,PacketScalar>(alignedStart);
+      PacketScalar packet_res0 = mat.template packet<alignment,PacketScalar>(alignedStart);
       if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop
       {
-        PacketScalar packet_res1 = eval.template packet<alignment,PacketScalar>(alignedStart+packetSize);
+        PacketScalar packet_res1 = mat.template packet<alignment,PacketScalar>(alignedStart+packetSize);
         for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize)
         {
-          packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(index));
-          packet_res1 = func.packetOp(packet_res1, eval.template packet<alignment,PacketScalar>(index+packetSize));
+          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(index));
+          packet_res1 = func.packetOp(packet_res1, mat.template packet<alignment,PacketScalar>(index+packetSize));
         }
 
         packet_res0 = func.packetOp(packet_res0,packet_res1);
         if(alignedEnd>alignedEnd2)
-          packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment,PacketScalar>(alignedEnd2));
+          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment,PacketScalar>(alignedEnd2));
       }
       res = func.predux(packet_res0);
 
       for(Index index = 0; index < alignedStart; ++index)
-        res = func(res,eval.coeff(index));
+        res = func(res,mat.coeff(index));
 
       for(Index index = alignedEnd; index < size; ++index)
-        res = func(res,eval.coeff(index));
+        res = func(res,mat.coeff(index));
     }
     else // too small to vectorize anything.
          // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
     {
-      res = eval.coeff(0);
+      res = mat.coeff(0);
       for(Index index = 1; index < size; ++index)
-        res = func(res,eval.coeff(index));
+        res = func(res,mat.coeff(index));
     }
 
     return res;
@@ -283,108 +265,130 @@ struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, NoUnrolling>
 };
 
 // NOTE: for SliceVectorizedTraversal we simply bypass unrolling
-template<typename Func, typename Evaluator, int Unrolling>
-struct redux_impl<Func, Evaluator, SliceVectorizedTraversal, Unrolling>
+template<typename Func, typename Derived, int Unrolling>
+struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling>
 {
-  typedef typename Evaluator::Scalar Scalar;
-  typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename redux_traits<Func, Derived>::PacketType PacketType;
 
-  template<typename XprType>
-  EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr)
+  EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func)
   {
-    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
-    const Index innerSize = xpr.innerSize();
-    const Index outerSize = xpr.outerSize();
+    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
+    const Index innerSize = mat.innerSize();
+    const Index outerSize = mat.outerSize();
     enum {
-      packetSize = redux_traits<Func, Evaluator>::PacketSize
+      packetSize = redux_traits<Func, Derived>::PacketSize
     };
     const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize;
     Scalar res;
     if(packetedInnerSize)
     {
-      PacketType packet_res = eval.template packet<Unaligned,PacketType>(0,0);
+      PacketType packet_res = mat.template packet<Unaligned,PacketType>(0,0);
       for(Index j=0; j<outerSize; ++j)
         for(Index i=(j==0?packetSize:0); i<packetedInnerSize; i+=Index(packetSize))
-          packet_res = func.packetOp(packet_res, eval.template packetByOuterInner<Unaligned,PacketType>(j,i));
+          packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned,PacketType>(j,i));
 
       res = func.predux(packet_res);
       for(Index j=0; j<outerSize; ++j)
         for(Index i=packetedInnerSize; i<innerSize; ++i)
-          res = func(res, eval.coeffByOuterInner(j,i));
+          res = func(res, mat.coeffByOuterInner(j,i));
     }
     else // too small to vectorize anything.
          // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
     {
-      res = redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>::run(eval, func, xpr);
+      res = redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>::run(mat, func);
     }
 
     return res;
   }
 };
 
-template<typename Func, typename Evaluator>
-struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, CompleteUnrolling>
+template<typename Func, typename Derived>
+struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
 {
-  typedef typename Evaluator::Scalar Scalar;
+  typedef typename Derived::Scalar Scalar;
 
-  typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;
+  typedef typename redux_traits<Func, Derived>::PacketType PacketScalar;
   enum {
-    PacketSize = redux_traits<Func, Evaluator>::PacketSize,
-    Size = Evaluator::SizeAtCompileTime,
+    PacketSize = redux_traits<Func, Derived>::PacketSize,
+    Size = Derived::SizeAtCompileTime,
     VectorizedSize = (Size / PacketSize) * PacketSize
   };
-
-  template<typename XprType>
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE
-  Scalar run(const Evaluator &eval, const Func& func, const XprType &xpr)
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
   {
-    EIGEN_ONLY_USED_FOR_DEBUG(xpr)
-    eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix");
+    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
     if (VectorizedSize > 0) {
-      Scalar res = func.predux(redux_vec_unroller<Func, Evaluator, 0, Size / PacketSize>::template run<PacketType>(eval,func));
+      Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
       if (VectorizedSize != Size)
-        res = func(res,redux_novec_unroller<Func, Evaluator, VectorizedSize, Size-VectorizedSize>::run(eval,func));
+        res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
       return res;
     }
     else {
-      return redux_novec_unroller<Func, Evaluator, 0, Size>::run(eval,func);
+      return redux_novec_unroller<Func, Derived, 0, Size>::run(mat,func);
     }
   }
 };
 
 // evaluator adaptor
 template<typename _XprType>
-class redux_evaluator : public internal::evaluator<_XprType>
+class redux_evaluator
 {
-  typedef internal::evaluator<_XprType> Base;
 public:
   typedef _XprType XprType;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  explicit redux_evaluator(const XprType &xpr) : Base(xpr) {}
+  EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {}
   
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename XprType::PacketScalar PacketScalar;
+  typedef typename XprType::PacketReturnType PacketReturnType;
   
   enum {
     MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
     // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator
-    Flags = Base::Flags & ~DirectAccessBit,
+    Flags = evaluator<XprType>::Flags & ~DirectAccessBit,
     IsRowMajor = XprType::IsRowMajor,
     SizeAtCompileTime = XprType::SizeAtCompileTime,
-    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime
+    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime,
+    CoeffReadCost = evaluator<XprType>::CoeffReadCost,
+    Alignment = evaluator<XprType>::Alignment
   };
   
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); }
+  EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); }
+  EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); }
+
+  EIGEN_DEVICE_FUNC
+  CoeffReturnType coeff(Index row, Index col) const
+  { return m_evaluator.coeff(row, col); }
+
+  EIGEN_DEVICE_FUNC
+  CoeffReturnType coeff(Index index) const
+  { return m_evaluator.coeff(index); }
+
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index row, Index col) const
+  { return m_evaluator.template packet<LoadMode,PacketType>(row, col); }
+
+  template<int LoadMode, typename PacketType>
+  PacketType packet(Index index) const
+  { return m_evaluator.template packet<LoadMode,PacketType>(index); }
+  
+  EIGEN_DEVICE_FUNC
   CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
-  { return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
   
   template<int LoadMode, typename PacketType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   PacketType packetByOuterInner(Index outer, Index inner) const
-  { return Base::template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
+  { return m_evaluator.template packet<LoadMode,PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); }
   
+  const XprType & nestedExpression() const { return m_xpr; }
+  
+protected:
+  internal::evaluator<XprType> m_evaluator;
+  const XprType &m_xpr;
 };
 
 } // end namespace internal
@@ -399,42 +403,36 @@ public:
   * The template parameter \a BinaryOp is the type of the functor \a func which must be
   * an associative operator. Both current C++98 and C++11 functor styles are handled.
   *
-  * \warning the matrix must be not empty, otherwise an assertion is triggered.
-  *
   * \sa DenseBase::sum(), DenseBase::minCoeff(), DenseBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise()
   */
 template<typename Derived>
 template<typename Func>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::redux(const Func& func) const
 {
   eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
 
   typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
   ThisEvaluator thisEval(derived());
-
-  // The initial expression is passed to the reducer as an additional argument instead of
-  // passing it as a member of redux_evaluator to help  
-  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func, derived());
+  
+  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func);
 }
 
 /** \returns the minimum of all coefficients of \c *this.
-  * \warning the matrix must be not empty, otherwise an assertion is triggered.
   * \warning the result is undefined if \c *this contains NaN.
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff() const
 {
   return derived().redux(Eigen::internal::scalar_min_op<Scalar,Scalar>());
 }
 
 /** \returns the maximum of all coefficients of \c *this.
-  * \warning the matrix must be not empty, otherwise an assertion is triggered.
   * \warning the result is undefined if \c *this contains NaN.
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff() const
 {
   return derived().redux(Eigen::internal::scalar_max_op<Scalar,Scalar>());
@@ -447,7 +445,7 @@ DenseBase<Derived>::maxCoeff() const
   * \sa trace(), prod(), mean()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::sum() const
 {
   if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
@@ -460,7 +458,7 @@ DenseBase<Derived>::sum() const
 * \sa trace(), prod(), sum()
 */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::mean() const
 {
 #ifdef __INTEL_COMPILER
@@ -481,7 +479,7 @@ DenseBase<Derived>::mean() const
   * \sa sum(), mean(), trace()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::prod() const
 {
   if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
@@ -496,7 +494,7 @@ DenseBase<Derived>::prod() const
   * \sa diagonal(), sum()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
+EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
 MatrixBase<Derived>::trace() const
 {
   return derived().diagonal().sum();
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Ref.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Ref.h
index 172c8ffb6..17a1496b8 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/Ref.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Ref.h
@@ -187,8 +187,6 @@ protected:
   * void foo(const Ref<MatrixXf,0,Stride<> >& A) { foo_impl(A); }
   * \endcode
   *
-  * See also the following stackoverflow questions for further references:
-  *  - <a href="http://stackoverflow.com/questions/21132538/correct-usage-of-the-eigenref-class">Correct usage of the Eigen::Ref<> class</a>
   *
   * \sa PlainObjectBase::Map(), \ref TopicStorageOrders
   */
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Replicate.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Replicate.h
index 0b2d6d743..9960ef884 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/Replicate.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Replicate.h
@@ -115,7 +115,7 @@ template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
   */
 template<typename Derived>
 template<int RowFactor, int ColFactor>
-EIGEN_DEVICE_FUNC const Replicate<Derived,RowFactor,ColFactor>
+const Replicate<Derived,RowFactor,ColFactor>
 DenseBase<Derived>::replicate() const
 {
   return Replicate<Derived,RowFactor,ColFactor>(derived());
@@ -130,7 +130,7 @@ DenseBase<Derived>::replicate() const
   * \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate
   */
 template<typename ExpressionType, int Direction>
-EIGEN_DEVICE_FUNC const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
+const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
 VectorwiseOp<ExpressionType,Direction>::replicate(Index factor) const
 {
   return typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Reshaped.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Reshaped.h
deleted file mode 100644
index a78fd880f..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/Reshaped.h
+++ /dev/null
@@ -1,453 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2017 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2014 yoco <peter.xiau@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_RESHAPED_H
-#define EIGEN_RESHAPED_H
-
-namespace Eigen {
-namespace internal {
-
-/** \class Reshaped
-  * \ingroup Core_Module
-  *
-  * \brief Expression of a fixed-size or dynamic-size reshape
-  *
-  * \tparam XprType the type of the expression in which we are taking a reshape
-  * \tparam Rows the number of rows of the reshape we are taking at compile time (optional)
-  * \tparam Cols the number of columns of the reshape we are taking at compile time (optional)
-  * \tparam Order can be ColMajor or RowMajor, default is ColMajor.
-  *
-  * This class represents an expression of either a fixed-size or dynamic-size reshape.
-  * It is the return type of DenseBase::reshaped(NRowsType,NColsType) and
-  * most of the time this is the only way it is used.
-  *
-  * However, in C++98, if you want to directly maniputate reshaped expressions,
-  * for instance if you want to write a function returning such an expression, you
-  * will need to use this class. In C++11, it is advised to use the \em auto
-  * keyword for such use cases.
-  *
-  * Here is an example illustrating the dynamic case:
-  * \include class_Reshaped.cpp
-  * Output: \verbinclude class_Reshaped.out
-  *
-  * Here is an example illustrating the fixed-size case:
-  * \include class_FixedReshaped.cpp
-  * Output: \verbinclude class_FixedReshaped.out
-  *
-  * \sa DenseBase::reshaped(NRowsType,NColsType)
-  */
-
-template<typename XprType, int Rows, int Cols, int Order>
-struct traits<Reshaped<XprType, Rows, Cols, Order> > : traits<XprType>
-{
-  typedef typename traits<XprType>::Scalar Scalar;
-  typedef typename traits<XprType>::StorageKind StorageKind;
-  typedef typename traits<XprType>::XprKind XprKind;
-  enum{
-    MatrixRows = traits<XprType>::RowsAtCompileTime,
-    MatrixCols = traits<XprType>::ColsAtCompileTime,
-    RowsAtCompileTime = Rows,
-    ColsAtCompileTime = Cols,
-    MaxRowsAtCompileTime = Rows,
-    MaxColsAtCompileTime = Cols,
-    XpxStorageOrder = ((int(traits<XprType>::Flags) & RowMajorBit) == RowMajorBit) ? RowMajor : ColMajor,
-    ReshapedStorageOrder = (RowsAtCompileTime == 1 && ColsAtCompileTime != 1) ? RowMajor
-                         : (ColsAtCompileTime == 1 && RowsAtCompileTime != 1) ? ColMajor
-                         : XpxStorageOrder,
-    HasSameStorageOrderAsXprType = (ReshapedStorageOrder == XpxStorageOrder),
-    InnerSize = (ReshapedStorageOrder==int(RowMajor)) ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
-    InnerStrideAtCompileTime = HasSameStorageOrderAsXprType
-                             ? int(inner_stride_at_compile_time<XprType>::ret)
-                             : Dynamic,
-    OuterStrideAtCompileTime = Dynamic,
-
-    HasDirectAccess = internal::has_direct_access<XprType>::ret
-                    && (Order==int(XpxStorageOrder))
-                    && ((evaluator<XprType>::Flags&LinearAccessBit)==LinearAccessBit),
-
-    MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0)
-                       && (InnerStrideAtCompileTime == 1)
-                        ? PacketAccessBit : 0,
-    //MaskAlignedBit = ((OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16) == 0)) ? AlignedBit : 0,
-    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0,
-    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
-    FlagsRowMajorBit = (ReshapedStorageOrder==int(RowMajor)) ? RowMajorBit : 0,
-    FlagsDirectAccessBit = HasDirectAccess ? DirectAccessBit : 0,
-    Flags0 = traits<XprType>::Flags & ( (HereditaryBits & ~RowMajorBit) | MaskPacketAccessBit),
-
-    Flags = (Flags0 | FlagsLinearAccessBit | FlagsLvalueBit | FlagsRowMajorBit | FlagsDirectAccessBit)
-  };
-};
-
-template<typename XprType, int Rows, int Cols, int Order, bool HasDirectAccess> class ReshapedImpl_dense;
-
-} // end namespace internal
-
-template<typename XprType, int Rows, int Cols, int Order, typename StorageKind> class ReshapedImpl;
-
-template<typename XprType, int Rows, int Cols, int Order> class Reshaped
-  : public ReshapedImpl<XprType, Rows, Cols, Order, typename internal::traits<XprType>::StorageKind>
-{
-    typedef ReshapedImpl<XprType, Rows, Cols, Order, typename internal::traits<XprType>::StorageKind> Impl;
-  public:
-    //typedef typename Impl::Base Base;
-    typedef Impl Base;
-    EIGEN_GENERIC_PUBLIC_INTERFACE(Reshaped)
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reshaped)
-
-    /** Fixed-size constructor
-      */
-    EIGEN_DEVICE_FUNC
-    inline Reshaped(XprType& xpr)
-      : Impl(xpr)
-    {
-      EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
-      eigen_assert(Rows * Cols == xpr.rows() * xpr.cols());
-    }
-
-    /** Dynamic-size constructor
-      */
-    EIGEN_DEVICE_FUNC
-    inline Reshaped(XprType& xpr,
-          Index reshapeRows, Index reshapeCols)
-      : Impl(xpr, reshapeRows, reshapeCols)
-    {
-      eigen_assert((RowsAtCompileTime==Dynamic || RowsAtCompileTime==reshapeRows)
-          && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==reshapeCols));
-      eigen_assert(reshapeRows * reshapeCols == xpr.rows() * xpr.cols());
-    }
-};
-
-// The generic default implementation for dense reshape simply forward to the internal::ReshapedImpl_dense
-// that must be specialized for direct and non-direct access...
-template<typename XprType, int Rows, int Cols, int Order>
-class ReshapedImpl<XprType, Rows, Cols, Order, Dense>
-  : public internal::ReshapedImpl_dense<XprType, Rows, Cols, Order,internal::traits<Reshaped<XprType,Rows,Cols,Order> >::HasDirectAccess>
-{
-    typedef internal::ReshapedImpl_dense<XprType, Rows, Cols, Order,internal::traits<Reshaped<XprType,Rows,Cols,Order> >::HasDirectAccess> Impl;
-  public:
-    typedef Impl Base;
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl)
-    EIGEN_DEVICE_FUNC inline ReshapedImpl(XprType& xpr) : Impl(xpr) {}
-    EIGEN_DEVICE_FUNC inline ReshapedImpl(XprType& xpr, Index reshapeRows, Index reshapeCols)
-      : Impl(xpr, reshapeRows, reshapeCols) {}
-};
-
-namespace internal {
-
-/** \internal Internal implementation of dense Reshaped in the general case. */
-template<typename XprType, int Rows, int Cols, int Order>
-class ReshapedImpl_dense<XprType,Rows,Cols,Order,false>
-  : public internal::dense_xpr_base<Reshaped<XprType, Rows, Cols, Order> >::type
-{
-    typedef Reshaped<XprType, Rows, Cols, Order> ReshapedType;
-  public:
-
-    typedef typename internal::dense_xpr_base<ReshapedType>::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(ReshapedType)
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl_dense)
-
-    typedef typename internal::ref_selector<XprType>::non_const_type MatrixTypeNested;
-    typedef typename internal::remove_all<XprType>::type NestedExpression;
-
-    class InnerIterator;
-
-    /** Fixed-size constructor
-      */
-    EIGEN_DEVICE_FUNC
-    inline ReshapedImpl_dense(XprType& xpr)
-      : m_xpr(xpr), m_rows(Rows), m_cols(Cols)
-    {}
-
-    /** Dynamic-size constructor
-      */
-    EIGEN_DEVICE_FUNC
-    inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols)
-      : m_xpr(xpr), m_rows(nRows), m_cols(nCols)
-    {}
-
-    EIGEN_DEVICE_FUNC Index rows() const { return m_rows; }
-    EIGEN_DEVICE_FUNC Index cols() const { return m_cols; }
-
-    #ifdef EIGEN_PARSED_BY_DOXYGEN
-    /** \sa MapBase::data() */
-    EIGEN_DEVICE_FUNC inline const Scalar* data() const;
-    EIGEN_DEVICE_FUNC inline Index innerStride() const;
-    EIGEN_DEVICE_FUNC inline Index outerStride() const;
-    #endif
-
-    /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<XprType>::type&
-    nestedExpression() const { return m_xpr; }
-
-    /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC
-    typename internal::remove_reference<XprType>::type&
-    nestedExpression() { return m_xpr; }
-
-  protected:
-
-    MatrixTypeNested m_xpr;
-    const internal::variable_if_dynamic<Index, Rows> m_rows;
-    const internal::variable_if_dynamic<Index, Cols> m_cols;
-};
-
-
-/** \internal Internal implementation of dense Reshaped in the direct access case. */
-template<typename XprType, int Rows, int Cols, int Order>
-class ReshapedImpl_dense<XprType, Rows, Cols, Order, true>
-  : public MapBase<Reshaped<XprType, Rows, Cols, Order> >
-{
-    typedef Reshaped<XprType, Rows, Cols, Order> ReshapedType;
-    typedef typename internal::ref_selector<XprType>::non_const_type XprTypeNested;
-  public:
-
-    typedef MapBase<ReshapedType> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(ReshapedType)
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl_dense)
-
-    /** Fixed-size constructor
-      */
-    EIGEN_DEVICE_FUNC
-    inline ReshapedImpl_dense(XprType& xpr)
-      : Base(xpr.data()), m_xpr(xpr)
-    {}
-
-    /** Dynamic-size constructor
-      */
-    EIGEN_DEVICE_FUNC
-    inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols)
-      : Base(xpr.data(), nRows, nCols),
-        m_xpr(xpr)
-    {}
-
-    EIGEN_DEVICE_FUNC
-    const typename internal::remove_all<XprTypeNested>::type& nestedExpression() const
-    {
-      return m_xpr;
-    }
-
-    EIGEN_DEVICE_FUNC
-    XprType& nestedExpression() { return m_xpr; }
-
-    /** \sa MapBase::innerStride() */
-    EIGEN_DEVICE_FUNC
-    inline Index innerStride() const
-    {
-      return m_xpr.innerStride();
-    }
-
-    /** \sa MapBase::outerStride() */
-    EIGEN_DEVICE_FUNC
-    inline Index outerStride() const
-    {
-      return ((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows();
-    }
-
-  protected:
-
-    XprTypeNested m_xpr;
-};
-
-// Evaluators
-template<typename ArgType, int Rows, int Cols, int Order, bool HasDirectAccess> struct reshaped_evaluator;
-
-template<typename ArgType, int Rows, int Cols, int Order>
-struct evaluator<Reshaped<ArgType, Rows, Cols, Order> >
-  : reshaped_evaluator<ArgType, Rows, Cols, Order, traits<Reshaped<ArgType,Rows,Cols,Order> >::HasDirectAccess>
-{
-  typedef Reshaped<ArgType, Rows, Cols, Order> XprType;
-  typedef typename XprType::Scalar Scalar;
-  // TODO: should check for smaller packet types
-  typedef typename packet_traits<Scalar>::type PacketScalar;
-
-  enum {
-    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
-    HasDirectAccess = traits<XprType>::HasDirectAccess,
-
-//     RowsAtCompileTime = traits<XprType>::RowsAtCompileTime,
-//     ColsAtCompileTime = traits<XprType>::ColsAtCompileTime,
-//     MaxRowsAtCompileTime = traits<XprType>::MaxRowsAtCompileTime,
-//     MaxColsAtCompileTime = traits<XprType>::MaxColsAtCompileTime,
-//
-//     InnerStrideAtCompileTime = traits<XprType>::HasSameStorageOrderAsXprType
-//                              ? int(inner_stride_at_compile_time<ArgType>::ret)
-//                              : Dynamic,
-//     OuterStrideAtCompileTime = Dynamic,
-
-    FlagsLinearAccessBit = (traits<XprType>::RowsAtCompileTime == 1 || traits<XprType>::ColsAtCompileTime == 1 || HasDirectAccess) ? LinearAccessBit : 0,
-    FlagsRowMajorBit = (traits<XprType>::ReshapedStorageOrder==int(RowMajor)) ? RowMajorBit : 0,
-    FlagsDirectAccessBit =  HasDirectAccess ? DirectAccessBit : 0,
-    Flags0 = evaluator<ArgType>::Flags & (HereditaryBits & ~RowMajorBit),
-    Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit | FlagsDirectAccessBit,
-
-    PacketAlignment = unpacket_traits<PacketScalar>::alignment,
-    Alignment = evaluator<ArgType>::Alignment
-  };
-  typedef reshaped_evaluator<ArgType, Rows, Cols, Order, HasDirectAccess> reshaped_evaluator_type;
-  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : reshaped_evaluator_type(xpr)
-  {
-    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
-  }
-};
-
-template<typename ArgType, int Rows, int Cols, int Order>
-struct reshaped_evaluator<ArgType, Rows, Cols, Order, /* HasDirectAccess */ false>
-  : evaluator_base<Reshaped<ArgType, Rows, Cols, Order> >
-{
-  typedef Reshaped<ArgType, Rows, Cols, Order> XprType;
-
-  enum {
-    CoeffReadCost = evaluator<ArgType>::CoeffReadCost /* TODO + cost of index computations */,
-
-    Flags = (evaluator<ArgType>::Flags & (HereditaryBits /*| LinearAccessBit | DirectAccessBit*/)),
-
-    Alignment = 0
-  };
-
-  EIGEN_DEVICE_FUNC explicit reshaped_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_xpr(xpr)
-  {
-    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
-  }
-
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-
-  typedef std::pair<Index, Index> RowCol;
-
-  inline RowCol index_remap(Index rowId, Index colId) const
-  {
-    if(Order==ColMajor)
-    {
-      const Index nth_elem_idx = colId * m_xpr.rows() + rowId;
-      return RowCol(nth_elem_idx % m_xpr.nestedExpression().rows(),
-                    nth_elem_idx / m_xpr.nestedExpression().rows());
-    }
-    else
-    {
-      const Index nth_elem_idx = colId + rowId * m_xpr.cols();
-      return RowCol(nth_elem_idx / m_xpr.nestedExpression().cols(),
-                    nth_elem_idx % m_xpr.nestedExpression().cols());
-    }
-  }
-
-  EIGEN_DEVICE_FUNC
-  inline Scalar& coeffRef(Index rowId, Index colId)
-  {
-    EIGEN_STATIC_ASSERT_LVALUE(XprType)
-    const RowCol row_col = index_remap(rowId, colId);
-    return m_argImpl.coeffRef(row_col.first, row_col.second);
-  }
-
-  EIGEN_DEVICE_FUNC
-  inline const Scalar& coeffRef(Index rowId, Index colId) const
-  {
-    const RowCol row_col = index_remap(rowId, colId);
-    return m_argImpl.coeffRef(row_col.first, row_col.second);
-  }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index rowId, Index colId) const
-  {
-    const RowCol row_col = index_remap(rowId, colId);
-    return m_argImpl.coeff(row_col.first, row_col.second);
-  }
-
-  EIGEN_DEVICE_FUNC
-  inline Scalar& coeffRef(Index index)
-  {
-    EIGEN_STATIC_ASSERT_LVALUE(XprType)
-    const RowCol row_col = index_remap(Rows == 1 ? 0 : index,
-                                       Rows == 1 ? index : 0);
-    return m_argImpl.coeffRef(row_col.first, row_col.second);
-
-  }
-
-  EIGEN_DEVICE_FUNC
-  inline const Scalar& coeffRef(Index index) const
-  {
-    const RowCol row_col = index_remap(Rows == 1 ? 0 : index,
-                                       Rows == 1 ? index : 0);
-    return m_argImpl.coeffRef(row_col.first, row_col.second);
-  }
-
-  EIGEN_DEVICE_FUNC
-  inline const CoeffReturnType coeff(Index index) const
-  {
-    const RowCol row_col = index_remap(Rows == 1 ? 0 : index,
-                                       Rows == 1 ? index : 0);
-    return m_argImpl.coeff(row_col.first, row_col.second);
-  }
-#if 0
-  EIGEN_DEVICE_FUNC
-  template<int LoadMode>
-  inline PacketScalar packet(Index rowId, Index colId) const
-  {
-    const RowCol row_col = index_remap(rowId, colId);
-    return m_argImpl.template packet<Unaligned>(row_col.first, row_col.second);
-
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC
-  inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
-  {
-    const RowCol row_col = index_remap(rowId, colId);
-    m_argImpl.const_cast_derived().template writePacket<Unaligned>
-            (row_col.first, row_col.second, val);
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC
-  inline PacketScalar packet(Index index) const
-  {
-    const RowCol row_col = index_remap(RowsAtCompileTime == 1 ? 0 : index,
-                                        RowsAtCompileTime == 1 ? index : 0);
-    return m_argImpl.template packet<Unaligned>(row_col.first, row_col.second);
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC
-  inline void writePacket(Index index, const PacketScalar& val)
-  {
-    const RowCol row_col = index_remap(RowsAtCompileTime == 1 ? 0 : index,
-                                        RowsAtCompileTime == 1 ? index : 0);
-    return m_argImpl.template packet<Unaligned>(row_col.first, row_col.second, val);
-  }
-#endif
-protected:
-
-  evaluator<ArgType> m_argImpl;
-  const XprType& m_xpr;
-
-};
-
-template<typename ArgType, int Rows, int Cols, int Order>
-struct reshaped_evaluator<ArgType, Rows, Cols, Order, /* HasDirectAccess */ true>
-: mapbase_evaluator<Reshaped<ArgType, Rows, Cols, Order>,
-                      typename Reshaped<ArgType, Rows, Cols, Order>::PlainObject>
-{
-  typedef Reshaped<ArgType, Rows, Cols, Order> XprType;
-  typedef typename XprType::Scalar Scalar;
-
-  EIGEN_DEVICE_FUNC explicit reshaped_evaluator(const XprType& xpr)
-    : mapbase_evaluator<XprType, typename XprType::PlainObject>(xpr)
-  {
-    // TODO: for the 3.4 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime
-    eigen_assert(((internal::UIntPtr(xpr.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator<XprType>::Alignment)) == 0) && "data is not aligned");
-  }
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_RESHAPED_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/ReturnByValue.h b/uppsrc/plugin/Eigen/Eigen/src/Core/ReturnByValue.h
index 11dc86d07..c44b7673b 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/ReturnByValue.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/ReturnByValue.h
@@ -79,7 +79,7 @@ template<typename Derived> class ReturnByValue
 
 template<typename Derived>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
+Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
 {
   other.evalTo(derived());
   return derived();
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Reverse.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Reverse.h
index 853093923..0640cda2a 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/Reverse.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Reverse.h
@@ -114,7 +114,7 @@ template<typename MatrixType, int Direction> class Reverse
   *
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ReverseReturnType
+inline typename DenseBase<Derived>::ReverseReturnType
 DenseBase<Derived>::reverse()
 {
   return ReverseReturnType(derived());
@@ -136,7 +136,7 @@ DenseBase<Derived>::reverse()
   *
   * \sa VectorwiseOp::reverseInPlace(), reverse() */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::reverseInPlace()
+inline void DenseBase<Derived>::reverseInPlace()
 {
   if(cols()>rows())
   {
@@ -171,10 +171,8 @@ struct vectorwise_reverse_inplace_impl<Vertical>
   template<typename ExpressionType>
   static void run(ExpressionType &xpr)
   {
-    const int HalfAtCompileTime = ExpressionType::RowsAtCompileTime==Dynamic?Dynamic:ExpressionType::RowsAtCompileTime/2;
     Index half = xpr.rows()/2;
-    xpr.topRows(fix<HalfAtCompileTime>(half))
-       .swap(xpr.bottomRows(fix<HalfAtCompileTime>(half)).colwise().reverse());
+    xpr.topRows(half).swap(xpr.bottomRows(half).colwise().reverse());
   }
 };
 
@@ -184,10 +182,8 @@ struct vectorwise_reverse_inplace_impl<Horizontal>
   template<typename ExpressionType>
   static void run(ExpressionType &xpr)
   {
-    const int HalfAtCompileTime = ExpressionType::ColsAtCompileTime==Dynamic?Dynamic:ExpressionType::ColsAtCompileTime/2;
     Index half = xpr.cols()/2;
-    xpr.leftCols(fix<HalfAtCompileTime>(half))
-       .swap(xpr.rightCols(fix<HalfAtCompileTime>(half)).rowwise().reverse());
+    xpr.leftCols(half).swap(xpr.rightCols(half).rowwise().reverse());
   }
 };
 
@@ -205,9 +201,9 @@ struct vectorwise_reverse_inplace_impl<Horizontal>
   *
   * \sa DenseBase::reverseInPlace(), reverse() */
 template<typename ExpressionType, int Direction>
-EIGEN_DEVICE_FUNC void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
+void VectorwiseOp<ExpressionType,Direction>::reverseInPlace()
 {
-  internal::vectorwise_reverse_inplace_impl<Direction>::run(m_matrix);
+  internal::vectorwise_reverse_inplace_impl<Direction>::run(_expression().const_cast_derived());
 }
 
 } // end namespace Eigen
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/SelfAdjointView.h b/uppsrc/plugin/Eigen/Eigen/src/Core/SelfAdjointView.h
index 2173799d9..b2e51f37a 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/SelfAdjointView.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/SelfAdjointView.h
@@ -61,7 +61,6 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
     typedef typename internal::traits<SelfAdjointView>::Scalar Scalar; 
     typedef typename MatrixType::StorageIndex StorageIndex;
     typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
-    typedef SelfAdjointView<typename internal::add_const<MatrixType>::type, UpLo> ConstSelfAdjointView;
 
     enum {
       Mode = internal::traits<SelfAdjointView>::Mode,
@@ -198,18 +197,6 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
     inline const ConjugateReturnType conjugate() const
     { return ConjugateReturnType(m_matrix.conjugate()); }
 
-    /** \returns an expression of the complex conjugate of \c *this if Cond==true,
-     *           returns \c *this otherwise.
-     */
-    template<bool Cond>
-    EIGEN_DEVICE_FUNC
-    inline typename internal::conditional<Cond,ConjugateReturnType,ConstSelfAdjointView>::type
-    conjugateIf() const
-    {
-      typedef typename internal::conditional<Cond,ConjugateReturnType,ConstSelfAdjointView>::type ReturnType;
-      return ReturnType(m_matrix.template conjugateIf<Cond>());
-    }
-
     typedef SelfAdjointView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType;
     /** \sa MatrixBase::adjoint() const */
     EIGEN_DEVICE_FUNC
@@ -337,7 +324,7 @@ public:
 /** This is the const version of MatrixBase::selfadjointView() */
 template<typename Derived>
 template<unsigned int UpLo>
-EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
+typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView() const
 {
   return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
@@ -354,7 +341,7 @@ MatrixBase<Derived>::selfadjointView() const
   */
 template<typename Derived>
 template<unsigned int UpLo>
-EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
+typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
 MatrixBase<Derived>::selfadjointView()
 {
   return typename SelfAdjointViewReturnType<UpLo>::Type(derived());
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Solve.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Solve.h
index ec4b4a987..a8daea511 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/Solve.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Solve.h
@@ -19,7 +19,7 @@ template<typename Decomposition, typename RhsType, typename StorageKind> class S
   *
   * \brief Pseudo expression representing a solving operation
   *
-  * \tparam Decomposition the type of the matrix or decomposition object
+  * \tparam Decomposition the type of the matrix or decomposion object
   * \tparam Rhstype the type of the right-hand side
   *
   * This class represents an expression of A.solve(B)
@@ -181,7 +181,7 @@ struct Assignment<DstXprType, Solve<CwiseUnaryOp<internal::scalar_conjugate_op<t
   }
 };
 
-} // end namespace internal
+} // end namepsace internal
 
 } // end namespace Eigen
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/SolveTriangular.h b/uppsrc/plugin/Eigen/Eigen/src/Core/SolveTriangular.h
index 813fef0db..fd0acb1a5 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/SolveTriangular.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/SolveTriangular.h
@@ -164,7 +164,7 @@ struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 template<typename MatrixType, unsigned int Mode>
 template<int Side, typename OtherDerived>
-EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
+void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
 {
   OtherDerived& other = _other.const_cast_derived();
   eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/SolverBase.h b/uppsrc/plugin/Eigen/Eigen/src/Core/SolverBase.h
index 501461042..8a4adc229 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/SolverBase.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/SolverBase.h
@@ -14,35 +14,8 @@ namespace Eigen {
 
 namespace internal {
 
-template<typename Derived>
-struct solve_assertion {
-    template<bool Transpose_, typename Rhs>
-    static void run(const Derived& solver, const Rhs& b) { solver.template _check_solve_assertion<Transpose_>(b); }
-};
 
-template<typename Derived>
-struct solve_assertion<Transpose<Derived> >
-{
-    typedef Transpose<Derived> type;
 
-    template<bool Transpose_, typename Rhs>
-    static void run(const type& transpose, const Rhs& b)
-    {
-        internal::solve_assertion<typename internal::remove_all<Derived>::type>::template run<true>(transpose.nestedExpression(), b);
-    }
-};
-
-template<typename Scalar, typename Derived>
-struct solve_assertion<CwiseUnaryOp<Eigen::internal::scalar_conjugate_op<Scalar>, const Transpose<Derived> > >
-{
-    typedef CwiseUnaryOp<Eigen::internal::scalar_conjugate_op<Scalar>, const Transpose<Derived> > type;
-
-    template<bool Transpose_, typename Rhs>
-    static void run(const type& adjoint, const Rhs& b)
-    {
-        internal::solve_assertion<typename internal::remove_all<Transpose<Derived> >::type>::template run<true>(adjoint.nestedExpression(), b);
-    }
-};
 } // end namespace internal
 
 /** \class SolverBase
@@ -62,7 +35,7 @@ struct solve_assertion<CwiseUnaryOp<Eigen::internal::scalar_conjugate_op<Scalar>
   *
   * \warning Currently, any other usage of transpose() and adjoint() are not supported and will produce compilation errors.
   *
-  * \sa class PartialPivLU, class FullPivLU, class HouseholderQR, class ColPivHouseholderQR, class FullPivHouseholderQR, class CompleteOrthogonalDecomposition, class LLT, class LDLT, class SVDBase
+  * \sa class PartialPivLU, class FullPivLU
   */
 template<typename Derived>
 class SolverBase : public EigenBase<Derived>
@@ -73,9 +46,6 @@ class SolverBase : public EigenBase<Derived>
     typedef typename internal::traits<Derived>::Scalar Scalar;
     typedef Scalar CoeffReturnType;
 
-    template<typename Derived_>
-    friend struct internal::solve_assertion;
-
     enum {
       RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
       ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
@@ -86,8 +56,7 @@ class SolverBase : public EigenBase<Derived>
       MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
                                                              internal::traits<Derived>::MaxColsAtCompileTime>::ret),
       IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1
-                           || internal::traits<Derived>::MaxColsAtCompileTime == 1,
-      NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2
+                           || internal::traits<Derived>::MaxColsAtCompileTime == 1
     };
 
     /** Default constructor */
@@ -105,7 +74,7 @@ class SolverBase : public EigenBase<Derived>
     inline const Solve<Derived, Rhs>
     solve(const MatrixBase<Rhs>& b) const
     {
-      internal::solve_assertion<typename internal::remove_all<Derived>::type>::template run<false>(derived(), b);
+      eigen_assert(derived().rows()==b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
       return Solve<Derived, Rhs>(derived(), b.derived());
     }
 
@@ -143,13 +112,6 @@ class SolverBase : public EigenBase<Derived>
     }
 
   protected:
-
-    template<bool Transpose_, typename Rhs>
-    void _check_solve_assertion(const Rhs& b) const {
-        EIGEN_ONLY_USED_FOR_DEBUG(b);
-        eigen_assert(derived().m_isInitialized && "Solver is not initialized.");
-        eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "SolverBase::solve(): invalid number of rows of the right hand side matrix b");
-    }
 };
 
 namespace internal {
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/StableNorm.h b/uppsrc/plugin/Eigen/Eigen/src/Core/StableNorm.h
index 77ea3c261..88c8d9890 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/StableNorm.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/StableNorm.h
@@ -50,71 +50,6 @@ inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& sc
     ssq += (bl*invScale).squaredNorm();
 }
 
-template<typename VectorType, typename RealScalar>
-void stable_norm_impl_inner_step(const VectorType &vec, RealScalar& ssq, RealScalar& scale, RealScalar& invScale)
-{
-  typedef typename VectorType::Scalar Scalar;
-  const Index blockSize = 4096;
-  
-  typedef typename internal::nested_eval<VectorType,2>::type VectorTypeCopy;
-  typedef typename internal::remove_all<VectorTypeCopy>::type VectorTypeCopyClean;
-  const VectorTypeCopy copy(vec);
-  
-  enum {
-    CanAlign = (   (int(VectorTypeCopyClean::Flags)&DirectAccessBit)
-                || (int(internal::evaluator<VectorTypeCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
-               ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT)
-                 && (EIGEN_MAX_STATIC_ALIGN_BYTES>0) // if we cannot allocate on the stack, then let's not bother about this optimization
-  };
-  typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<VectorTypeCopyClean>::Alignment>,
-                                                   typename VectorTypeCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
-  Index n = vec.size();
-  
-  Index bi = internal::first_default_aligned(copy);
-  if (bi>0)
-    internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
-  for (; bi<n; bi+=blockSize)
-    internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
-}
-
-template<typename VectorType>
-typename VectorType::RealScalar
-stable_norm_impl(const VectorType &vec, typename enable_if<VectorType::IsVectorAtCompileTime>::type* = 0 )
-{
-  using std::sqrt;
-  using std::abs;
-
-  Index n = vec.size();
-
-  if(n==1)
-    return abs(vec.coeff(0));
-
-  typedef typename VectorType::RealScalar RealScalar;
-  RealScalar scale(0);
-  RealScalar invScale(1);
-  RealScalar ssq(0); // sum of squares
-
-  stable_norm_impl_inner_step(vec, ssq, scale, invScale);
-  
-  return scale * sqrt(ssq);
-}
-
-template<typename MatrixType>
-typename MatrixType::RealScalar
-stable_norm_impl(const MatrixType &mat, typename enable_if<!MatrixType::IsVectorAtCompileTime>::type* = 0 )
-{
-  using std::sqrt;
-
-  typedef typename MatrixType::RealScalar RealScalar;
-  RealScalar scale(0);
-  RealScalar invScale(1);
-  RealScalar ssq(0); // sum of squares
-
-  for(Index j=0; j<mat.outerSize(); ++j)
-    stable_norm_impl_inner_step(mat.innerVector(j), ssq, scale, invScale);
-  return scale * sqrt(ssq);
-}
-
 template<typename Derived>
 inline typename NumTraits<typename traits<Derived>::Scalar>::Real
 blueNorm_impl(const EigenBase<Derived>& _vec)
@@ -139,7 +74,7 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
     // are used. For any specific computer, each of the assignment
     // statements can be replaced
     ibeta = std::numeric_limits<RealScalar>::radix;                 // base for floating-point numbers
-    it    = NumTraits<RealScalar>::digits();                        // number of base-beta digits in mantissa
+    it    = std::numeric_limits<RealScalar>::digits;                // number of base-beta digits in mantissa
     iemin = std::numeric_limits<RealScalar>::min_exponent;          // minimum exponent
     iemax = std::numeric_limits<RealScalar>::max_exponent;          // maximum exponent
     rbig  = (std::numeric_limits<RealScalar>::max)();               // largest floating-point number
@@ -163,16 +98,12 @@ blueNorm_impl(const EigenBase<Derived>& _vec)
   RealScalar asml = RealScalar(0);
   RealScalar amed = RealScalar(0);
   RealScalar abig = RealScalar(0);
-
-  for(Index j=0; j<vec.outerSize(); ++j)
+  for(typename Derived::InnerIterator it(vec, 0); it; ++it)
   {
-    for(typename Derived::InnerIterator it(vec, j); it; ++it)
-    {
-      RealScalar ax = abs(it.value());
-      if(ax > ab2)     abig += numext::abs2(ax*s2m);
-      else if(ax < b1) asml += numext::abs2(ax*s1m);
-      else             amed += numext::abs2(ax);
-    }
+    RealScalar ax = abs(it.value());
+    if(ax > ab2)     abig += numext::abs2(ax*s2m);
+    else if(ax < b1) asml += numext::abs2(ax*s1m);
+    else             amed += numext::abs2(ax);
   }
   if(amed!=amed)
     return amed;  // we got a NaN
@@ -225,7 +156,36 @@ template<typename Derived>
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::stableNorm() const
 {
-  return internal::stable_norm_impl(derived());
+  using std::sqrt;
+  using std::abs;
+  const Index blockSize = 4096;
+  RealScalar scale(0);
+  RealScalar invScale(1);
+  RealScalar ssq(0); // sum of square
+  
+  typedef typename internal::nested_eval<Derived,2>::type DerivedCopy;
+  typedef typename internal::remove_all<DerivedCopy>::type DerivedCopyClean;
+  const DerivedCopy copy(derived());
+  
+  enum {
+    CanAlign = (   (int(DerivedCopyClean::Flags)&DirectAccessBit)
+                || (int(internal::evaluator<DerivedCopyClean>::Alignment)>0) // FIXME Alignment)>0 might not be enough
+               ) && (blockSize*sizeof(Scalar)*2<EIGEN_STACK_ALLOCATION_LIMIT)
+                 && (EIGEN_MAX_STATIC_ALIGN_BYTES>0) // if we cannot allocate on the stack, then let's not bother about this optimization
+  };
+  typedef typename internal::conditional<CanAlign, Ref<const Matrix<Scalar,Dynamic,1,0,blockSize,1>, internal::evaluator<DerivedCopyClean>::Alignment>,
+                                                   typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper;
+  Index n = size();
+  
+  if(n==1)
+    return abs(this->coeff(0));
+  
+  Index bi = internal::first_default_aligned(copy);
+  if (bi>0)
+    internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale);
+  for (; bi<n; bi+=blockSize)
+    internal::stable_norm_kernel(SegmentWrapper(copy.segment(bi,numext::mini(blockSize, n - bi))), ssq, scale, invScale);
+  return scale * sqrt(ssq);
 }
 
 /** \returns the \em l2 norm of \c *this using the Blue's algorithm.
@@ -253,10 +213,7 @@ template<typename Derived>
 inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
 MatrixBase<Derived>::hypotNorm() const
 {
-  if(size()==1)
-    return numext::abs(coeff(0,0));
-  else
-    return this->cwiseAbs().redux(internal::scalar_hypot_op<RealScalar>());
+  return this->cwiseAbs().redux(internal::scalar_hypot_op<RealScalar>());
 }
 
 } // end namespace Eigen
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/StlIterators.h b/uppsrc/plugin/Eigen/Eigen/src/Core/StlIterators.h
deleted file mode 100644
index 0d8bd1aa3..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/StlIterators.h
+++ /dev/null
@@ -1,331 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2018 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-namespace Eigen {
-
-namespace internal {
-
-template<typename IteratorType>
-struct indexed_based_stl_iterator_traits;
-
-template<typename  Derived>
-class indexed_based_stl_iterator_base
-{
-protected:
-  typedef indexed_based_stl_iterator_traits<Derived> traits;
-  typedef typename traits::XprType XprType;
-  typedef indexed_based_stl_iterator_base<typename traits::non_const_iterator> non_const_iterator;
-  typedef indexed_based_stl_iterator_base<typename traits::const_iterator> const_iterator;
-  typedef typename internal::conditional<internal::is_const<XprType>::value,non_const_iterator,const_iterator>::type other_iterator;
-  // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class:
-  friend class indexed_based_stl_iterator_base<typename traits::const_iterator>;
-  friend class indexed_based_stl_iterator_base<typename traits::non_const_iterator>;
-public:
-  typedef Index difference_type;
-  typedef std::random_access_iterator_tag iterator_category;
-
-  indexed_based_stl_iterator_base() : mp_xpr(0), m_index(0) {}
-  indexed_based_stl_iterator_base(XprType& xpr, Index index) : mp_xpr(&xpr), m_index(index) {}
-
-  indexed_based_stl_iterator_base(const non_const_iterator& other)
-    : mp_xpr(other.mp_xpr), m_index(other.m_index)
-  {}
-
-  indexed_based_stl_iterator_base& operator=(const non_const_iterator& other)
-  {
-    mp_xpr = other.mp_xpr;
-    m_index = other.m_index;
-    return *this;
-  }
-
-  Derived& operator++() { ++m_index; return derived(); }
-  Derived& operator--() { --m_index; return derived(); }
-
-  Derived operator++(int) { Derived prev(derived()); operator++(); return prev;}
-  Derived operator--(int) { Derived prev(derived()); operator--(); return prev;}
-
-  friend Derived operator+(const indexed_based_stl_iterator_base& a, Index b) { Derived ret(a.derived()); ret += b; return ret; }
-  friend Derived operator-(const indexed_based_stl_iterator_base& a, Index b) { Derived ret(a.derived()); ret -= b; return ret; }
-  friend Derived operator+(Index a, const indexed_based_stl_iterator_base& b) { Derived ret(b.derived()); ret += a; return ret; }
-  friend Derived operator-(Index a, const indexed_based_stl_iterator_base& b) { Derived ret(b.derived()); ret -= a; return ret; }
-  
-  Derived& operator+=(Index b) { m_index += b; return derived(); }
-  Derived& operator-=(Index b) { m_index -= b; return derived(); }
-
-  difference_type operator-(const indexed_based_stl_iterator_base& other) const
-  {
-    eigen_assert(mp_xpr == other.mp_xpr);
-    return m_index - other.m_index;
-  }
-
-  difference_type operator-(const other_iterator& other) const
-  {
-    eigen_assert(mp_xpr == other.mp_xpr);
-    return m_index - other.m_index;
-  }
-
-  bool operator==(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; }
-  bool operator!=(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; }
-  bool operator< (const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <  other.m_index; }
-  bool operator<=(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; }
-  bool operator> (const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >  other.m_index; }
-  bool operator>=(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; }
-
-  bool operator==(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; }
-  bool operator!=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; }
-  bool operator< (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <  other.m_index; }
-  bool operator<=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; }
-  bool operator> (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >  other.m_index; }
-  bool operator>=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; }
-
-protected:
-
-  Derived& derived() { return static_cast<Derived&>(*this); }
-  const Derived& derived() const { return static_cast<const Derived&>(*this); }
-
-  XprType *mp_xpr;
-  Index m_index;
-};
-
-template<typename XprType>
-class pointer_based_stl_iterator
-{
-  enum { is_lvalue  = internal::is_lvalue<XprType>::value };
-  typedef pointer_based_stl_iterator<typename internal::remove_const<XprType>::type> non_const_iterator;
-  typedef pointer_based_stl_iterator<typename internal::add_const<XprType>::type> const_iterator;
-  typedef typename internal::conditional<internal::is_const<XprType>::value,non_const_iterator,const_iterator>::type other_iterator;
-  // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class:
-  friend class pointer_based_stl_iterator<typename internal::add_const<XprType>::type>;
-  friend class pointer_based_stl_iterator<typename internal::remove_const<XprType>::type>;
-public:
-  typedef Index difference_type;
-  typedef typename XprType::Scalar value_type;
-  typedef std::random_access_iterator_tag iterator_category;
-  typedef typename internal::conditional<bool(is_lvalue), value_type*, const value_type*>::type pointer;
-  typedef typename internal::conditional<bool(is_lvalue), value_type&, const value_type&>::type reference;
-
-
-  pointer_based_stl_iterator() : m_ptr(0) {}
-  pointer_based_stl_iterator(XprType& xpr, Index index) : m_incr(xpr.innerStride())
-  {
-    m_ptr = xpr.data() + index * m_incr.value();
-  }
-
-  pointer_based_stl_iterator(const non_const_iterator& other)
-    : m_ptr(other.m_ptr), m_incr(other.m_incr)
-  {}
-
-  pointer_based_stl_iterator& operator=(const non_const_iterator& other)
-  {
-    m_ptr = other.m_ptr;
-    m_incr.setValue(other.m_incr);
-    return *this;
-  }
-
-  reference operator*()         const { return *m_ptr;   }
-  reference operator[](Index i) const { return *(m_ptr+i*m_incr.value()); }
-  pointer   operator->()        const { return m_ptr;    }
-
-  pointer_based_stl_iterator& operator++() { m_ptr += m_incr.value(); return *this; }
-  pointer_based_stl_iterator& operator--() { m_ptr -= m_incr.value(); return *this; }
-
-  pointer_based_stl_iterator operator++(int) { pointer_based_stl_iterator prev(*this); operator++(); return prev;}
-  pointer_based_stl_iterator operator--(int) { pointer_based_stl_iterator prev(*this); operator--(); return prev;}
-
-  friend pointer_based_stl_iterator operator+(const pointer_based_stl_iterator& a, Index b) { pointer_based_stl_iterator ret(a); ret += b; return ret; }
-  friend pointer_based_stl_iterator operator-(const pointer_based_stl_iterator& a, Index b) { pointer_based_stl_iterator ret(a); ret -= b; return ret; }
-  friend pointer_based_stl_iterator operator+(Index a, const pointer_based_stl_iterator& b) { pointer_based_stl_iterator ret(b); ret += a; return ret; }
-  friend pointer_based_stl_iterator operator-(Index a, const pointer_based_stl_iterator& b) { pointer_based_stl_iterator ret(b); ret -= a; return ret; }
-  
-  pointer_based_stl_iterator& operator+=(Index b) { m_ptr += b*m_incr.value(); return *this; }
-  pointer_based_stl_iterator& operator-=(Index b) { m_ptr -= b*m_incr.value(); return *this; }
-
-  difference_type operator-(const pointer_based_stl_iterator& other) const {
-    return (m_ptr - other.m_ptr)/m_incr.value();
-  }
-
-  difference_type operator-(const other_iterator& other) const {
-    return (m_ptr - other.m_ptr)/m_incr.value();
-  }
-
-  bool operator==(const pointer_based_stl_iterator& other) const { return m_ptr == other.m_ptr; }
-  bool operator!=(const pointer_based_stl_iterator& other) const { return m_ptr != other.m_ptr; }
-  bool operator< (const pointer_based_stl_iterator& other) const { return m_ptr <  other.m_ptr; }
-  bool operator<=(const pointer_based_stl_iterator& other) const { return m_ptr <= other.m_ptr; }
-  bool operator> (const pointer_based_stl_iterator& other) const { return m_ptr >  other.m_ptr; }
-  bool operator>=(const pointer_based_stl_iterator& other) const { return m_ptr >= other.m_ptr; }
-
-  bool operator==(const other_iterator& other) const { return m_ptr == other.m_ptr; }
-  bool operator!=(const other_iterator& other) const { return m_ptr != other.m_ptr; }
-  bool operator< (const other_iterator& other) const { return m_ptr <  other.m_ptr; }
-  bool operator<=(const other_iterator& other) const { return m_ptr <= other.m_ptr; }
-  bool operator> (const other_iterator& other) const { return m_ptr >  other.m_ptr; }
-  bool operator>=(const other_iterator& other) const { return m_ptr >= other.m_ptr; }
-
-protected:
-
-  pointer m_ptr;
-  internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_incr;
-};
-
-template<typename _XprType>
-struct indexed_based_stl_iterator_traits<generic_randaccess_stl_iterator<_XprType> >
-{
-  typedef _XprType XprType;
-  typedef generic_randaccess_stl_iterator<typename internal::remove_const<XprType>::type> non_const_iterator;
-  typedef generic_randaccess_stl_iterator<typename internal::add_const<XprType>::type> const_iterator;
-};
-
-template<typename XprType>
-class generic_randaccess_stl_iterator : public indexed_based_stl_iterator_base<generic_randaccess_stl_iterator<XprType> >
-{
-public:
-  typedef typename XprType::Scalar value_type;
-
-protected:
-
-  enum {
-    has_direct_access = (internal::traits<XprType>::Flags & DirectAccessBit) ? 1 : 0,
-    is_lvalue  = internal::is_lvalue<XprType>::value
-  };
-
-  typedef indexed_based_stl_iterator_base<generic_randaccess_stl_iterator> Base;
-  using Base::m_index;
-  using Base::mp_xpr;
-
-  // TODO currently const Transpose/Reshape expressions never returns const references,
-  // so lets return by value too.
-  //typedef typename internal::conditional<bool(has_direct_access), const value_type&, const value_type>::type read_only_ref_t;
-  typedef const value_type read_only_ref_t;
-
-public:
-  
-  typedef typename internal::conditional<bool(is_lvalue), value_type *, const value_type *>::type pointer;
-  typedef typename internal::conditional<bool(is_lvalue), value_type&, read_only_ref_t>::type reference;
-  
-  generic_randaccess_stl_iterator() : Base() {}
-  generic_randaccess_stl_iterator(XprType& xpr, Index index) : Base(xpr,index) {}
-  generic_randaccess_stl_iterator(const typename Base::non_const_iterator& other) : Base(other) {}
-  using Base::operator=;
-
-  reference operator*()         const { return   (*mp_xpr)(m_index);   }
-  reference operator[](Index i) const { return   (*mp_xpr)(m_index+i); }
-  pointer   operator->()        const { return &((*mp_xpr)(m_index)); }
-};
-
-template<typename _XprType, DirectionType Direction>
-struct indexed_based_stl_iterator_traits<subvector_stl_iterator<_XprType,Direction> >
-{
-  typedef _XprType XprType;
-  typedef subvector_stl_iterator<typename internal::remove_const<XprType>::type, Direction> non_const_iterator;
-  typedef subvector_stl_iterator<typename internal::add_const<XprType>::type, Direction> const_iterator;
-};
-
-template<typename XprType, DirectionType Direction>
-class subvector_stl_iterator : public indexed_based_stl_iterator_base<subvector_stl_iterator<XprType,Direction> >
-{
-protected:
-
-  enum { is_lvalue  = internal::is_lvalue<XprType>::value };
-
-  typedef indexed_based_stl_iterator_base<subvector_stl_iterator> Base;
-  using Base::m_index;
-  using Base::mp_xpr;
-
-  typedef typename internal::conditional<Direction==Vertical,typename XprType::ColXpr,typename XprType::RowXpr>::type SubVectorType;
-  typedef typename internal::conditional<Direction==Vertical,typename XprType::ConstColXpr,typename XprType::ConstRowXpr>::type ConstSubVectorType;
-
-
-public:
-  typedef typename internal::conditional<bool(is_lvalue), SubVectorType, ConstSubVectorType>::type reference;
-  typedef typename reference::PlainObject value_type;
-
-private:
-  class subvector_stl_iterator_ptr
-  {
-  public:
-      subvector_stl_iterator_ptr(const reference &subvector) : m_subvector(subvector) {}
-      reference* operator->() { return &m_subvector; }
-  private:
-      reference m_subvector;
-  };
-public:
-
-  typedef subvector_stl_iterator_ptr pointer;
-  
-  subvector_stl_iterator() : Base() {}
-  subvector_stl_iterator(XprType& xpr, Index index) : Base(xpr,index) {}
-
-  reference operator*()         const { return (*mp_xpr).template subVector<Direction>(m_index); }
-  reference operator[](Index i) const { return (*mp_xpr).template subVector<Direction>(m_index+i); }
-  pointer   operator->()        const { return (*mp_xpr).template subVector<Direction>(m_index); }
-};
-
-} // namespace internal
-
-
-/** returns an iterator to the first element of the 1D vector or array
-  * \only_for_vectors
-  * \sa end(), cbegin()
-  */
-template<typename Derived>
-inline typename DenseBase<Derived>::iterator DenseBase<Derived>::begin()
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  return iterator(derived(), 0);
-}
-
-/** const version of begin() */
-template<typename Derived>
-inline typename DenseBase<Derived>::const_iterator DenseBase<Derived>::begin() const
-{
-  return cbegin();
-}
-
-/** returns a read-only const_iterator to the first element of the 1D vector or array
-  * \only_for_vectors
-  * \sa cend(), begin()
-  */
-template<typename Derived>
-inline typename DenseBase<Derived>::const_iterator DenseBase<Derived>::cbegin() const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  return const_iterator(derived(), 0);
-}
-
-/** returns an iterator to the element following the last element of the 1D vector or array
-  * \only_for_vectors
-  * \sa begin(), cend()
-  */
-template<typename Derived>
-inline typename DenseBase<Derived>::iterator DenseBase<Derived>::end()
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  return iterator(derived(), size());
-}
-
-/** const version of end() */
-template<typename Derived>
-inline typename DenseBase<Derived>::const_iterator DenseBase<Derived>::end() const
-{
-  return cend();
-}
-
-/** returns a read-only const_iterator to the element following the last element of the 1D vector or array
-  * \only_for_vectors
-  * \sa begin(), cend()
-  */
-template<typename Derived>
-inline typename DenseBase<Derived>::const_iterator DenseBase<Derived>::cend() const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  return const_iterator(derived(), size());
-}
-
-} // namespace Eigen
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Swap.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Swap.h
index 180a4e5ad..d70200918 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/Swap.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Swap.h
@@ -30,13 +30,12 @@ public:
   typedef typename Base::DstXprType DstXprType;
   typedef swap_assign_op<Scalar> Functor;
   
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
+  EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr)
     : Base(dst, src, func, dstExpr)
   {}
   
   template<int StoreMode, int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE void assignPacket(Index row, Index col)
+  void assignPacket(Index row, Index col)
   {
     PacketType tmp = m_src.template packet<LoadMode,PacketType>(row,col);
     const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(row,col, m_dst.template packet<StoreMode,PacketType>(row,col));
@@ -44,7 +43,7 @@ public:
   }
   
   template<int StoreMode, int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE void assignPacket(Index index)
+  void assignPacket(Index index)
   {
     PacketType tmp = m_src.template packet<LoadMode,PacketType>(index);
     const_cast<SrcEvaluatorTypeT&>(m_src).template writePacket<LoadMode>(index, m_dst.template packet<StoreMode,PacketType>(index));
@@ -53,7 +52,7 @@ public:
   
   // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I mean no CRTP (Gael)
   template<int StoreMode, int LoadMode, typename PacketType>
-  EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner)
+  void assignPacketByOuterInner(Index outer, Index inner)
   {
     Index row = Base::rowIndexByOuterInner(outer, inner); 
     Index col = Base::colIndexByOuterInner(outer, inner);
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Transpose.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Transpose.h
index 49804b0ab..960dc4510 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/Transpose.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Transpose.h
@@ -61,27 +61,24 @@ template<typename MatrixType> class Transpose
     typedef typename internal::remove_all<MatrixType>::type NestedExpression;
 
     EIGEN_DEVICE_FUNC
-    explicit EIGEN_STRONG_INLINE Transpose(MatrixType& matrix) : m_matrix(matrix) {}
+    explicit inline Transpose(MatrixType& matrix) : m_matrix(matrix) {}
 
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose)
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index rows() const { return m_matrix.cols(); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index cols() const { return m_matrix.rows(); }
+    EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.cols(); }
+    EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.rows(); }
 
     /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
     const typename internal::remove_all<MatrixTypeNested>::type&
     nestedExpression() const { return m_matrix; }
 
     /** \returns the nested expression */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_DEVICE_FUNC
     typename internal::remove_reference<MatrixTypeNested>::type&
     nestedExpression() { return m_matrix; }
 
     /** \internal */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     void resize(Index nrows, Index ncols) {
       m_matrix.resize(ncols,nrows);
     }
@@ -125,10 +122,8 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
     EIGEN_DENSE_PUBLIC_INTERFACE(Transpose<MatrixType>)
     EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TransposeImpl)
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index innerStride() const { return derived().nestedExpression().innerStride(); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Index outerStride() const { return derived().nestedExpression().outerStride(); }
+    EIGEN_DEVICE_FUNC inline Index innerStride() const { return derived().nestedExpression().innerStride(); }
+    EIGEN_DEVICE_FUNC inline Index outerStride() const { return derived().nestedExpression().outerStride(); }
 
     typedef typename internal::conditional<
                        internal::is_lvalue<MatrixType>::value,
@@ -136,20 +131,18 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
                        const Scalar
                      >::type ScalarWithConstIfNotLvalue;
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const Scalar* data() const { return derived().nestedExpression().data(); }
+    EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
+    EIGEN_DEVICE_FUNC inline const Scalar* data() const { return derived().nestedExpression().data(); }
 
     // FIXME: shall we keep the const version of coeffRef?
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const Scalar& coeffRef(Index rowId, Index colId) const
+    EIGEN_DEVICE_FUNC
+    inline const Scalar& coeffRef(Index rowId, Index colId) const
     {
       return derived().nestedExpression().coeffRef(colId, rowId);
     }
 
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const Scalar& coeffRef(Index index) const
+    EIGEN_DEVICE_FUNC
+    inline const Scalar& coeffRef(Index index) const
     {
       return derived().nestedExpression().coeffRef(index);
     }
@@ -177,8 +170,7 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
   *
   * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-Transpose<Derived>
+inline Transpose<Derived>
 DenseBase<Derived>::transpose()
 {
   return TransposeReturnType(derived());
@@ -190,8 +182,7 @@ DenseBase<Derived>::transpose()
   *
   * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename DenseBase<Derived>::ConstTransposeReturnType
+inline typename DenseBase<Derived>::ConstTransposeReturnType
 DenseBase<Derived>::transpose() const
 {
   return ConstTransposeReturnType(derived());
@@ -217,7 +208,7 @@ DenseBase<Derived>::transpose() const
   *
   * \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::AdjointReturnType
+inline const typename MatrixBase<Derived>::AdjointReturnType
 MatrixBase<Derived>::adjoint() const
 {
   return AdjointReturnType(this->transpose());
@@ -239,10 +230,11 @@ struct inplace_transpose_selector;
 template<typename MatrixType>
 struct inplace_transpose_selector<MatrixType,true,false> { // square matrix
   static void run(MatrixType& m) {
-    m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose().template triangularView<StrictlyUpper>());
+    m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
   }
 };
 
+// TODO: vectorized path is currently limited to LargestPacketSize x LargestPacketSize cases only.
 template<typename MatrixType>
 struct inplace_transpose_selector<MatrixType,true,true> { // PacketSize x PacketSize
   static void run(MatrixType& m) {
@@ -259,66 +251,16 @@ struct inplace_transpose_selector<MatrixType,true,true> { // PacketSize x Packet
   }
 };
 
-
-template <typename MatrixType, Index Alignment>
-void BlockedInPlaceTranspose(MatrixType& m) {
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename internal::packet_traits<typename MatrixType::Scalar>::type Packet;
-  const Index PacketSize = internal::packet_traits<Scalar>::size;
-  eigen_assert(m.rows() == m.cols());
-  int row_start = 0;
-  for (; row_start + PacketSize <= m.rows(); row_start += PacketSize) {
-    for (int col_start = row_start; col_start + PacketSize <= m.cols(); col_start += PacketSize) {
-      PacketBlock<Packet> A;
-      if (row_start == col_start) {
-        for (Index i=0; i<PacketSize; ++i)
-          A.packet[i] = m.template packetByOuterInner<Alignment>(row_start + i,col_start);
-        internal::ptranspose(A);
-        for (Index i=0; i<PacketSize; ++i)
-          m.template writePacket<Alignment>(m.rowIndexByOuterInner(row_start + i, col_start), m.colIndexByOuterInner(row_start + i,col_start), A.packet[i]);
-      } else {
-        PacketBlock<Packet> B;
-        for (Index i=0; i<PacketSize; ++i) {
-          A.packet[i] = m.template packetByOuterInner<Alignment>(row_start + i,col_start);
-          B.packet[i] = m.template packetByOuterInner<Alignment>(col_start + i, row_start);
-        }
-        internal::ptranspose(A);
-        internal::ptranspose(B);
-        for (Index i=0; i<PacketSize; ++i) {
-          m.template writePacket<Alignment>(m.rowIndexByOuterInner(row_start + i, col_start), m.colIndexByOuterInner(row_start + i,col_start), B.packet[i]);
-          m.template writePacket<Alignment>(m.rowIndexByOuterInner(col_start + i, row_start), m.colIndexByOuterInner(col_start + i,row_start), A.packet[i]);
-        }
-      }
-    }
-  }
-  for (Index row = row_start; row < m.rows(); ++row) {
-    m.matrix().row(row).head(row).swap(
-        m.matrix().col(row).head(row).transpose());
-  }
-}
-
 template<typename MatrixType,bool MatchPacketSize>
-struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non square or dynamic matrix
+struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non square matrix
   static void run(MatrixType& m) {
-    typedef typename MatrixType::Scalar Scalar;
-    if (m.rows() == m.cols()) {
-      const Index PacketSize = internal::packet_traits<Scalar>::size;
-      if (!NumTraits<Scalar>::IsComplex && m.rows() >= PacketSize) {
-        if ((m.rows() % PacketSize) == 0)
-          BlockedInPlaceTranspose<MatrixType,internal::evaluator<MatrixType>::Alignment>(m);
-        else
-          BlockedInPlaceTranspose<MatrixType,Unaligned>(m);
-      }
-      else {
-        m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose().template triangularView<StrictlyUpper>());
-      }
-    } else {
+    if (m.rows()==m.cols())
+      m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
+    else
       m = m.transpose().eval();
-    }
   }
 };
 
-
 } // end namespace internal
 
 /** This is the "in place" version of transpose(): it replaces \c *this by its own transpose.
@@ -341,7 +283,7 @@ struct inplace_transpose_selector<MatrixType,false,MatchPacketSize> { // non squ
   *
   * \sa transpose(), adjoint(), adjointInPlace() */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::transposeInPlace()
+inline void DenseBase<Derived>::transposeInPlace()
 {
   eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic))
                && "transposeInPlace() called on a non-square non-resizable matrix");
@@ -372,7 +314,7 @@ EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::transposeInPlace()
   *
   * \sa transpose(), adjoint(), transposeInPlace() */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::adjointInPlace()
+inline void MatrixBase<Derived>::adjointInPlace()
 {
   derived() = adjoint().eval();
 }
@@ -451,8 +393,7 @@ struct checkTransposeAliasing_impl<Derived, OtherDerived, false>
 template<typename Dst, typename Src>
 void check_for_aliasing(const Dst &dst, const Src &src)
 {
-  if((!Dst::IsVectorAtCompileTime) && dst.rows()>1 && dst.cols()>1)
-    internal::checkTransposeAliasing_impl<Dst, Src>::run(dst, src);
+  internal::checkTransposeAliasing_impl<Dst, Src>::run(dst, src);
 }
 
 } // end namespace internal
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Transpositions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Transpositions.h
index f6d02f7d8..86da5af59 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/Transpositions.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Transpositions.h
@@ -33,6 +33,17 @@ class TranspositionsBase
       indices() = other.indices();
       return derived();
     }
+    
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** This is a special case of the templated operator=. Its purpose is to
+      * prevent a default operator= from hiding the templated operator=.
+      */
+    Derived& operator=(const TranspositionsBase& other)
+    {
+      indices() = other.indices();
+      return derived();
+    }
+    #endif
 
     /** \returns the number of transpositions */
     Index size() const { return indices().size(); }
@@ -73,7 +84,7 @@ class TranspositionsBase
     }
 
     // FIXME: do we want such methods ?
-    // might be useful when the target matrix expression is complex, e.g.:
+    // might be usefull when the target matrix expression is complex, e.g.:
     // object.matrix().block(..,..,..,..) = trans * object.matrix().block(..,..,..,..);
     /*
     template<typename MatrixType>
@@ -160,6 +171,12 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim
     inline Transpositions(const TranspositionsBase<OtherDerived>& other)
       : m_indices(other.indices()) {}
 
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** Standard copy constructor. Defined only to prevent a default copy constructor
+      * from hiding the other templated constructor */
+    inline Transpositions(const Transpositions& other) : m_indices(other.indices()) {}
+    #endif
+
     /** Generic constructor from expression of the transposition indices. */
     template<typename Other>
     explicit inline Transpositions(const MatrixBase<Other>& indices) : m_indices(indices)
@@ -172,6 +189,17 @@ class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTim
       return Base::operator=(other);
     }
 
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** This is a special case of the templated operator=. Its purpose is to
+      * prevent a default operator= from hiding the templated operator=.
+      */
+    Transpositions& operator=(const Transpositions& other)
+    {
+      m_indices = other.m_indices;
+      return *this;
+    }
+    #endif
+
     /** Constructs an uninitialized permutation matrix of given size.
       */
     inline Transpositions(Index size) : m_indices(size)
@@ -278,6 +306,17 @@ class TranspositionsWrapper
       return Base::operator=(other);
     }
 
+    #ifndef EIGEN_PARSED_BY_DOXYGEN
+    /** This is a special case of the templated operator=. Its purpose is to
+      * prevent a default operator= from hiding the templated operator=.
+      */
+    TranspositionsWrapper& operator=(const TranspositionsWrapper& other)
+    {
+      m_indices = other.m_indices;
+      return *this;
+    }
+    #endif
+
     /** const version of indices(). */
     const IndicesType& indices() const { return m_indices; }
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/TriangularMatrix.h b/uppsrc/plugin/Eigen/Eigen/src/Core/TriangularMatrix.h
index 5e2f2de1b..9abb7e31a 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/TriangularMatrix.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/TriangularMatrix.h
@@ -65,7 +65,6 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
     inline Index innerStride() const { return derived().innerStride(); }
     
     // dummy resize function
-    EIGEN_DEVICE_FUNC
     void resize(Index rows, Index cols)
     {
       EIGEN_UNUSED_VARIABLE(rows);
@@ -198,7 +197,6 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
     typedef typename internal::traits<TriangularView>::MatrixTypeNestedNonRef MatrixTypeNestedNonRef;
 
     typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
-    typedef TriangularView<typename internal::add_const<MatrixType>::type, _Mode> ConstTriangularView;
     
   public:
 
@@ -242,18 +240,6 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularView
     inline const ConjugateReturnType conjugate() const
     { return ConjugateReturnType(m_matrix.conjugate()); }
 
-    /** \returns an expression of the complex conjugate of \c *this if Cond==true,
-     *           returns \c *this otherwise.
-     */
-    template<bool Cond>
-    EIGEN_DEVICE_FUNC
-    inline typename internal::conditional<Cond,ConjugateReturnType,ConstTriangularView>::type
-    conjugateIf() const
-    {
-      typedef typename internal::conditional<Cond,ConjugateReturnType,ConstTriangularView>::type ReturnType;
-      return ReturnType(m_matrix.template conjugateIf<Cond>());
-    }
-
     typedef TriangularView<const typename MatrixType::AdjointReturnType,TransposeMode> AdjointReturnType;
     /** \sa MatrixBase::adjoint() const */
     EIGEN_DEVICE_FUNC
@@ -447,14 +433,14 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
     TriangularViewType& operator=(const TriangularViewImpl& other)
     { return *this = other.derived().nestedExpression(); }
 
-    template<typename OtherDerived>
     /** \deprecated */
-    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     void lazyAssign(const TriangularBase<OtherDerived>& other);
 
-    template<typename OtherDerived>
     /** \deprecated */
-    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
     void lazyAssign(const MatrixBase<OtherDerived>& other);
 #endif
 
@@ -482,7 +468,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
       * \a Side==OnTheLeft (the default), or the right-inverse-multiply  \a other * inverse(\c *this) if
       * \a Side==OnTheRight.
       *
-      * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft
+      * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
       *
       * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
       * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
@@ -500,6 +486,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
       * \sa TriangularView::solveInPlace()
       */
     template<int Side, typename Other>
+    EIGEN_DEVICE_FUNC
     inline const internal::triangular_solve_retval<Side,TriangularViewType, Other>
     solve(const MatrixBase<Other>& other) const;
 
@@ -508,7 +495,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
       * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
       * This function will const_cast it, so constness isn't honored here.
       *
-      * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft
+      * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft
       *
       * See TriangularView:solve() for the details.
       */
@@ -534,10 +521,10 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
       call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
     }
 
-    /** Shortcut for \code (*this).swap(other.triangularView<(*this)::Mode>()) \endcode */
+    /** \deprecated
+      * Shortcut for \code (*this).swap(other.triangularView<(*this)::Mode>()) \endcode */
     template<typename OtherDerived>
-    /** \deprecated */
-    EIGEN_DEPRECATED EIGEN_DEVICE_FUNC
+    EIGEN_DEVICE_FUNC
     void swap(MatrixBase<OtherDerived> const & other)
     {
       EIGEN_STATIC_ASSERT_LVALUE(OtherDerived);
@@ -569,7 +556,7 @@ template<typename _MatrixType, unsigned int _Mode> class TriangularViewImpl<_Mat
 // FIXME should we keep that possibility
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
+inline TriangularView<MatrixType, Mode>&
 TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDerived>& other)
 {
   internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op<Scalar,typename OtherDerived::Scalar>());
@@ -579,7 +566,7 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const MatrixBase<OtherDer
 // FIXME should we keep that possibility
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
+void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other)
 {
   internal::call_assignment_no_alias(derived(), other.template triangularView<Mode>());
 }
@@ -588,7 +575,7 @@ EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(c
 
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>&
+inline TriangularView<MatrixType, Mode>&
 TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<OtherDerived>& other)
 {
   eigen_assert(Mode == int(OtherDerived::Mode));
@@ -598,7 +585,7 @@ TriangularViewImpl<MatrixType, Mode, Dense>::operator=(const TriangularBase<Othe
 
 template<typename MatrixType, unsigned int Mode>
 template<typename OtherDerived>
-EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
+void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const TriangularBase<OtherDerived>& other)
 {
   eigen_assert(Mode == int(OtherDerived::Mode));
   internal::call_assignment_no_alias(derived(), other.derived());
@@ -613,7 +600,7 @@ EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(c
   * If the matrix is triangular, the opposite part is set to zero. */
 template<typename Derived>
 template<typename DenseDerived>
-EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
+void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
 {
   evalToLazy(other.derived());
 }
@@ -639,7 +626,6 @@ EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived>
   */
 template<typename Derived>
 template<unsigned int Mode>
-EIGEN_DEVICE_FUNC
 typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView()
 {
@@ -649,7 +635,6 @@ MatrixBase<Derived>::triangularView()
 /** This is the const version of MatrixBase::triangularView() */
 template<typename Derived>
 template<unsigned int Mode>
-EIGEN_DEVICE_FUNC
 typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
 MatrixBase<Derived>::triangularView() const
 {
@@ -732,7 +717,6 @@ struct unary_evaluator<TriangularView<MatrixType,Mode>, IndexBased>
 {
   typedef TriangularView<MatrixType,Mode> XprType;
   typedef evaluator<typename internal::remove_all<MatrixType>::type> Base;
-  EIGEN_DEVICE_FUNC
   unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {}
 };
 
@@ -948,7 +932,7 @@ struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite>
   * If the matrix is triangular, the opposite part is set to zero. */
 template<typename Derived>
 template<typename DenseDerived>
-EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
+void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
 {
   other.derived().resize(this->rows(), this->cols());
   internal::call_triangular_assignment_loop<Derived::Mode,(Derived::Mode&SelfAdjoint)==0 /* SetOpposite */>(other.derived(), derived().nestedExpression());
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/VectorBlock.h b/uppsrc/plugin/Eigen/Eigen/src/Core/VectorBlock.h
index 71c5b95ee..d72fbf7e9 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/VectorBlock.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/VectorBlock.h
@@ -35,7 +35,7 @@ struct traits<VectorBlock<VectorType, Size> >
   * It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment<int>(Index) and
   * most of the time this is the only way it is used.
   *
-  * However, if you want to directly manipulate sub-vector expressions,
+  * However, if you want to directly maniputate sub-vector expressions,
   * for instance if you want to write a function returning such an expression, you
   * will need to use this class.
   *
@@ -71,8 +71,8 @@ template<typename VectorType, int Size> class VectorBlock
 
     /** Dynamic-size constructor
       */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    VectorBlock(VectorType& vector, Index start, Index size)
+    EIGEN_DEVICE_FUNC
+    inline VectorBlock(VectorType& vector, Index start, Index size)
       : Base(vector,
              IsColVector ? start : 0, IsColVector ? 0 : start,
              IsColVector ? size  : 1, IsColVector ? 1 : size)
@@ -82,8 +82,8 @@ template<typename VectorType, int Size> class VectorBlock
 
     /** Fixed-size constructor
       */
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    VectorBlock(VectorType& vector, Index start)
+    EIGEN_DEVICE_FUNC
+    inline VectorBlock(VectorType& vector, Index start)
       : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start)
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorBlock);
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/VectorwiseOp.h b/uppsrc/plugin/Eigen/Eigen/src/Core/VectorwiseOp.h
index 865691b32..4fe267e9f 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/VectorwiseOp.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/VectorwiseOp.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -81,46 +81,39 @@ class PartialReduxExpr : public internal::dense_xpr_base< PartialReduxExpr<Matri
     const MemberOp m_functor;
 };
 
-template<typename A,typename B> struct partial_redux_dummy_func;
-
-#define EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(MEMBER,COST,VECTORIZABLE,BINARYOP)                \
-  template <typename ResultType,typename Scalar>                                                            \
-  struct member_##MEMBER {                                                                  \
-    EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER)                                                \
-    typedef ResultType result_type;                                                         \
-    typedef BINARYOP<Scalar,Scalar> BinaryOp;   \
-    template<int Size> struct Cost { enum { value = COST }; };             \
-    enum { Vectorizable = VECTORIZABLE };                                                   \
-    template<typename XprType>                                                              \
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                                                   \
-    ResultType operator()(const XprType& mat) const                                         \
-    { return mat.MEMBER(); }                                                                \
-    BinaryOp binaryFunc() const { return BinaryOp(); }                                      \
+#define EIGEN_MEMBER_FUNCTOR(MEMBER,COST)                               \
+  template <typename ResultType>                                        \
+  struct member_##MEMBER {                                              \
+    EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER)                            \
+    typedef ResultType result_type;                                     \
+    template<typename Scalar, int Size> struct Cost                     \
+    { enum { value = COST }; };                                         \
+    template<typename XprType>                                          \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE                               \
+    ResultType operator()(const XprType& mat) const                     \
+    { return mat.MEMBER(); } \
   }
 
-#define EIGEN_MEMBER_FUNCTOR(MEMBER,COST) \
-  EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(MEMBER,COST,0,partial_redux_dummy_func)
-
 namespace internal {
 
+EIGEN_MEMBER_FUNCTOR(squaredNorm, Size * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(norm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(stableNorm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(blueNorm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(hypotNorm, (Size-1) * functor_traits<scalar_hypot_op<Scalar> >::Cost );
+EIGEN_MEMBER_FUNCTOR(sum, (Size-1)*NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(mean, (Size-1)*NumTraits<Scalar>::AddCost + NumTraits<Scalar>::MulCost);
+EIGEN_MEMBER_FUNCTOR(minCoeff, (Size-1)*NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(maxCoeff, (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(all, (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(any, (Size-1)*NumTraits<Scalar>::AddCost);
 EIGEN_MEMBER_FUNCTOR(count, (Size-1)*NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(prod, (Size-1)*NumTraits<Scalar>::MulCost);
 
-EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(sum, (Size-1)*NumTraits<Scalar>::AddCost, 1, internal::scalar_sum_op);
-EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(minCoeff, (Size-1)*NumTraits<Scalar>::AddCost, 1, internal::scalar_min_op);
-EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(maxCoeff, (Size-1)*NumTraits<Scalar>::AddCost, 1, internal::scalar_max_op);
-EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(prod, (Size-1)*NumTraits<Scalar>::MulCost, 1, internal::scalar_product_op);
-
-template <int p, typename ResultType,typename Scalar>
+template <int p, typename ResultType>
 struct member_lpnorm {
   typedef ResultType result_type;
-  enum { Vectorizable = 0 };
-  template<int Size> struct Cost
+  template<typename Scalar, int Size> struct Cost
   { enum { value = (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost }; };
   EIGEN_DEVICE_FUNC member_lpnorm() {}
   template<typename XprType>
@@ -128,20 +121,17 @@ struct member_lpnorm {
   { return mat.template lpNorm<p>(); }
 };
 
-template <typename BinaryOpT, typename Scalar>
+template <typename BinaryOp, typename Scalar>
 struct member_redux {
-  typedef BinaryOpT BinaryOp;
   typedef typename result_of<
                      BinaryOp(const Scalar&,const Scalar&)
                    >::type  result_type;
-  
-  enum { Vectorizable = functor_traits<BinaryOp>::PacketAccess };
-  template<int Size> struct Cost { enum { value = (Size-1) * functor_traits<BinaryOp>::Cost }; };
+  template<typename _Scalar, int Size> struct Cost
+  { enum { value = (Size-1) * functor_traits<BinaryOp>::Cost }; };
   EIGEN_DEVICE_FUNC explicit member_redux(const BinaryOp func) : m_functor(func) {}
   template<typename Derived>
   EIGEN_DEVICE_FUNC inline result_type operator()(const DenseBase<Derived>& mat) const
   { return mat.redux(m_functor); }
-  const BinaryOp& binaryFunc() const { return m_functor; }
   const BinaryOp m_functor;
 };
 }
@@ -149,38 +139,18 @@ struct member_redux {
 /** \class VectorwiseOp
   * \ingroup Core_Module
   *
-  * \brief Pseudo expression providing broadcasting and partial reduction operations
+  * \brief Pseudo expression providing partial reduction operations
   *
   * \tparam ExpressionType the type of the object on which to do partial reductions
-  * \tparam Direction indicates whether to operate on columns (#Vertical) or rows (#Horizontal)
+  * \tparam Direction indicates the direction of the redux (#Vertical or #Horizontal)
   *
-  * This class represents a pseudo expression with broadcasting and partial reduction features.
+  * This class represents a pseudo expression with partial reduction features.
   * It is the return type of DenseBase::colwise() and DenseBase::rowwise()
-  * and most of the time this is the only way it is explicitly used.
+  * and most of the time this is the only way it is used.
   *
-  * To understand the logic of rowwise/colwise expression, let's consider a generic case `A.colwise().foo()`
-  * where `foo` is any method of `VectorwiseOp`. This expression is equivalent to applying `foo()` to each
-  * column of `A` and then re-assemble the outputs in a matrix expression:
-  * \code [A.col(0).foo(), A.col(1).foo(), ..., A.col(A.cols()-1).foo()] \endcode
-  * 
   * Example: \include MatrixBase_colwise.cpp
   * Output: \verbinclude MatrixBase_colwise.out
   *
-  * The begin() and end() methods are obviously exceptions to the previous rule as they
-  * return STL-compatible begin/end iterators to the rows or columns of the nested expression.
-  * Typical use cases include for-range-loop and calls to STL algorithms:
-  * 
-  * Example: \include MatrixBase_colwise_iterator_cxx11.cpp
-  * Output: \verbinclude MatrixBase_colwise_iterator_cxx11.out
-  * 
-  * For a partial reduction on an empty input, some rules apply.
-  * For the sake of clarity, let's consider a vertical reduction:
-  *   - If the number of columns is zero, then a 1x0 row-major vector expression is returned.
-  *   - Otherwise, if the number of rows is zero, then
-  *       - a row vector of zeros is returned for sum-like reductions (sum, squaredNorm, norm, etc.)
-  *       - a row vector of ones is returned for a product reduction (e.g., <code>MatrixXd(n,0).colwise().prod()</code>)
-  *       - an assert is triggered for all other reductions (minCoeff,maxCoeff,redux(bin_op))
-  * 
   * \sa DenseBase::colwise(), DenseBase::rowwise(), class PartialReduxExpr
   */
 template<typename ExpressionType, int Direction> class VectorwiseOp
@@ -193,11 +163,11 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     typedef typename internal::ref_selector<ExpressionType>::non_const_type ExpressionTypeNested;
     typedef typename internal::remove_all<ExpressionTypeNested>::type ExpressionTypeNestedCleaned;
 
-    template<template<typename OutScalar,typename InputScalar> class Functor,
-                      typename ReturnScalar=Scalar> struct ReturnType
+    template<template<typename _Scalar> class Functor,
+                      typename Scalar_=Scalar> struct ReturnType
     {
       typedef PartialReduxExpr<ExpressionType,
-                               Functor<ReturnScalar,Scalar>,
+                               Functor<Scalar_>,
                                Direction
                               > Type;
     };
@@ -216,7 +186,24 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     };
 
   protected:
-  
+
+    typedef typename internal::conditional<isVertical,
+                               typename ExpressionType::ColXpr,
+                               typename ExpressionType::RowXpr>::type SubVector;
+    /** \internal
+      * \returns the i-th subvector according to the \c Direction */
+    EIGEN_DEVICE_FUNC
+    SubVector subVector(Index i)
+    {
+      return SubVector(m_matrix.derived(),i);
+    }
+
+    /** \internal
+      * \returns the number of subvectors in the direction \c Direction */
+    EIGEN_DEVICE_FUNC
+    Index subVectors() const
+    { return isVertical?m_matrix.cols():m_matrix.rows(); }
+
     template<typename OtherDerived> struct ExtendedType {
       typedef Replicate<OtherDerived,
                         isVertical   ? 1 : ExpressionType::RowsAtCompileTime,
@@ -271,81 +258,42 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     EIGEN_DEVICE_FUNC
     inline const ExpressionType& _expression() const { return m_matrix; }
 
-    #ifdef EIGEN_PARSED_BY_DOXYGEN
-    /** STL-like <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">RandomAccessIterator</a>
-      * iterator type over the columns or rows as returned by the begin() and end() methods.
-      */
-    random_access_iterator_type iterator;
-    /** This is the const version of iterator (aka read-only) */
-    random_access_iterator_type const_iterator;
-    #else
-    typedef internal::subvector_stl_iterator<ExpressionType,       DirectionType(Direction)> iterator;
-    typedef internal::subvector_stl_iterator<const ExpressionType, DirectionType(Direction)> const_iterator;
-    #endif
-
-    /** returns an iterator to the first row (rowwise) or column (colwise) of the nested expression.
-      * \sa end(), cbegin()
-      */
-    iterator        begin()       { return iterator      (m_matrix, 0); }
-    /** const version of begin() */
-    const_iterator  begin() const { return const_iterator(m_matrix, 0); }
-    /** const version of begin() */
-    const_iterator cbegin() const { return const_iterator(m_matrix, 0); }
-
-    /** returns an iterator to the row (resp. column) following the last row (resp. column) of the nested expression
-      * \sa begin(), cend()
-      */
-    iterator        end()         { return iterator      (m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()); }
-    /** const version of end() */
-    const_iterator  end()   const { return const_iterator(m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()); }
-    /** const version of end() */
-    const_iterator cend()   const { return const_iterator(m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()); }
-
     /** \returns a row or column vector expression of \c *this reduxed by \a func
       *
       * The template parameter \a BinaryOp is the type of the functor
       * of the custom redux operator. Note that func must be an associative operator.
       *
-      * \warning the size along the reduction direction must be strictly positive,
-      *          otherwise an assertion is triggered.
-      * 
       * \sa class VectorwiseOp, DenseBase::colwise(), DenseBase::rowwise()
       */
     template<typename BinaryOp>
     EIGEN_DEVICE_FUNC
     const typename ReduxReturnType<BinaryOp>::Type
     redux(const BinaryOp& func = BinaryOp()) const
-    {
-      eigen_assert(redux_length()>0 && "you are using an empty matrix");
-      return typename ReduxReturnType<BinaryOp>::Type(_expression(), internal::member_redux<BinaryOp,Scalar>(func));
-    }
+    { return typename ReduxReturnType<BinaryOp>::Type(_expression(), internal::member_redux<BinaryOp,Scalar>(func)); }
 
     typedef typename ReturnType<internal::member_minCoeff>::Type MinCoeffReturnType;
     typedef typename ReturnType<internal::member_maxCoeff>::Type MaxCoeffReturnType;
-    typedef PartialReduxExpr<const CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const ExpressionTypeNestedCleaned>,internal::member_sum<RealScalar,RealScalar>,Direction> SquaredNormReturnType;
-    typedef CwiseUnaryOp<internal::scalar_sqrt_op<RealScalar>, const SquaredNormReturnType> NormReturnType;
+    typedef typename ReturnType<internal::member_squaredNorm,RealScalar>::Type SquaredNormReturnType;
+    typedef typename ReturnType<internal::member_norm,RealScalar>::Type NormReturnType;
     typedef typename ReturnType<internal::member_blueNorm,RealScalar>::Type BlueNormReturnType;
     typedef typename ReturnType<internal::member_stableNorm,RealScalar>::Type StableNormReturnType;
     typedef typename ReturnType<internal::member_hypotNorm,RealScalar>::Type HypotNormReturnType;
     typedef typename ReturnType<internal::member_sum>::Type SumReturnType;
-    typedef EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(SumReturnType,Scalar,quotient) MeanReturnType;
+    typedef typename ReturnType<internal::member_mean>::Type MeanReturnType;
     typedef typename ReturnType<internal::member_all>::Type AllReturnType;
     typedef typename ReturnType<internal::member_any>::Type AnyReturnType;
-    typedef PartialReduxExpr<ExpressionType, internal::member_count<Index,Scalar>, Direction> CountReturnType;
+    typedef PartialReduxExpr<ExpressionType, internal::member_count<Index>, Direction> CountReturnType;
     typedef typename ReturnType<internal::member_prod>::Type ProdReturnType;
     typedef Reverse<const ExpressionType, Direction> ConstReverseReturnType;
     typedef Reverse<ExpressionType, Direction> ReverseReturnType;
 
     template<int p> struct LpNormReturnType {
-      typedef PartialReduxExpr<ExpressionType, internal::member_lpnorm<p,RealScalar,Scalar>,Direction> Type;
+      typedef PartialReduxExpr<ExpressionType, internal::member_lpnorm<p,RealScalar>,Direction> Type;
     };
 
     /** \returns a row (or column) vector expression of the smallest coefficient
       * of each column (or row) of the referenced expression.
       *
-      * \warning the size along the reduction direction must be strictly positive,
-      *          otherwise an assertion is triggered.
-      * 
       * \warning the result is undefined if \c *this contains NaN.
       *
       * Example: \include PartialRedux_minCoeff.cpp
@@ -354,17 +302,11 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       * \sa DenseBase::minCoeff() */
     EIGEN_DEVICE_FUNC
     const MinCoeffReturnType minCoeff() const
-    {
-      eigen_assert(redux_length()>0 && "you are using an empty matrix");
-      return MinCoeffReturnType(_expression());
-    }
+    { return MinCoeffReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression of the largest coefficient
       * of each column (or row) of the referenced expression.
       *
-      * \warning the size along the reduction direction must be strictly positive,
-      *          otherwise an assertion is triggered.
-      * 
       * \warning the result is undefined if \c *this contains NaN.
       *
       * Example: \include PartialRedux_maxCoeff.cpp
@@ -373,10 +315,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       * \sa DenseBase::maxCoeff() */
     EIGEN_DEVICE_FUNC
     const MaxCoeffReturnType maxCoeff() const
-    {
-      eigen_assert(redux_length()>0 && "you are using an empty matrix");
-      return MaxCoeffReturnType(_expression());
-    }
+    { return MaxCoeffReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression of the squared norm
       * of each column (or row) of the referenced expression.
@@ -388,7 +327,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       * \sa DenseBase::squaredNorm() */
     EIGEN_DEVICE_FUNC
     const SquaredNormReturnType squaredNorm() const
-    { return SquaredNormReturnType(m_matrix.cwiseAbs2()); }
+    { return SquaredNormReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression of the norm
       * of each column (or row) of the referenced expression.
@@ -400,7 +339,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       * \sa DenseBase::norm() */
     EIGEN_DEVICE_FUNC
     const NormReturnType norm() const
-    { return NormReturnType(squaredNorm()); }
+    { return NormReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression of the norm
       * of each column (or row) of the referenced expression.
@@ -465,7 +404,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     * \sa DenseBase::mean() */
     EIGEN_DEVICE_FUNC
     const MeanReturnType mean() const
-    { return sum() / Scalar(Direction==Vertical?m_matrix.rows():m_matrix.cols()); }
+    { return MeanReturnType(_expression()); }
 
     /** \returns a row (or column) vector expression representing
       * whether \b all coefficients of each respective column (or row) are \c true.
@@ -561,7 +500,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
       EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
       //eigen_assert((m_matrix.isNull()) == (other.isNull())); FIXME
-      return m_matrix = extendedTo(other.derived());
+      return const_cast<ExpressionType&>(m_matrix = extendedTo(other.derived()));
     }
 
     /** Adds the vector \a other to each subvector of \c *this */
@@ -571,7 +510,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
       EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
-      return m_matrix += extendedTo(other.derived());
+      return const_cast<ExpressionType&>(m_matrix += extendedTo(other.derived()));
     }
 
     /** Substracts the vector \a other to each subvector of \c *this */
@@ -581,7 +520,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     {
       EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
       EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
-      return m_matrix -= extendedTo(other.derived());
+      return const_cast<ExpressionType&>(m_matrix -= extendedTo(other.derived()));
     }
 
     /** Multiples each subvector of \c *this by the vector \a other */
@@ -593,7 +532,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
       EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
       m_matrix *= extendedTo(other.derived());
-      return m_matrix;
+      return const_cast<ExpressionType&>(m_matrix);
     }
 
     /** Divides each subvector of \c *this by the vector \a other */
@@ -605,7 +544,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
       EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
       EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
       m_matrix /= extendedTo(other.derived());
-      return m_matrix;
+      return const_cast<ExpressionType&>(m_matrix);
     }
 
     /** Returns the expression of the sum of the vector \a other to each subvector of \c *this */
@@ -670,7 +609,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     EIGEN_DEVICE_FUNC
     CwiseBinaryOp<internal::scalar_quotient_op<Scalar>,
                   const ExpressionTypeNestedCleaned,
-                  const typename OppositeExtendedType<NormReturnType>::Type>
+                  const typename OppositeExtendedType<typename ReturnType<internal::member_norm,RealScalar>::Type>::Type>
     normalized() const { return m_matrix.cwiseQuotient(extendedToOpposite(this->norm())); }
 
 
@@ -719,15 +658,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
     EIGEN_DEVICE_FUNC
     const HNormalizedReturnType hnormalized() const;
 
-#   ifdef EIGEN_VECTORWISEOP_PLUGIN
-#     include EIGEN_VECTORWISEOP_PLUGIN
-#   endif
-
   protected:
-    Index redux_length() const
-    {
-      return Direction==Vertical ? m_matrix.rows() : m_matrix.cols();
-    }
     ExpressionTypeNested m_matrix;
 };
 
@@ -739,7 +670,7 @@ template<typename ExpressionType, int Direction> class VectorwiseOp
   * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ColwiseReturnType
+inline typename DenseBase<Derived>::ColwiseReturnType
 DenseBase<Derived>::colwise()
 {
   return ColwiseReturnType(derived());
@@ -753,7 +684,7 @@ DenseBase<Derived>::colwise()
   * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::RowwiseReturnType
+inline typename DenseBase<Derived>::RowwiseReturnType
 DenseBase<Derived>::rowwise()
 {
   return RowwiseReturnType(derived());
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Visitor.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Visitor.h
index 67a69c54f..54c1883d9 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/Visitor.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Visitor.h
@@ -40,14 +40,6 @@ struct visitor_impl<Visitor, Derived, 1>
   }
 };
 
-// This specialization enables visitors on empty matrices at compile-time
-template<typename Visitor, typename Derived>
-struct visitor_impl<Visitor, Derived, 0> {
-  EIGEN_DEVICE_FUNC
-  static inline void run(const Derived &/*mat*/, Visitor& /*visitor*/)
-  {}
-};
-
 template<typename Visitor, typename Derived>
 struct visitor_impl<Visitor, Derived, Dynamic>
 {
@@ -106,8 +98,6 @@ protected:
   *
   * \note compared to one or two \em for \em loops, visitors offer automatic
   * unrolling for small fixed size matrix.
-  * 
-  * \note if the matrix is empty, then the visitor is left unchanged.
   *
   * \sa minCoeff(Index*,Index*), maxCoeff(Index*,Index*), DenseBase::redux()
   */
@@ -116,9 +106,6 @@ template<typename Visitor>
 EIGEN_DEVICE_FUNC
 void DenseBase<Derived>::visit(Visitor& visitor) const
 {
-  if(size()==0)
-    return;
-  
   typedef typename internal::visitor_evaluator<Derived> ThisEvaluator;
   ThisEvaluator thisEval(derived());
   
@@ -137,9 +124,6 @@ namespace internal {
 template <typename Derived>
 struct coeff_visitor
 {
-  // default initialization to avoid countless invalid maybe-uninitialized warnings by gcc
-  EIGEN_DEVICE_FUNC
-  coeff_visitor() : row(-1), col(-1), res(0) {}
   typedef typename Derived::Scalar Scalar;
   Index row, col;
   Scalar res;
@@ -212,9 +196,6 @@ struct functor_traits<max_coeff_visitor<Scalar> > {
 
 /** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
   * \returns the minimum of all coefficients of *this and puts in *row and *col its location.
-  * 
-  * \warning the matrix must be not empty, otherwise an assertion is triggered.
-  * 
   * \warning the result is undefined if \c *this contains NaN.
   *
   * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff()
@@ -225,8 +206,6 @@ EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
 {
-  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
-
   internal::min_coeff_visitor<Derived> minVisitor;
   this->visit(minVisitor);
   *rowId = minVisitor.row;
@@ -235,9 +214,6 @@ DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
 }
 
 /** \returns the minimum of all coefficients of *this and puts in *index its location.
-  * 
-  * \warning the matrix must be not empty, otherwise an assertion is triggered.
-  * 
   * \warning the result is undefined if \c *this contains NaN. 
   *
   * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::minCoeff()
@@ -248,8 +224,6 @@ EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::minCoeff(IndexType* index) const
 {
-  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
-
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   internal::min_coeff_visitor<Derived> minVisitor;
   this->visit(minVisitor);
@@ -259,9 +233,6 @@ DenseBase<Derived>::minCoeff(IndexType* index) const
 
 /** \fn DenseBase<Derived>::maxCoeff(IndexType* rowId, IndexType* colId) const
   * \returns the maximum of all coefficients of *this and puts in *row and *col its location.
-  * 
-  * \warning the matrix must be not empty, otherwise an assertion is triggered.
-  * 
   * \warning the result is undefined if \c *this contains NaN. 
   *
   * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff()
@@ -272,8 +243,6 @@ EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
 {
-  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
-
   internal::max_coeff_visitor<Derived> maxVisitor;
   this->visit(maxVisitor);
   *rowPtr = maxVisitor.row;
@@ -282,9 +251,6 @@ DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
 }
 
 /** \returns the maximum of all coefficients of *this and puts in *index its location.
-  * 
-  * \warning the matrix must be not empty, otherwise an assertion is triggered.
-  *
   * \warning the result is undefined if \c *this contains NaN.
   *
   * \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff()
@@ -295,8 +261,6 @@ EIGEN_DEVICE_FUNC
 typename internal::traits<Derived>::Scalar
 DenseBase<Derived>::maxCoeff(IndexType* index) const
 {
-  eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix");
-
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   internal::max_coeff_visitor<Derived> maxVisitor;
   this->visit(maxVisitor);
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/Complex.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/Complex.h
index c2d5205f2..7fa61969d 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/Complex.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/Complex.h
@@ -22,7 +22,6 @@ struct Packet4cf
   __m256  v;
 };
 
-#ifndef EIGEN_VECTORIZE_AVX512
 template<> struct packet_traits<std::complex<float> >  : default_packet_traits
 {
   typedef Packet4cf type;
@@ -42,13 +41,11 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
     HasAbs2   = 0,
     HasMin    = 0,
     HasMax    = 0,
-    HasSetLinear = 0,
-    HasInsert = 1
+    HasSetLinear = 0
   };
 };
-#endif
 
-template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet4cf> { typedef std::complex<float> type; enum {size=4, alignment=Aligned32}; typedef Packet2cf half; };
 
 template<> EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); }
@@ -70,18 +67,10 @@ template<> EIGEN_STRONG_INLINE Packet4cf pmul<Packet4cf>(const Packet4cf& a, con
   return Packet4cf(result);
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) {
-  __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ);
-  return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cf ptrue<Packet4cf>(const Packet4cf& a) { return Packet4cf(ptrue(Packet8f(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet4cf pnot<Packet4cf>(const Packet4cf& a) { return Packet4cf(pnot(Packet8f(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet4cf pand   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf por    <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet4cf pxor   <Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(b.v,a.v)); }
+template<> EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); }
 
 template<> EIGEN_STRONG_INLINE Packet4cf pload <Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload<Packet8f>(&numext::real_ref(*from))); }
 template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu<Packet8f>(&numext::real_ref(*from))); }
@@ -151,12 +140,37 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packe
                      Packet2cf(_mm256_extractf128_ps(a.v,1))));
 }
 
+template<> EIGEN_STRONG_INLINE Packet4cf preduxp<Packet4cf>(const Packet4cf* vecs)
+{
+  Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0));
+  Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0));
+  t0 = _mm256_hadd_ps(t0,t1);
+  Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0));
+  Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0));
+  t2 = _mm256_hadd_ps(t2,t3);
+  
+  t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4));
+  t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4));
+
+  return Packet4cf(_mm256_add_ps(t1,t3));
+}
+
 template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a)
 {
   return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)),
                          Packet2cf(_mm256_extractf128_ps(a.v, 1))));
 }
 
+template<int Offset>
+struct palign_impl<Offset,Packet4cf>
+{
+  static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second)
+  {
+    if (Offset==0) return;
+    palign_impl<Offset*2,Packet8f>::run(first.v, second.v);
+  }
+};
+
 template<> struct conj_helper<Packet4cf, Packet4cf, false,true>
 {
   EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
@@ -214,7 +228,6 @@ struct Packet2cd
   __m256d  v;
 };
 
-#ifndef EIGEN_VECTORIZE_AVX512
 template<> struct packet_traits<std::complex<double> >  : default_packet_traits
 {
   typedef Packet2cd type;
@@ -237,9 +250,8 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
     HasSetLinear = 0
   };
 };
-#endif
 
-template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet2cd> { typedef std::complex<double> type; enum {size=2, alignment=Aligned32}; typedef Packet1cd half; };
 
 template<> EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); }
@@ -260,18 +272,10 @@ template<> EIGEN_STRONG_INLINE Packet2cd pmul<Packet2cd>(const Packet2cd& a, con
   return Packet2cd(_mm256_addsub_pd(even, odd));
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) {
-  __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ);
-  return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cd ptrue<Packet2cd>(const Packet2cd& a) { return Packet2cd(ptrue(Packet4d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet2cd pnot<Packet2cd>(const Packet2cd& a) { return Packet2cd(pnot(Packet4d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet2cd pand   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd por    <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cd pxor   <Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(b.v,a.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); }
 
 template<> EIGEN_STRONG_INLINE Packet2cd pload <Packet2cd>(const std::complex<double>* from)
 { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload<Packet4d>((const double*)from)); }
@@ -323,12 +327,30 @@ template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Pack
                      Packet1cd(_mm256_extractf128_pd(a.v,1))));
 }
 
+template<> EIGEN_STRONG_INLINE Packet2cd preduxp<Packet2cd>(const Packet2cd* vecs)
+{
+  Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4));
+  Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4));
+
+  return Packet2cd(_mm256_add_pd(t0,t1));
+}
+
 template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a)
 {
   return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)),
                      Packet1cd(_mm256_extractf128_pd(a.v,1))));
 }
 
+template<int Offset>
+struct palign_impl<Offset,Packet2cd>
+{
+  static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second)
+  {
+    if (Offset==0) return;
+    palign_impl<Offset*2,Packet4d>::run(first.v, second.v);
+  }
+};
+
 template<> struct conj_helper<Packet2cd, Packet2cd, false,true>
 {
   EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
@@ -402,6 +424,26 @@ ptranspose(PacketBlock<Packet2cd,2>& kernel) {
  kernel.packet[0].v = tmp;
 }
 
+template<> EIGEN_STRONG_INLINE Packet4cf pinsertfirst(const Packet4cf& a, std::complex<float> b)
+{
+  return Packet4cf(_mm256_blend_ps(a.v,pset1<Packet4cf>(b).v,1|2));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd pinsertfirst(const Packet2cd& a, std::complex<double> b)
+{
+  return Packet2cd(_mm256_blend_pd(a.v,pset1<Packet2cd>(b).v,1|2));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4cf pinsertlast(const Packet4cf& a, std::complex<float> b)
+{
+  return Packet4cf(_mm256_blend_ps(a.v,pset1<Packet4cf>(b).v,(1<<7)|(1<<6)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cd pinsertlast(const Packet2cd& a, std::complex<double> b)
+{
+  return Packet2cd(_mm256_blend_pd(a.v,pset1<Packet2cd>(b).v,(1<<3)|(1<<2)));
+}
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/MathFunctions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/MathFunctions.h
index c5394430f..6af67ce2d 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/MathFunctions.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_MATH_FUNCTIONS_AVX_H
 #define EIGEN_MATH_FUNCTIONS_AVX_H
 
-/* The sin and cos functions of this file are loosely derived from
+/* The sin, cos, exp, and log functions of this file are loosely derived from
  * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
  */
 
@@ -18,32 +18,187 @@ namespace Eigen {
 
 namespace internal {
 
+inline Packet8i pshiftleft(Packet8i v, int n)
+{
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_slli_epi32(v, n);
+#else
+  __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(v, 0), n);
+  __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(v, 1), n);
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+inline Packet8f pshiftright(Packet8f v, int n)
+{
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(v), n));
+#else
+  __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 0), n);
+  __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 1), n);
+  return _mm256_cvtepi32_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1));
+#endif
+}
+
+// Sine function
+// Computes sin(x) by wrapping x to the interval [-Pi/4,3*Pi/4] and
+// evaluating interpolants in [-Pi/4,Pi/4] or [Pi/4,3*Pi/4]. The interpolants
+// are (anti-)symmetric and thus have only odd/even coefficients
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
 psin<Packet8f>(const Packet8f& _x) {
-  return psin_float(_x);
-}
-
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
-pcos<Packet8f>(const Packet8f& _x) {
-  return pcos_float(_x);
+  Packet8f x = _x;
+
+  // Some useful values.
+  _EIGEN_DECLARE_CONST_Packet8i(one, 1);
+  _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f);
+  _EIGEN_DECLARE_CONST_Packet8f(two, 2.0f);
+  _EIGEN_DECLARE_CONST_Packet8f(one_over_four, 0.25f);
+  _EIGEN_DECLARE_CONST_Packet8f(one_over_pi, 3.183098861837907e-01f);
+  _EIGEN_DECLARE_CONST_Packet8f(neg_pi_first, -3.140625000000000e+00f);
+  _EIGEN_DECLARE_CONST_Packet8f(neg_pi_second, -9.670257568359375e-04f);
+  _EIGEN_DECLARE_CONST_Packet8f(neg_pi_third, -6.278329571784980e-07f);
+  _EIGEN_DECLARE_CONST_Packet8f(four_over_pi, 1.273239544735163e+00f);
+
+  // Map x from [-Pi/4,3*Pi/4] to z in [-1,3] and subtract the shifted period.
+  Packet8f z = pmul(x, p8f_one_over_pi);
+  Packet8f shift = _mm256_floor_ps(padd(z, p8f_one_over_four));
+  x = pmadd(shift, p8f_neg_pi_first, x);
+  x = pmadd(shift, p8f_neg_pi_second, x);
+  x = pmadd(shift, p8f_neg_pi_third, x);
+  z = pmul(x, p8f_four_over_pi);
+
+  // Make a mask for the entries that need flipping, i.e. wherever the shift
+  // is odd.
+  Packet8i shift_ints = _mm256_cvtps_epi32(shift);
+  Packet8i shift_isodd = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(shift_ints), _mm256_castsi256_ps(p8i_one)));
+  Packet8i sign_flip_mask = pshiftleft(shift_isodd, 31);
+
+  // Create a mask for which interpolant to use, i.e. if z > 1, then the mask
+  // is set to ones for that entry.
+  Packet8f ival_mask = _mm256_cmp_ps(z, p8f_one, _CMP_GT_OQ);
+
+  // Evaluate the polynomial for the interval [1,3] in z.
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_0, 9.999999724233232e-01f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_2, -3.084242535619928e-01f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_4, 1.584991525700324e-02f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_right_6, -3.188805084631342e-04f);
+  Packet8f z_minus_two = psub(z, p8f_two);
+  Packet8f z_minus_two2 = pmul(z_minus_two, z_minus_two);
+  Packet8f right = pmadd(p8f_coeff_right_6, z_minus_two2, p8f_coeff_right_4);
+  right = pmadd(right, z_minus_two2, p8f_coeff_right_2);
+  right = pmadd(right, z_minus_two2, p8f_coeff_right_0);
+
+  // Evaluate the polynomial for the interval [-1,1] in z.
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_1, 7.853981525427295e-01f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_3, -8.074536727092352e-02f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_5, 2.489871967827018e-03f);
+  _EIGEN_DECLARE_CONST_Packet8f(coeff_left_7, -3.587725841214251e-05f);
+  Packet8f z2 = pmul(z, z);
+  Packet8f left = pmadd(p8f_coeff_left_7, z2, p8f_coeff_left_5);
+  left = pmadd(left, z2, p8f_coeff_left_3);
+  left = pmadd(left, z2, p8f_coeff_left_1);
+  left = pmul(left, z);
+
+  // Assemble the results, i.e. select the left and right polynomials.
+  left = _mm256_andnot_ps(ival_mask, left);
+  right = _mm256_and_ps(ival_mask, right);
+  Packet8f res = _mm256_or_ps(left, right);
+
+  // Flip the sign on the odd intervals and return the result.
+  res = _mm256_xor_ps(res, _mm256_castsi256_ps(sign_flip_mask));
+  return res;
 }
 
+// Natural logarithm
+// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
+// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
+// be easily approximated by a polynomial centered on m=1 for stability.
+// TODO(gonnet): Further reduce the interval allowing for lower-degree
+//               polynomial interpolants -> ... -> profit!
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
 plog<Packet8f>(const Packet8f& _x) {
-  return plog_float(_x);
-}
+  Packet8f x = _x;
+  _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
+  _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet8f(126f, 126.0f);
 
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet8f plog1p<Packet8f>(const Packet8f& _x) {
-  return generic_plog1p(_x);
-}
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inv_mant_mask, ~0x7f800000);
 
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet8f pexpm1<Packet8f>(const Packet8f& _x) {
-  return generic_expm1(_x);
+  // The smallest non denormalized float number.
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(min_norm_pos, 0x00800000);
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(minus_inf, 0xff800000);
+
+  // Polynomial coefficients.
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_SQRTHF, 0.707106781186547524f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p0, 7.0376836292E-2f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p1, -1.1514610310E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p2, 1.1676998740E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p3, -1.2420140846E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p4, +1.4249322787E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p5, -1.6668057665E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p6, +2.0000714765E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p7, -2.4999993993E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p8, +3.3333331174E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q1, -2.12194440e-4f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q2, 0.693359375f);
+
+  Packet8f invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_NGE_UQ); // not greater equal is true if x is NaN
+  Packet8f iszero_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ);
+
+  // Truncate input values to the minimum positive normal.
+  x = pmax(x, p8f_min_norm_pos);
+
+  Packet8f emm0 = pshiftright(x,23);
+  Packet8f e = _mm256_sub_ps(emm0, p8f_126f);
+
+  // Set the exponents to -1, i.e. x are in the range [0.5,1).
+  x = _mm256_and_ps(x, p8f_inv_mant_mask);
+  x = _mm256_or_ps(x, p8f_half);
+
+  // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
+  // and shift by -1. The values are then centered around 0, which improves
+  // the stability of the polynomial evaluation.
+  //   if( x < SQRTHF ) {
+  //     e -= 1;
+  //     x = x + x - 1.0;
+  //   } else { x = x - 1.0; }
+  Packet8f mask = _mm256_cmp_ps(x, p8f_cephes_SQRTHF, _CMP_LT_OQ);
+  Packet8f tmp = _mm256_and_ps(x, mask);
+  x = psub(x, p8f_1);
+  e = psub(e, _mm256_and_ps(p8f_1, mask));
+  x = padd(x, tmp);
+
+  Packet8f x2 = pmul(x, x);
+  Packet8f x3 = pmul(x2, x);
+
+  // Evaluate the polynomial approximant of degree 8 in three parts, probably
+  // to improve instruction-level parallelism.
+  Packet8f y, y1, y2;
+  y = pmadd(p8f_cephes_log_p0, x, p8f_cephes_log_p1);
+  y1 = pmadd(p8f_cephes_log_p3, x, p8f_cephes_log_p4);
+  y2 = pmadd(p8f_cephes_log_p6, x, p8f_cephes_log_p7);
+  y = pmadd(y, x, p8f_cephes_log_p2);
+  y1 = pmadd(y1, x, p8f_cephes_log_p5);
+  y2 = pmadd(y2, x, p8f_cephes_log_p8);
+  y = pmadd(y, x3, y1);
+  y = pmadd(y, x3, y2);
+  y = pmul(y, x3);
+
+  // Add the logarithm of the exponent back to the result of the interpolation.
+  y1 = pmul(e, p8f_cephes_log_q1);
+  tmp = pmul(x2, p8f_half);
+  y = padd(y, y1);
+  x = psub(x, tmp);
+  y2 = pmul(e, p8f_cephes_log_q2);
+  x = padd(x, y);
+  x = padd(x, y2);
+
+  // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF.
+  return _mm256_or_ps(
+      _mm256_andnot_ps(iszero_mask, _mm256_or_ps(x, invalid_mask)),
+      _mm256_and_ps(iszero_mask, p8f_minus_inf));
 }
 
 // Exponential function. Works by writing "x = m*log(2) + r" where
@@ -52,7 +207,62 @@ Packet8f pexpm1<Packet8f>(const Packet8f& _x) {
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f
 pexp<Packet8f>(const Packet8f& _x) {
-  return pexp_float(_x);
+  _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f);
+  _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet8f(127, 127.0f);
+
+  _EIGEN_DECLARE_CONST_Packet8f(exp_hi, 88.3762626647950f);
+  _EIGEN_DECLARE_CONST_Packet8f(exp_lo, -88.3762626647949f);
+
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_LOG2EF, 1.44269504088896341f);
+
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p0, 1.9875691500E-4f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p1, 1.3981999507E-3f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p2, 8.3334519073E-3f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p3, 4.1665795894E-2f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p4, 1.6666665459E-1f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p5, 5.0000001201E-1f);
+
+  // Clamp x.
+  Packet8f x = pmax(pmin(_x, p8f_exp_hi), p8f_exp_lo);
+
+  // Express exp(x) as exp(m*ln(2) + r), start by extracting
+  // m = floor(x/ln(2) + 0.5).
+  Packet8f m = _mm256_floor_ps(pmadd(x, p8f_cephes_LOG2EF, p8f_half));
+
+// Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
+// subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
+// truncation errors. Note that we don't use the "pmadd" function here to
+// ensure that a precision-preserving FMA instruction is used.
+#ifdef EIGEN_VECTORIZE_FMA
+  _EIGEN_DECLARE_CONST_Packet8f(nln2, -0.6931471805599453f);
+  Packet8f r = _mm256_fmadd_ps(m, p8f_nln2, x);
+#else
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C1, 0.693359375f);
+  _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C2, -2.12194440e-4f);
+  Packet8f r = psub(x, pmul(m, p8f_cephes_exp_C1));
+  r = psub(r, pmul(m, p8f_cephes_exp_C2));
+#endif
+
+  Packet8f r2 = pmul(r, r);
+
+  // TODO(gonnet): Split into odd/even polynomials and try to exploit
+  //               instruction-level parallelism.
+  Packet8f y = p8f_cephes_exp_p0;
+  y = pmadd(y, r, p8f_cephes_exp_p1);
+  y = pmadd(y, r, p8f_cephes_exp_p2);
+  y = pmadd(y, r, p8f_cephes_exp_p3);
+  y = pmadd(y, r, p8f_cephes_exp_p4);
+  y = pmadd(y, r, p8f_cephes_exp_p5);
+  y = pmadd(y, r2, r);
+  y = padd(y, p8f_1);
+
+  // Build emm0 = 2^m.
+  Packet8i emm0 = _mm256_cvttps_epi32(padd(m, p8f_127));
+  emm0 = pshiftleft(emm0, 23);
+
+  // Return 2^m * exp(r).
+  return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x);
 }
 
 // Hyperbolic Tangent function.
@@ -62,11 +272,84 @@ ptanh<Packet8f>(const Packet8f& x) {
   return internal::generic_fast_tanh_float(x);
 }
 
-// Exponential function for doubles.
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d
-pexp<Packet4d>(const Packet4d& x) {
-  return pexp_double(x);
+pexp<Packet4d>(const Packet4d& _x) {
+  Packet4d x = _x;
+
+  _EIGEN_DECLARE_CONST_Packet4d(1, 1.0);
+  _EIGEN_DECLARE_CONST_Packet4d(2, 2.0);
+  _EIGEN_DECLARE_CONST_Packet4d(half, 0.5);
+
+  _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437);
+  _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0);
+
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125);
+  _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6);
+  _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
+
+  Packet4d tmp, fx;
+
+  // clamp x
+  x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo);
+  // Express exp(x) as exp(g + n*log(2)).
+  fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half);
+
+  // Get the integer modulus of log(2), i.e. the "n" described above.
+  fx = _mm256_floor_pd(fx);
+
+  // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
+  // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
+  // digits right.
+  tmp = pmul(fx, p4d_cephes_exp_C1);
+  Packet4d z = pmul(fx, p4d_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  Packet4d x2 = pmul(x, x);
+
+  // Evaluate the numerator polynomial of the rational interpolant.
+  Packet4d px = p4d_cephes_exp_p0;
+  px = pmadd(px, x2, p4d_cephes_exp_p1);
+  px = pmadd(px, x2, p4d_cephes_exp_p2);
+  px = pmul(px, x);
+
+  // Evaluate the denominator polynomial of the rational interpolant.
+  Packet4d qx = p4d_cephes_exp_q0;
+  qx = pmadd(qx, x2, p4d_cephes_exp_q1);
+  qx = pmadd(qx, x2, p4d_cephes_exp_q2);
+  qx = pmadd(qx, x2, p4d_cephes_exp_q3);
+
+  // I don't really get this bit, copied from the SSE2 routines, so...
+  // TODO(gonnet): Figure out what is going on here, perhaps find a better
+  // rational interpolant?
+  x = _mm256_div_pd(px, psub(qx, px));
+  x = pmadd(p4d_2, x, p4d_1);
+
+  // Build e=2^n by constructing the exponents in a 128-bit vector and
+  // shifting them to where they belong in double-precision values.
+  __m128i emm0 = _mm256_cvtpd_epi32(fx);
+  emm0 = _mm_add_epi32(emm0, p4i_1023);
+  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
+  __m128i lo = _mm_slli_epi64(emm0, 52);
+  __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
+  __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
+  e = _mm256_insertf128_si256(e, hi, 1);
+
+  // Construct the result 2^n * exp(g) = e * x. The max is used to catch
+  // non-finite values in the input.
+  return pmax(pmul(x, _mm256_castsi256_pd(e)), _x);
 }
 
 // Functions for sqrt.
@@ -109,6 +392,7 @@ Packet4d psqrt<Packet4d>(const Packet4d& x) {
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
   _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000);
+  _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(nan, 0x7fc00000);
   _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f);
   _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f);
   _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000);
@@ -117,25 +401,20 @@ Packet8f prsqrt<Packet8f>(const Packet8f& _x) {
 
   // select only the inverse sqrt of positive normal inputs (denormals are
   // flushed to zero and cause infs as well).
-  Packet8f lt_min_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ);
-  Packet8f inf_mask =  _mm256_cmp_ps(_x, p8f_inf, _CMP_EQ_OQ);
-  Packet8f not_normal_finite_mask = _mm256_or_ps(lt_min_mask, inf_mask);
+  Packet8f le_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ);
+  Packet8f x = _mm256_andnot_ps(le_zero_mask, _mm256_rsqrt_ps(_x));
 
-  // Compute an approximate result using the rsqrt intrinsic.
-  Packet8f y_approx = _mm256_rsqrt_ps(_x);
+  // Fill in NaNs and Infs for the negative/zero entries.
+  Packet8f neg_mask = _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_LT_OQ);
+  Packet8f zero_mask = _mm256_andnot_ps(neg_mask, le_zero_mask);
+  Packet8f infs_and_nans = _mm256_or_ps(_mm256_and_ps(neg_mask, p8f_nan),
+                                        _mm256_and_ps(zero_mask, p8f_inf));
 
-  // Do a single step of Newton-Raphson iteration to improve the approximation.
-  // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
-  // It is essential to evaluate the inner term like this because forming
-  // y_n^2 may over- or underflow.
-  Packet8f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p8f_one_point_five));
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five));
 
-  // Select the result of the Newton-Raphson step for positive normal arguments.
-  // For other arguments, choose the output of the intrinsic. This will
-  // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if
-  // x is zero or a positive denormalized float (equivalent to flushing positive
-  // denormalized inputs to zero).
-  return pselect<Packet8f>(not_normal_finite_mask, y_approx, y_newton);
+  // Insert NaNs and Infs in all the right places.
+  return _mm256_or_ps(x, infs_and_nans);
 }
 
 #else
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/PacketMath.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/PacketMath.h
index 35a329e3f..923a124b2 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -18,11 +18,11 @@ namespace internal {
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
 #endif
 
-#if !defined(EIGEN_VECTORIZE_AVX512) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
 #endif
 
-#ifdef EIGEN_VECTORIZE_FMA
+#ifdef __FMA__
 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
@@ -31,12 +31,10 @@ namespace internal {
 typedef __m256  Packet8f;
 typedef __m256i Packet8i;
 typedef __m256d Packet4d;
-typedef eigen_packet_wrapper<__m128i, 2> Packet8h;
 
 template<> struct is_arithmetic<__m256>  { enum { value = true }; };
 template<> struct is_arithmetic<__m256i> { enum { value = true }; };
 template<> struct is_arithmetic<__m256d> { enum { value = true }; };
-template<> struct is_arithmetic<Packet8h> { enum { value = true }; };
 
 #define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \
   const Packet8f p8f_##NAME = pset1<Packet8f>(X)
@@ -60,28 +58,21 @@ template<> struct packet_traits<float>  : default_packet_traits
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size = 8,
+    size=8,
     HasHalfPacket = 1,
-    HasInsert = 1,
 
-    HasDiv = 1,
-    HasSin = EIGEN_FAST_MATH,
-    HasCos = EIGEN_FAST_MATH,
-    HasLog = 1,
-    HasLog1p = 1,
-    HasExpm1 = 1,
-    HasExp = 1,
-    HasNdtri = 1,
-    HasBessel = 1,
+    HasDiv  = 1,
+    HasSin  = EIGEN_FAST_MATH,
+    HasCos  = 0,
+    HasLog  = 1,
+    HasExp  = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
-    HasTanh = EIGEN_FAST_MATH,
-    HasErf = EIGEN_FAST_MATH,
+    HasTanh  = EIGEN_FAST_MATH,
     HasBlend = 1,
     HasRound = 1,
     HasFloor = 1,
-    HasCeil = 1,
-    HasRint = 1
+    HasCeil = 1
   };
 };
 template<> struct packet_traits<double> : default_packet_traits
@@ -93,7 +84,6 @@ template<> struct packet_traits<double> : default_packet_traits
     AlignedOnScalar = 1,
     size=4,
     HasHalfPacket = 1,
-    HasInsert = 1,
 
     HasDiv  = 1,
     HasExp  = 1,
@@ -105,36 +95,6 @@ template<> struct packet_traits<double> : default_packet_traits
     HasCeil = 1
   };
 };
-
-template <>
-struct packet_traits<Eigen::half> : default_packet_traits {
-  typedef Packet8h type;
-  // There is no half-size packet for Packet8h.
-  typedef Packet8h half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 8,
-    HasHalfPacket = 0,
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
-    HasNegate = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasConj   = 0,
-    HasSetLinear = 0,
-    HasSqrt = 0,
-    HasRsqrt = 0,
-    HasExp = 0,
-    HasLog = 0,
-    HasBlend = 0,
-    HasInsert = 1
-  };
-};
 #endif
 
 template<> struct scalar_div_cost<float,true> { enum { value = 14 }; };
@@ -153,30 +113,14 @@ template<> struct packet_traits<int>    : default_packet_traits
 };
 */
 
-template<> struct unpacket_traits<Packet8f> {
-  typedef float     type;
-  typedef Packet4f  half;
-  typedef Packet8i  integer_packet;
-  typedef uint8_t   mask_t;
-  enum {size=8, alignment=Aligned32, vectorizable=true, masked_load_available=true, masked_store_available=true};
-};
-template<> struct unpacket_traits<Packet4d> {
-  typedef double type;
-  typedef Packet2d half;
-  enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
-};
-template<> struct unpacket_traits<Packet8i> { typedef int    type; typedef Packet4i half; enum {size=8, alignment=Aligned32, vectorizable=false, masked_load_available=false, masked_store_available=false}; };
+template<> struct unpacket_traits<Packet8f> { typedef float  type; typedef Packet4f half; enum {size=8, alignment=Aligned32}; };
+template<> struct unpacket_traits<Packet4d> { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32}; };
+template<> struct unpacket_traits<Packet8i> { typedef int    type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; };
 
 template<> EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float&  from) { return _mm256_set1_ps(from); }
 template<> EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) { return _mm256_set1_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int&    from) { return _mm256_set1_epi32(from); }
 
-template<> EIGEN_STRONG_INLINE Packet8f pset1frombits<Packet8f>(unsigned int from) { return _mm256_castsi256_ps(pset1<Packet8i>(from)); }
-
-template<> EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f& /*a*/) { return _mm256_setzero_ps(); }
-template<> EIGEN_STRONG_INLINE Packet4d pzero(const Packet4d& /*a*/) { return _mm256_setzero_pd(); }
-template<> EIGEN_STRONG_INLINE Packet8i pzero(const Packet8i& /*a*/) { return _mm256_setzero_si256(); }
-
 template<> EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float*  from) { return _mm256_broadcast_ss(from); }
 template<> EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) { return _mm256_broadcast_sd(from); }
 
@@ -185,15 +129,6 @@ template<> EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) { retur
 
 template<> EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(const Packet8i& a, const Packet8i& b) {
-#ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_add_epi32(a,b);
-#else
-  __m128i lo = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
-  __m128i hi = _mm_add_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
-  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
-#endif
-}
 
 template<> EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); }
@@ -222,7 +157,7 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& /*a*/, co
   return pset1<Packet8i>(0);
 }
 
-#ifdef EIGEN_VECTORIZE_FMA
+#ifdef __FMA__
 template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
 #if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) )
   // Clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers,
@@ -249,77 +184,14 @@ template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d&
 }
 #endif
 
-template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
-  // There appears to be a bug in GCC, by which the optimizer may flip
-  // the argument order in calls to _mm_min_ps/_mm_max_ps, so we have to
-  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
-  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
-  Packet8f res;
-  asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
-  return res;
-#else
-  // Arguments are swapped to match NaN propagation behavior of std::min.
-  return _mm256_min_ps(b,a);
-#endif
-}
-template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
-  // See pmin above
-  Packet4d res;
-  asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
-  return res;
-#else
-  // Arguments are swapped to match NaN propagation behavior of std::min.
-  return _mm256_min_pd(b,a);
-#endif
-}
-template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
-  // See pmin above
-  Packet8f res;
-  asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
-  return res;
-#else
-  // Arguments are swapped to match NaN propagation behavior of std::max.
-  return _mm256_max_ps(b,a);
-#endif
-}
-template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
-  // See pmin above
-  Packet4d res;
-  asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
-  return res;
-#else
-  // Arguments are swapped to match NaN propagation behavior of std::max.
-  return _mm256_max_pd(b,a);
-#endif
-}
+template<> EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LE_OQ); }
-template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LT_OQ); }
-template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a, b, _CMP_NGE_UQ); }
-template<> EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_OQ); }
+template<> EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet4d pcmp_le(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_LE_OQ); }
-template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_LT_OQ); }
-template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt_or_nan(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a, b, _CMP_NGE_UQ); }
-template<> EIGEN_STRONG_INLINE Packet4d pcmp_eq(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_EQ_OQ); }
-
-
-template<> EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) {
-#ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_cmpeq_epi32(a,b);
-#else
-  __m128i lo = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
-  __m128i hi = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
-  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet8f print<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
-template<> EIGEN_STRONG_INLINE Packet4d print<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
+template<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
+template<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
 
 template<> EIGEN_STRONG_INLINE Packet8f pceil<Packet8f>(const Packet8f& a) { return _mm256_ceil_ps(a); }
 template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { return _mm256_ceil_pd(a); }
@@ -327,124 +199,17 @@ template<> EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) { ret
 template<> EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) { return _mm256_floor_ps(a); }
 template<> EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) { return _mm256_floor_pd(a); }
 
-
-template<> EIGEN_STRONG_INLINE Packet8i ptrue<Packet8i>(const Packet8i& a) {
-#ifdef EIGEN_VECTORIZE_AVX2
-  // vpcmpeqd has lower latency than the more general vcmpps
-  return _mm256_cmpeq_epi32(a,a);
-#else
-  const __m256 b = _mm256_castsi256_ps(a);
-  return _mm256_castps_si256(_mm256_cmp_ps(b,b,_CMP_TRUE_UQ));
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet8f ptrue<Packet8f>(const Packet8f& a) {
-#ifdef EIGEN_VECTORIZE_AVX2
-  // vpcmpeqd has lower latency than the more general vcmpps
-  const __m256i b = _mm256_castps_si256(a);
-  return _mm256_castsi256_ps(_mm256_cmpeq_epi32(b,b));
-#else
-  return _mm256_cmp_ps(a,a,_CMP_TRUE_UQ);
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet4d ptrue<Packet4d>(const Packet4d& a) {
-#ifdef EIGEN_VECTORIZE_AVX2
-  // vpcmpeqq has lower latency than the more general vcmppd
-  const __m256i b = _mm256_castpd_si256(a);
-  return _mm256_castsi256_pd(_mm256_cmpeq_epi64(b,b));
-#else
-  return _mm256_cmp_pd(a,a,_CMP_TRUE_UQ);
-#endif
-}
-
 template<> EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b) {
-#ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_and_si256(a,b);
-#else
-  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
-#endif
-}
 
 template<> EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8i por<Packet8i>(const Packet8i& a, const Packet8i& b) {
-#ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_or_si256(a,b);
-#else
-  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
-#endif
-}
 
 template<> EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(const Packet8i& a, const Packet8i& b) {
-#ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_xor_si256(a,b);
-#else
-  return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b)));
-#endif
-}
 
-template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(b,a); }
-template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(b,a); }
-template<> EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(const Packet8i& a, const Packet8i& b) {
-#ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_andnot_si256(b,a);
-#else
-  return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b),_mm256_castsi256_ps(a)));
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a)
-{
-  const Packet8f mask = pset1frombits<Packet8f>(0x80000000u);
-  const Packet8f prev0dot5 = pset1frombits<Packet8f>(0x3EFFFFFFu);
-  return _mm256_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
-}
-template<> EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a)
-{
-  const Packet4d mask = _mm256_castsi256_pd(_mm256_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull, 0x8000000000000000ull, 0x8000000000000000ull));
-  const Packet4d prev0dot5 = _mm256_castsi256_pd(_mm256_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull));
-  return _mm256_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8f pselect<Packet8f>(const Packet8f& mask, const Packet8f& a, const Packet8f& b)
-{ return _mm256_blendv_ps(b,a,mask); }
-template<> EIGEN_STRONG_INLINE Packet4d pselect<Packet4d>(const Packet4d& mask, const Packet4d& a, const Packet4d& b)
-{ return _mm256_blendv_pd(b,a,mask); }
-
-template<int N> EIGEN_STRONG_INLINE Packet8i parithmetic_shift_right(Packet8i a) {
-#ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_srai_epi32(a, N);
-#else
-  __m128i lo = _mm_srai_epi32(_mm256_extractf128_si256(a, 0), N);
-  __m128i hi = _mm_srai_epi32(_mm256_extractf128_si256(a, 1), N);
-  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
-#endif
-}
-
-template<int N> EIGEN_STRONG_INLINE Packet8i plogical_shift_right(Packet8i a) {
-#ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_srli_epi32(a, N);
-#else
-  __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(a, 0), N);
-  __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(a, 1), N);
-  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
-#endif
-}
-
-template<int N> EIGEN_STRONG_INLINE Packet8i plogical_shift_left(Packet8i a) {
-#ifdef EIGEN_VECTORIZE_AVX2
-  return _mm256_slli_epi32(a, N);
-#else
-  __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(a, 0), N);
-  __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(a, 1), N);
-  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
-#endif
-}
+template<> EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); }
 
 template<> EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); }
 template<> EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); }
@@ -454,14 +219,6 @@ template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) { EI
 template<> EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from)); }
 
-template<> EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from, uint8_t umask) {
-  Packet8i mask = _mm256_set1_epi8(static_cast<char>(umask));
-  const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe);
-  mask = por<Packet8i>(mask, bit_mask);
-  mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
-  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_maskload_ps(from, mask);
-}
-
 // Loads 4 floats from memory a returns the packet {a0, a0  a1, a1, a2, a2, a3, a3}
 template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
 {
@@ -469,7 +226,7 @@ template<> EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from)
 //   Packet8f tmp  = _mm256_castps128_ps256(_mm_loadu_ps(from));
 //   tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
 //   return _mm256_unpacklo_ps(tmp,tmp);
-
+  
   // _mm256_insertf128_ps is very slow on Haswell, thus:
   Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
   // mimic an "inplace" permutation of the lower 128bits using a blend
@@ -499,14 +256,6 @@ template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet8f&
 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); }
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
 
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet8f& from, uint8_t umask) {
-  Packet8i mask = _mm256_set1_epi8(static_cast<char>(umask));
-  const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe);
-  mask = por<Packet8i>(mask, bit_mask);
-  mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
-  EIGEN_DEBUG_UNALIGNED_STORE return _mm256_maskstore_ps(to, mask, from);
-}
-
 // NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
 // NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4);
 template<> EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, Index stride)
@@ -605,26 +354,47 @@ template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a)
   return _mm256_and_pd(a,mask);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8f pfrexp<Packet8f>(const Packet8f& a, Packet8f& exponent) {
-  return pfrexp_float(a,exponent);
-}
+// preduxp should be ok
+// FIXME: why is this ok? why isn't the simply implementation working as expected?
+template<> EIGEN_STRONG_INLINE Packet8f preduxp<Packet8f>(const Packet8f* vecs)
+{
+    __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]);
+    __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]);
+    __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]);
+    __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]);
 
-template<> EIGEN_STRONG_INLINE Packet8f pldexp<Packet8f>(const Packet8f& a, const Packet8f& exponent) {
-  return pldexp_float(a,exponent);
-}
+    __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1);
+    __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2);
+    __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3);
+    __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4);
 
-template<> EIGEN_STRONG_INLINE Packet4d pldexp<Packet4d>(const Packet4d& a, const Packet4d& exponent) {
-  // Build e=2^n by constructing the exponents in a 128-bit vector and
-  // shifting them to where they belong in double-precision values.
-  Packet4i cst_1023 = pset1<Packet4i>(1023);
-  __m128i emm0 = _mm256_cvtpd_epi32(exponent);
-  emm0 = _mm_add_epi32(emm0, cst_1023);
-  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0));
-  __m128i lo = _mm_slli_epi64(emm0, 52);
-  __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52);
-  __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0);
-  e = _mm256_insertf128_si256(e, hi, 1);
-  return pmul(a,_mm256_castsi256_pd(e));
+    __m256 perm1 =  _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
+    __m256 perm2 =  _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
+    __m256 perm3 =  _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
+    __m256 perm4 =  _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
+
+    __m256 sum1 = _mm256_add_ps(perm1, hsum5);
+    __m256 sum2 = _mm256_add_ps(perm2, hsum6);
+    __m256 sum3 = _mm256_add_ps(perm3, hsum7);
+    __m256 sum4 = _mm256_add_ps(perm4, hsum8);
+
+    __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
+    __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
+
+    __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0);
+    return final;
+}
+template<> EIGEN_STRONG_INLINE Packet4d preduxp<Packet4d>(const Packet4d* vecs)
+{
+ Packet4d tmp0, tmp1;
+
+  tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]);
+  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
+
+  tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]);
+  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
+
+  return _mm256_blend_pd(tmp0, tmp1, 0xC);
 }
 
 template<> EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a)
@@ -636,7 +406,7 @@ template<> EIGEN_STRONG_INLINE double predux<Packet4d>(const Packet4d& a)
   return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1))));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f predux_half_dowto4<Packet8f>(const Packet8f& a)
+template<> EIGEN_STRONG_INLINE Packet4f predux_downto4<Packet8f>(const Packet8f& a)
 {
   return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1));
 }
@@ -680,16 +450,93 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet4d>(const Packet4d& a)
   return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1)));
 }
 
-// not needed yet
-// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x)
-// {
-//   return _mm256_movemask_ps(x)==0xFF;
-// }
 
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x)
+template<int Offset>
+struct palign_impl<Offset,Packet8f>
 {
-  return _mm256_movemask_ps(x)!=0;
-}
+  static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second)
+  {
+    if (Offset==1)
+    {
+      first = _mm256_blend_ps(first, second, 1);
+      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
+      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
+      first = _mm256_blend_ps(tmp1, tmp2, 0x88);
+    }
+    else if (Offset==2)
+    {
+      first = _mm256_blend_ps(first, second, 3);
+      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
+      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
+      first = _mm256_blend_ps(tmp1, tmp2, 0xcc);
+    }
+    else if (Offset==3)
+    {
+      first = _mm256_blend_ps(first, second, 7);
+      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
+      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
+      first = _mm256_blend_ps(tmp1, tmp2, 0xee);
+    }
+    else if (Offset==4)
+    {
+      first = _mm256_blend_ps(first, second, 15);
+      Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0));
+      Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1);
+      first = _mm256_permute_ps(tmp2, _MM_SHUFFLE(3,2,1,0));
+    }
+    else if (Offset==5)
+    {
+      first = _mm256_blend_ps(first, second, 31);
+      first = _mm256_permute2f128_ps(first, first, 1);
+      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1));
+      first = _mm256_permute2f128_ps(tmp, tmp, 1);
+      first = _mm256_blend_ps(tmp, first, 0x88);
+    }
+    else if (Offset==6)
+    {
+      first = _mm256_blend_ps(first, second, 63);
+      first = _mm256_permute2f128_ps(first, first, 1);
+      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2));
+      first = _mm256_permute2f128_ps(tmp, tmp, 1);
+      first = _mm256_blend_ps(tmp, first, 0xcc);
+    }
+    else if (Offset==7)
+    {
+      first = _mm256_blend_ps(first, second, 127);
+      first = _mm256_permute2f128_ps(first, first, 1);
+      Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3));
+      first = _mm256_permute2f128_ps(tmp, tmp, 1);
+      first = _mm256_blend_ps(tmp, first, 0xee);
+    }
+  }
+};
+
+template<int Offset>
+struct palign_impl<Offset,Packet4d>
+{
+  static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second)
+  {
+    if (Offset==1)
+    {
+      first = _mm256_blend_pd(first, second, 1);
+      __m256d tmp = _mm256_permute_pd(first, 5);
+      first = _mm256_permute2f128_pd(tmp, tmp, 1);
+      first = _mm256_blend_pd(tmp, first, 0xA);
+    }
+    else if (Offset==2)
+    {
+      first = _mm256_blend_pd(first, second, 3);
+      first = _mm256_permute2f128_pd(first, first, 1);
+    }
+    else if (Offset==3)
+    {
+      first = _mm256_blend_pd(first, second, 7);
+      __m256d tmp = _mm256_permute_pd(first, 5);
+      first = _mm256_permute2f128_pd(tmp, tmp, 1);
+      first = _mm256_blend_pd(tmp, first, 5);
+    }
+  }
+};
 
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet8f,8>& kernel) {
@@ -763,274 +610,24 @@ template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, cons
   return _mm256_blendv_pd(thenPacket, elsePacket, false_mask);
 }
 
-// Packet math for Eigen::half
-template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet8h half; };
-
-template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
-  return _mm_set1_epi16(from.x);
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
-  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_extract_epi16(from, 0)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
-  return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
-  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
-}
-
-template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
-  _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
-}
-
-template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
-  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h
-ploaddup<Packet8h>(const Eigen::half*  from) {
-  unsigned short a = from[0].x;
-  unsigned short b = from[1].x;
-  unsigned short c = from[2].x;
-  unsigned short d = from[3].x;
-  return _mm_set_epi16(d, d, c, c, b, b, a, a);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h
-ploadquad<Packet8h>(const Eigen::half* from) {
-  unsigned short a = from[0].x;
-  unsigned short b = from[1].x;
-  return _mm_set_epi16(b, b, b, b, a, a, a, a);
-}
-
-EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) {
-#ifdef EIGEN_HAS_FP16_C
-  return _mm256_cvtph_ps(a);
-#else
-  EIGEN_ALIGN32 Eigen::half aux[8];
-  pstore(aux, a);
-  float f0(aux[0]);
-  float f1(aux[1]);
-  float f2(aux[2]);
-  float f3(aux[3]);
-  float f4(aux[4]);
-  float f5(aux[5]);
-  float f6(aux[6]);
-  float f7(aux[7]);
-
-  return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0);
-#endif
-}
-
-EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
-#ifdef EIGEN_HAS_FP16_C
-  return _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
-#else
-  EIGEN_ALIGN32 float aux[8];
-  pstore(aux, a);
-  Eigen::half h0(aux[0]);
-  Eigen::half h1(aux[1]);
-  Eigen::half h2(aux[2]);
-  Eigen::half h3(aux[3]);
-  Eigen::half h4(aux[4]);
-  Eigen::half h5(aux[5]);
-  Eigen::half h6(aux[6]);
-  Eigen::half h7(aux[7]);
-
-  return _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) {
- return _mm_cmpeq_epi32(a, a);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) {
-  // in some cases Packet4i is a wrapper around __m128i, so we either need to
-  // cast to Packet4i to directly call the intrinsics as below:
-  return _mm_or_si128(a,b);
-}
-template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) {
-  return _mm_xor_si128(a,b);
-}
-template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) {
-  return _mm_and_si128(a,b);
-}
-template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) {
-  return _mm_andnot_si128(b,a);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) {
-  return _mm_blendv_epi8(b, a, mask);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a,const Packet8h& b) {
-  Packet8f af = half2float(a);
-  Packet8f bf = half2float(b);
-  Packet8f rf = pcmp_eq(af, bf);
-  // Pack the 32-bit flags into 16-bits flags.
-  return _mm_packs_epi32(_mm256_extractf128_si256(_mm256_castps_si256(rf), 0),
-                         _mm256_extractf128_si256(_mm256_castps_si256(rf), 1));
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
-
-template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
-  Packet8h sign_mask = _mm_set1_epi16(static_cast<unsigned short>(0x8000));
-  return _mm_xor_si128(a, sign_mask);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  Packet8f af = half2float(a);
-  Packet8f bf = half2float(b);
-  Packet8f rf = padd(af, bf);
-  return float2half(rf);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  Packet8f af = half2float(a);
-  Packet8f bf = half2float(b);
-  Packet8f rf = psub(af, bf);
-  return float2half(rf);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  Packet8f af = half2float(a);
-  Packet8f bf = half2float(b);
-  Packet8f rf = pmul(af, bf);
-  return float2half(rf);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
-  Packet8f af = half2float(a);
-  Packet8f bf = half2float(b);
-  Packet8f rf = pdiv(af, bf);
-  return float2half(rf);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride)
+template<> EIGEN_STRONG_INLINE Packet8f pinsertfirst(const Packet8f& a, float b)
 {
-  return _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
+  return _mm256_blend_ps(a,pset1<Packet8f>(b),1);
 }
 
-template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride)
+template<> EIGEN_STRONG_INLINE Packet4d pinsertfirst(const Packet4d& a, double b)
 {
-  EIGEN_ALIGN32 Eigen::half aux[8];
-  pstore(aux, from);
-  to[stride*0] = aux[0];
-  to[stride*1] = aux[1];
-  to[stride*2] = aux[2];
-  to[stride*3] = aux[3];
-  to[stride*4] = aux[4];
-  to[stride*5] = aux[5];
-  to[stride*6] = aux[6];
-  to[stride*7] = aux[7];
+  return _mm256_blend_pd(a,pset1<Packet4d>(b),1);
 }
 
-template<> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
-  Packet8f af = half2float(a);
-  float reduced = predux<Packet8f>(af);
-  return Eigen::half(reduced);
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
-  Packet8f af = half2float(a);
-  float reduced = predux_max<Packet8f>(af);
-  return Eigen::half(reduced);
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
-  Packet8f af = half2float(a);
-  float reduced = predux_min<Packet8f>(af);
-  return Eigen::half(reduced);
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
-  Packet8f af = half2float(a);
-  float reduced = predux_mul<Packet8f>(af);
-  return Eigen::half(reduced);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a)
+template<> EIGEN_STRONG_INLINE Packet8f pinsertlast(const Packet8f& a, float b)
 {
-  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
-  return _mm_shuffle_epi8(a,m);
+  return _mm256_blend_ps(a,pset1<Packet8f>(b),(1<<7));
 }
 
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet8h,8>& kernel) {
-  __m128i a = kernel.packet[0];
-  __m128i b = kernel.packet[1];
-  __m128i c = kernel.packet[2];
-  __m128i d = kernel.packet[3];
-  __m128i e = kernel.packet[4];
-  __m128i f = kernel.packet[5];
-  __m128i g = kernel.packet[6];
-  __m128i h = kernel.packet[7];
-
-  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
-  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
-  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
-  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
-  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
-  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
-  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
-  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
-
-  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
-  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
-  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
-  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
-  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
-  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
-  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
-  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
-
-  __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
-  __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
-  __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
-  __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
-  __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
-  __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
-  __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
-  __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
-
-  kernel.packet[0] = a0b0c0d0e0f0g0h0;
-  kernel.packet[1] = a1b1c1d1e1f1g1h1;
-  kernel.packet[2] = a2b2c2d2e2f2g2h2;
-  kernel.packet[3] = a3b3c3d3e3f3g3h3;
-  kernel.packet[4] = a4b4c4d4e4f4g4h4;
-  kernel.packet[5] = a5b5c5d5e5f5g5h5;
-  kernel.packet[6] = a6b6c6d6e6f6g6h6;
-  kernel.packet[7] = a7b7c7d7e7f7g7h7;
-}
-
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet8h,4>& kernel) {
-  EIGEN_ALIGN32 Eigen::half in[4][8];
-  pstore<Eigen::half>(in[0], kernel.packet[0]);
-  pstore<Eigen::half>(in[1], kernel.packet[1]);
-  pstore<Eigen::half>(in[2], kernel.packet[2]);
-  pstore<Eigen::half>(in[3], kernel.packet[3]);
-
-  EIGEN_ALIGN32 Eigen::half out[4][8];
-
-  for (int i = 0; i < 4; ++i) {
-    for (int j = 0; j < 4; ++j) {
-      out[i][j] = in[j][2*i];
-    }
-    for (int j = 0; j < 4; ++j) {
-      out[i][j+4] = in[j][2*i+1];
-    }
-  }
-
-  kernel.packet[0] = pload<Packet8h>(out[0]);
-  kernel.packet[1] = pload<Packet8h>(out[1]);
-  kernel.packet[2] = pload<Packet8h>(out[2]);
-  kernel.packet[3] = pload<Packet8h>(out[3]);
+template<> EIGEN_STRONG_INLINE Packet4d pinsertlast(const Packet4d& a, double b)
+{
+  return _mm256_blend_pd(a,pset1<Packet4d>(b),(1<<3));
 }
 
 } // end namespace internal
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/TypeCasting.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/TypeCasting.h
index 181043588..83bfdc604 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/TypeCasting.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/TypeCasting.h
@@ -37,51 +37,13 @@ struct type_casting_traits<int, float> {
 
 
 template<> EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) {
-  return _mm256_cvttps_epi32(a);
+  return _mm256_cvtps_epi32(a);
 }
 
 template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(const Packet8i& a) {
   return _mm256_cvtepi32_ps(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8i preinterpret<Packet8i,Packet8f>(const Packet8f& a) {
-  return _mm256_castps_si256(a);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8f preinterpret<Packet8f,Packet8i>(const Packet8i& a) {
-  return _mm256_castsi256_ps(a);
-}
-
-#ifndef EIGEN_VECTORIZE_AVX512
-
-template <>
-struct type_casting_traits<Eigen::half, float> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
-  return half2float(a);
-}
-
-template <>
-struct type_casting_traits<float, Eigen::half> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-#endif  // EIGEN_VECTORIZE_AVX512
-
-template<> EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
-  return float2half(a);
-}
-
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/Complex.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/Complex.h
deleted file mode 100644
index dc2ae0a35..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/Complex.h
+++ /dev/null
@@ -1,447 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2018 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_COMPLEX_AVX512_H
-#define EIGEN_COMPLEX_AVX512_H
-
-namespace Eigen {
-
-namespace internal {
-
-//---------- float ----------
-struct Packet8cf
-{
-  EIGEN_STRONG_INLINE Packet8cf() {}
-  EIGEN_STRONG_INLINE explicit Packet8cf(const __m512& a) : v(a) {}
-  __m512  v;
-};
-
-template<> struct packet_traits<std::complex<float> >  : default_packet_traits
-{
-  typedef Packet8cf type;
-  typedef Packet4cf half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 8,
-    HasHalfPacket = 1,
-
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
-    HasNegate = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasSetLinear = 0,
-    HasInsert = 1
-  };
-};
-
-template<> struct unpacket_traits<Packet8cf> {
-  typedef std::complex<float> type;
-  enum {
-    size = 8,
-    alignment=unpacket_traits<Packet16f>::alignment,
-    vectorizable=true,
-    masked_load_available=false,
-    masked_store_available=false
-  };
-  typedef Packet4cf half;
-};
-
-template<> EIGEN_STRONG_INLINE Packet8cf ptrue<Packet8cf>(const Packet8cf& a) { return Packet8cf(ptrue(Packet16f(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet8cf pnot<Packet8cf>(const Packet8cf& a) { return Packet8cf(pnot(Packet16f(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet8cf padd<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_add_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf psub<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_sub_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf pnegate(const Packet8cf& a)
-{
-  return Packet8cf(pnegate(a.v));
-}
-template<> EIGEN_STRONG_INLINE Packet8cf pconj(const Packet8cf& a)
-{
-  const __m512 mask = _mm512_castsi512_ps(_mm512_setr_epi32(
-    0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,
-    0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000));
-  return Packet8cf(pxor(a.v,mask));
-}
-
-template<> EIGEN_STRONG_INLINE Packet8cf pmul<Packet8cf>(const Packet8cf& a, const Packet8cf& b)
-{
-  __m512 tmp2 = _mm512_mul_ps(_mm512_movehdup_ps(a.v), _mm512_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1)));
-  return Packet8cf(_mm512_fmaddsub_ps(_mm512_moveldup_ps(a.v), b.v, tmp2));
-}
-
-template<> EIGEN_STRONG_INLINE Packet8cf pand   <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pand(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf por    <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(por(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf pxor   <Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pxor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet8cf pandnot<Packet8cf>(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pandnot(a.v,b.v)); }
-
-template <>
-EIGEN_STRONG_INLINE Packet8cf pcmp_eq(const Packet8cf& a, const Packet8cf& b) {
-  __m512 eq = pcmp_eq<Packet16f>(a.v, b.v);
-  return Packet8cf(pand(eq, _mm512_permute_ps(eq, 0xB1)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet8cf pload <Packet8cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload<Packet16f>(&numext::real_ref(*from))); }
-template<> EIGEN_STRONG_INLINE Packet8cf ploadu<Packet8cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet8cf(ploadu<Packet16f>(&numext::real_ref(*from))); }
-
-
-template<> EIGEN_STRONG_INLINE Packet8cf pset1<Packet8cf>(const std::complex<float>& from)
-{
-  return Packet8cf(_mm512_castpd_ps(pload1<Packet8d>((const double*)(const void*)&from)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet8cf ploaddup<Packet8cf>(const std::complex<float>* from)
-{
-  return Packet8cf( _mm512_castpd_ps( ploaddup<Packet8d>((const double*)(const void*)from )) );
-}
-template<> EIGEN_STRONG_INLINE Packet8cf ploadquad<Packet8cf>(const std::complex<float>* from)
-{
-  return Packet8cf( _mm512_castpd_ps( ploadquad<Packet8d>((const double*)(const void*)from )) );
-}
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float>* to, const Packet8cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet8cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet8cf pgather<std::complex<float>, Packet8cf>(const std::complex<float>* from, Index stride)
-{
-  return Packet8cf(_mm512_castpd_ps(pgather<double,Packet8d>((const double*)(const void*)from, stride)));
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet8cf>(std::complex<float>* to, const Packet8cf& from, Index stride)
-{
-  pscatter((double*)(void*)to, _mm512_castps_pd(from.v), stride);
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet8cf>(const Packet8cf& a)
-{
-  return pfirst(Packet2cf(_mm512_castps512_ps128(a.v)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet8cf preverse(const Packet8cf& a) {
-  return Packet8cf(_mm512_castsi512_ps(
-            _mm512_permutexvar_epi64( _mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7),
-                                      _mm512_castps_si512(a.v))));
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet8cf>(const Packet8cf& a)
-{
-  return predux(padd(Packet4cf(extract256<0>(a.v)),
-                     Packet4cf(extract256<1>(a.v))));
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet8cf>(const Packet8cf& a)
-{
-  return predux_mul(pmul(Packet4cf(extract256<0>(a.v)),
-                         Packet4cf(extract256<1>(a.v))));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4cf predux_half_dowto4<Packet8cf>(const Packet8cf& a) {
-  __m256 lane0 = extract256<0>(a.v);
-  __m256 lane1 = extract256<1>(a.v);
-  __m256 res = _mm256_add_ps(lane0, lane1);
-  return Packet4cf(res);
-}
-
-template<> struct conj_helper<Packet8cf, Packet8cf, false,true>
-{
-  EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet8cf, Packet8cf, true,false>
-{
-  EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet8cf, Packet8cf, true,true>
-{
-  EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf,Packet16f)
-
-template<> EIGEN_STRONG_INLINE Packet8cf pdiv<Packet8cf>(const Packet8cf& a, const Packet8cf& b)
-{
-  Packet8cf num = pmul(a, pconj(b));
-  __m512 tmp = _mm512_mul_ps(b.v, b.v);
-  __m512 tmp2    = _mm512_shuffle_ps(tmp,tmp,0xB1);
-  __m512 denom = _mm512_add_ps(tmp, tmp2);
-  return Packet8cf(_mm512_div_ps(num.v, denom));
-}
-
-template<> EIGEN_STRONG_INLINE Packet8cf pcplxflip<Packet8cf>(const Packet8cf& x)
-{
-  return Packet8cf(_mm512_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1)));
-}
-
-//---------- double ----------
-struct Packet4cd
-{
-  EIGEN_STRONG_INLINE Packet4cd() {}
-  EIGEN_STRONG_INLINE explicit Packet4cd(const __m512d& a) : v(a) {}
-  __m512d  v;
-};
-
-template<> struct packet_traits<std::complex<double> >  : default_packet_traits
-{
-  typedef Packet4cd type;
-  typedef Packet2cd half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 0,
-    size = 4,
-    HasHalfPacket = 1,
-
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
-    HasNegate = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasSetLinear = 0
-  };
-};
-
-template<> struct unpacket_traits<Packet4cd> {
-  typedef std::complex<double> type;
-  enum {
-    size = 4,
-    alignment = unpacket_traits<Packet8d>::alignment,
-    vectorizable=true,
-    masked_load_available=false,
-    masked_store_available=false
-  };
-  typedef Packet2cd half;
-};
-
-template<> EIGEN_STRONG_INLINE Packet4cd padd<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_add_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd psub<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_sub_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd pnegate(const Packet4cd& a) { return Packet4cd(pnegate(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd pconj(const Packet4cd& a)
-{
-  const __m512d mask = _mm512_castsi512_pd(
-          _mm512_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0,
-                           0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0));
-  return Packet4cd(pxor(a.v,mask));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cd pmul<Packet4cd>(const Packet4cd& a, const Packet4cd& b)
-{
-  __m512d tmp1 = _mm512_shuffle_pd(a.v,a.v,0x0);
-  __m512d tmp2 = _mm512_shuffle_pd(a.v,a.v,0xFF);
-  __m512d tmp3 = _mm512_shuffle_pd(b.v,b.v,0x55);
-  __m512d odd  = _mm512_mul_pd(tmp2, tmp3);
-  return Packet4cd(_mm512_fmaddsub_pd(tmp1, b.v, odd));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cd ptrue<Packet4cd>(const Packet4cd& a) { return Packet4cd(ptrue(Packet8d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet4cd pnot<Packet4cd>(const Packet4cd& a) { return Packet4cd(pnot(Packet8d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet4cd pand   <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pand(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd por    <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(por(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd pxor   <Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pxor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet4cd pandnot<Packet4cd>(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pandnot(a.v,b.v)); }
-
-template <>
-EIGEN_STRONG_INLINE Packet4cd pcmp_eq(const Packet4cd& a, const Packet4cd& b) {
-  __m512d eq = pcmp_eq<Packet8d>(a.v, b.v);
-  return Packet4cd(pand(eq, _mm512_permute_pd(eq, 0x55)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cd pload <Packet4cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return Packet4cd(pload<Packet8d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet4cd ploadu<Packet4cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cd(ploadu<Packet8d>((const double*)from)); }
-
-template<> EIGEN_STRONG_INLINE Packet4cd pset1<Packet4cd>(const std::complex<double>& from)
-{
-  #ifdef EIGEN_VECTORIZE_AVX512DQ
-  return Packet4cd(_mm512_broadcast_f64x2(pset1<Packet1cd>(from).v));
-  #else
-  return Packet4cd(_mm512_castps_pd(_mm512_broadcast_f32x4( _mm_castpd_ps(pset1<Packet1cd>(from).v))));
-  #endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cd ploaddup<Packet4cd>(const std::complex<double>* from) {
-  return Packet4cd(_mm512_insertf64x4(
-          _mm512_castpd256_pd512(ploaddup<Packet2cd>(from).v), ploaddup<Packet2cd>(from+1).v, 1));
-}
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet4cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet4cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet4cd pgather<std::complex<double>, Packet4cd>(const std::complex<double>* from, Index stride)
-{
-  return Packet4cd(_mm512_insertf64x4(_mm512_castpd256_pd512(
-            _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from+0*stride).v), ploadu<Packet1cd>(from+1*stride).v,1)),
-            _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from+2*stride).v), ploadu<Packet1cd>(from+3*stride).v,1), 1));
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet4cd>(std::complex<double>* to, const Packet4cd& from, Index stride)
-{
-  __m512i fromi = _mm512_castpd_si512(from.v);
-  double* tod = (double*)(void*)to;
-  _mm_storeu_pd(tod+0*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,0)) );
-  _mm_storeu_pd(tod+2*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,1)) );
-  _mm_storeu_pd(tod+4*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,2)) );
-  _mm_storeu_pd(tod+6*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,3)) );
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet4cd>(const Packet4cd& a)
-{
-  __m128d low = extract128<0>(a.v);
-  EIGEN_ALIGN16 double res[2];
-  _mm_store_pd(res, low);
-  return std::complex<double>(res[0],res[1]);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cd preverse(const Packet4cd& a) {
-  return Packet4cd(_mm512_shuffle_f64x2(a.v, a.v, EIGEN_SSE_SHUFFLE_MASK(3,2,1,0)));
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet4cd>(const Packet4cd& a)
-{
-  return predux(padd(Packet2cd(_mm512_extractf64x4_pd(a.v,0)),
-                     Packet2cd(_mm512_extractf64x4_pd(a.v,1))));
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet4cd>(const Packet4cd& a)
-{
-  return predux_mul(pmul(Packet2cd(_mm512_extractf64x4_pd(a.v,0)),
-                         Packet2cd(_mm512_extractf64x4_pd(a.v,1))));
-}
-
-template<> struct conj_helper<Packet4cd, Packet4cd, false,true>
-{
-  EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet4cd, Packet4cd, true,false>
-{
-  EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet4cd, Packet4cd, true,true>
-{
-  EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cd,Packet8d)
-
-template<> EIGEN_STRONG_INLINE Packet4cd pdiv<Packet4cd>(const Packet4cd& a, const Packet4cd& b)
-{
-  Packet4cd num = pmul(a, pconj(b));
-  __m512d tmp = _mm512_mul_pd(b.v, b.v);
-  __m512d denom =  padd(_mm512_permute_pd(tmp,0x55), tmp);
-  return Packet4cd(_mm512_div_pd(num.v, denom));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4cd pcplxflip<Packet4cd>(const Packet4cd& x)
-{
-  return Packet4cd(_mm512_permute_pd(x.v,0x55));
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8cf,4>& kernel) {
-  PacketBlock<Packet8d,4> pb;
-  
-  pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v);
-  pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v);
-  pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v);
-  pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v);
-  ptranspose(pb);
-  kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]);
-  kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]);
-  kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]);
-  kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]);
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8cf,8>& kernel) {
-  PacketBlock<Packet8d,8> pb;
-  
-  pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v);
-  pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v);
-  pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v);
-  pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v);
-  pb.packet[4] = _mm512_castps_pd(kernel.packet[4].v);
-  pb.packet[5] = _mm512_castps_pd(kernel.packet[5].v);
-  pb.packet[6] = _mm512_castps_pd(kernel.packet[6].v);
-  pb.packet[7] = _mm512_castps_pd(kernel.packet[7].v);
-  ptranspose(pb);
-  kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]);
-  kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]);
-  kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]);
-  kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]);
-  kernel.packet[4].v = _mm512_castpd_ps(pb.packet[4]);
-  kernel.packet[5].v = _mm512_castpd_ps(pb.packet[5]);
-  kernel.packet[6].v = _mm512_castpd_ps(pb.packet[6]);
-  kernel.packet[7].v = _mm512_castpd_ps(pb.packet[7]);
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4cd,4>& kernel) {
-  __m512d T0 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, EIGEN_SSE_SHUFFLE_MASK(0,1,0,1)); // [a0 a1 b0 b1]
-  __m512d T1 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, EIGEN_SSE_SHUFFLE_MASK(2,3,2,3)); // [a2 a3 b2 b3]
-  __m512d T2 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, EIGEN_SSE_SHUFFLE_MASK(0,1,0,1)); // [c0 c1 d0 d1]
-  __m512d T3 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, EIGEN_SSE_SHUFFLE_MASK(2,3,2,3)); // [c2 c3 d2 d3]
-
-  kernel.packet[3] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, EIGEN_SSE_SHUFFLE_MASK(1,3,1,3))); // [a3 b3 c3 d3]
-  kernel.packet[2] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, EIGEN_SSE_SHUFFLE_MASK(0,2,0,2))); // [a2 b2 c2 d2]
-  kernel.packet[1] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, EIGEN_SSE_SHUFFLE_MASK(1,3,1,3))); // [a1 b1 c1 d1]
-  kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, EIGEN_SSE_SHUFFLE_MASK(0,2,0,2))); // [a0 b0 c0 d0]
-}
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_COMPLEX_AVX512_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h
index 67043d01b..b259c1e1f 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -15,13 +15,13 @@ namespace Eigen {
 namespace internal {
 
 // Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics.
-#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG  || EIGEN_COMP_MSVC >= 1923
+#if EIGEN_GNUC_AT_LEAST(5, 3)
 
 #define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \
   const Packet16f p16f_##NAME = pset1<Packet16f>(X)
 
 #define _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(NAME, X) \
-  const Packet16f p16f_##NAME =  preinterpret<Packet16f,Packet16i>(pset1<Packet16i>(X))
+  const Packet16f p16f_##NAME = (__m512)pset1<Packet16i>(X)
 
 #define _EIGEN_DECLARE_CONST_Packet8d(NAME, X) \
   const Packet8d p8d_##NAME = pset1<Packet8d>(X)
@@ -29,6 +29,7 @@ namespace internal {
 #define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \
   const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X))
 
+
 // Natural logarithm
 // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
 // and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
@@ -72,7 +73,7 @@ plog<Packet16f>(const Packet16f& _x) {
   x = pmax(x, p16f_min_norm_pos);
 
   // Extract the shifted exponents.
-  Packet16f emm0 = _mm512_cvtepi32_ps(_mm512_srli_epi32((preinterpret<Packet16i,Packet16f>(x)), 23));
+  Packet16f emm0 = _mm512_cvtepi32_ps(_mm512_srli_epi32((__m512i)x, 23));
   Packet16f e = _mm512_sub_ps(emm0, p16f_126f);
 
   // Set the exponents to -1, i.e. x are in the range [0.5,1).
@@ -128,6 +129,7 @@ plog<Packet16f>(const Packet16f& _x) {
               p16f_nan),
             p16f_minus_inf);
 }
+
 #endif
 
 // Exponential function. Works by writing "x = m*log(2) + r" where
@@ -253,7 +255,6 @@ pexp<Packet8d>(const Packet8d& _x) {
   return pmax(pmul(x, e), _x);
   }*/
 
-
 // Functions for sqrt.
 // The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step
 // of Newton's method, at a cost of 1-2 bits of precision as opposed to the
@@ -309,136 +310,78 @@ EIGEN_STRONG_INLINE Packet8d psqrt<Packet8d>(const Packet8d& x) {
 }
 #endif
 
-// prsqrt for float.
-#if defined(EIGEN_VECTORIZE_AVX512ER)
-
-template <>
-EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
-  return _mm512_rsqrt28_ps(x);
-}
-
-#elif EIGEN_FAST_MATH
-
+// Functions for rsqrt.
+// Almost identical to the sqrt routine, just leave out the last multiplication
+// and fill in NaN/Inf where needed. Note that this function only exists as an
+// iterative version for doubles since there is no instruction for diretly
+// computing the reciprocal square root in AVX-512.
+#ifdef EIGEN_FAST_MATH
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
 prsqrt<Packet16f>(const Packet16f& _x) {
   _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000);
+  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000);
   _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f);
   _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f);
+  _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000);
 
   Packet16f neg_half = pmul(_x, p16f_minus_half);
 
-  // Identity infinite, negative and denormal arguments.
-  __mmask16 inf_mask = _mm512_cmp_ps_mask(_x, p16f_inf, _CMP_EQ_OQ);
-  __mmask16 not_pos_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LE_OQ);
-  __mmask16 not_finite_pos_mask = not_pos_mask | inf_mask;
-  
-  // Compute an approximate result using the rsqrt intrinsic, forcing +inf
-  // for denormals for consistency with AVX and SSE implementations.
-  Packet16f y_approx = _mm512_rsqrt14_ps(_x);
+  // select only the inverse sqrt of positive normal inputs (denormals are
+  // flushed to zero and cause infs as well).
+  __mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ);
+  Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps());
 
-  // Do a single step of Newton-Raphson iteration to improve the approximation.
-  // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
-  // It is essential to evaluate the inner term like this because forming
-  // y_n^2 may over- or underflow.
-  Packet16f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p16f_one_point_five));
+  // Fill in NaNs and Infs for the negative/zero entries.
+  __mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ);
+  Packet16f infs_and_nans = _mm512_mask_blend_ps(
+      neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan);
 
-  // Select the result of the Newton-Raphson step for positive finite arguments.
-  // For other arguments, choose the output of the intrinsic. This will
-  // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf.
-  return _mm512_mask_blend_ps(not_finite_pos_mask, y_newton, y_approx);
-  }
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five));
 
-#else
-
-template <>
-EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
-  _EIGEN_DECLARE_CONST_Packet16f(one, 1.0f);
-  return _mm512_div_ps(p16f_one, _mm512_sqrt_ps(x));
+  // Insert NaNs and Infs in all the right places.
+  return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans);
 }
 
-#endif
-
-// prsqrt for double.
-#if EIGEN_FAST_MATH
 template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
 prsqrt<Packet8d>(const Packet8d& _x) {
+  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL);
+  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(nan, 0x7ff1000000000000LL);
   _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5);
   _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5);
-  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL);
+  _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL);
 
   Packet8d neg_half = pmul(_x, p8d_minus_half);
 
-  // Identity infinite, negative and denormal arguments.
-  __mmask8 inf_mask = _mm512_cmp_pd_mask(_x, p8d_inf, _CMP_EQ_OQ);
-  __mmask8 not_pos_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LE_OQ);
-  __mmask8 not_finite_pos_mask = not_pos_mask | inf_mask;
+  // select only the inverse sqrt of positive normal inputs (denormals are
+  // flushed to zero and cause infs as well).
+  __mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ);
+  Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd());
 
-  // Compute an approximate result using the rsqrt intrinsic, forcing +inf
-  // for denormals for consistency with AVX and SSE implementations.
-#if defined(EIGEN_VECTORIZE_AVX512ER)
-  Packet8d y_approx = _mm512_rsqrt28_pd(_x);
-#else
-  Packet8d y_approx = _mm512_rsqrt14_pd(_x);
-#endif
-  // Do one or two steps of Newton-Raphson's to improve the approximation, depending on the
-  // starting accuracy (either 2^-14 or 2^-28, depending on whether AVX512ER is available).
-  // The Newton-Raphson algorithm has quadratic convergence and roughly doubles the number
-  // of correct digits for each step.
-  // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
-  // It is essential to evaluate the inner term like this because forming
-  // y_n^2 may over- or underflow.
-  Packet8d y_newton = pmul(y_approx, pmadd(neg_half, pmul(y_approx, y_approx), p8d_one_point_five));
-#if !defined(EIGEN_VECTORIZE_AVX512ER)
-  y_newton = pmul(y_newton, pmadd(y_newton, pmul(neg_half, y_newton), p8d_one_point_five));
-#endif
-  // Select the result of the Newton-Raphson step for positive finite arguments.
-  // For other arguments, choose the output of the intrinsic. This will
-  // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf.
-  return _mm512_mask_blend_pd(not_finite_pos_mask, y_newton, y_approx);
+  // Fill in NaNs and Infs for the negative/zero entries.
+  __mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ);
+  Packet8d infs_and_nans = _mm512_mask_blend_pd(
+      neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan);
+
+  // Do a first step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+
+  // Do a second step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five));
+
+  // Insert NaNs and Infs in all the right places.
+  return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans);
 }
-#else
+#elif defined(EIGEN_VECTORIZE_AVX512ER)
 template <>
-EIGEN_STRONG_INLINE Packet8d prsqrt<Packet8d>(const Packet8d& x) {
-  _EIGEN_DECLARE_CONST_Packet8d(one, 1.0f);
-  return _mm512_div_pd(p8d_one, _mm512_sqrt_pd(x));
+EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
+  return _mm512_rsqrt28_ps(x);
 }
 #endif
-
-#if defined(EIGEN_VECTORIZE_AVX512DQ)
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet16f plog1p<Packet16f>(const Packet16f& _x) {
-  return generic_plog1p(_x);
-}
-
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet16f pexpm1<Packet16f>(const Packet16f& _x) {
-  return generic_expm1(_x);
-}
 #endif
 
-#endif
-
-
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
-psin<Packet16f>(const Packet16f& _x) {
-  return psin_float(_x);
-}
-
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
-pcos<Packet16f>(const Packet16f& _x) {
-  return pcos_float(_x);
-}
-
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f
-ptanh<Packet16f>(const Packet16f& _x) {
-  return internal::generic_fast_tanh_float(_x);
-}
-
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/PacketMath.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/PacketMath.h
index 10a1d4adb..000b7762f 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -31,7 +31,6 @@ namespace internal {
 typedef __m512 Packet16f;
 typedef __m512i Packet16i;
 typedef __m512d Packet8d;
-typedef eigen_packet_wrapper<__m256i, 1> Packet16h;
 
 template <>
 struct is_arithmetic<__m512> {
@@ -46,38 +45,6 @@ struct is_arithmetic<__m512d> {
   enum { value = true };
 };
 
-template<> struct is_arithmetic<Packet16h> { enum { value = true }; };
-
-template <>
-struct packet_traits<half> : default_packet_traits {
-  typedef Packet16h type;
-  // There is no half-size packet for Packet16h.
-  typedef Packet16h half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 16,
-    HasHalfPacket = 0,
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
-    HasNegate = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasConj   = 0,
-    HasSetLinear = 0,
-    HasSqrt = 0,
-    HasRsqrt = 0,
-    HasExp = 0,
-    HasLog = 0,
-    HasBlend = 0,
-    HasInsert = 1
-  };
-};
-
 template<> struct packet_traits<float>  : default_packet_traits
 {
   typedef Packet16f type;
@@ -88,22 +55,13 @@ template<> struct packet_traits<float>  : default_packet_traits
     size = 16,
     HasHalfPacket = 1,
     HasBlend = 0,
-    HasInsert = 1,
-    HasSin = EIGEN_FAST_MATH,
-    HasCos = EIGEN_FAST_MATH,
 #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
 #ifdef EIGEN_VECTORIZE_AVX512DQ
     HasLog = 1,
-    HasLog1p  = 1,
-    HasExpm1  = 1,
-    HasNdtri = 1,
-    HasBessel  = 1,
 #endif
     HasExp = 1,
     HasSqrt = EIGEN_FAST_MATH,
     HasRsqrt = EIGEN_FAST_MATH,
-    HasTanh = EIGEN_FAST_MATH,
-    HasErf = EIGEN_FAST_MATH,
 #endif
     HasDiv = 1
   };
@@ -117,7 +75,6 @@ template<> struct packet_traits<double> : default_packet_traits
     AlignedOnScalar = 1,
     size = 8,
     HasHalfPacket = 1,
-    HasInsert = 1,
 #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
     HasSqrt = EIGEN_FAST_MATH,
     HasRsqrt = EIGEN_FAST_MATH,
@@ -143,27 +100,19 @@ struct unpacket_traits<Packet16f> {
   typedef float type;
   typedef Packet8f half;
   typedef Packet16i integer_packet;
-  typedef uint16_t mask_t;
-  enum { size = 16, alignment=Aligned64, vectorizable=true, masked_load_available=true, masked_store_available=true };
+  enum { size = 16, alignment=Aligned64 };
 };
 template <>
 struct unpacket_traits<Packet8d> {
   typedef double type;
   typedef Packet4d half;
-  enum { size = 8, alignment=Aligned64, vectorizable=true, masked_load_available=false, masked_store_available=false };
+  enum { size = 8, alignment=Aligned64 };
 };
 template <>
 struct unpacket_traits<Packet16i> {
   typedef int type;
   typedef Packet8i half;
-  enum { size = 16, alignment=Aligned64, vectorizable=false, masked_load_available=false, masked_store_available=false };
-};
-
-template<>
-struct unpacket_traits<Packet16h> {
-  typedef Eigen::half type;
-  typedef Packet16h half;
-  enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false};
+  enum { size = 16, alignment=Aligned64 };
 };
 
 template <>
@@ -179,11 +128,6 @@ EIGEN_STRONG_INLINE Packet16i pset1<Packet16i>(const int& from) {
   return _mm512_set1_epi32(from);
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet16f pset1frombits<Packet16f>(unsigned int from) {
-  return _mm512_castsi512_ps(_mm512_set1_epi32(from));
-}
-
 template <>
 EIGEN_STRONG_INLINE Packet16f pload1<Packet16f>(const float* from) {
   return _mm512_broadcastss_ps(_mm_load_ps1(from));
@@ -300,24 +244,6 @@ EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b,
 }
 #endif
 
-template <>
-EIGEN_DEVICE_FUNC inline Packet16f pselect(const Packet16f& mask,
-                                           const Packet16f& a,
-                                           const Packet16f& b) {
-  __mmask16 mask16 = _mm512_cmp_epi32_mask(
-      _mm512_castps_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
-  return _mm512_mask_blend_ps(mask16, a, b);
-}
-
-template <>
-EIGEN_DEVICE_FUNC inline Packet8d pselect(const Packet8d& mask,
-                                          const Packet8d& a,
-                                          const Packet8d& b) {
-  __mmask8 mask8 = _mm512_cmp_epi64_mask(_mm512_castpd_si512(mask),
-                                         _mm512_setzero_epi32(), _MM_CMPINT_EQ);
-  return _mm512_mask_blend_pd(mask8, a, b);
-}
-
 template <>
 EIGEN_STRONG_INLINE Packet16f pmin<Packet16f>(const Packet16f& a,
                                               const Packet16f& b) {
@@ -365,74 +291,23 @@ EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
 }
 #endif
 
-template <>
-EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
-  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ);
-  return _mm512_castsi512_ps(
-      _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
-}
-template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
-  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ);
-  return _mm512_castsi512_ps(
-      _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
-}
-
-template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
-  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ);
-  return _mm512_castsi512_ps(
-      _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
-}
-
-template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
-  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGT_UQ);
-  return _mm512_castsi512_ps(
-      _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
-}
-
-template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) {
-  __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _CMP_EQ_OQ);
-  return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu);
-}
-
-
-template <>
-EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) {
-  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ);
-  return _mm512_castsi512_pd(
-      _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
-}
-template <>
-EIGEN_STRONG_INLINE Packet8d pcmp_le(const Packet8d& a, const Packet8d& b) {
-  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LE_OQ);
-  return _mm512_castsi512_pd(
-      _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
-}
-template <>
-EIGEN_STRONG_INLINE Packet8d pcmp_lt(const Packet8d& a, const Packet8d& b) {
-  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ);
-  return _mm512_castsi512_pd(
-      _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
-}
-template <>
-EIGEN_STRONG_INLINE Packet8d pcmp_lt_or_nan(const Packet8d& a, const Packet8d& b) {
-  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_NGT_UQ);
-  return _mm512_castsi512_pd(
-      _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16i ptrue<Packet16i>(const Packet16i& /*a*/) {
-  return _mm512_set1_epi32(0xffffffffu);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet16f ptrue<Packet16f>(const Packet16f& a) {
-  return _mm512_castsi512_ps(ptrue<Packet16i>(_mm512_castps_si512(a)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet8d ptrue<Packet8d>(const Packet8d& a) {
-  return _mm512_castsi512_pd(ptrue<Packet16i>(_mm512_castpd_si512(a)));
+// Helper function for bit packing snippet of low precision comparison.
+// It packs the flags from 32x16 to 16x16.
+EIGEN_STRONG_INLINE __m256i Pack32To16(Packet16f rf) {
+  // Split data into small pieces and handle with AVX instructions
+  // to guarantee internal order of vector.
+  // Operation:
+  //   dst[15:0]    := Saturate16(rf[31:0])
+  //   dst[31:16]   := Saturate16(rf[63:32])
+  //   ...
+  //   dst[255:240] := Saturate16(rf[255:224])
+  __m256i lo = _mm256_castps_si256(extract256<0>(rf));
+  __m256i hi = _mm256_castps_si256(extract256<1>(rf));
+  __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0),
+                                      _mm256_extractf128_si256(lo, 1));
+  __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0),
+                                      _mm256_extractf128_si256(hi, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
 }
 
 template <>
@@ -576,12 +451,6 @@ EIGEN_STRONG_INLINE Packet16i ploadu<Packet16i>(const int* from) {
       reinterpret_cast<const __m512i*>(from));
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from, uint16_t umask) {
-  __mmask16 mask = static_cast<__mmask16>(umask);
-  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_maskz_loadu_ps(mask, from);
-}
-
 // Loads 8 floats from memory a returns the packet
 // {a0, a0  a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7}
 template <>
@@ -666,11 +535,6 @@ EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) {
   EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512(
       reinterpret_cast<__m512i*>(to), from);
 }
-template <>
-EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from, uint16_t umask) {
-  __mmask16 mask = static_cast<__mmask16>(umask);
-  EIGEN_DEBUG_UNALIGNED_STORE return _mm512_mask_storeu_ps(to, mask, from);
-}
 
 template <>
 EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from,
@@ -822,26 +686,27 @@ EIGEN_STRONG_INLINE double predux<Packet8d>(const Packet8d& a) {
 }
 
 template <>
-EIGEN_STRONG_INLINE Packet8f predux_half_dowto4<Packet16f>(const Packet16f& a) {
+EIGEN_STRONG_INLINE Packet8f predux_downto4<Packet16f>(const Packet16f& a) {
 #ifdef EIGEN_VECTORIZE_AVX512DQ
-  __m256 lane0 = _mm512_extractf32x8_ps(a, 0);
-  __m256 lane1 = _mm512_extractf32x8_ps(a, 1);
-  return _mm256_add_ps(lane0, lane1);
+  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
+  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
+  return padd(lane0, lane1);
 #else
-  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
-  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
-  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
-  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
-  __m128 sum0 = _mm_add_ps(lane0, lane2);
-  __m128 sum1 = _mm_add_ps(lane1, lane3);
+  Packet4f lane0 = _mm512_extractf32x4_ps(a, 0);
+  Packet4f lane1 = _mm512_extractf32x4_ps(a, 1);
+  Packet4f lane2 = _mm512_extractf32x4_ps(a, 2);
+  Packet4f lane3 = _mm512_extractf32x4_ps(a, 3);
+  Packet4f sum0 = padd(lane0, lane2);
+  Packet4f sum1 = padd(lane1, lane3);
   return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1);
 #endif
 }
 template <>
-EIGEN_STRONG_INLINE Packet4d predux_half_dowto4<Packet8d>(const Packet8d& a) {
-  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
-  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
-  return _mm256_add_pd(lane0, lane1);
+EIGEN_STRONG_INLINE Packet4d predux_downto4<Packet8d>(const Packet8d& a) {
+  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
+  Packet4d res = padd(lane0, lane1);
+  return res;
 }
 
 template <>
@@ -912,13 +777,196 @@ EIGEN_STRONG_INLINE double predux_max<Packet8d>(const Packet8d& a) {
   return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1)));
 }
 
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x)
+template<> EIGEN_STRONG_INLINE Packet16f preduxp<Packet16f>(const Packet16f* vecs)
 {
-  Packet16i xi = _mm512_castps_si512(x);
-  __mmask16 tmp = _mm512_test_epi32_mask(xi,xi);
-  return !_mm512_kortestz(tmp,tmp);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[0], vecs0);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[1], vecs1);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[2], vecs2);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[3], vecs3);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[4], vecs4);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[5], vecs5);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[6], vecs6);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[7], vecs7);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[8], vecs8);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[9], vecs9);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[10], vecs10);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[11], vecs11);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[12], vecs12);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[13], vecs13);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[14], vecs14);
+  EIGEN_EXTRACT_8f_FROM_16f(vecs[15], vecs15);
+
+  __m256 hsum1 = _mm256_hadd_ps(vecs0_0, vecs1_0);
+  __m256 hsum2 = _mm256_hadd_ps(vecs2_0, vecs3_0);
+  __m256 hsum3 = _mm256_hadd_ps(vecs4_0, vecs5_0);
+  __m256 hsum4 = _mm256_hadd_ps(vecs6_0, vecs7_0);
+
+  __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1);
+  __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2);
+  __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3);
+  __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4);
+
+  __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
+  __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
+  __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
+  __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
+
+  __m256 sum1 = _mm256_add_ps(perm1, hsum5);
+  __m256 sum2 = _mm256_add_ps(perm2, hsum6);
+  __m256 sum3 = _mm256_add_ps(perm3, hsum7);
+  __m256 sum4 = _mm256_add_ps(perm4, hsum8);
+
+  __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
+  __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
+
+  __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0);
+
+  hsum1 = _mm256_hadd_ps(vecs0_1, vecs1_1);
+  hsum2 = _mm256_hadd_ps(vecs2_1, vecs3_1);
+  hsum3 = _mm256_hadd_ps(vecs4_1, vecs5_1);
+  hsum4 = _mm256_hadd_ps(vecs6_1, vecs7_1);
+
+  hsum5 = _mm256_hadd_ps(hsum1, hsum1);
+  hsum6 = _mm256_hadd_ps(hsum2, hsum2);
+  hsum7 = _mm256_hadd_ps(hsum3, hsum3);
+  hsum8 = _mm256_hadd_ps(hsum4, hsum4);
+
+  perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
+  perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
+  perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
+  perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
+
+  sum1 = _mm256_add_ps(perm1, hsum5);
+  sum2 = _mm256_add_ps(perm2, hsum6);
+  sum3 = _mm256_add_ps(perm3, hsum7);
+  sum4 = _mm256_add_ps(perm4, hsum8);
+
+  blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
+  blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
+
+  final = padd(final, _mm256_blend_ps(blend1, blend2, 0xf0));
+
+  hsum1 = _mm256_hadd_ps(vecs8_0, vecs9_0);
+  hsum2 = _mm256_hadd_ps(vecs10_0, vecs11_0);
+  hsum3 = _mm256_hadd_ps(vecs12_0, vecs13_0);
+  hsum4 = _mm256_hadd_ps(vecs14_0, vecs15_0);
+
+  hsum5 = _mm256_hadd_ps(hsum1, hsum1);
+  hsum6 = _mm256_hadd_ps(hsum2, hsum2);
+  hsum7 = _mm256_hadd_ps(hsum3, hsum3);
+  hsum8 = _mm256_hadd_ps(hsum4, hsum4);
+
+  perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
+  perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
+  perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
+  perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
+
+  sum1 = _mm256_add_ps(perm1, hsum5);
+  sum2 = _mm256_add_ps(perm2, hsum6);
+  sum3 = _mm256_add_ps(perm3, hsum7);
+  sum4 = _mm256_add_ps(perm4, hsum8);
+
+  blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
+  blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
+
+  __m256 final_1 = _mm256_blend_ps(blend1, blend2, 0xf0);
+
+  hsum1 = _mm256_hadd_ps(vecs8_1, vecs9_1);
+  hsum2 = _mm256_hadd_ps(vecs10_1, vecs11_1);
+  hsum3 = _mm256_hadd_ps(vecs12_1, vecs13_1);
+  hsum4 = _mm256_hadd_ps(vecs14_1, vecs15_1);
+
+  hsum5 = _mm256_hadd_ps(hsum1, hsum1);
+  hsum6 = _mm256_hadd_ps(hsum2, hsum2);
+  hsum7 = _mm256_hadd_ps(hsum3, hsum3);
+  hsum8 = _mm256_hadd_ps(hsum4, hsum4);
+
+  perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23);
+  perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23);
+  perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23);
+  perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23);
+
+  sum1 = _mm256_add_ps(perm1, hsum5);
+  sum2 = _mm256_add_ps(perm2, hsum6);
+  sum3 = _mm256_add_ps(perm3, hsum7);
+  sum4 = _mm256_add_ps(perm4, hsum8);
+
+  blend1 = _mm256_blend_ps(sum1, sum2, 0xcc);
+  blend2 = _mm256_blend_ps(sum3, sum4, 0xcc);
+
+  final_1 = padd(final_1, _mm256_blend_ps(blend1, blend2, 0xf0));
+
+  __m512 final_output;
+
+  EIGEN_INSERT_8f_INTO_16f(final_output, final, final_1);
+  return final_output;
 }
 
+template<> EIGEN_STRONG_INLINE Packet8d preduxp<Packet8d>(const Packet8d* vecs)
+{
+  Packet4d vecs0_0 = _mm512_extractf64x4_pd(vecs[0], 0);
+  Packet4d vecs0_1 = _mm512_extractf64x4_pd(vecs[0], 1);
+
+  Packet4d vecs1_0 = _mm512_extractf64x4_pd(vecs[1], 0);
+  Packet4d vecs1_1 = _mm512_extractf64x4_pd(vecs[1], 1);
+
+  Packet4d vecs2_0 = _mm512_extractf64x4_pd(vecs[2], 0);
+  Packet4d vecs2_1 = _mm512_extractf64x4_pd(vecs[2], 1);
+
+  Packet4d vecs3_0 = _mm512_extractf64x4_pd(vecs[3], 0);
+  Packet4d vecs3_1 = _mm512_extractf64x4_pd(vecs[3], 1);
+
+  Packet4d vecs4_0 = _mm512_extractf64x4_pd(vecs[4], 0);
+  Packet4d vecs4_1 = _mm512_extractf64x4_pd(vecs[4], 1);
+
+  Packet4d vecs5_0 = _mm512_extractf64x4_pd(vecs[5], 0);
+  Packet4d vecs5_1 = _mm512_extractf64x4_pd(vecs[5], 1);
+
+  Packet4d vecs6_0 = _mm512_extractf64x4_pd(vecs[6], 0);
+  Packet4d vecs6_1 = _mm512_extractf64x4_pd(vecs[6], 1);
+
+  Packet4d vecs7_0 = _mm512_extractf64x4_pd(vecs[7], 0);
+  Packet4d vecs7_1 = _mm512_extractf64x4_pd(vecs[7], 1);
+
+  Packet4d tmp0, tmp1;
+
+  tmp0 = _mm256_hadd_pd(vecs0_0, vecs1_0);
+  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
+
+  tmp1 = _mm256_hadd_pd(vecs2_0, vecs3_0);
+  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
+
+  __m256d final_0 = _mm256_blend_pd(tmp0, tmp1, 0xC);
+
+  tmp0 = _mm256_hadd_pd(vecs0_1, vecs1_1);
+  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
+
+  tmp1 = _mm256_hadd_pd(vecs2_1, vecs3_1);
+  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
+
+  final_0 = padd(final_0, _mm256_blend_pd(tmp0, tmp1, 0xC));
+
+  tmp0 = _mm256_hadd_pd(vecs4_0, vecs5_0);
+  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
+
+  tmp1 = _mm256_hadd_pd(vecs6_0, vecs7_0);
+  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
+
+  __m256d final_1 = _mm256_blend_pd(tmp0, tmp1, 0xC);
+
+  tmp0 = _mm256_hadd_pd(vecs4_1, vecs5_1);
+  tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1));
+
+  tmp1 = _mm256_hadd_pd(vecs6_1, vecs7_1);
+  tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1));
+
+  final_1 = padd(final_1, _mm256_blend_pd(tmp0, tmp1, 0xC));
+
+  __m512d final_output = _mm512_insertf64x4(final_output, final_0, 0);
+
+  return _mm512_insertf64x4(final_output, final_1, 1);
+}
+ 
 
 
 #define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
@@ -1202,418 +1250,52 @@ template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16i, Packet16f>(const Packe
   return _mm512_cvtepi32_ps(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet16i preinterpret<Packet16i,Packet16f>(const Packet16f& a) {
-  return _mm512_castps_si512(a);
-}
+template <int Offset>
+struct palign_impl<Offset, Packet16f> {
+  static EIGEN_STRONG_INLINE void run(Packet16f& first,
+                                      const Packet16f& second) {
+    if (Offset != 0) {
+      __m512i first_idx = _mm512_set_epi32(
+          Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11,
+          Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6,
+          Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset);
 
-template<> EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f,Packet16i>(const Packet16i& a) {
-  return _mm512_castsi512_ps(a);
-}
+      __m512i second_idx =
+          _mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4,
+                           Offset - 5, Offset - 6, Offset - 7, Offset - 8,
+                           Offset - 9, Offset - 10, Offset - 11, Offset - 12,
+                           Offset - 13, Offset - 14, Offset - 15, Offset - 16);
 
+      unsigned short mask = 0xFFFF;
+      mask <<= (16 - Offset);
 
-// Packet math for Eigen::half
-template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
-  return _mm256_set1_epi16(from.x);
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
-  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from, 0)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
-  return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
-  return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
-}
-
-template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
-  // (void*) -> workaround clang warning:
-  // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
-  _mm256_store_si256((__m256i*)(void*)to, from);
-}
-
-template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
-  // (void*) -> workaround clang warning:
-  // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
-  _mm256_storeu_si256((__m256i*)(void*)to, from);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h
-ploaddup<Packet16h>(const Eigen::half*  from) {
-  unsigned short a = from[0].x;
-  unsigned short b = from[1].x;
-  unsigned short c = from[2].x;
-  unsigned short d = from[3].x;
-  unsigned short e = from[4].x;
-  unsigned short f = from[5].x;
-  unsigned short g = from[6].x;
-  unsigned short h = from[7].x;
-  return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h
-ploadquad(const Eigen::half* from) {
-  unsigned short a = from[0].x;
-  unsigned short b = from[1].x;
-  unsigned short c = from[2].x;
-  unsigned short d = from[3].x;
-  return _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
-}
-
-EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) {
-#ifdef EIGEN_HAS_FP16_C
-  return _mm512_cvtph_ps(a);
-#else
-  EIGEN_ALIGN64 half aux[16];
-  pstore(aux, a);
-  float f0(aux[0]);
-  float f1(aux[1]);
-  float f2(aux[2]);
-  float f3(aux[3]);
-  float f4(aux[4]);
-  float f5(aux[5]);
-  float f6(aux[6]);
-  float f7(aux[7]);
-  float f8(aux[8]);
-  float f9(aux[9]);
-  float fa(aux[10]);
-  float fb(aux[11]);
-  float fc(aux[12]);
-  float fd(aux[13]);
-  float fe(aux[14]);
-  float ff(aux[15]);
-
-  return _mm512_set_ps(
-      ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0);
-#endif
-}
-
-EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
-#ifdef EIGEN_HAS_FP16_C
-  return _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
-#else
-  EIGEN_ALIGN64 float aux[16];
-  pstore(aux, a);
-  half h0(aux[0]);
-  half h1(aux[1]);
-  half h2(aux[2]);
-  half h3(aux[3]);
-  half h4(aux[4]);
-  half h5(aux[5]);
-  half h6(aux[6]);
-  half h7(aux[7]);
-  half h8(aux[8]);
-  half h9(aux[9]);
-  half ha(aux[10]);
-  half hb(aux[11]);
-  half hc(aux[12]);
-  half hd(aux[13]);
-  half he(aux[14]);
-  half hf(aux[15]);
-
-  return _mm256_set_epi16(
-      hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x,
-      h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) {
-  return ptrue(Packet8i(a));
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) {
-  return _mm256_xor_si256(a, ptrue(a));
-}
-
-
-template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) {
-  // in some cases Packet8i is a wrapper around __m256i, so we need to
-  // cast to Packet8i to call the correct overload.
-  return por(Packet8i(a),Packet8i(b));
-}
-template<> EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a,const Packet16h& b) {
-  return pxor(Packet8i(a),Packet8i(b));
-}
-template<> EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a,const Packet16h& b) {
-  return pand(Packet8i(a),Packet8i(b));
-}
-template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet16h& b) {
-  return pandnot(Packet8i(a),Packet8i(b));
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) {
-  return _mm256_blendv_epi8(b, a, mask);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a,const Packet16h& b) {
-  Packet16f af = half2float(a);
-  Packet16f bf = half2float(b);
-  Packet16f rf = pcmp_eq(af, bf);
-  // Pack the 32-bit flags into 16-bits flags.
-  __m256i lo = _mm256_castps_si256(extract256<0>(rf));
-  __m256i hi = _mm256_castps_si256(extract256<1>(rf));
-  __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0),
-                                      _mm256_extractf128_si256(lo, 1));
-  __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0),
-                                      _mm256_extractf128_si256(hi, 1));
-  return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
-  Packet16h sign_mask = _mm256_set1_epi16(static_cast<unsigned short>(0x8000));
-  return _mm256_xor_si256(a, sign_mask);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  Packet16f af = half2float(a);
-  Packet16f bf = half2float(b);
-  Packet16f rf = padd(af, bf);
-  return float2half(rf);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  Packet16f af = half2float(a);
-  Packet16f bf = half2float(b);
-  Packet16f rf = psub(af, bf);
-  return float2half(rf);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  Packet16f af = half2float(a);
-  Packet16f bf = half2float(b);
-  Packet16f rf = pmul(af, bf);
-  return float2half(rf);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
-  Packet16f af = half2float(a);
-  Packet16f bf = half2float(b);
-  Packet16f rf = pdiv(af, bf);
-  return float2half(rf);
-}
-
-template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
-  Packet16f from_float = half2float(from);
-  return half(predux(from_float));
-}
-
-template<> EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& from) {
-  Packet16f from_float = half2float(from);
-  return half(predux_mul(from_float));
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a)
-{
-  __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1);
-  return _mm256_insertf128_si256(
-                    _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a,1),m)),
-                                           _mm_shuffle_epi8(_mm256_extractf128_si256(a,0),m), 1);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
-{
-  return _mm256_set_epi16(
-      from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x,
-      from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x,
-      from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x,
-      from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
-}
-
-template<> EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride)
-{
-  EIGEN_ALIGN64 half aux[16];
-  pstore(aux, from);
-  to[stride*0].x = aux[0].x;
-  to[stride*1].x = aux[1].x;
-  to[stride*2].x = aux[2].x;
-  to[stride*3].x = aux[3].x;
-  to[stride*4].x = aux[4].x;
-  to[stride*5].x = aux[5].x;
-  to[stride*6].x = aux[6].x;
-  to[stride*7].x = aux[7].x;
-  to[stride*8].x = aux[8].x;
-  to[stride*9].x = aux[9].x;
-  to[stride*10].x = aux[10].x;
-  to[stride*11].x = aux[11].x;
-  to[stride*12].x = aux[12].x;
-  to[stride*13].x = aux[13].x;
-  to[stride*14].x = aux[14].x;
-  to[stride*15].x = aux[15].x;
-}
-
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet16h,16>& kernel) {
-  __m256i a = kernel.packet[0];
-  __m256i b = kernel.packet[1];
-  __m256i c = kernel.packet[2];
-  __m256i d = kernel.packet[3];
-  __m256i e = kernel.packet[4];
-  __m256i f = kernel.packet[5];
-  __m256i g = kernel.packet[6];
-  __m256i h = kernel.packet[7];
-  __m256i i = kernel.packet[8];
-  __m256i j = kernel.packet[9];
-  __m256i k = kernel.packet[10];
-  __m256i l = kernel.packet[11];
-  __m256i m = kernel.packet[12];
-  __m256i n = kernel.packet[13];
-  __m256i o = kernel.packet[14];
-  __m256i p = kernel.packet[15];
-
-  __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
-  __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
-  __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
-  __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
-  __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
-  __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
-  __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
-  __m256i op_07 = _mm256_unpacklo_epi16(o, p);
-
-  __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
-  __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
-  __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
-  __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
-  __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
-  __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
-  __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
-  __m256i op_8f = _mm256_unpackhi_epi16(o, p);
-
-  __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
-  __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
-  __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
-  __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
-  __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
-  __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
-  __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
-  __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
-
-  __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
-  __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
-  __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
-  __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
-  __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
-  __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
-  __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
-  __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
-
-  __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
-  __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
-  __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
-  __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
-  __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
-  __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
-  __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
-  __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
-  __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
-  __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
-  __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
-  __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
-  __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
-  __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
-  __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
-  __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
-
-  // NOTE: no unpacklo/hi instr in this case, so using permute instr.
-  __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
-  __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
-  __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
-  __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
-  __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
-  __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
-  __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
-  __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
-  __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
-  __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
-  __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
-  __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
-  __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
-  __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
-  __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
-  __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
-
-  kernel.packet[0] = a_p_0;
-  kernel.packet[1] = a_p_1;
-  kernel.packet[2] = a_p_2;
-  kernel.packet[3] = a_p_3;
-  kernel.packet[4] = a_p_4;
-  kernel.packet[5] = a_p_5;
-  kernel.packet[6] = a_p_6;
-  kernel.packet[7] = a_p_7;
-  kernel.packet[8] = a_p_8;
-  kernel.packet[9] = a_p_9;
-  kernel.packet[10] = a_p_a;
-  kernel.packet[11] = a_p_b;
-  kernel.packet[12] = a_p_c;
-  kernel.packet[13] = a_p_d;
-  kernel.packet[14] = a_p_e;
-  kernel.packet[15] = a_p_f;
-}
-
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet16h,8>& kernel) {
-  EIGEN_ALIGN64 half in[8][16];
-  pstore<half>(in[0], kernel.packet[0]);
-  pstore<half>(in[1], kernel.packet[1]);
-  pstore<half>(in[2], kernel.packet[2]);
-  pstore<half>(in[3], kernel.packet[3]);
-  pstore<half>(in[4], kernel.packet[4]);
-  pstore<half>(in[5], kernel.packet[5]);
-  pstore<half>(in[6], kernel.packet[6]);
-  pstore<half>(in[7], kernel.packet[7]);
-
-  EIGEN_ALIGN64 half out[8][16];
-
-  for (int i = 0; i < 8; ++i) {
-    for (int j = 0; j < 8; ++j) {
-      out[i][j] = in[j][2*i];
-    }
-    for (int j = 0; j < 8; ++j) {
-      out[i][j+8] = in[j][2*i+1];
+      first = _mm512_permutexvar_ps(first_idx, first);
+      Packet16f tmp = _mm512_permutexvar_ps(second_idx, second);
+      first = _mm512_mask_blend_ps(mask, first, tmp);
     }
   }
+};
+template <int Offset>
+struct palign_impl<Offset, Packet8d> {
+  static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) {
+    if (Offset != 0) {
+      __m512i first_idx = _mm512_set_epi32(
+          0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0,
+          Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset);
 
-  kernel.packet[0] = pload<Packet16h>(out[0]);
-  kernel.packet[1] = pload<Packet16h>(out[1]);
-  kernel.packet[2] = pload<Packet16h>(out[2]);
-  kernel.packet[3] = pload<Packet16h>(out[3]);
-  kernel.packet[4] = pload<Packet16h>(out[4]);
-  kernel.packet[5] = pload<Packet16h>(out[5]);
-  kernel.packet[6] = pload<Packet16h>(out[6]);
-  kernel.packet[7] = pload<Packet16h>(out[7]);
-}
+      __m512i second_idx = _mm512_set_epi32(
+          0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0,
+          Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8);
 
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet16h,4>& kernel) {
-  EIGEN_ALIGN64 half in[4][16];
-  pstore<half>(in[0], kernel.packet[0]);
-  pstore<half>(in[1], kernel.packet[1]);
-  pstore<half>(in[2], kernel.packet[2]);
-  pstore<half>(in[3], kernel.packet[3]);
+      unsigned char mask = 0xFF;
+      mask <<= (8 - Offset);
 
-  EIGEN_ALIGN64 half out[4][16];
-
-  for (int i = 0; i < 4; ++i) {
-    for (int j = 0; j < 4; ++j) {
-      out[i][j] = in[j][4*i];
-    }
-    for (int j = 0; j < 4; ++j) {
-      out[i][j+4] = in[j][4*i+1];
-    }
-    for (int j = 0; j < 4; ++j) {
-      out[i][j+8] = in[j][4*i+2];
-    }
-    for (int j = 0; j < 4; ++j) {
-      out[i][j+12] = in[j][4*i+3];
+      first = _mm512_permutexvar_pd(first_idx, first);
+      Packet8d tmp = _mm512_permutexvar_pd(second_idx, second);
+      first = _mm512_mask_blend_pd(mask, first, tmp);
     }
   }
-
-  kernel.packet[0] = pload<Packet16h>(out[0]);
-  kernel.packet[1] = pload<Packet16h>(out[1]);
-  kernel.packet[2] = pload<Packet16h>(out[2]);
-  kernel.packet[3] = pload<Packet16h>(out[3]);
-}
+};
 
 
 } // end namespace internal
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h
deleted file mode 100644
index a82176941..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h
+++ /dev/null
@@ -1,47 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2019 Rasmus Munk Larsen <rmlarsen@google.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_TYPE_CASTING_AVX512_H
-#define EIGEN_TYPE_CASTING_AVX512_H
-
-namespace Eigen {
-
-namespace internal {
-
-template <>
-struct type_casting_traits<half, float> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
-  return half2float(a);
-}
-
-template <>
-struct type_casting_traits<float, half> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) {
-  return float2half(a);
-}
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_TYPE_CASTING_AVX512_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AltiVec/Complex.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AltiVec/Complex.h
index 69d2ceca8..3e665730c 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -60,7 +60,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
   };
 };
 
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
 
 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
@@ -82,14 +82,14 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<f
 
 template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
 {
-  EIGEN_ALIGN16 std::complex<float> af[2];
+  std::complex<float> EIGEN_ALIGN16 af[2];
   af[0] = from[0*stride];
   af[1] = from[1*stride];
   return pload<Packet2cf>(af);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
 {
-  EIGEN_ALIGN16 std::complex<float> af[2];
+  std::complex<float> EIGEN_ALIGN16 af[2];
   pstore<std::complex<float> >((std::complex<float> *) af, from);
   to[0*stride] = af[0];
   to[1*stride] = af[1];
@@ -128,7 +128,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::co
 
 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
-  EIGEN_ALIGN16 std::complex<float> res[2];
+  std::complex<float> EIGEN_ALIGN16 res[2];
   pstore((float *)&res, a.v);
 
   return res[0];
@@ -149,6 +149,22 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packe
   return pfirst<Packet2cf>(Packet2cf(b));
 }
 
+template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
+{
+  Packet4f b1, b2;
+#ifdef _BIG_ENDIAN  
+  b1 = vec_sld(vecs[0].v, vecs[1].v, 8);
+  b2 = vec_sld(vecs[1].v, vecs[0].v, 8);
+#else
+  b1 = vec_sld(vecs[1].v, vecs[0].v, 8);
+  b2 = vec_sld(vecs[0].v, vecs[1].v, 8);
+#endif
+  b2 = vec_sld(b2, b2, 8);
+  b2 = padd<Packet4f>(b1, b2);
+
+  return Packet2cf(b2);
+}
+
 template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
 {
   Packet4f b;
@@ -159,6 +175,22 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
   return pfirst<Packet2cf>(prod);
 }
 
+template<int Offset>
+struct palign_impl<Offset,Packet2cf>
+{
+  static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
+  {
+    if (Offset==1)
+    {
+#ifdef _BIG_ENDIAN
+      first.v = vec_sld(first.v, second.v, 8);
+#else
+      first.v = vec_sld(second.v, first.v, 8);
+#endif
+    }
+  }
+};
+
 template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
 {
   EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
@@ -214,11 +246,6 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
   kernel.packet[0].v = tmp;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
-  Packet4f eq = reinterpret_cast<Packet4f>(vec_cmpeq(a.v,b.v));
-  return Packet2cf(vec_and(eq, vec_perm(eq, eq, p16uc_COMPLEX32_REV)));
-}
-
 #ifdef __VSX__
 template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
   Packet2cf result;
@@ -259,7 +286,7 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
   };
 };
 
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
 
 template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { return Packet1cd(pload<Packet2d>((const double*)from)); }
 template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { return Packet1cd(ploadu<Packet2d>((const double*)from)); }
@@ -271,14 +298,14 @@ template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<dou
 
 template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
 {
-  EIGEN_ALIGN16 std::complex<double> af[2];
+  std::complex<double> EIGEN_ALIGN16 af[2];
   af[0] = from[0*stride];
   af[1] = from[1*stride];
   return pload<Packet1cd>(af);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride)
 {
-  EIGEN_ALIGN16 std::complex<double> af[2];
+  std::complex<double> EIGEN_ALIGN16 af[2];
   pstore<std::complex<double> >(af, from);
   to[0*stride] = af[0];
   to[1*stride] = af[1];
@@ -318,7 +345,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::c
 
 template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
 {
-  EIGEN_ALIGN16 std::complex<double> res[2];
+  std::complex<double> EIGEN_ALIGN16 res[2];
   pstore<std::complex<double> >(res, a);
 
   return res[0];
@@ -327,9 +354,20 @@ template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Pac
 template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
 
 template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
+template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)        { return vecs[0]; }
 
 template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
 
+template<int Offset>
+struct palign_impl<Offset,Packet1cd>
+{
+  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
+  {
+    // FIXME is it sure we never have to align a Packet1cd?
+    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
+  }
+};
+
 template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
 {
   EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
@@ -384,18 +422,6 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
   kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
   kernel.packet[0].v = tmp;
 }
-
-template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
-  // Compare real and imaginary parts of a and b to get the mask vector:
-  // [re(a)==re(b), im(a)==im(b)]
-  Packet2d eq = reinterpret_cast<Packet2d>(vec_cmpeq(a.v,b.v));
-  // Swap real/imag elements in the mask in to get:
-  // [im(a)==im(b), re(a)==re(b)]
-  Packet2d eq_swapped = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(eq), reinterpret_cast<Packet4ui>(eq), 8));
-  // Return re(a)==re(b) & im(a)==im(b) by computing bitwise AND of eq and eq_swapped
-  return Packet1cd(vec_and(eq, eq_swapped));
-}
-
 #endif // __VSX__
 } // end namespace internal
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h
index 3a7a32936..c5e4bede7 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h
@@ -9,6 +9,10 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+/* The sin, cos, exp, and log functions of this file come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
 #ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H
 #define EIGEN_MATH_FUNCTIONS_ALTIVEC_H
 
@@ -16,28 +20,180 @@ namespace Eigen {
 
 namespace internal {
 
+static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+static _EIGEN_DECLARE_CONST_Packet4i(23, 23);
+
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
+
+/* the smallest non denormalized float number */
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000); // -1.f/0.f
+static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan,     0xffffffff);
+  
+/* natural logarithm computed for 4 simultaneous float
+  return NaN for x <= 0
+*/
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
+static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
+static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
+
+#ifdef __VSX__
+static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
+static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
+static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
+
+static _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
+static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
+
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
+static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
+
+#ifdef __POWER8_VECTOR__
+static Packet2l p2l_1023 = { 1023, 1023 };
+static Packet2ul p2ul_52 = { 52, 52 };
+#endif
+
+#endif
+
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f plog<Packet4f>(const Packet4f& _x)
 {
-  return plog_float(_x);
+  Packet4f x = _x;
+
+  Packet4i emm0;
+
+  /* isvalid_mask is 0 if x < 0 or x is NaN. */
+  Packet4ui isvalid_mask = reinterpret_cast<Packet4ui>(vec_cmpge(x, p4f_ZERO));
+  Packet4ui iszero_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(x, p4f_ZERO));
+
+  x = pmax(x, p4f_min_norm_pos);  /* cut off denormalized stuff */
+  emm0 = vec_sr(reinterpret_cast<Packet4i>(x),
+                reinterpret_cast<Packet4ui>(p4i_23));
+
+  /* keep only the fractional part */
+  x = pand(x, p4f_inv_mant_mask);
+  x = por(x, p4f_half);
+
+  emm0 = psub(emm0, p4i_0x7f);
+  Packet4f e = padd(vec_ctf(emm0, 0), p4f_1);
+
+  /* part2:
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  Packet4f mask = reinterpret_cast<Packet4f>(vec_cmplt(x, p4f_cephes_SQRTHF));
+  Packet4f tmp = pand(x, mask);
+  x = psub(x, p4f_1);
+  e = psub(e, pand(p4f_1, mask));
+  x = padd(x, tmp);
+
+  Packet4f x2 = pmul(x,x);
+  Packet4f x3 = pmul(x2,x);
+
+  Packet4f y, y1, y2;
+  y  = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
+  y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
+  y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
+  y  = pmadd(y , x, p4f_cephes_log_p2);
+  y1 = pmadd(y1, x, p4f_cephes_log_p5);
+  y2 = pmadd(y2, x, p4f_cephes_log_p8);
+  y = pmadd(y, x3, y1);
+  y = pmadd(y, x3, y2);
+  y = pmul(y, x3);
+
+  y1 = pmul(e, p4f_cephes_log_q1);
+  tmp = pmul(x2, p4f_half);
+  y = padd(y, y1);
+  x = psub(x, tmp);
+  y2 = pmul(e, p4f_cephes_log_q2);
+  x = padd(x, y);
+  x = padd(x, y2);
+  // negative arg will be NAN, 0 will be -INF
+  x = vec_sel(x, p4f_minus_inf, iszero_mask);
+  x = vec_sel(p4f_minus_nan, x, isvalid_mask);
+  return x;
 }
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f pexp<Packet4f>(const Packet4f& _x)
 {
-  return pexp_float(_x);
-}
+  Packet4f x = _x;
 
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f psin<Packet4f>(const Packet4f& _x)
-{
-  return psin_float(_x);
-}
+  Packet4f tmp, fx;
+  Packet4i emm0;
 
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f pcos<Packet4f>(const Packet4f& _x)
-{
-  return pcos_float(_x);
+  // clamp x
+  x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
+
+  // express exp(x) as exp(g + n*log(2))
+  fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
+
+  fx = pfloor(fx);
+
+  tmp = pmul(fx, p4f_cephes_exp_C1);
+  Packet4f z = pmul(fx, p4f_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  z = pmul(x,x);
+
+  Packet4f y = p4f_cephes_exp_p0;
+  y = pmadd(y, x, p4f_cephes_exp_p1);
+  y = pmadd(y, x, p4f_cephes_exp_p2);
+  y = pmadd(y, x, p4f_cephes_exp_p3);
+  y = pmadd(y, x, p4f_cephes_exp_p4);
+  y = pmadd(y, x, p4f_cephes_exp_p5);
+  y = pmadd(y, z, x);
+  y = padd(y, p4f_1);
+
+  // build 2^n
+  emm0 = vec_cts(fx, 0);
+  emm0 = vec_add(emm0, p4i_0x7f);
+  emm0 = vec_sl(emm0, reinterpret_cast<Packet4ui>(p4i_23));
+
+  // Altivec's max & min operators just drop silent NaNs. Check NaNs in 
+  // inputs and return them unmodified.
+  Packet4ui isnumber_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(_x, _x));
+  return vec_sel(_x, pmax(pmul(y, reinterpret_cast<Packet4f>(emm0)), _x),
+                 isnumber_mask);
 }
 
 #ifndef EIGEN_COMP_CLANG
@@ -69,19 +225,95 @@ Packet2d psqrt<Packet2d>(const Packet2d& x)
   return  vec_sqrt(x);
 }
 
+// VSX support varies between different compilers and even different
+// versions of the same compiler.  For gcc version >= 4.9.3, we can use
+// vec_cts to efficiently convert Packet2d to Packet2l.  Otherwise, use
+// a slow version that works with older compilers. 
+// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
+// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
+static inline Packet2l ConvertToPacket2l(const Packet2d& x) {
+#if EIGEN_GNUC_AT_LEAST(5, 4) || \
+    (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
+  return vec_cts(x, 0);    // TODO: check clang version.
+#else
+  double tmp[2];
+  memcpy(tmp, &x, sizeof(tmp));
+  Packet2l l = { static_cast<long long>(tmp[0]),
+                 static_cast<long long>(tmp[1]) };
+  return l;
+#endif
+}
+
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d pexp<Packet2d>(const Packet2d& _x)
 {
-  return pexp_double(_x);
-}
+  Packet2d x = _x;
+
+  Packet2d tmp, fx;
+  Packet2l emm0;
+
+  // clamp x
+  x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = pmadd(x, p2d_cephes_LOG2EF, p2d_half);
+
+  fx = pfloor(fx);
+
+  tmp = pmul(fx, p2d_cephes_exp_C1);
+  Packet2d z = pmul(fx, p2d_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  Packet2d x2 = pmul(x,x);
+
+  Packet2d px = p2d_cephes_exp_p0;
+  px = pmadd(px, x2, p2d_cephes_exp_p1);
+  px = pmadd(px, x2, p2d_cephes_exp_p2);
+  px = pmul (px, x);
+
+  Packet2d qx = p2d_cephes_exp_q0;
+  qx = pmadd(qx, x2, p2d_cephes_exp_q1);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q2);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q3);
+
+  x = pdiv(px,psub(qx,px));
+  x = pmadd(p2d_2,x,p2d_1);
+
+  // build 2^n
+  emm0 = ConvertToPacket2l(fx);
+
+#ifdef __POWER8_VECTOR__ 
+  emm0 = vec_add(emm0, p2l_1023);
+  emm0 = vec_sl(emm0, p2ul_52);
+#else
+  // Code is a bit complex for POWER7.  There is actually a
+  // vec_xxsldi intrinsic but it is not supported by some gcc versions.
+  // So we shift (52-32) bits and do a word swap with zeros.
+  _EIGEN_DECLARE_CONST_Packet4i(1023, 1023);
+  _EIGEN_DECLARE_CONST_Packet4i(20, 20);    // 52 - 32
+
+  Packet4i emm04i = reinterpret_cast<Packet4i>(emm0);
+  emm04i = vec_add(emm04i, p4i_1023);
+  emm04i = vec_sl(emm04i, reinterpret_cast<Packet4ui>(p4i_20));
+  static const Packet16uc perm = {
+    0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, 
+    0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
+#ifdef  _BIG_ENDIAN
+  emm0 = reinterpret_cast<Packet2l>(vec_perm(p4i_ZERO, emm04i, perm));
+#else
+  emm0 = reinterpret_cast<Packet2l>(vec_perm(emm04i, p4i_ZERO, perm));
 #endif
 
-// Hyperbolic Tangent function.
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
-ptanh<Packet4f>(const Packet4f& x) {
-  return internal::generic_fast_tanh_float(x);
+#endif
+
+  // Altivec's max & min operators just drop silent NaNs. Check NaNs in 
+  // inputs and return them unmodified.
+  Packet2ul isnumber_mask = reinterpret_cast<Packet2ul>(vec_cmpeq(_x, _x));
+  return vec_sel(_x, pmax(pmul(x, reinterpret_cast<Packet2d>(emm0)), _x),
+                 isnumber_mask);
 }
+#endif
 
 }  // end namespace internal
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 83b75b974..08a27d153 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -31,33 +31,22 @@ namespace internal {
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  32
 #endif
 
-typedef __vector float                   Packet4f;
-typedef __vector int                     Packet4i;
-typedef __vector unsigned int            Packet4ui;
-typedef __vector __bool int              Packet4bi;
-typedef __vector short int               Packet8s;
-typedef __vector unsigned short int      Packet8us;
-typedef __vector int8_t                  Packet16c;
-typedef __vector uint8_t                 Packet16uc;
+typedef __vector float          Packet4f;
+typedef __vector int            Packet4i;
+typedef __vector unsigned int   Packet4ui;
+typedef __vector __bool int     Packet4bi;
+typedef __vector short int      Packet8i;
+typedef __vector unsigned char  Packet16uc;
 
 // We don't want to write the same code all the time, but we need to reuse the constants
 // and it doesn't really work to declare them global, so we define macros instead
 
 #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
-  Packet4f p4f_##NAME = {X, X, X, X}
+  Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
 
 #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
   Packet4i p4i_##NAME = vec_splat_s32(X)
 
-#define _EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME,X) \
-  Packet4ui p4ui_##NAME = {X, X, X, X}
-
-#define _EIGEN_DECLARE_CONST_FAST_Packet8us(NAME,X) \
-  Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X}
-
-#define _EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME,X) \
-  Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}
-
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
   Packet4f p4f_##NAME = pset1<Packet4f>(X)
 
@@ -76,39 +65,32 @@ typedef __vector uint8_t                 Packet16uc;
 #define DST_CHAN 1
 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
 
+
 // These constants are endian-agnostic
 static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
-static _EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u);
-static _EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu);
-static _EIGEN_DECLARE_CONST_FAST_Packet8us(ONE,1); //{ 1, 1, 1, 1, 1, 1, 1, 1}
-static _EIGEN_DECLARE_CONST_FAST_Packet16uc(ONE,1);
 static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
 #ifndef __VSX__
 static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0}
 #endif
 
-static Packet4f  p4f_COUNTDOWN  = { 0.0, 1.0, 2.0, 3.0 };
-static Packet4i  p4i_COUNTDOWN  = { 0, 1, 2, 3 };
-static Packet8s  p8s_COUNTDOWN  = { 0, 1, 2, 3, 4, 5, 6, 7 };
-static Packet8us p8us_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 };
-static Packet16c  p16c_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7,
-                                    8, 9, 10, 11, 12, 13, 14, 15};
-static Packet16uc p16uc_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7, 
-                                    8, 9, 10, 11, 12, 13, 14, 15};
+static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
+static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
 
 static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 };
-static Packet16uc p16uc_REVERSE16 = { 14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1 };
-static Packet16uc p16uc_REVERSE8 = { 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 };
-
 static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 };
-static Packet16uc p16uc_DUPLICATE16_HI = { 0,1,0,1, 2,3,2,3, 4,5,4,5, 6,7,6,7 };
-static Packet16uc p16uc_DUPLICATE8_HI = { 0,0, 1,1, 2,2, 3,3, 4,4, 5,5, 6,6, 7,7 };
 
-static Packet16uc p16uc_QUADRUPLICATE16_HI = { 0,1,0,1,0,1,0,1, 2,3,2,3,2,3,2,3 };
+// Mask alignment
+#ifdef __PPC64__
+#define _EIGEN_MASK_ALIGNMENT	0xfffffffffffffff0
+#else
+#define _EIGEN_MASK_ALIGNMENT	0xfffffff0
+#endif
+
+#define _EIGEN_ALIGNED_PTR(x)	((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT)
 
 // Handle endianness properly while loading constants
 // Define global static constants:
@@ -147,27 +129,27 @@ static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_L
   #define EIGEN_PPC_PREFETCH(ADDR) asm( "   dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
 #endif
 
-template <>
-struct packet_traits<float> : default_packet_traits {
+template<> struct packet_traits<float>  : default_packet_traits
+{
   typedef Packet4f type;
   typedef Packet4f half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size = 4,
+    size=4,
     HasHalfPacket = 1,
 
-    HasAdd = 1,
-    HasSub = 1,
-    HasMul = 1,
-    HasDiv = 1,
-    HasMin = 1,
-    HasMax = 1,
-    HasAbs = 1,
-    HasSin = EIGEN_FAST_MATH,
-    HasCos = EIGEN_FAST_MATH,
-    HasLog = 1,
-    HasExp = 1,
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 1,
+    HasMin  = 1,
+    HasMax  = 1,
+    HasAbs  = 1,
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 0,
+    HasExp  = 1,
 #ifdef __VSX__
     HasSqrt = 1,
 #if !EIGEN_COMP_CLANG
@@ -178,8 +160,6 @@ struct packet_traits<float> : default_packet_traits {
 #else
     HasSqrt = 0,
     HasRsqrt = 0,
-    HasTanh = EIGEN_FAST_MATH,
-    HasErf = EIGEN_FAST_MATH,
 #endif
     HasRound = 1,
     HasFloor = 1,
@@ -188,8 +168,8 @@ struct packet_traits<float> : default_packet_traits {
     HasBlend = 1
   };
 };
-template <>
-struct packet_traits<int> : default_packet_traits {
+template<> struct packet_traits<int>    : default_packet_traits
+{
   typedef Packet4i type;
   typedef Packet4i half;
   enum {
@@ -198,25 +178,6 @@ struct packet_traits<int> : default_packet_traits {
     size = 4,
     HasHalfPacket = 0,
 
-    HasAdd   = 1,
-    HasSub   = 1,
-    HasShift = 1,
-    HasMul   = 1,
-    HasDiv   = 0,
-    HasBlend = 1
-  };
-};
-
-template <>
-struct packet_traits<short int> : default_packet_traits {
-  typedef Packet8s type;
-  typedef Packet8s half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 8,
-    HasHalfPacket = 0,
-
     HasAdd  = 1,
     HasSub  = 1,
     HasMul  = 1,
@@ -225,120 +186,19 @@ struct packet_traits<short int> : default_packet_traits {
   };
 };
 
-template <>
-struct packet_traits<unsigned short int> : default_packet_traits {
-  typedef Packet8us type;
-  typedef Packet8us half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 8,
-    HasHalfPacket = 0,
 
-    HasAdd  = 1,
-    HasSub  = 1,
-    HasMul  = 1,
-    HasDiv  = 0,
-    HasBlend = 1
-  };
-};
-
-template <>
-struct packet_traits<int8_t> : default_packet_traits {
-  typedef Packet16c type;
-  typedef Packet16c half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 16,
-    HasHalfPacket = 0,
-
-    HasAdd  = 1,
-    HasSub  = 1,
-    HasMul  = 1,
-    HasDiv  = 0,
-    HasBlend = 1
-  };
-};
-
-template <>
-struct packet_traits<uint8_t> : default_packet_traits {
-  typedef Packet16uc type;
-  typedef Packet16uc half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 16,
-    HasHalfPacket = 0,
-
-    HasAdd  = 1,
-    HasSub  = 1,
-    HasMul  = 1,
-    HasDiv  = 0,
-    HasBlend = 1
-  };
-};
-
-template<> struct unpacket_traits<Packet4f>
-{
-  typedef float     type;
-  typedef Packet4f  half;
-  typedef Packet4i  integer_packet;
-  enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
-};
-template<> struct unpacket_traits<Packet4i>
-{
-  typedef int       type;
-  typedef Packet4i  half;
-  enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
-};
-template<> struct unpacket_traits<Packet8s>
-{
-  typedef short int type;
-  typedef Packet8s  half;
-  enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
-};
-template<> struct unpacket_traits<Packet8us>
-{
-  typedef unsigned short int type;
-  typedef Packet8us          half;
-  enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
-};
-
-template<> struct unpacket_traits<Packet16c>
-{
-  typedef int8_t type;
-  typedef Packet16c  half;
-  enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
-};
-template<> struct unpacket_traits<Packet16uc>
-{
-  typedef uint8_t type;
-  typedef Packet16uc  half;
-  enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
-};
-
-inline std::ostream & operator <<(std::ostream & s, const Packet16c & v)
-{
-  union {
-    Packet16c   v;
-    int8_t n[16];
-  } vt;
-  vt.v = v;
-  for (int i=0; i< 16; i++)
-    s << vt.n[i] << ", ";
-  return s;
-}
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
 
 inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v)
 {
   union {
     Packet16uc   v;
-    uint8_t n[16];
+    unsigned char n[16];
   } vt;
   vt.v = v;
   for (int i=0; i< 16; i++)
-    s << vt.n[i] << ", ";
+    s << (int)vt.n[i] << ", ";
   return s;
 }
 
@@ -378,12 +238,9 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
 // Need to define them first or we get specialization after instantiation errors
 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
 {
-  // some versions of GCC throw "unused-but-set-parameter".
-  // ignoring these warnings for now.
-  EIGEN_UNUSED_VARIABLE(from);
   EIGEN_DEBUG_ALIGNED_LOAD
 #ifdef __VSX__
-  return vec_xl(0, from);
+  return vec_vsx_ld(0, from);
 #else
   return vec_ld(0, from);
 #endif
@@ -391,61 +248,19 @@ template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
 
 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from)
 {
-  // some versions of GCC throw "unused-but-set-parameter".
-  // ignoring these warnings for now.
-  EIGEN_UNUSED_VARIABLE(from);
   EIGEN_DEBUG_ALIGNED_LOAD
 #ifdef __VSX__
-  return vec_xl(0, from);
+  return vec_vsx_ld(0, from);
 #else
   return vec_ld(0, from);
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const short int* from)
-{
-  // some versions of GCC throw "unused-but-set-parameter".
-  // ignoring these warnings for now.
-  EIGEN_UNUSED_VARIABLE(from);
-  EIGEN_DEBUG_ALIGNED_LOAD
-  return vec_ld(0, from);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const unsigned short int* from)
-{
-  // some versions of GCC throw "unused-but-set-parameter".
-  // ignoring these warnings for now.
-  EIGEN_UNUSED_VARIABLE(from);
-  EIGEN_DEBUG_ALIGNED_LOAD
-  return vec_ld(0, from);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t*     from)
-{
-  // some versions of GCC throw "unused-but-set-parameter".
-  // ignoring these warnings for now.
-  EIGEN_UNUSED_VARIABLE(from);
-  EIGEN_DEBUG_ALIGNED_LOAD
-  return vec_ld(0, from);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t*     from)
-{
-  // some versions of GCC throw "unused-but-set-parameter".
-  // ignoring these warnings for now.
-  EIGEN_UNUSED_VARIABLE(from);
-  EIGEN_DEBUG_ALIGNED_LOAD
-  return vec_ld(0, from);
-}
-
 template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from)
 {
-  // some versions of GCC throw "unused-but-set-parameter" (float *to).
-  // ignoring these warnings for now.
-  EIGEN_UNUSED_VARIABLE(to);
   EIGEN_DEBUG_ALIGNED_STORE
 #ifdef __VSX__
-  vec_xst(from, 0, to);
+  vec_vsx_st(from, 0, to);
 #else
   vec_st(from, 0, to);
 #endif
@@ -453,52 +268,14 @@ template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& f
 
 template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from)
 {
-  // some versions of GCC throw "unused-but-set-parameter" (float *to).
-  // ignoring these warnings for now.
-  EIGEN_UNUSED_VARIABLE(to);
   EIGEN_DEBUG_ALIGNED_STORE
 #ifdef __VSX__
-  vec_xst(from, 0, to);
+  vec_vsx_st(from, 0, to);
 #else
   vec_st(from, 0, to);
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<short int>(short int*       to, const Packet8s& from)
-{
-  // some versions of GCC throw "unused-but-set-parameter" (float *to).
-  // ignoring these warnings for now.
-  EIGEN_UNUSED_VARIABLE(to);
-  EIGEN_DEBUG_ALIGNED_STORE
-  vec_st(from, 0, to);
-}
-
-template<> EIGEN_STRONG_INLINE void pstore<unsigned short int>(unsigned short int*       to, const Packet8us& from)
-{
-  // some versions of GCC throw "unused-but-set-parameter" (float *to).
-  // ignoring these warnings for now.
-  EIGEN_UNUSED_VARIABLE(to);
-  EIGEN_DEBUG_ALIGNED_STORE
-  vec_st(from, 0, to);
-}
-template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t*       to, const Packet16c& from)
-{
-  // some versions of GCC throw "unused-but-set-parameter" (float *to).
-  // ignoring these warnings for now.
-  EIGEN_UNUSED_VARIABLE(to);
-  EIGEN_DEBUG_ALIGNED_STORE
-  vec_st(from, 0, to);
-}
-
-template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t*       to, const Packet16uc& from)
-{
-  // some versions of GCC throw "unused-but-set-parameter" (float *to).
-  // ignoring these warnings for now.
-  EIGEN_UNUSED_VARIABLE(to);
-  EIGEN_DEBUG_ALIGNED_STORE
-  vec_st(from, 0, to);
-}
-
 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
   Packet4f v = {from, from, from, from};
   return v;
@@ -508,31 +285,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   {
   Packet4i v = {from, from, from, from};
   return v;
 }
-
-template<> EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const short int&    from)   {
-  Packet8s v = {from, from, from, from, from, from, from, from};
-  return v;
-}
-
-template<> EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const unsigned short int&    from)   {
-  Packet8us v = {from, from, from, from, from, from, from, from};
-  return v;
-}
-
-template<> EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const int8_t&    from)   {
-  Packet16c v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from};
-  return v;
-}
-
-template<> EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const uint8_t&    from)   {
-  Packet16uc v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from};
-  return v;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
-  return reinterpret_cast<Packet4f>(pset1<Packet4i>(from));
-}
-
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet4f>(const float *a,
                       Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
@@ -556,7 +308,7 @@ pbroadcast4<Packet4i>(const int *a,
 
 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
 {
-  EIGEN_ALIGN16 float af[4];
+  float EIGEN_ALIGN16 af[4];
   af[0] = from[0*stride];
   af[1] = from[1*stride];
   af[2] = from[2*stride];
@@ -565,88 +317,16 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const floa
 }
 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
 {
-  EIGEN_ALIGN16 int ai[4];
+  int EIGEN_ALIGN16 ai[4];
   ai[0] = from[0*stride];
   ai[1] = from[1*stride];
   ai[2] = from[2*stride];
   ai[3] = from[3*stride];
  return pload<Packet4i>(ai);
 }
-template<> EIGEN_DEVICE_FUNC inline Packet8s pgather<short int, Packet8s>(const short int* from, Index stride)
-{
-  EIGEN_ALIGN16 short int ai[8];
-  ai[0] = from[0*stride];
-  ai[1] = from[1*stride];
-  ai[2] = from[2*stride];
-  ai[3] = from[3*stride];
-  ai[4] = from[4*stride];
-  ai[5] = from[5*stride];
-  ai[6] = from[6*stride];
-  ai[7] = from[7*stride];
-  return pload<Packet8s>(ai);
-}
-
-template<> EIGEN_DEVICE_FUNC inline Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from, Index stride)
-{
-  EIGEN_ALIGN16 unsigned short int ai[8];
-  ai[0] = from[0*stride];
-  ai[1] = from[1*stride];
-  ai[2] = from[2*stride];
-  ai[3] = from[3*stride];
-  ai[4] = from[4*stride];
-  ai[5] = from[5*stride];
-  ai[6] = from[6*stride];
-  ai[7] = from[7*stride];
-  return pload<Packet8us>(ai);
-}
-
-template<> EIGEN_DEVICE_FUNC inline Packet16c pgather<int8_t, Packet16c>(const int8_t* from, Index stride)
-{
-  EIGEN_ALIGN16 int8_t ai[16];
-  ai[0] = from[0*stride];
-  ai[1] = from[1*stride];
-  ai[2] = from[2*stride];
-  ai[3] = from[3*stride];
-  ai[4] = from[4*stride];
-  ai[5] = from[5*stride];
-  ai[6] = from[6*stride];
-  ai[7] = from[7*stride];
-  ai[8] = from[8*stride];
-  ai[9] = from[9*stride];
-  ai[10] = from[10*stride];
-  ai[11] = from[11*stride];
-  ai[12] = from[12*stride];
-  ai[13] = from[13*stride];
-  ai[14] = from[14*stride];
-  ai[15] = from[15*stride];
-  return pload<Packet16c>(ai);
-}
-
-template<> EIGEN_DEVICE_FUNC inline Packet16uc pgather<uint8_t, Packet16uc>(const uint8_t* from, Index stride)
-{
-  EIGEN_ALIGN16 uint8_t ai[16];
-  ai[0] = from[0*stride];
-  ai[1] = from[1*stride];
-  ai[2] = from[2*stride];
-  ai[3] = from[3*stride];
-  ai[4] = from[4*stride];
-  ai[5] = from[5*stride];
-  ai[6] = from[6*stride];
-  ai[7] = from[7*stride];
-  ai[8] = from[8*stride];
-  ai[9] = from[9*stride];
-  ai[10] = from[10*stride];
-  ai[11] = from[11*stride];
-  ai[12] = from[12*stride];
-  ai[13] = from[13*stride];
-  ai[14] = from[14*stride];
-  ai[15] = from[15*stride];
-  return pload<Packet16uc>(ai);
-}
-
 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
 {
-  EIGEN_ALIGN16 float af[4];
+  float EIGEN_ALIGN16 af[4];
   pstore<float>(af, from);
   to[0*stride] = af[0];
   to[1*stride] = af[1];
@@ -655,7 +335,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, co
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
 {
-  EIGEN_ALIGN16 int ai[4];
+  int EIGEN_ALIGN16 ai[4];
   pstore<int>((int *)ai, from);
   to[0*stride] = ai[0];
   to[1*stride] = ai[1];
@@ -663,52 +343,14 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const
   to[3*stride] = ai[3];
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<short int, Packet8s>(short int* to, const Packet8s& from, Index stride)
-{
-  EIGEN_ALIGN16 short int ai[8];
-  pstore<short int>((short int *)ai, from);
-  to[0*stride] = ai[0];
-  to[1*stride] = ai[1];
-  to[2*stride] = ai[2];
-  to[3*stride] = ai[3];
-  to[4*stride] = ai[4];
-  to[5*stride] = ai[5];
-  to[6*stride] = ai[6];
-  to[7*stride] = ai[7];
-}
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; }
+template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)   { return pset1<Packet4i>(a) + p4i_COUNTDOWN; }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<unsigned short int, Packet8us>(unsigned short int* to, const Packet8us& from, Index stride)
-{
-  EIGEN_ALIGN16 unsigned short int ai[8];
-  pstore<unsigned short int>((unsigned short int *)ai, from);
-  to[0*stride] = ai[0];
-  to[1*stride] = ai[1];
-  to[2*stride] = ai[2];
-  to[3*stride] = ai[3];
-  to[4*stride] = ai[4];
-  to[5*stride] = ai[5];
-  to[6*stride] = ai[6];
-  to[7*stride] = ai[7];
-}
+template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return a + b; }
+template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return a + b; }
 
-template<> EIGEN_STRONG_INLINE Packet4f   plset<Packet4f>(const float&     a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN;  }
-template<> EIGEN_STRONG_INLINE Packet4i   plset<Packet4i>(const int&       a) { return pset1<Packet4i>(a) + p4i_COUNTDOWN;  }
-template<> EIGEN_STRONG_INLINE Packet8s   plset<Packet8s>(const short int& a) { return pset1<Packet8s>(a) + p8s_COUNTDOWN; }
-template<> EIGEN_STRONG_INLINE Packet8us  plset<Packet8us>(const unsigned short int& a) { return pset1<Packet8us>(a) + p8us_COUNTDOWN; }
-template<> EIGEN_STRONG_INLINE Packet16c  plset<Packet16c>(const int8_t& a)   { return pset1<Packet16c>(a) + p16c_COUNTDOWN; }
-template<> EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const uint8_t& a)   { return pset1<Packet16uc>(a) + p16uc_COUNTDOWN; }
-
-template<> EIGEN_STRONG_INLINE Packet4f   padd<Packet4f>  (const Packet4f&   a, const Packet4f&   b) { return a + b; }
-template<> EIGEN_STRONG_INLINE Packet4i   padd<Packet4i>  (const Packet4i&   a, const Packet4i&   b) { return a + b; }
-template<> EIGEN_STRONG_INLINE Packet8s   padd<Packet8s>  (const Packet8s&   a, const Packet8s&   b) { return a + b; }
-template<> EIGEN_STRONG_INLINE Packet8us  padd<Packet8us> (const Packet8us&  a, const Packet8us&  b) { return a + b; }
-template<> EIGEN_STRONG_INLINE Packet16c  padd<Packet16c> (const Packet16c&  a, const Packet16c&  b) { return a + b; }
-template<> EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a + b; }
-
-template<> EIGEN_STRONG_INLINE Packet4f   psub<Packet4f>  (const Packet4f&   a, const Packet4f&   b) { return a - b; }
-template<> EIGEN_STRONG_INLINE Packet4i   psub<Packet4i>  (const Packet4i&   a, const Packet4i&   b) { return a - b; }
-template<> EIGEN_STRONG_INLINE Packet16c  psub<Packet16c> (const Packet16c&  a, const Packet16c&  b) { return a - b; }
-template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a - b; }
+template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return a - b; }
+template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return a - b; }
 
 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; }
 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; }
@@ -716,10 +358,8 @@ template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_
 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
 
-template<> EIGEN_STRONG_INLINE Packet4f   pmul<Packet4f>  (const Packet4f&   a, const Packet4f&   b) { return vec_madd(a,b, p4f_MZERO); }
-template<> EIGEN_STRONG_INLINE Packet4i   pmul<Packet4i>  (const Packet4i&   a, const Packet4i&   b) { return a * b; }
-template<> EIGEN_STRONG_INLINE Packet16c  pmul<Packet16c> (const Packet16c&  a, const Packet16c&  b) { return vec_mul(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_mul(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); }
+template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return a * b; }
 
 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
@@ -751,7 +391,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i&
 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
   #ifdef __VSX__
-  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
   Packet4f ret;
   __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
   return ret;
@@ -760,15 +399,10 @@ template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const
   #endif
 }
 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_min(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_min(a, b); }
-template<> EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_min(a, b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_min(a, b); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
   #ifdef __VSX__
-  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
   Packet4f ret;
   __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
   return ret;
@@ -777,19 +411,6 @@ template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const
   #endif
 }
 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) { return vec_max(a, b); }
-template<> EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) { return vec_max(a, b); }
-template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) { return vec_max(a, b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_max(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmple(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmplt(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmpeq(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
-  Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a,b));
-  return vec_nor(c,c);
-}
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmpeq(a,b)); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
@@ -803,19 +424,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const
 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
 
-template<> EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
-  return vec_sel(b, a, reinterpret_cast<Packet4ui>(mask));
-}
-template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
-    Packet4f t = vec_add(reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
-    Packet4f res;
-
-    __asm__("vrfiz %0, %1\n\t"
-        : "=v" (res)
-        : "v" (t));
-
-    return res;
-}
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return vec_round(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a) { return vec_ceil(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); }
 
@@ -842,82 +451,17 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
   mask = vec_lvsl(0, from);                        // create the permute mask
   return (Packet4i) vec_perm(MSQ, LSQ, mask);    // align the data
 }
-template<> EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const short int* from)
-{
-  EIGEN_DEBUG_ALIGNED_LOAD
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  Packet16uc MSQ, LSQ;
-  Packet16uc mask;
-  MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
-  LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
-  mask = vec_lvsl(0, from);                        // create the permute mask
-  return static_cast<Packet8s>(vec_perm(MSQ, LSQ, mask));    // align the data
-}
-template<> EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const unsigned short int* from)
-{
-  EIGEN_DEBUG_ALIGNED_LOAD
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  Packet16uc MSQ, LSQ;
-  Packet16uc mask;
-  MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
-  LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
-  mask = vec_lvsl(0, from);                        // create the permute mask
-  return static_cast<Packet8us>(vec_perm(MSQ, LSQ, mask));    // align the data
-}
-template<> EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const char* from)
-{
-  EIGEN_DEBUG_ALIGNED_LOAD
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  Packet16uc MSQ, LSQ;
-  Packet16uc mask;
-  MSQ = vec_ld(0, from);          // most significant quadword
-  LSQ = vec_ld(15, from);         // least significant quadword
-  mask = vec_lvsl(0, from);                        // create the permute mask
-  return static_cast<Packet16c>(vec_perm(MSQ, LSQ, mask));    // align the data
-}
-
-template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char* from)
-{
-  EIGEN_DEBUG_ALIGNED_LOAD
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  Packet16uc MSQ, LSQ;
-  Packet16uc mask;
-  MSQ = vec_ld(0, from);          // most significant quadword
-  LSQ = vec_ld(15, from);         // least significant quadword
-  mask = vec_lvsl(0, from);                        // create the permute mask
-  return static_cast<Packet16uc>(vec_perm(MSQ, LSQ, mask));    // align the data
-}
 #else
-// We also need to redefine little endian loading of Packet4i/Packet4f using VSX
+// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
 {
   EIGEN_DEBUG_UNALIGNED_LOAD
-  return vec_xl(0, from);
+  return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from));
 }
 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
 {
   EIGEN_DEBUG_UNALIGNED_LOAD
-  return vec_xl(0, from);
-}
-template<> EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const short int* from)
-{
-  EIGEN_DEBUG_UNALIGNED_LOAD
-  return vec_vsx_ld(0, from);
-}
-template<> EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const unsigned short int* from)
-{
-  EIGEN_DEBUG_UNALIGNED_LOAD
-  return vec_vsx_ld(0, from);
-}
-template<> EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const int8_t* from)
-{
-  EIGEN_DEBUG_UNALIGNED_LOAD
-  return vec_vsx_ld(0, from);
-}
-template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const uint8_t* from)
-{
-  EIGEN_DEBUG_UNALIGNED_LOAD
-  return vec_vsx_ld(0, from);
+  return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from));
 }
 #endif
 
@@ -928,7 +472,6 @@ template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
   else                                  p = ploadu<Packet4f>(from);
   return vec_perm(p, p, p16uc_DUPLICATE32_HI);
 }
-
 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
 {
   Packet4i p;
@@ -937,54 +480,6 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
   return vec_perm(p, p, p16uc_DUPLICATE32_HI);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const short int*     from)
-{
-  Packet8s p;
-  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet8s>(from);
-  else                                  p = ploadu<Packet8s>(from);
-  return vec_perm(p, p, p16uc_DUPLICATE16_HI);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const unsigned short int*     from)
-{
-  Packet8us p;
-  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet8us>(from);
-  else                                  p = ploadu<Packet8us>(from);
-  return vec_perm(p, p, p16uc_DUPLICATE16_HI);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const short int*     from)
-{
-  Packet8s p;
-  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet8s>(from);
-  else                                  p = ploadu<Packet8s>(from);
-  return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
-}
-
-template<> EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const unsigned short int*     from)
-{
-  Packet8us p;
-  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet8us>(from);
-  else                                  p = ploadu<Packet8us>(from);
-  return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const int8_t*     from)
-{
-  Packet16c p;
-  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet16c>(from);
-  else                                  p = ploadu<Packet16c>(from);
-  return vec_perm(p, p, p16uc_DUPLICATE8_HI);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const uint8_t*     from)
-{
-  Packet16uc p;
-  if((std::ptrdiff_t(from) % 16) == 0)  p = pload<Packet16uc>(from);
-  else                                  p = ploadu<Packet16uc>(from);
-  return vec_perm(p, p, p16uc_DUPLICATE8_HI);
-}
-
 #ifdef _BIG_ENDIAN
 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from)
 {
@@ -1022,151 +517,25 @@ template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& f
   vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
   vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
 }
-template<> EIGEN_STRONG_INLINE void pstoreu<short int>(short int*      to, const Packet8s& from)
-{
-  EIGEN_DEBUG_UNALIGNED_STORE
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  // Warning: not thread safe!
-  Packet16uc MSQ, LSQ, edges;
-  Packet16uc edgeAlign, align;
-
-  MSQ = vec_ld(0, (unsigned char *)to);                     // most significant quadword
-  LSQ = vec_ld(15, (unsigned char *)to);                    // least significant quadword
-  edgeAlign = vec_lvsl(0, to);                              // permute map to extract edges
-  edges = vec_perm(LSQ, MSQ, edgeAlign);                      // extract the edges
-  align = vec_lvsr( 0, to );                                // permute map to misalign data
-  MSQ = vec_perm(edges, (Packet16uc) from, align);          // misalign the data (MSQ)
-  LSQ = vec_perm((Packet16uc) from, edges, align);          // misalign the data (LSQ)
-  vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
-  vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
-}
-template<> EIGEN_STRONG_INLINE void pstoreu<unsigned short int>(unsigned short int*      to, const Packet8us& from)
-{
-  EIGEN_DEBUG_UNALIGNED_STORE
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  // Warning: not thread safe!
-  Packet16uc MSQ, LSQ, edges;
-  Packet16uc edgeAlign, align;
-
-  MSQ = vec_ld(0, (unsigned char *)to);                     // most significant quadword
-  LSQ = vec_ld(15, (unsigned char *)to);                    // least significant quadword
-  edgeAlign = vec_lvsl(0, to);                              // permute map to extract edges
-  edges = vec_perm(LSQ, MSQ, edgeAlign);                      // extract the edges
-  align = vec_lvsr( 0, to );                                // permute map to misalign data
-  MSQ = vec_perm(edges, (Packet16uc) from, align);          // misalign the data (MSQ)
-  LSQ = vec_perm((Packet16uc) from, edges, align);          // misalign the data (LSQ)
-  vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
-  vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
-}
-
-template<> EIGEN_STRONG_INLINE void pstoreu<char>(char*      to, const Packet16c& from)
-{
-  EIGEN_DEBUG_UNALIGNED_STORE
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  // Warning: not thread safe!
-  Packet16uc MSQ, LSQ, edges;
-  Packet16uc edgeAlign, align;
-
-  MSQ = vec_ld(0, to);                     // most significant quadword
-  LSQ = vec_ld(15,to);                    // least significant quadword
-  edgeAlign = vec_lvsl(0, to);                              // permute map to extract edges
-  edges=vec_perm(LSQ, MSQ, edgeAlign);                      // extract the edges
-  align = vec_lvsr( 0, to );                                // permute map to misalign data
-  MSQ = vec_perm(edges, (Packet16uc) from, align);          // misalign the data (MSQ)
-  LSQ = vec_perm((Packet16uc) from, edges, align);          // misalign the data (LSQ)
-  vec_st( LSQ, 15, to );                   // Store the LSQ part first
-  vec_st( MSQ, 0, to );                    // Store the MSQ part
-}
-template<> EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char*      to, const Packet16uc& from)
-{
-  EIGEN_DEBUG_UNALIGNED_STORE
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  // Warning: not thread safe!
-  Packet16uc MSQ, LSQ, edges;
-  Packet16uc edgeAlign, align;
-
-  MSQ = vec_ld(0, to);                     // most significant quadword
-  LSQ = vec_ld(15,to);                    // least significant quadword
-  edgeAlign = vec_lvsl(0, to);                              // permute map to extract edges
-  edges=vec_perm(LSQ, MSQ, edgeAlign);                      // extract the edges
-  align = vec_lvsr( 0, to );                                // permute map to misalign data
-  MSQ = vec_perm(edges, (Packet16uc) from, align);          // misalign the data (MSQ)
-  LSQ = vec_perm((Packet16uc) from, edges, align);          // misalign the data (LSQ)
-  vec_st( LSQ, 15, to );                   // Store the LSQ part first
-  vec_st( MSQ, 0, to );                    // Store the MSQ part
-}
 #else
-// We also need to redefine little endian loading of Packet4i/Packet4f using VSX
+// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from)
 {
-  EIGEN_DEBUG_UNALIGNED_STORE
-  vec_xst(from, 0, to);
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to));
 }
 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f& from)
 {
-  EIGEN_DEBUG_UNALIGNED_STORE
-  vec_xst(from, 0, to);
-}
-template<> EIGEN_STRONG_INLINE void pstoreu<short int>(short int*       to, const Packet8s& from)
-{
-  EIGEN_DEBUG_UNALIGNED_STORE
-  /*GCC provides a commonly used synonym for vec_xst called vec_vsx_st.
-   * Although these have the same behavior,
-   *  only vec_xst is guaranteed to be portable across compliant compilers
-   *  vec_xst should be preferred. */
-  vec_xst(from, 0, to);
-}
-template<> EIGEN_STRONG_INLINE void pstoreu<unsigned short int>(unsigned short int*       to, const Packet8us& from)
-{
-  EIGEN_DEBUG_UNALIGNED_STORE
-  /*GCC provides a commonly used synonym for vec_xst called vec_vsx_st.
-   * Although these have the same behavior,
-   *  only vec_xst is guaranteed to be portable across compliant compilers
-   *  vec_xst should be preferred. */
-  vec_xst(from, 0, to);
-}
-template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t*   to, const Packet16c& from)
-{
-  EIGEN_DEBUG_UNALIGNED_STORE
-  vec_vsx_st(from, 0, to);
-}
-template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t*   to, const Packet16uc& from)
-{
-  EIGEN_DEBUG_UNALIGNED_STORE
-  vec_vsx_st(from, 0, to);
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
 }
 #endif
 
 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr)    { EIGEN_PPC_PREFETCH(addr); }
 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr)    { EIGEN_PPC_PREFETCH(addr); }
 
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int   x; vec_ste(a, 0, &x); return x; }
-
-template<> EIGEN_STRONG_INLINE short int pfirst<Packet8s>(const Packet8s& a) { 
-  EIGEN_ALIGN16 short int x;
-  vec_ste(a, 0, &x);
-  return x;
-}
-
-template<> EIGEN_STRONG_INLINE unsigned short int pfirst<Packet8us>(const Packet8us& a) { 
-  EIGEN_ALIGN16 unsigned short int x;
-  vec_ste(a, 0, &x);
-  return x;
-}
-
-template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(const Packet16c& a)
-{
-  EIGEN_ALIGN16 int8_t x;
-  vec_ste(a, 0, &x);
-  return x;
-}
-template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(const Packet16uc& a)
-{
-  EIGEN_ALIGN16 uint8_t x;
-  vec_ste(a, 0, &x);
-  return x;
-}
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; }
 
 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
 {
@@ -1174,46 +543,10 @@ template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
 }
 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
 {
-  return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
-}
-template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a)
-{
-  return reinterpret_cast<Packet8s>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
-}
-template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a)
-{
-  return reinterpret_cast<Packet8us>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
-}
-template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a)
-{
-  return vec_perm(a, a, p16uc_REVERSE8);
-}
-template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a)
-{
-  return vec_perm(a, a, p16uc_REVERSE8);
-}
+  return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32)); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; }
-
-template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a)
-{ return vec_sra(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
-template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a)
-{ return vec_sr(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
-template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a)
-{ return vec_sl(a,reinterpret_cast<Packet4ui>(pset1<Packet4i>(N))); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
-  return pfrexp_float(a,exponent);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
-  return pldexp_float(a,exponent);
-}
 
 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
 {
@@ -1225,6 +558,34 @@ template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
   return pfirst(sum);
 }
 
+template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
+{
+  Packet4f v[4], sum[4];
+
+  // It's easier and faster to transpose then add as columns
+  // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
+  // Do the transpose, first set of moves
+  v[0] = vec_mergeh(vecs[0], vecs[2]);
+  v[1] = vec_mergel(vecs[0], vecs[2]);
+  v[2] = vec_mergeh(vecs[1], vecs[3]);
+  v[3] = vec_mergel(vecs[1], vecs[3]);
+  // Get the resulting vectors
+  sum[0] = vec_mergeh(v[0], v[2]);
+  sum[1] = vec_mergel(v[0], v[2]);
+  sum[2] = vec_mergeh(v[1], v[3]);
+  sum[3] = vec_mergel(v[1], v[3]);
+
+  // Now do the summation:
+  // Lines 0+1
+  sum[0] = sum[0] + sum[1];
+  // Lines 2+3
+  sum[1] = sum[2] + sum[3];
+  // Add the results
+  sum[0] = sum[0] + sum[1];
+
+  return sum[0];
+}
+
 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
 {
   Packet4i sum;
@@ -1237,85 +598,34 @@ template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
   return pfirst(sum);
 }
 
-template<> EIGEN_STRONG_INLINE short int predux<Packet8s>(const Packet8s& a)
+template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
 {
-  union{
-    Packet8s v;
-    short int n[8];
-  } vt;
-  vt.v = a;
+  Packet4i v[4], sum[4];
 
-  EIGEN_ALIGN16 int  first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
-  EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
-  Packet4i first_half  = pload<Packet4i>(first_loader);
-  Packet4i second_half = pload<Packet4i>(second_loader);
+  // It's easier and faster to transpose then add as columns
+  // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
+  // Do the transpose, first set of moves
+  v[0] = vec_mergeh(vecs[0], vecs[2]);
+  v[1] = vec_mergel(vecs[0], vecs[2]);
+  v[2] = vec_mergeh(vecs[1], vecs[3]);
+  v[3] = vec_mergel(vecs[1], vecs[3]);
+  // Get the resulting vectors
+  sum[0] = vec_mergeh(v[0], v[2]);
+  sum[1] = vec_mergel(v[0], v[2]);
+  sum[2] = vec_mergeh(v[1], v[3]);
+  sum[3] = vec_mergel(v[1], v[3]);
 
-  return static_cast<short int>(predux(first_half) + predux(second_half));
+  // Now do the summation:
+  // Lines 0+1
+  sum[0] = sum[0] + sum[1];
+  // Lines 2+3
+  sum[1] = sum[2] + sum[3];
+  // Add the results
+  sum[0] = sum[0] + sum[1];
+
+  return sum[0];
 }
 
-template<> EIGEN_STRONG_INLINE unsigned short int predux<Packet8us>(const Packet8us& a)
-{
-  union{
-    Packet8us v;
-    unsigned short int n[8];
-  } vt;
-  vt.v = a;
-
-  //There is no predux for Packet4ui. So we are intentionally using int
-  EIGEN_ALIGN16 int first_loader[4]  = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
-  EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
-  Packet4i first_half  = pload<Packet4i>(first_loader);
-  Packet4i second_half = pload<Packet4i>(second_loader);
-
-  return static_cast<unsigned short int>(predux(first_half) + predux(second_half));
-}
-
-template<> EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a)
-{
-  union{
-    Packet16c v;
-    int8_t n[16];
-  } vt;
-  vt.v = a;
-
-  EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
-  EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
-  EIGEN_ALIGN16 int third_loader[4] = { vt.n[8], vt.n[9], vt.n[10], vt.n[11] };
-  EIGEN_ALIGN16 int fourth_loader[4] = { vt.n[12], vt.n[13], vt.n[14], vt.n[15] };
-
-  Packet4i first_quarter = pload<Packet4i>(first_loader);
-  Packet4i second_quarter = pload<Packet4i>(second_loader);
-  Packet4i third_quarter = pload<Packet4i>(third_loader);
-  Packet4i fourth_quarter = pload<Packet4i>(fourth_loader);
-
-  return static_cast<int8_t>(predux(first_quarter) + predux(second_quarter)
-		                  + predux(third_quarter) + predux(fourth_quarter));
-}
-
-template<> EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a)
-{
-  union{
-    Packet16uc v;
-    uint8_t n[16];
-  } vt;
-  vt.v = a;
-
-  EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] };
-  EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] };
-  EIGEN_ALIGN16 int third_loader[4] = { vt.n[8], vt.n[9], vt.n[10], vt.n[11] };
-  EIGEN_ALIGN16 int fourth_loader[4] = { vt.n[12], vt.n[13], vt.n[14], vt.n[15] };
-
-  Packet4i first_quarter = pload<Packet4i>(first_loader);
-  Packet4i second_quarter = pload<Packet4i>(second_loader);
-  Packet4i third_quarter = pload<Packet4i>(third_loader);
-  Packet4i fourth_quarter = pload<Packet4i>(fourth_loader);
-
-
-  return static_cast<uint8_t>(predux(first_quarter) + predux(second_quarter)
-		                  + predux(third_quarter) + predux(fourth_quarter));
-}
-
-
 // Other reduction functions:
 // mul
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
@@ -1332,52 +642,6 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
   return aux[0] * aux[1] * aux[2] * aux[3];
 }
 
-template<> EIGEN_STRONG_INLINE short int predux_mul<Packet8s>(const Packet8s& a)
-{
-  Packet8s pair, quad, octo;
-
-  pair = vec_mul(a, vec_sld(a, a, 8));
-  quad = vec_mul(pair, vec_sld(pair, pair, 4));
-  octo = vec_mul(quad, vec_sld(quad, quad, 2));
-
-  return pfirst(octo);
-}
-
-template<> EIGEN_STRONG_INLINE unsigned short int predux_mul<Packet8us>(const Packet8us& a)
-{
-  Packet8us pair, quad, octo;
-
-  pair = vec_mul(a, vec_sld(a, a, 8));
-  quad = vec_mul(pair, vec_sld(pair, pair, 4));
-  octo = vec_mul(quad, vec_sld(quad, quad, 2));
-
-  return pfirst(octo);
-}
-
-template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a)
-{
-  Packet16c pair, quad, octo, result;
-
-  pair = vec_mul(a, vec_sld(a, a, 8));
-  quad = vec_mul(pair, vec_sld(pair, pair, 4));
-  octo = vec_mul(quad, vec_sld(quad, quad, 2));
-  result = vec_mul(octo, vec_sld(octo, octo, 1));
-
-  return pfirst(result);
-}
-
-template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a)
-{
-  Packet16uc pair, quad, octo, result;
-
-  pair = vec_mul(a, vec_sld(a, a, 8));
-  quad = vec_mul(pair, vec_sld(pair, pair, 4));
-  octo = vec_mul(quad, vec_sld(quad, quad, 2));
-  result = vec_mul(octo, vec_sld(octo, octo, 1));
-
-  return pfirst(result);
-}
-
 // min
 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
 {
@@ -1395,59 +659,6 @@ template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
   return pfirst(res);
 }
 
-template<> EIGEN_STRONG_INLINE short int predux_min<Packet8s>(const Packet8s& a)
-{
-  Packet8s pair, quad, octo;
-  
-  //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
-  pair = vec_min(a, vec_sld(a, a, 8)); 
-
-  //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
-  quad = vec_min(pair, vec_sld(pair, pair, 4));
-
-  //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
-  octo = vec_min(quad, vec_sld(quad, quad, 2));
-  return pfirst(octo);
-}
-
-template<> EIGEN_STRONG_INLINE unsigned short int predux_min<Packet8us>(const Packet8us& a)
-{
-  Packet8us pair, quad, octo;
-  
-  //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
-  pair = vec_min(a, vec_sld(a, a, 8)); 
-
-  //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
-  quad = vec_min(pair, vec_sld(pair, pair, 4));
-
-  //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
-  octo = vec_min(quad, vec_sld(quad, quad, 2));
-  return pfirst(octo);
-}
-
-template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a)
-{
-  Packet16c pair, quad, octo, result;
-
-  pair = vec_min(a, vec_sld(a, a, 8));
-  quad = vec_min(pair, vec_sld(pair, pair, 4));
-  octo = vec_min(quad, vec_sld(quad, quad, 2));
-  result = vec_min(octo, vec_sld(octo, octo, 1));
-
-  return pfirst(result);
-}
-
-template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a)
-{
-  Packet16uc pair, quad, octo, result;
-
-  pair = vec_min(a, vec_sld(a, a, 8));
-  quad = vec_min(pair, vec_sld(pair, pair, 4));
-  octo = vec_min(quad, vec_sld(quad, quad, 2));
-  result = vec_min(octo, vec_sld(octo, octo, 1));
-
-  return pfirst(result);
-}
 // max
 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
 {
@@ -1465,64 +676,59 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
   return pfirst(res);
 }
 
-template<> EIGEN_STRONG_INLINE short int predux_max<Packet8s>(const Packet8s& a)
+template<int Offset>
+struct palign_impl<Offset,Packet4f>
 {
-  Packet8s pair, quad, octo;
-  
-  //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
-  pair = vec_max(a, vec_sld(a, a, 8)); 
+  static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
+  {
+#ifdef _BIG_ENDIAN
+    switch (Offset % 4) {
+    case 1:
+      first = vec_sld(first, second, 4); break;
+    case 2:
+      first = vec_sld(first, second, 8); break;
+    case 3:
+      first = vec_sld(first, second, 12); break;
+    }
+#else
+    switch (Offset % 4) {
+    case 1:
+      first = vec_sld(second, first, 12); break;
+    case 2:
+      first = vec_sld(second, first, 8); break;
+    case 3:
+      first = vec_sld(second, first, 4); break;
+    }
+#endif
+  }
+};
 
-  //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
-  quad = vec_max(pair, vec_sld(pair, pair, 4));
-
-  //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
-  octo = vec_max(quad, vec_sld(quad, quad, 2));
-  return pfirst(octo);
-}
-
-template<> EIGEN_STRONG_INLINE unsigned short int predux_max<Packet8us>(const Packet8us& a)
+template<int Offset>
+struct palign_impl<Offset,Packet4i>
 {
-  Packet8us pair, quad, octo;
-  
-  //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
-  pair = vec_max(a, vec_sld(a, a, 8)); 
-
-  //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
-  quad = vec_max(pair, vec_sld(pair, pair, 4));
-
-  //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
-  octo = vec_max(quad, vec_sld(quad, quad, 2));
-  return pfirst(octo);
-}
-
-template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a)
-{
-  Packet16c pair, quad, octo, result;
-
-  pair = vec_max(a, vec_sld(a, a, 8));
-  quad = vec_max(pair, vec_sld(pair, pair, 4));
-  octo = vec_max(quad, vec_sld(quad, quad, 2));
-  result = vec_max(octo, vec_sld(octo, octo, 1));
-
-  return pfirst(result);
-}
-
-template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a)
-{
-  Packet16uc pair, quad, octo, result;
-
-  pair = vec_max(a, vec_sld(a, a, 8));
-  quad = vec_max(pair, vec_sld(pair, pair, 4));
-  octo = vec_max(quad, vec_sld(quad, quad, 2));
-  result = vec_max(octo, vec_sld(octo, octo, 1));
-
-  return pfirst(result);
-}
-
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
-{
-  return vec_any_ne(x, pzero(x));
-}
+  static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
+  {
+#ifdef _BIG_ENDIAN
+    switch (Offset % 4) {
+    case 1:
+      first = vec_sld(first, second, 4); break;
+    case 2:
+      first = vec_sld(first, second, 8); break;
+    case 3:
+      first = vec_sld(first, second, 12); break;
+    }
+#else
+    switch (Offset % 4) {
+    case 1:
+      first = vec_sld(second, first, 12); break;
+    case 2:
+      first = vec_sld(second, first, 8); break;
+    case 3:
+      first = vec_sld(second, first, 4); break;
+    }
+#endif
+  }
+};
 
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet4f,4>& kernel) {
@@ -1550,267 +756,6 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
   kernel.packet[3] = vec_mergel(t1, t3);
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8s,4>& kernel) {
-  Packet8s t0, t1, t2, t3;
-  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
-  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
-  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
-  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
-  kernel.packet[0] = vec_mergeh(t0, t2);
-  kernel.packet[1] = vec_mergel(t0, t2);
-  kernel.packet[2] = vec_mergeh(t1, t3);
-  kernel.packet[3] = vec_mergel(t1, t3);
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8us,4>& kernel) {
-  Packet8us t0, t1, t2, t3;
-  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
-  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
-  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
-  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
-  kernel.packet[0] = vec_mergeh(t0, t2);
-  kernel.packet[1] = vec_mergel(t0, t2);
-  kernel.packet[2] = vec_mergeh(t1, t3);
-  kernel.packet[3] = vec_mergel(t1, t3);
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet16c,4>& kernel) {
-  Packet16c t0, t1, t2, t3;
-  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
-  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
-  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
-  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
-  kernel.packet[0] = vec_mergeh(t0, t2);
-  kernel.packet[1] = vec_mergel(t0, t2);
-  kernel.packet[2] = vec_mergeh(t1, t3);
-  kernel.packet[3] = vec_mergel(t1, t3);
-}
-
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet16uc,4>& kernel) {
-  Packet16uc t0, t1, t2, t3;
-  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
-  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
-  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
-  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
-  kernel.packet[0] = vec_mergeh(t0, t2);
-  kernel.packet[1] = vec_mergel(t0, t2);
-  kernel.packet[2] = vec_mergeh(t1, t3);
-  kernel.packet[3] = vec_mergel(t1, t3);
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8s,8>& kernel) {
-  Packet8s v[8], sum[8];
-
-  v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
-  v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
-  v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
-  v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
-  v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
-  v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
-  v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
-  v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
-  sum[0] = vec_mergeh(v[0], v[4]);
-  sum[1] = vec_mergel(v[0], v[4]);
-  sum[2] = vec_mergeh(v[1], v[5]);
-  sum[3] = vec_mergel(v[1], v[5]);
-  sum[4] = vec_mergeh(v[2], v[6]);
-  sum[5] = vec_mergel(v[2], v[6]);
-  sum[6] = vec_mergeh(v[3], v[7]);
-  sum[7] = vec_mergel(v[3], v[7]);
-
-  kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
-  kernel.packet[1] = vec_mergel(sum[0], sum[4]);
-  kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
-  kernel.packet[3] = vec_mergel(sum[1], sum[5]);
-  kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
-  kernel.packet[5] = vec_mergel(sum[2], sum[6]);
-  kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
-  kernel.packet[7] = vec_mergel(sum[3], sum[7]);
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet8us,8>& kernel) {
-  Packet8us v[8], sum[8];
-
-  v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
-  v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
-  v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
-  v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
-  v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
-  v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
-  v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
-  v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
-  sum[0] = vec_mergeh(v[0], v[4]);
-  sum[1] = vec_mergel(v[0], v[4]);
-  sum[2] = vec_mergeh(v[1], v[5]);
-  sum[3] = vec_mergel(v[1], v[5]);
-  sum[4] = vec_mergeh(v[2], v[6]);
-  sum[5] = vec_mergel(v[2], v[6]);
-  sum[6] = vec_mergeh(v[3], v[7]);
-  sum[7] = vec_mergel(v[3], v[7]);
-
-  kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
-  kernel.packet[1] = vec_mergel(sum[0], sum[4]);
-  kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
-  kernel.packet[3] = vec_mergel(sum[1], sum[5]);
-  kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
-  kernel.packet[5] = vec_mergel(sum[2], sum[6]);
-  kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
-  kernel.packet[7] = vec_mergel(sum[3], sum[7]);
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet16c,16>& kernel) {
-  Packet16c step1[16], step2[16], step3[16];
-
-  step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
-  step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
-  step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
-  step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
-  step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
-  step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
-  step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
-  step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
-  step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
-  step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
-  step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
-  step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
-  step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
-  step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
-  step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
-  step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
-
-  step2[0]  = vec_mergeh(step1[0], step1[8]);
-  step2[1]  = vec_mergel(step1[0], step1[8]);
-  step2[2]  = vec_mergeh(step1[1], step1[9]);
-  step2[3]  = vec_mergel(step1[1], step1[9]);
-  step2[4]  = vec_mergeh(step1[2], step1[10]);
-  step2[5]  = vec_mergel(step1[2], step1[10]);
-  step2[6]  = vec_mergeh(step1[3], step1[11]);
-  step2[7]  = vec_mergel(step1[3], step1[11]);
-  step2[8]  = vec_mergeh(step1[4], step1[12]);
-  step2[9]  = vec_mergel(step1[4], step1[12]);
-  step2[10] = vec_mergeh(step1[5], step1[13]);
-  step2[11] = vec_mergel(step1[5], step1[13]);
-  step2[12] = vec_mergeh(step1[6], step1[14]);
-  step2[13] = vec_mergel(step1[6], step1[14]);
-  step2[14] = vec_mergeh(step1[7], step1[15]);
-  step2[15] = vec_mergel(step1[7], step1[15]);
-
-  step3[0]  = vec_mergeh(step2[0], step2[8]);
-  step3[1]  = vec_mergel(step2[0], step2[8]);
-  step3[2]  = vec_mergeh(step2[1], step2[9]);
-  step3[3]  = vec_mergel(step2[1], step2[9]);
-  step3[4]  = vec_mergeh(step2[2], step2[10]);
-  step3[5]  = vec_mergel(step2[2], step2[10]);
-  step3[6]  = vec_mergeh(step2[3], step2[11]);
-  step3[7]  = vec_mergel(step2[3], step2[11]);
-  step3[8]  = vec_mergeh(step2[4], step2[12]);
-  step3[9]  = vec_mergel(step2[4], step2[12]);
-  step3[10] = vec_mergeh(step2[5], step2[13]);
-  step3[11] = vec_mergel(step2[5], step2[13]);
-  step3[12] = vec_mergeh(step2[6], step2[14]);
-  step3[13] = vec_mergel(step2[6], step2[14]);
-  step3[14] = vec_mergeh(step2[7], step2[15]);
-  step3[15] = vec_mergel(step2[7], step2[15]);
-
-  kernel.packet[0]  = vec_mergeh(step3[0], step3[8]);
-  kernel.packet[1]  = vec_mergel(step3[0], step3[8]);
-  kernel.packet[2]  = vec_mergeh(step3[1], step3[9]);
-  kernel.packet[3]  = vec_mergel(step3[1], step3[9]);
-  kernel.packet[4]  = vec_mergeh(step3[2], step3[10]);
-  kernel.packet[5]  = vec_mergel(step3[2], step3[10]);
-  kernel.packet[6]  = vec_mergeh(step3[3], step3[11]);
-  kernel.packet[7]  = vec_mergel(step3[3], step3[11]);
-  kernel.packet[8]  = vec_mergeh(step3[4], step3[12]);
-  kernel.packet[9]  = vec_mergel(step3[4], step3[12]);
-  kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
-  kernel.packet[11] = vec_mergel(step3[5], step3[13]);
-  kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
-  kernel.packet[13] = vec_mergel(step3[6], step3[14]);
-  kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
-  kernel.packet[15] = vec_mergel(step3[7], step3[15]);
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet16uc,16>& kernel) {
-  Packet16uc step1[16], step2[16], step3[16];
-
-  step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
-  step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
-  step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
-  step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
-  step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
-  step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
-  step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
-  step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
-  step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
-  step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
-  step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
-  step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
-  step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
-  step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
-  step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
-  step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
-
-  step2[0]  = vec_mergeh(step1[0], step1[8]);
-  step2[1]  = vec_mergel(step1[0], step1[8]);
-  step2[2]  = vec_mergeh(step1[1], step1[9]);
-  step2[3]  = vec_mergel(step1[1], step1[9]);
-  step2[4]  = vec_mergeh(step1[2], step1[10]);
-  step2[5]  = vec_mergel(step1[2], step1[10]);
-  step2[6]  = vec_mergeh(step1[3], step1[11]);
-  step2[7]  = vec_mergel(step1[3], step1[11]);
-  step2[8]  = vec_mergeh(step1[4], step1[12]);
-  step2[9]  = vec_mergel(step1[4], step1[12]);
-  step2[10] = vec_mergeh(step1[5], step1[13]);
-  step2[11] = vec_mergel(step1[5], step1[13]);
-  step2[12] = vec_mergeh(step1[6], step1[14]);
-  step2[13] = vec_mergel(step1[6], step1[14]);
-  step2[14] = vec_mergeh(step1[7], step1[15]);
-  step2[15] = vec_mergel(step1[7], step1[15]);
-
-  step3[0]  = vec_mergeh(step2[0], step2[8]);
-  step3[1]  = vec_mergel(step2[0], step2[8]);
-  step3[2]  = vec_mergeh(step2[1], step2[9]);
-  step3[3]  = vec_mergel(step2[1], step2[9]);
-  step3[4]  = vec_mergeh(step2[2], step2[10]);
-  step3[5]  = vec_mergel(step2[2], step2[10]);
-  step3[6]  = vec_mergeh(step2[3], step2[11]);
-  step3[7]  = vec_mergel(step2[3], step2[11]);
-  step3[8]  = vec_mergeh(step2[4], step2[12]);
-  step3[9]  = vec_mergel(step2[4], step2[12]);
-  step3[10] = vec_mergeh(step2[5], step2[13]);
-  step3[11] = vec_mergel(step2[5], step2[13]);
-  step3[12] = vec_mergeh(step2[6], step2[14]);
-  step3[13] = vec_mergel(step2[6], step2[14]);
-  step3[14] = vec_mergeh(step2[7], step2[15]);
-  step3[15] = vec_mergel(step2[7], step2[15]);
-
-  kernel.packet[0]  = vec_mergeh(step3[0], step3[8]);
-  kernel.packet[1]  = vec_mergel(step3[0], step3[8]);
-  kernel.packet[2]  = vec_mergeh(step3[1], step3[9]);
-  kernel.packet[3]  = vec_mergel(step3[1], step3[9]);
-  kernel.packet[4]  = vec_mergeh(step3[2], step3[10]);
-  kernel.packet[5]  = vec_mergel(step3[2], step3[10]);
-  kernel.packet[6]  = vec_mergeh(step3[3], step3[11]);
-  kernel.packet[7]  = vec_mergel(step3[3], step3[11]);
-  kernel.packet[8]  = vec_mergeh(step3[4], step3[12]);
-  kernel.packet[9]  = vec_mergel(step3[4], step3[12]);
-  kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
-  kernel.packet[11] = vec_mergel(step3[5], step3[13]);
-  kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
-  kernel.packet[13] = vec_mergel(step3[6], step3[14]);
-  kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
-  kernel.packet[15] = vec_mergel(step3[7], step3[15]);
-}
-
 template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
   Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
   Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE)));
@@ -1823,77 +768,6 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons
   return vec_sel(elsePacket, thenPacket, mask);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket, const Packet8s& elsePacket) {
-  Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
-                       ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] };
-  Packet8us mask = reinterpret_cast<Packet8us>(vec_cmpeq(select, p8us_ONE));
-  Packet8s result = vec_sel(elsePacket, thenPacket, mask);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket, const Packet8us& elsePacket) {
-  Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
-                       ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] };
-  Packet8us mask = reinterpret_cast<Packet8us>(vec_cmpeq(reinterpret_cast<Packet8us>(select), p8us_ONE));
-  return vec_sel(elsePacket, thenPacket, mask);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, const Packet16c& thenPacket, const Packet16c& elsePacket) {
-  Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
-                       ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
-                       ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
-                       ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] };
-
-  Packet16uc mask = reinterpret_cast<Packet16uc>(vec_cmpeq(reinterpret_cast<Packet16uc>(select), p16uc_ONE));
-  return vec_sel(elsePacket, thenPacket, mask);
-}
-
-template<> EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket, const Packet16uc& elsePacket) {
-  Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
-                       ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7],
-                       ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11],
-                       ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] };
-
-  Packet16uc mask = reinterpret_cast<Packet16uc>(vec_cmpeq(reinterpret_cast<Packet16uc>(select), p16uc_ONE));
-  return vec_sel(elsePacket, thenPacket, mask);
-}
-
-template <>
-struct type_casting_traits<float, int> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template <>
-struct type_casting_traits<int, float> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-
-template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
-  return vec_cts(a,0);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
-  return vec_ctf(a,0);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
-  return reinterpret_cast<Packet4i>(a);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
-  return reinterpret_cast<Packet4f>(a);
-}
-
-
 
 //---------- double ----------
 #ifdef __VSX__
@@ -1961,7 +835,7 @@ template<> struct packet_traits<double> : default_packet_traits
   };
 };
 
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; };
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
 
 inline std::ostream & operator <<(std::ostream & s, const Packet2l & v)
 {
@@ -1989,13 +863,21 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
 {
   EIGEN_DEBUG_ALIGNED_LOAD
-  return vec_xl(0, const_cast<double *>(from)); // cast needed by Clang
+#ifdef __VSX__
+  return vec_vsx_ld(0, from);
+#else
+  return vec_ld(0, from);
+#endif
 }
 
 template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from)
 {
   EIGEN_DEBUG_ALIGNED_STORE
-  vec_xst(from, 0, to);
+#ifdef __VSX__
+  vec_vsx_st(from, 0, to);
+#else
+  vec_st(from, 0, to);
+#endif
 }
 
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) {
@@ -2017,14 +899,14 @@ pbroadcast4<Packet2d>(const double *a,
 
 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
 {
-  EIGEN_ALIGN16 double af[2];
+  double EIGEN_ALIGN16 af[2];
   af[0] = from[0*stride];
   af[1] = from[1*stride];
  return pload<Packet2d>(af);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
 {
-  EIGEN_ALIGN16 double af[2];
+  double EIGEN_ALIGN16 af[2];
   pstore<double>(af, from);
   to[0*stride] = af[0];
   to[1*stride] = af[1];
@@ -2048,7 +930,6 @@ template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d&
 
 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b)
 {
-  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
   Packet2d ret;
   __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
   return ret;
@@ -2056,20 +937,11 @@ template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const
 
 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b)
 {
-  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
   Packet2d ret;
   __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b));
   return ret;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmple(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmplt(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return reinterpret_cast<Packet2d>(vec_cmpeq(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
-  Packet2d c = reinterpret_cast<Packet2d>(vec_cmpge(a,b));
-  return vec_nor(c,c);
-}
-
 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
 
 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
@@ -2084,8 +956,8 @@ template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { re
 
 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
 {
-  EIGEN_DEBUG_UNALIGNED_LOAD
-  return vec_xl(0, from);
+  EIGEN_DEBUG_ALIGNED_LOAD
+  return (Packet2d) vec_vsx_ld((long)from & 15, (const double*) _EIGEN_ALIGNED_PTR(from));
 }
 
 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
@@ -2098,13 +970,13 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
 
 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from)
 {
-  EIGEN_DEBUG_UNALIGNED_STORE
-  vec_xst(from, 0, to);
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to));
 }
 
 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); }
 
-template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore<double>(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE double  pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore<double>(x, a); return x[0]; }
 
 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
 {
@@ -2112,59 +984,6 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
 }
 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); }
 
-// VSX support varies between different compilers and even different
-// versions of the same compiler.  For gcc version >= 4.9.3, we can use
-// vec_cts to efficiently convert Packet2d to Packet2l.  Otherwise, use
-// a slow version that works with older compilers. 
-// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles
-// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963
-static inline Packet2l ConvertToPacket2l(const Packet2d& x) {
-#if EIGEN_GNUC_AT_LEAST(5, 4) || \
-    (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1)
-  return vec_cts(x, 0);    // TODO: check clang version.
-#else
-  double tmp[2];
-  memcpy(tmp, &x, sizeof(tmp));
-  Packet2l l = { static_cast<long long>(tmp[0]),
-                 static_cast<long long>(tmp[1]) };
-  return l;
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
-  
-  // build 2^n
-  Packet2l emm0 = ConvertToPacket2l(exponent);
-
-#ifdef __POWER8_VECTOR__ 
-  const Packet2l  p2l_1023 = { 1023, 1023 };
-  const Packet2ul p2ul_52 = { 52, 52 };
-  emm0 = vec_add(emm0, p2l_1023);
-  emm0 = vec_sl(emm0, p2ul_52);
-#else
-  // Code is a bit complex for POWER7.  There is actually a
-  // vec_xxsldi intrinsic but it is not supported by some gcc versions.
-  // So we shift (52-32) bits and do a word swap with zeros.
-  const Packet4i p4i_1023 = pset1<Packet4i>(1023);
-  const Packet4i p4i_20 = pset1<Packet4i>(20);    // 52 - 32
-
-  Packet4i emm04i = reinterpret_cast<Packet4i>(emm0);
-  emm04i = vec_add(emm04i, p4i_1023);
-  emm04i = vec_sl(emm04i, reinterpret_cast<Packet4ui>(p4i_20));
-  static const Packet16uc perm = {
-    0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, 
-    0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b };
-#ifdef  _BIG_ENDIAN
-  emm0 = reinterpret_cast<Packet2l>(vec_perm(p4i_ZERO, emm04i, perm));
-#else
-  emm0 = reinterpret_cast<Packet2l>(vec_perm(emm04i, p4i_ZERO, perm));
-#endif
-
-#endif
-
-  return pmul(a, reinterpret_cast<Packet2d>(emm0));
-}
-
 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
 {
   Packet2d b, sum;
@@ -2173,6 +992,20 @@ template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
   return pfirst<Packet2d>(sum);
 }
 
+template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
+{
+  Packet2d v[2], sum;
+  v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8));
+  v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8));
+
+#ifdef _BIG_ENDIAN
+  sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8));
+#else
+  sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[1]), reinterpret_cast<Packet4f>(v[0]), 8));
+#endif
+
+  return sum;
+}
 // Other reduction functions:
 // mul
 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
@@ -2192,6 +1025,20 @@ template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
   return pfirst(pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
 }
 
+template<int Offset>
+struct palign_impl<Offset,Packet2d>
+{
+  static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
+  {
+    if (Offset == 1)
+#ifdef _BIG_ENDIAN
+      first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(first), reinterpret_cast<Packet4ui>(second), 8));
+#else
+      first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(second), reinterpret_cast<Packet4ui>(first), 8));
+#endif
+  }
+};
+
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet2d,2>& kernel) {
   Packet2d t0, t1;
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/Complex.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/Complex.h
index 57d1201f4..9c2536509 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/Complex.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/Complex.h
@@ -16,7 +16,7 @@ namespace Eigen {
 
 namespace internal {
 
-#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
 
 // Many std::complex methods such as operator+, operator-, operator* and
 // operator/ are not constexpr. Due to this, clang does not treat them as device
@@ -55,7 +55,7 @@ template<typename T> struct scalar_difference_op<std::complex<T>, std::complex<T
 // Product
 template<typename T> struct scalar_product_op<const std::complex<T>, const std::complex<T> >  : binary_op_base<const std::complex<T>, const std::complex<T> > {
   enum {
-    Vectorizable = packet_traits<std::complex<T> >::HasMul
+    Vectorizable = packet_traits<std::complex<T>>::HasMul
   };
   typedef typename std::complex<T> result_type;
 
@@ -76,7 +76,7 @@ template<typename T> struct scalar_product_op<std::complex<T>, std::complex<T> >
 // Quotient
 template<typename T> struct scalar_quotient_op<const std::complex<T>, const std::complex<T> > : binary_op_base<const std::complex<T>, const std::complex<T> > {
   enum {
-    Vectorizable = packet_traits<std::complex<T> >::HasDiv
+    Vectorizable = packet_traits<std::complex<T>>::HasDiv
   };
   typedef typename std::complex<T> result_type;
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/Half.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/Half.h
similarity index 79%
rename from uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/Half.h
rename to uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/Half.h
index cfd0bdc06..59717b4fe 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/Half.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/Half.h
@@ -26,15 +26,15 @@
 
 
 // Standard 16-bit float type, mostly useful for GPUs. Defines a new
-// type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with
+// type Eigen::half (inheriting from CUDA's __half struct) with
 // operator overloads such that it behaves basically as an arithmetic
 // type. It will be quite slow on CPUs (so it is recommended to stay
-// in fp32 for CPUs, except for simple parameter conversions, I/O
+// in float32_bits for CPUs, except for simple parameter conversions, I/O
 // to disk and the likes), but fast on GPUs.
 
 
-#ifndef EIGEN_HALF_H
-#define EIGEN_HALF_H
+#ifndef EIGEN_HALF_CUDA_H
+#define EIGEN_HALF_CUDA_H
 
 #if __cplusplus > 199711L
 #define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type()
@@ -50,25 +50,16 @@ struct half;
 
 namespace half_impl {
 
-#if !defined(EIGEN_HAS_GPU_FP16)
+#if !defined(EIGEN_HAS_CUDA_FP16)
 // Make our own __half_raw definition that is similar to CUDA's.
 struct __half_raw {
   EIGEN_DEVICE_FUNC __half_raw() : x(0) {}
   explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {}
   unsigned short x;
 };
-#elif defined(EIGEN_HAS_HIP_FP16)
-  // Nothing to do here
-  // HIP fp16 header file has a definition for __half_raw
-#elif defined(EIGEN_HAS_CUDA_FP16)
- #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000
 // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
- typedef __half __half_raw;
- #endif // defined(EIGEN_HAS_CUDA_FP16)
-
-#elif defined(SYCL_DEVICE_ONLY)
-typedef cl::sycl::half __half_raw;
-
+typedef __half __half_raw;
 #endif
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x);
@@ -77,16 +68,10 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
 
 struct half_base : public __half_raw {
   EIGEN_DEVICE_FUNC half_base() {}
+  EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {}
   EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {}
-
-#if defined(EIGEN_HAS_GPU_FP16)
- #if defined(EIGEN_HAS_HIP_FP16)
-  EIGEN_DEVICE_FUNC half_base(const __half& h) { x = __half_as_ushort(h); }
- #elif defined(EIGEN_HAS_CUDA_FP16)
-  #if (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000)
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
   EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
-  #endif
- #endif    
 #endif
 };
 
@@ -94,38 +79,18 @@ struct half_base : public __half_raw {
 
 // Class definition.
 struct half : public half_impl::half_base {
-
-  // Writing this out as separate #if-else blocks to make the code easier to follow
-  // The same applies to most #if-else blocks in this file
-#if !defined(EIGEN_HAS_GPU_FP16)
-  typedef half_impl::__half_raw __half_raw;
-#elif defined(EIGEN_HAS_HIP_FP16)
-  // Nothing to do here
-  // HIP fp16 header file has a definition for __half_raw
-#elif defined(EIGEN_HAS_CUDA_FP16)
-  // Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so
-  // (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP!  So keeping this within
-  // #if defined(EIGEN_HAS_CUDA_FP16) is needed
-  #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+  #if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000)
     typedef half_impl::__half_raw __half_raw;
   #endif
-#endif
 
   EIGEN_DEVICE_FUNC half() {}
 
   EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {}
-
-#if defined(EIGEN_HAS_GPU_FP16)
- #if defined(EIGEN_HAS_HIP_FP16)
+  EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {}
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000
   EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
- #elif defined(EIGEN_HAS_CUDA_FP16)
-  #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
-  EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {}
-  #endif
- #endif
 #endif
 
-
   explicit EIGEN_DEVICE_FUNC half(bool b)
       : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
   template<class T>
@@ -174,6 +139,11 @@ struct half : public half_impl::half_base {
   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const {
     return static_cast<double>(half_impl::half_to_float(*this));
   }
+
+  EIGEN_DEVICE_FUNC half& operator=(const half& other) {
+    x = other.x;
+    return *this;
+  }
 };
 
 } // end namespace Eigen
@@ -232,24 +202,15 @@ namespace Eigen {
 
 namespace half_impl {
 
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \
-     EIGEN_CUDA_ARCH >= 530) ||                                  \
-    (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
-#define EIGEN_HAS_NATIVE_FP16
-#endif
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
 
 // Intrinsics for native fp16 support. Note that on current hardware,
-// these are no faster than fp32 arithmetic (you need to use the half2
+// these are no faster than float32_bits arithmetic (you need to use the half2
 // versions to get the ALU speed increased), but you do save the
 // conversion steps back and forth.
 
-#if defined(EIGEN_HAS_NATIVE_FP16)
 EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) {
-#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
-  return __hadd(::__half(a), ::__half(b));
-#else
   return __hadd(a, b);
-#endif
 }
 EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) {
   return __hmul(a, b);
@@ -258,13 +219,9 @@ EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) {
   return __hsub(a, b);
 }
 EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) {
-#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
-  return __hdiv(a, b);
-#else
   float num = __half2float(a);
   float denom = __half2float(b);
   return __float2half(num / denom);
-#endif
 }
 EIGEN_STRONG_INLINE __device__ half operator - (const half& a) {
   return __hneg(a);
@@ -304,26 +261,10 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
   return __hge(a, b);
 }
 
-#endif
+#else  // Emulate support for half floats
 
-// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
-// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
-// of the functions, while the latter can only deal with one of them.
-#if !defined(EIGEN_HAS_NATIVE_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for half floats
-
-#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC)
-// We need to provide emulated *host-side* FP16 operators for clang.
-#pragma push_macro("EIGEN_DEVICE_FUNC")
-#undef EIGEN_DEVICE_FUNC
-#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_FP16)
-#define EIGEN_DEVICE_FUNC __host__
-#else // both host and device need emulated ops.
-#define EIGEN_DEVICE_FUNC __host__ __device__
-#endif
-#endif
-
-// Definitions for CPUs and older HIP+CUDA, mostly working through conversion
-// to/from fp32.
+// Definitions for CPUs and older CUDA, mostly working through conversion
+// to/from float32_bits.
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
   return half(float(a) + float(b));
@@ -377,9 +318,6 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const hal
   return float(a) >= float(b);
 }
 
-#if defined(__clang__) && defined(__CUDA__)
-#pragma pop_macro("EIGEN_DEVICE_FUNC")
-#endif
 #endif  // Emulate support for half floats
 
 // Division by an index. Do it in full float precision to avoid accuracy
@@ -405,8 +343,7 @@ union float32_bits {
 };
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
   __half tmp_ff = __float2half(ff);
   return *(__half_raw*)&tmp_ff;
 
@@ -462,8 +399,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
 }
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
   return __half2float(h);
 
 #elif defined(EIGEN_HAS_FP16_C)
@@ -497,8 +433,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) {
   return (a.x & 0x7fff) == 0x7c00;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
   return __hisnan(a);
 #else
   return (a.x & 0x7fff) > 0x7c00;
@@ -514,19 +449,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
   return result;
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
-  defined(EIGEN_HIP_DEVICE_COMPILE)
+#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
   return half(hexp(a));
 #else
    return half(::expf(float(a)));
 #endif
 }
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) {
-  return half(numext::expm1(float(a)));
-}
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
   return half(::hlog(a));
 #else
   return half(::logf(float(a)));
@@ -539,8 +469,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) {
   return half(::log10f(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
-  defined(EIGEN_HIP_DEVICE_COMPILE)
+#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
   return half(hsqrt(a));
 #else
     return half(::sqrtf(float(a)));
@@ -562,16 +491,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) {
   return half(::tanhf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
-  defined(EIGEN_HIP_DEVICE_COMPILE)
+#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
   return half(hfloor(a));
 #else
   return half(::floorf(float(a)));
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
-  defined(EIGEN_HIP_DEVICE_COMPILE)
+#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300
   return half(hceil(a));
 #else
   return half(::ceilf(float(a)));
@@ -579,8 +506,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
 }
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
   return __hlt(b, a) ? b : a;
 #else
   const float f1 = static_cast<float>(a);
@@ -589,8 +515,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) {
 #endif
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
   return __hlt(a, b) ? b : a;
 #else
   const float f1 = static_cast<float>(a);
@@ -599,12 +524,10 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) {
 #endif
 }
 
-#ifndef EIGEN_NO_IO
 EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const half& v) {
   os << static_cast<float>(v);
   return os;
 }
-#endif
 
 } // end namespace half_impl
 
@@ -670,8 +593,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) {
   return Eigen::half(::expf(float(a)));
 }
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) {
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
-  defined(EIGEN_HIP_DEVICE_COMPILE)
+#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530
   return Eigen::half(::hlog(a));
 #else
   return Eigen::half(::logf(float(a)));
@@ -705,12 +627,9 @@ struct hash<Eigen::half> {
 
 
 // Add the missing shfl_xor intrinsic
-#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-  defined(EIGEN_HIPCC)
-
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300
 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) {
-  #if (EIGEN_CUDA_SDK_VER < 90000) || \
-    defined(EIGEN_HAS_HIP_FP16)
+  #if EIGEN_CUDACC_VER < 90000
   return static_cast<Eigen::half>(__shfl_xor(static_cast<float>(var), laneMask, width));
   #else
   return static_cast<Eigen::half>(__shfl_xor_sync(0xFFFFFFFF, static_cast<float>(var), laneMask, width));
@@ -719,8 +638,7 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneM
 #endif
 
 // ldg() has an overload for __half_raw, but we also need one for Eigen::half.
-#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) || \
-  defined(EIGEN_HIPCC)
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) {
   return Eigen::half_impl::raw_uint16_to_half(
       __ldg(reinterpret_cast<const unsigned short*>(ptr)));
@@ -728,7 +646,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr)
 #endif
 
 
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#if defined(EIGEN_CUDA_ARCH)
 namespace Eigen {
 namespace numext {
 
@@ -754,4 +672,4 @@ bool (isfinite)(const Eigen::half& h) {
 }  // namespace numext
 #endif
 
-#endif // EIGEN_HALF_H
+#endif // EIGEN_HALF_CUDA_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/GPU/MathFunctions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h
similarity index 82%
rename from uppsrc/plugin/Eigen/Eigen/src/Core/arch/GPU/MathFunctions.h
rename to uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h
index d2b3a2568..0348b41db 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/GPU/MathFunctions.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h
@@ -7,8 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_MATH_FUNCTIONS_GPU_H
-#define EIGEN_MATH_FUNCTIONS_GPU_H
+#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H
+#define EIGEN_MATH_FUNCTIONS_CUDA_H
 
 namespace Eigen {
 
@@ -17,7 +17,7 @@ namespace internal {
 // Make sure this is only available when targeting a GPU: we don't want to
 // introduce conflicts between these packet_traits definitions and the ones
 // we'll use on the host side (SSE, AVX, ...)
-#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 plog<float4>(const float4& a)
 {
@@ -56,18 +56,6 @@ double2 pexp<double2>(const double2& a)
   return make_double2(exp(a.x), exp(a.y));
 }
 
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pexpm1<float4>(const float4& a)
-{
-  return make_float4(expm1f(a.x), expm1f(a.y), expm1f(a.z), expm1f(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pexpm1<double2>(const double2& a)
-{
-  return make_double2(expm1(a.x), expm1(a.y));
-}
-
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 float4 psqrt<float4>(const float4& a)
 {
@@ -100,4 +88,4 @@ double2 prsqrt<double2>(const double2& a)
 
 } // end namespace Eigen
 
-#endif // EIGEN_MATH_FUNCTIONS_GPU_H
+#endif // EIGEN_MATH_FUNCTIONS_CUDA_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/PacketMath.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/PacketMath.h
new file mode 100644
index 000000000..4dda63188
--- /dev/null
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/PacketMath.h
@@ -0,0 +1,333 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_CUDA_H
+#define EIGEN_PACKET_MATH_CUDA_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+template<> struct is_arithmetic<float4>  { enum { value = true }; };
+template<> struct is_arithmetic<double2> { enum { value = true }; };
+
+template<> struct packet_traits<float> : default_packet_traits
+{
+  typedef float4 type;
+  typedef float4 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=4,
+    HasHalfPacket = 0,
+
+    HasDiv  = 1,
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasLGamma = 1,
+    HasDiGamma = 1,
+    HasZeta = 1,
+    HasPolygamma = 1,
+    HasErf = 1,
+    HasErfc = 1,
+    HasIGamma = 1,
+    HasIGammac = 1,
+    HasBetaInc = 1,
+
+    HasBlend = 0,
+  };
+};
+
+template<> struct packet_traits<double> : default_packet_traits
+{
+  typedef double2 type;
+  typedef double2 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=2,
+    HasHalfPacket = 0,
+
+    HasDiv  = 1,
+    HasLog  = 1,
+    HasExp  = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasLGamma = 1,
+    HasDiGamma = 1,
+    HasZeta = 1,
+    HasPolygamma = 1,
+    HasErf = 1,
+    HasErfc = 1,
+    HasIGamma = 1,
+    HasIGammac = 1,
+    HasBetaInc = 1,
+
+    HasBlend = 0,
+  };
+};
+
+
+template<> struct unpacket_traits<float4>  { typedef float  type; enum {size=4, alignment=Aligned16}; typedef float4 half; };
+template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16}; typedef double2 half; };
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float&  from) {
+  return make_float4(from, from, from, from);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
+  return make_double2(from, from);
+}
+
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
+  return make_float4(a, a+1, a+2, a+3);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
+  return make_double2(a, a+1);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x+b.x, a.y+b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x-b.x, a.y-b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
+  return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
+  return make_double2(-a.x, -a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; }
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; }
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x*b.x, a.y*b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x/b.x, a.y/b.y);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
+  return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
+  return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
+  return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
+  return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
+  return *reinterpret_cast<const float4*>(from);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
+  return *reinterpret_cast<const double2*>(from);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
+  return make_float4(from[0], from[1], from[2], from[3]);
+}
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
+  return make_double2(from[0], from[1]);
+}
+
+template<> EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
+  return make_float4(from[0], from[0], from[1], from[1]);
+}
+template<> EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
+  return make_double2(from[0], from[0]);
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float*   to, const float4& from) {
+  *reinterpret_cast<float4*>(to) = from;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
+  *reinterpret_cast<double2*>(to) = from;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const float4& from) {
+  to[0] = from.x;
+  to[1] = from.y;
+  to[2] = from.z;
+  to[3] = from.w;
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
+  to[0] = from.x;
+  to[1] = from.y;
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+  return __ldg((const float4*)from);
+#else
+  return make_float4(from[0], from[1], from[2], from[3]);
+#endif
+}
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+  return __ldg((const double2*)from);
+#else
+  return make_double2(from[0], from[1]);
+#endif
+}
+
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+  return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
+#else
+  return make_float4(from[0], from[1], from[2], from[3]);
+#endif
+}
+template<>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
+  return make_double2(__ldg(from+0), __ldg(from+1));
+#else
+  return make_double2(from[0], from[1]);
+#endif
+}
+
+template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
+  return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
+  return make_double2(from[0*stride], from[1*stride]);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
+  to[stride*0] = from.x;
+  to[stride*1] = from.y;
+  to[stride*2] = from.z;
+  to[stride*3] = from.w;
+}
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
+  to[stride*0] = from.x;
+  to[stride*1] = from.y;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  pfirst<float4>(const float4& a) {
+  return a.x;
+}
+template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
+  return a.x;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux<float4>(const float4& a) {
+  return a.x + a.y + a.z + a.w;
+}
+template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
+  return a.x + a.y;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux_max<float4>(const float4& a) {
+  return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
+  return fmax(a.x, a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux_min<float4>(const float4& a) {
+  return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
+  return fmin(a.x, a.y);
+}
+
+template<> EIGEN_DEVICE_FUNC inline float  predux_mul<float4>(const float4& a) {
+  return a.x * a.y * a.z * a.w;
+}
+template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
+  return a.x * a.y;
+}
+
+template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
+  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
+}
+template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
+  return make_double2(fabs(a.x), fabs(a.y));
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<float4,4>& kernel) {
+  float tmp = kernel.packet[0].y;
+  kernel.packet[0].y = kernel.packet[1].x;
+  kernel.packet[1].x = tmp;
+
+  tmp = kernel.packet[0].z;
+  kernel.packet[0].z = kernel.packet[2].x;
+  kernel.packet[2].x = tmp;
+
+  tmp = kernel.packet[0].w;
+  kernel.packet[0].w = kernel.packet[3].x;
+  kernel.packet[3].x = tmp;
+
+  tmp = kernel.packet[1].z;
+  kernel.packet[1].z = kernel.packet[2].y;
+  kernel.packet[2].y = tmp;
+
+  tmp = kernel.packet[1].w;
+  kernel.packet[1].w = kernel.packet[3].y;
+  kernel.packet[3].y = tmp;
+
+  tmp = kernel.packet[2].w;
+  kernel.packet[2].w = kernel.packet[3].z;
+  kernel.packet[3].z = tmp;
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<double2,2>& kernel) {
+  double tmp = kernel.packet[0].y;
+  kernel.packet[0].y = kernel.packet[1].x;
+  kernel.packet[1].x = tmp;
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+
+#endif // EIGEN_PACKET_MATH_CUDA_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
new file mode 100644
index 000000000..f749c573f
--- /dev/null
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h
@@ -0,0 +1,1124 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H
+#define EIGEN_PACKET_MATH_HALF_CUDA_H
+
+
+namespace Eigen {
+namespace internal {
+
+// Most of the following operations require arch >= 3.0
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+
+template<> struct is_arithmetic<half2> { enum { value = true }; };
+
+template<> struct packet_traits<Eigen::half> : default_packet_traits
+{
+  typedef half2 type;
+  typedef half2 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size=2,
+    HasHalfPacket = 0,
+    HasAdd    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasSqrt   = 1,
+    HasRsqrt  = 1,
+    HasExp    = 1,
+    HasLog    = 1,
+    HasLog1p  = 1
+  };
+};
+
+template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; };
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
+  return __half2half2(from);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
+  return *reinterpret_cast<const half2*>(from);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
+  return __halves2half2(from[0], from[1]);
+}
+
+template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half*  from) {
+  return __halves2half2(from[0], from[0]);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
+  *reinterpret_cast<half2*>(to) = from;
+}
+
+template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
+  to[0] = __low2half(from);
+  to[1] = __high2half(from);
+}
+
+template<>
+ __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
+#if __CUDA_ARCH__ >= 350
+   return __ldg((const half2*)from);
+#else
+  return __halves2half2(*(from+0), *(from+1));
+#endif
+}
+
+template<>
+__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
+#if __CUDA_ARCH__ >= 350
+   return __halves2half2(__ldg(from+0), __ldg(from+1));
+#else
+  return __halves2half2(*(from+0), *(from+1));
+#endif
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
+  return __halves2half2(from[0*stride], from[1*stride]);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
+  to[stride*0] = __low2half(from);
+  to[stride*1] = __high2half(from);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
+  return __low2half(a);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
+  half2 result;
+  unsigned temp = *(reinterpret_cast<const unsigned*>(&(a)));
+  *(reinterpret_cast<unsigned*>(&(result))) = temp & 0x7FFF7FFF;
+  return result;
+}
+
+
+__device__ EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<half2,2>& kernel) {
+  __half a1 = __low2half(kernel.packet[0]);
+  __half a2 = __high2half(kernel.packet[0]);
+  __half b1 = __low2half(kernel.packet[1]);
+  __half b2 = __high2half(kernel.packet[1]);
+  kernel.packet[0] = __halves2half2(a1, b1);
+  kernel.packet[1] = __halves2half2(a2, b2);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
+#if __CUDA_ARCH__ >= 530
+  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
+#else
+  float f = __half2float(a) + 1.0f;
+  return __halves2half2(a, __float2half(f));
+#endif
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
+#if __CUDA_ARCH__ >= 530
+  return __hadd2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 + b1;
+  float r2 = a2 + b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
+#if __CUDA_ARCH__ >= 530
+  return __hsub2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 - b1;
+  float r2 = a2 - b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+#if __CUDA_ARCH__ >= 530
+  return __hneg2(a);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return __floats2half2_rn(-a1, -a2);
+#endif
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
+#if __CUDA_ARCH__ >= 530
+  return __hmul2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 * b1;
+  float r2 = a2 * b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
+#if __CUDA_ARCH__ >= 530
+   return __hfma2(a, b, c);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float c1 = __low2float(c);
+  float c2 = __high2float(c);
+  float r1 = a1 * b1 + c1;
+  float r2 = a2 * b2 + c2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 / b1;
+  float r2 = a2 / b2;
+  return __floats2half2_rn(r1, r2);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
+#if __CUDA_ARCH__ >= 530
+  return __hadd(__low2half(a), __high2half(a));
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return Eigen::half(__float2half_rn(a1 + a2));
+#endif
+}
+
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
+#if __CUDA_ARCH__ >= 530
+  __half first = __low2half(a);
+  __half second = __high2half(a);
+  return __hgt(first, second) ? first : second;
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return a1 > a2 ? __low2half(a) : __high2half(a);
+#endif
+}
+
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
+#if __CUDA_ARCH__ >= 530
+  __half first = __low2half(a);
+  __half second = __high2half(a);
+  return __hlt(first, second) ? first : second;
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return a1 < a2 ? __low2half(a) : __high2half(a);
+#endif
+}
+
+template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
+#if __CUDA_ARCH__ >= 530
+  return __hmul(__low2half(a), __high2half(a));
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return Eigen::half(__float2half_rn(a1 * a2));
+#endif
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = log1pf(a1);
+  float r2 = log1pf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
+
+template<>  __device__ EIGEN_STRONG_INLINE
+half2 plog<half2>(const half2& a) {
+  return h2log(a);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE
+half2 pexp<half2>(const half2& a) {
+  return h2exp(a);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE
+half2 psqrt<half2>(const half2& a) {
+  return h2sqrt(a);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE
+half2 prsqrt<half2>(const half2& a) {
+  return h2rsqrt(a);
+}
+
+#else
+
+template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = logf(a1);
+  float r2 = logf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = expf(a1);
+  float r2 = expf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = sqrtf(a1);
+  float r2 = sqrtf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = rsqrtf(a1);
+  float r2 = rsqrtf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+#endif
+
+#elif defined EIGEN_VECTORIZE_AVX512
+
+typedef struct {
+  __m256i x;
+} Packet16h;
+
+
+template<> struct is_arithmetic<Packet16h> { enum { value = true }; };
+
+template <>
+struct packet_traits<half> : default_packet_traits {
+  typedef Packet16h type;
+  // There is no half-size packet for Packet16h.
+  typedef Packet16h half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+    HasHalfPacket = 0,
+    HasAdd    = 0,
+    HasSub    = 0,
+    HasMul    = 0,
+    HasNegate = 0,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasConj   = 0,
+    HasSetLinear = 0,
+    HasDiv = 0,
+    HasSqrt = 0,
+    HasRsqrt = 0,
+    HasExp = 0,
+    HasLog = 0,
+    HasBlend = 0
+  };
+};
+
+
+template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; };
+
+template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
+  Packet16h result;
+  result.x = _mm256_set1_epi16(from.x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
+  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from.x, 0)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
+  Packet16h result;
+  result.x = _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
+  Packet16h result;
+  result.x = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
+  _mm256_store_si256((__m256i*)to, from.x);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
+  _mm256_storeu_si256((__m256i*)to, from.x);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h
+ploadquad(const Eigen::half* from) {
+  Packet16h result;
+  unsigned short a = from[0].x;
+  unsigned short b = from[1].x;
+  unsigned short c = from[2].x;
+  unsigned short d = from[3].x;
+  result.x = _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
+  return result;
+}
+
+EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) {
+#ifdef EIGEN_HAS_FP16_C
+  return _mm512_cvtph_ps(a.x);
+#else
+  EIGEN_ALIGN64 half aux[16];
+  pstore(aux, a);
+  float f0(aux[0]);
+  float f1(aux[1]);
+  float f2(aux[2]);
+  float f3(aux[3]);
+  float f4(aux[4]);
+  float f5(aux[5]);
+  float f6(aux[6]);
+  float f7(aux[7]);
+  float f8(aux[8]);
+  float f9(aux[9]);
+  float fa(aux[10]);
+  float fb(aux[11]);
+  float fc(aux[12]);
+  float fd(aux[13]);
+  float fe(aux[14]);
+  float ff(aux[15]);
+
+  return _mm512_set_ps(
+      ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0);
+#endif
+}
+
+EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
+#ifdef EIGEN_HAS_FP16_C
+  Packet16h result;
+  result.x = _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
+  return result;
+#else
+  EIGEN_ALIGN64 float aux[16];
+  pstore(aux, a);
+  half h0(aux[0]);
+  half h1(aux[1]);
+  half h2(aux[2]);
+  half h3(aux[3]);
+  half h4(aux[4]);
+  half h5(aux[5]);
+  half h6(aux[6]);
+  half h7(aux[7]);
+  half h8(aux[8]);
+  half h9(aux[9]);
+  half ha(aux[10]);
+  half hb(aux[11]);
+  half hc(aux[12]);
+  half hd(aux[13]);
+  half he(aux[14]);
+  half hf(aux[15]);
+
+  Packet16h result;
+  result.x = _mm256_set_epi16(
+      hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x,
+      h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
+  return result;
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = padd(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = pmul(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
+  Packet16f from_float = half2float(from);
+  return half(predux(from_float));
+}
+
+template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
+{
+  Packet16h result;
+  result.x = _mm256_set_epi16(
+      from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x,
+      from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x,
+      from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x,
+      from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride)
+{
+  EIGEN_ALIGN64 half aux[16];
+  pstore(aux, from);
+  to[stride*0].x = aux[0].x;
+  to[stride*1].x = aux[1].x;
+  to[stride*2].x = aux[2].x;
+  to[stride*3].x = aux[3].x;
+  to[stride*4].x = aux[4].x;
+  to[stride*5].x = aux[5].x;
+  to[stride*6].x = aux[6].x;
+  to[stride*7].x = aux[7].x;
+  to[stride*8].x = aux[8].x;
+  to[stride*9].x = aux[9].x;
+  to[stride*10].x = aux[10].x;
+  to[stride*11].x = aux[11].x;
+  to[stride*12].x = aux[12].x;
+  to[stride*13].x = aux[13].x;
+  to[stride*14].x = aux[14].x;
+  to[stride*15].x = aux[15].x;
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet16h,16>& kernel) {
+  __m256i a = kernel.packet[0].x;
+  __m256i b = kernel.packet[1].x;
+  __m256i c = kernel.packet[2].x;
+  __m256i d = kernel.packet[3].x;
+  __m256i e = kernel.packet[4].x;
+  __m256i f = kernel.packet[5].x;
+  __m256i g = kernel.packet[6].x;
+  __m256i h = kernel.packet[7].x;
+  __m256i i = kernel.packet[8].x;
+  __m256i j = kernel.packet[9].x;
+  __m256i k = kernel.packet[10].x;
+  __m256i l = kernel.packet[11].x;
+  __m256i m = kernel.packet[12].x;
+  __m256i n = kernel.packet[13].x;
+  __m256i o = kernel.packet[14].x;
+  __m256i p = kernel.packet[15].x;
+
+  __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
+  __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
+  __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
+  __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
+  __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
+  __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
+  __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
+  __m256i op_07 = _mm256_unpacklo_epi16(o, p);
+
+  __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
+  __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
+  __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
+  __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
+  __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
+  __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
+  __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
+  __m256i op_8f = _mm256_unpackhi_epi16(o, p);
+
+  __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
+  __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
+  __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
+  __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
+  __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
+  __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
+  __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
+  __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
+
+  __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
+  __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
+  __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
+  __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
+  __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
+  __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
+  __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
+  __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
+
+  __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
+  __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
+  __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
+  __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
+  __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
+  __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
+  __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
+  __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
+  __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
+  __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
+  __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
+  __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
+  __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
+  __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
+  __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
+  __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
+
+  // NOTE: no unpacklo/hi instr in this case, so using permute instr.
+  __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
+  __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
+  __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
+  __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
+  __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
+  __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
+  __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
+  __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
+  __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
+  __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
+  __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
+  __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
+  __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
+  __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
+  __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
+  __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
+
+  kernel.packet[0].x = a_p_0;
+  kernel.packet[1].x = a_p_1;
+  kernel.packet[2].x = a_p_2;
+  kernel.packet[3].x = a_p_3;
+  kernel.packet[4].x = a_p_4;
+  kernel.packet[5].x = a_p_5;
+  kernel.packet[6].x = a_p_6;
+  kernel.packet[7].x = a_p_7;
+  kernel.packet[8].x = a_p_8;
+  kernel.packet[9].x = a_p_9;
+  kernel.packet[10].x = a_p_a;
+  kernel.packet[11].x = a_p_b;
+  kernel.packet[12].x = a_p_c;
+  kernel.packet[13].x = a_p_d;
+  kernel.packet[14].x = a_p_e;
+  kernel.packet[15].x = a_p_f;
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet16h,8>& kernel) {
+  EIGEN_ALIGN64 half in[8][16];
+  pstore<half>(in[0], kernel.packet[0]);
+  pstore<half>(in[1], kernel.packet[1]);
+  pstore<half>(in[2], kernel.packet[2]);
+  pstore<half>(in[3], kernel.packet[3]);
+  pstore<half>(in[4], kernel.packet[4]);
+  pstore<half>(in[5], kernel.packet[5]);
+  pstore<half>(in[6], kernel.packet[6]);
+  pstore<half>(in[7], kernel.packet[7]);
+
+  EIGEN_ALIGN64 half out[8][16];
+
+  for (int i = 0; i < 8; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      out[i][j] = in[j][2*i];
+    }
+    for (int j = 0; j < 8; ++j) {
+      out[i][j+8] = in[j][2*i+1];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet16h>(out[0]);
+  kernel.packet[1] = pload<Packet16h>(out[1]);
+  kernel.packet[2] = pload<Packet16h>(out[2]);
+  kernel.packet[3] = pload<Packet16h>(out[3]);
+  kernel.packet[4] = pload<Packet16h>(out[4]);
+  kernel.packet[5] = pload<Packet16h>(out[5]);
+  kernel.packet[6] = pload<Packet16h>(out[6]);
+  kernel.packet[7] = pload<Packet16h>(out[7]);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet16h,4>& kernel) {
+  EIGEN_ALIGN64 half in[4][16];
+  pstore<half>(in[0], kernel.packet[0]);
+  pstore<half>(in[1], kernel.packet[1]);
+  pstore<half>(in[2], kernel.packet[2]);
+  pstore<half>(in[3], kernel.packet[3]);
+
+  EIGEN_ALIGN64 half out[4][16];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      out[i][j] = in[j][4*i];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j+4] = in[j][4*i+1];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j+8] = in[j][4*i+2];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j+12] = in[j][4*i+3];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet16h>(out[0]);
+  kernel.packet[1] = pload<Packet16h>(out[1]);
+  kernel.packet[2] = pload<Packet16h>(out[2]);
+  kernel.packet[3] = pload<Packet16h>(out[3]);
+}
+
+
+#elif defined EIGEN_VECTORIZE_AVX
+
+typedef struct {
+  __m128i x;
+} Packet8h;
+
+
+template<> struct is_arithmetic<Packet8h> { enum { value = true }; };
+
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+  typedef Packet8h type;
+  // There is no half-size packet for Packet8h.
+  typedef Packet8h half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasHalfPacket = 0,
+    HasAdd    = 0,
+    HasSub    = 0,
+    HasMul    = 0,
+    HasNegate = 0,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasConj   = 0,
+    HasSetLinear = 0,
+    HasDiv = 0,
+    HasSqrt = 0,
+    HasRsqrt = 0,
+    HasExp = 0,
+    HasLog = 0,
+    HasBlend = 0
+  };
+};
+
+
+template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; };
+
+template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
+  Packet8h result;
+  result.x = _mm_set1_epi16(from.x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
+  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_extract_epi16(from.x, 0)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
+  Packet8h result;
+  result.x = _mm_load_si128(reinterpret_cast<const __m128i*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
+  Packet8h result;
+  result.x = _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
+  _mm_store_si128(reinterpret_cast<__m128i*>(to), from.x);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h
+ploadquad<Packet8h>(const Eigen::half* from) {
+  Packet8h result;
+  unsigned short a = from[0].x;
+  unsigned short b = from[1].x;
+  result.x = _mm_set_epi16(b, b, b, b, a, a, a, a);
+  return result;
+}
+
+EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) {
+#ifdef EIGEN_HAS_FP16_C
+  return _mm256_cvtph_ps(a.x);
+#else
+  EIGEN_ALIGN32 Eigen::half aux[8];
+  pstore(aux, a);
+  float f0(aux[0]);
+  float f1(aux[1]);
+  float f2(aux[2]);
+  float f3(aux[3]);
+  float f4(aux[4]);
+  float f5(aux[5]);
+  float f6(aux[6]);
+  float f7(aux[7]);
+
+  return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0);
+#endif
+}
+
+EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
+#ifdef EIGEN_HAS_FP16_C
+  Packet8h result;
+  result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
+  return result;
+#else
+  EIGEN_ALIGN32 float aux[8];
+  pstore(aux, a);
+  Eigen::half h0(aux[0]);
+  Eigen::half h1(aux[1]);
+  Eigen::half h2(aux[2]);
+  Eigen::half h3(aux[3]);
+  Eigen::half h4(aux[4]);
+  Eigen::half h5(aux[5]);
+  Eigen::half h6(aux[6]);
+  Eigen::half h7(aux[7]);
+
+  Packet8h result;
+  result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
+  return result;
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = padd(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = pmul(af, bf);
+  return float2half(rf);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride)
+{
+  Packet8h result;
+  result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride)
+{
+  EIGEN_ALIGN32 Eigen::half aux[8];
+  pstore(aux, from);
+  to[stride*0].x = aux[0].x;
+  to[stride*1].x = aux[1].x;
+  to[stride*2].x = aux[2].x;
+  to[stride*3].x = aux[3].x;
+  to[stride*4].x = aux[4].x;
+  to[stride*5].x = aux[5].x;
+  to[stride*6].x = aux[6].x;
+  to[stride*7].x = aux[7].x;
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
+  Packet8f af = half2float(a);
+  float reduced = predux<Packet8f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
+  Packet8f af = half2float(a);
+  float reduced = predux_max<Packet8f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
+  Packet8f af = half2float(a);
+  float reduced = predux_min<Packet8f>(af);
+  return Eigen::half(reduced);
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
+  Packet8f af = half2float(a);
+  float reduced = predux_mul<Packet8f>(af);
+  return Eigen::half(reduced);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet8h,8>& kernel) {
+  __m128i a = kernel.packet[0].x;
+  __m128i b = kernel.packet[1].x;
+  __m128i c = kernel.packet[2].x;
+  __m128i d = kernel.packet[3].x;
+  __m128i e = kernel.packet[4].x;
+  __m128i f = kernel.packet[5].x;
+  __m128i g = kernel.packet[6].x;
+  __m128i h = kernel.packet[7].x;
+
+  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
+  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
+  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
+  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
+  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
+  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
+  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
+  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
+
+  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
+  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
+  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
+  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
+  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
+  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
+  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
+  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
+
+  __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
+  __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
+  __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
+  __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
+  __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
+  __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
+  __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
+  __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
+
+  kernel.packet[0].x = a0b0c0d0e0f0g0h0;
+  kernel.packet[1].x = a1b1c1d1e1f1g1h1;
+  kernel.packet[2].x = a2b2c2d2e2f2g2h2;
+  kernel.packet[3].x = a3b3c3d3e3f3g3h3;
+  kernel.packet[4].x = a4b4c4d4e4f4g4h4;
+  kernel.packet[5].x = a5b5c5d5e5f5g5h5;
+  kernel.packet[6].x = a6b6c6d6e6f6g6h6;
+  kernel.packet[7].x = a7b7c7d7e7f7g7h7;
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet8h,4>& kernel) {
+  EIGEN_ALIGN32 Eigen::half in[4][8];
+  pstore<Eigen::half>(in[0], kernel.packet[0]);
+  pstore<Eigen::half>(in[1], kernel.packet[1]);
+  pstore<Eigen::half>(in[2], kernel.packet[2]);
+  pstore<Eigen::half>(in[3], kernel.packet[3]);
+
+  EIGEN_ALIGN32 Eigen::half out[4][8];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      out[i][j] = in[j][2*i];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j+4] = in[j][2*i+1];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet8h>(out[0]);
+  kernel.packet[1] = pload<Packet8h>(out[1]);
+  kernel.packet[2] = pload<Packet8h>(out[2]);
+  kernel.packet[3] = pload<Packet8h>(out[3]);
+}
+
+
+// Disable the following code since it's broken on too many platforms / compilers.
+//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
+#elif 0
+
+typedef struct {
+  __m64 x;
+} Packet4h;
+
+
+template<> struct is_arithmetic<Packet4h> { enum { value = true }; };
+
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+  typedef Packet4h type;
+  // There is no half-size packet for Packet4h.
+  typedef Packet4h half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasHalfPacket = 0,
+    HasAdd    = 0,
+    HasSub    = 0,
+    HasMul    = 0,
+    HasNegate = 0,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasConj   = 0,
+    HasSetLinear = 0,
+    HasDiv = 0,
+    HasSqrt = 0,
+    HasRsqrt = 0,
+    HasExp = 0,
+    HasLog = 0,
+    HasBlend = 0
+  };
+};
+
+
+template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; };
+
+template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
+  Packet4h result;
+  result.x = _mm_set1_pi16(from.x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
+  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha + hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha + hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha + hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha + hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha * hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha * hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha * hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha * hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
+  Packet4h result;
+  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
+  Packet4h result;
+  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
+  __int64_t r = _mm_cvtm64_si64(from.x);
+  *(reinterpret_cast<__int64_t*>(to)) = r;
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
+  __int64_t r = _mm_cvtm64_si64(from.x);
+  *(reinterpret_cast<__int64_t*>(to)) = r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h
+ploadquad<Packet4h>(const Eigen::half* from) {
+  return pset1<Packet4h>(*from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
+{
+  Packet4h result;
+  result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
+{
+  __int64_t a = _mm_cvtm64_si64(from.x);
+  to[stride*0].x = static_cast<unsigned short>(a);
+  to[stride*1].x = static_cast<unsigned short>(a >> 16);
+  to[stride*2].x = static_cast<unsigned short>(a >> 32);
+  to[stride*3].x = static_cast<unsigned short>(a >> 48);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet4h,4>& kernel) {
+  __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
+  __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
+  __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
+  __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
+
+  kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
+  kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
+  kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
+  kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
+}
+
+#endif
+
+}
+}
+
+#endif // EIGEN_PACKET_MATH_HALF_CUDA_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h
new file mode 100644
index 000000000..aa5fbce8e
--- /dev/null
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h
@@ -0,0 +1,212 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_CUDA_H
+#define EIGEN_TYPE_CASTING_CUDA_H
+
+namespace Eigen {
+
+namespace internal {
+
+template<>
+struct scalar_cast_op<float, Eigen::half> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef Eigen::half result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
+    #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+      return __float2half(a);
+    #else
+      return Eigen::half(a);
+    #endif
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<float, Eigen::half> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+template<>
+struct scalar_cast_op<int, Eigen::half> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef Eigen::half result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
+    #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+      return __float2half(static_cast<float>(a));
+    #else
+      return Eigen::half(static_cast<float>(a));
+    #endif
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<int, Eigen::half> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+template<>
+struct scalar_cast_op<Eigen::half, float> {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
+  typedef float result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
+    #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+      return __half2float(a);
+    #else
+      return static_cast<float>(a);
+    #endif
+  }
+};
+
+template<>
+struct functor_traits<scalar_cast_op<Eigen::half, float> >
+{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
+
+
+
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 2,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) {
+  float2 r1 = __half22float2(a);
+  float2 r2 = __half22float2(b);
+  return make_float4(r1.x, r1.y, r2.x, r2.y);
+}
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 2
+  };
+};
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
+  // Simply discard the second half of the input
+  return __floats2half2_rn(a.x, a.y);
+}
+
+#elif defined EIGEN_VECTORIZE_AVX512
+template <>
+struct type_casting_traits<half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
+  return half2float(a);
+}
+
+template <>
+struct type_casting_traits<float, half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) {
+  return float2half(a);
+}
+
+#elif defined EIGEN_VECTORIZE_AVX
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
+  return half2float(a);
+}
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
+  return float2half(a);
+}
+
+// Disable the following code since it's broken on too many platforms / compilers.
+//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
+#elif 0
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4h, Packet4f>(const Packet4h& a) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  Eigen::half h = raw_uint16_to_half(static_cast<unsigned short>(a64));
+  float f1 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  float f2 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  float f3 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  float f4 = static_cast<float>(h);
+  return _mm_set_ps(f4, f3, f2, f1);
+}
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f& a) {
+  EIGEN_ALIGN16 float aux[4];
+  pstore(aux, a);
+  Eigen::half h0(aux[0]);
+  Eigen::half h1(aux[1]);
+  Eigen::half h2(aux[2]);
+  Eigen::half h3(aux[3]);
+
+  Packet4h result;
+  result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x);
+  return result;
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_TYPE_CASTING_CUDA_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
deleted file mode 100644
index 4d9b3b44c..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ /dev/null
@@ -1,655 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2007 Julien Pommier
-// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
-// Copyright (C) 2009-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/* The exp and log functions of this file initially come from
- * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
- */
-
-#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
-#define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
-
-namespace Eigen {
-namespace internal {
-
-template<typename Packet> EIGEN_STRONG_INLINE Packet
-pfrexp_float(const Packet& a, Packet& exponent) {
-  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
-  const Packet cst_126f = pset1<Packet>(126.0f);
-  const Packet cst_half = pset1<Packet>(0.5f);
-  const Packet cst_inv_mant_mask  = pset1frombits<Packet>(~0x7f800000u);
-  exponent = psub(pcast<PacketI,Packet>(plogical_shift_right<23>(preinterpret<PacketI>(a))), cst_126f);
-  return por(pand(a, cst_inv_mant_mask), cst_half);
-}
-
-template<typename Packet> EIGEN_STRONG_INLINE Packet
-pldexp_float(Packet a, Packet exponent)
-{
-  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
-  const Packet cst_127 = pset1<Packet>(127.f);
-  // return a * 2^exponent
-  PacketI ei = pcast<Packet,PacketI>(padd(exponent, cst_127));
-  return pmul(a, preinterpret<Packet>(plogical_shift_left<23>(ei)));
-}
-
-// Natural logarithm
-// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
-// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
-// be easily approximated by a polynomial centered on m=1 for stability.
-// TODO(gonnet): Further reduce the interval allowing for lower-degree
-//               polynomial interpolants -> ... -> profit!
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-EIGEN_UNUSED
-Packet plog_float(const Packet _x)
-{
-  Packet x = _x;
-
-  const Packet cst_1              = pset1<Packet>(1.0f);
-  const Packet cst_half           = pset1<Packet>(0.5f);
-  // The smallest non denormalized float number.
-  const Packet cst_min_norm_pos   = pset1frombits<Packet>( 0x00800000u);
-  const Packet cst_minus_inf      = pset1frombits<Packet>( 0xff800000u);
-  const Packet cst_pos_inf        = pset1frombits<Packet>( 0x7f800000u);
-
-  // Polynomial coefficients.
-  const Packet cst_cephes_SQRTHF = pset1<Packet>(0.707106781186547524f);
-  const Packet cst_cephes_log_p0 = pset1<Packet>(7.0376836292E-2f);
-  const Packet cst_cephes_log_p1 = pset1<Packet>(-1.1514610310E-1f);
-  const Packet cst_cephes_log_p2 = pset1<Packet>(1.1676998740E-1f);
-  const Packet cst_cephes_log_p3 = pset1<Packet>(-1.2420140846E-1f);
-  const Packet cst_cephes_log_p4 = pset1<Packet>(+1.4249322787E-1f);
-  const Packet cst_cephes_log_p5 = pset1<Packet>(-1.6668057665E-1f);
-  const Packet cst_cephes_log_p6 = pset1<Packet>(+2.0000714765E-1f);
-  const Packet cst_cephes_log_p7 = pset1<Packet>(-2.4999993993E-1f);
-  const Packet cst_cephes_log_p8 = pset1<Packet>(+3.3333331174E-1f);
-  const Packet cst_cephes_log_q1 = pset1<Packet>(-2.12194440e-4f);
-  const Packet cst_cephes_log_q2 = pset1<Packet>(0.693359375f);
-
-  // Truncate input values to the minimum positive normal.
-  x = pmax(x, cst_min_norm_pos);
-
-  Packet e;
-  // extract significant in the range [0.5,1) and exponent
-  x = pfrexp(x,e);
-
-  // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
-  // and shift by -1. The values are then centered around 0, which improves
-  // the stability of the polynomial evaluation.
-  //   if( x < SQRTHF ) {
-  //     e -= 1;
-  //     x = x + x - 1.0;
-  //   } else { x = x - 1.0; }
-  Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
-  Packet tmp = pand(x, mask);
-  x = psub(x, cst_1);
-  e = psub(e, pand(cst_1, mask));
-  x = padd(x, tmp);
-
-  Packet x2 = pmul(x, x);
-  Packet x3 = pmul(x2, x);
-
-  // Evaluate the polynomial approximant of degree 8 in three parts, probably
-  // to improve instruction-level parallelism.
-  Packet y, y1, y2;
-  y  = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
-  y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
-  y2 = pmadd(cst_cephes_log_p6, x, cst_cephes_log_p7);
-  y  = pmadd(y, x, cst_cephes_log_p2);
-  y1 = pmadd(y1, x, cst_cephes_log_p5);
-  y2 = pmadd(y2, x, cst_cephes_log_p8);
-  y  = pmadd(y, x3, y1);
-  y  = pmadd(y, x3, y2);
-  y  = pmul(y, x3);
-
-  // Add the logarithm of the exponent back to the result of the interpolation.
-  y1  = pmul(e, cst_cephes_log_q1);
-  tmp = pmul(x2, cst_half);
-  y   = padd(y, y1);
-  x   = psub(x, tmp);
-  y2  = pmul(e, cst_cephes_log_q2);
-  x   = padd(x, y);
-  x   = padd(x, y2);
-
-  Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
-  Packet iszero_mask  = pcmp_eq(_x,pzero(_x));
-  Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf);
-  // Filter out invalid inputs, i.e.:
-  //  - negative arg will be NAN
-  //  - 0 will be -INF
-  //  - +INF will be +INF
-  return pselect(iszero_mask, cst_minus_inf,
-                              por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask));
-}
-
-/** \internal \returns log(1 + x) computed using W. Kahan's formula.
-    See: http://www.plunk.org/~hatch/rightway.php
- */
-template<typename Packet>
-Packet generic_plog1p(const Packet& x)
-{
-  typedef typename unpacket_traits<Packet>::type ScalarType;
-  const Packet one = pset1<Packet>(ScalarType(1));
-  Packet xp1 = padd(x, one);
-  Packet small_mask = pcmp_eq(xp1, one);
-  Packet log1 = plog(xp1);
-  Packet inf_mask = pcmp_eq(xp1, log1);
-  Packet log_large = pmul(x, pdiv(log1, psub(xp1, one)));
-  return pselect(por(small_mask, inf_mask), x, log_large);
-}
-
-/** \internal \returns exp(x)-1 computed using W. Kahan's formula.
-    See: http://www.plunk.org/~hatch/rightway.php
- */
-template<typename Packet>
-Packet generic_expm1(const Packet& x)
-{
-  typedef typename unpacket_traits<Packet>::type ScalarType;
-  const Packet one = pset1<Packet>(ScalarType(1));
-  const Packet neg_one = pset1<Packet>(ScalarType(-1));
-  Packet u = pexp(x);
-  Packet one_mask = pcmp_eq(u, one);
-  Packet u_minus_one = psub(u, one);
-  Packet neg_one_mask = pcmp_eq(u_minus_one, neg_one);
-  Packet logu = plog(u);
-  // The following comparison is to catch the case where
-  // exp(x) = +inf. It is written in this way to avoid having
-  // to form the constant +inf, which depends on the packet
-  // type.
-  Packet pos_inf_mask = pcmp_eq(logu, u);
-  Packet expm1 = pmul(u_minus_one, pdiv(x, logu));
-  expm1 = pselect(pos_inf_mask, u, expm1);
-  return pselect(one_mask,
-                 x,
-                 pselect(neg_one_mask,
-                         neg_one,
-                         expm1));
-}
-
-
-// Exponential function. Works by writing "x = m*log(2) + r" where
-// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
-// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-EIGEN_UNUSED
-Packet pexp_float(const Packet _x)
-{
-  const Packet cst_1      = pset1<Packet>(1.0f);
-  const Packet cst_half   = pset1<Packet>(0.5f);
-  const Packet cst_exp_hi = pset1<Packet>( 88.3762626647950f);
-  const Packet cst_exp_lo = pset1<Packet>(-88.3762626647949f);
-
-  const Packet cst_cephes_LOG2EF = pset1<Packet>(1.44269504088896341f);
-  const Packet cst_cephes_exp_p0 = pset1<Packet>(1.9875691500E-4f);
-  const Packet cst_cephes_exp_p1 = pset1<Packet>(1.3981999507E-3f);
-  const Packet cst_cephes_exp_p2 = pset1<Packet>(8.3334519073E-3f);
-  const Packet cst_cephes_exp_p3 = pset1<Packet>(4.1665795894E-2f);
-  const Packet cst_cephes_exp_p4 = pset1<Packet>(1.6666665459E-1f);
-  const Packet cst_cephes_exp_p5 = pset1<Packet>(5.0000001201E-1f);
-
-  // Clamp x.
-  Packet x = pmax(pmin(_x, cst_exp_hi), cst_exp_lo);
-
-  // Express exp(x) as exp(m*ln(2) + r), start by extracting
-  // m = floor(x/ln(2) + 0.5).
-  Packet m = pfloor(pmadd(x, cst_cephes_LOG2EF, cst_half));
-
-  // Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
-  // subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
-  // truncation errors.
-  Packet r;
-#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
-  const Packet cst_nln2 = pset1<Packet>(-0.6931471805599453f);
-  r = pmadd(m, cst_nln2, x);
-#else
-  const Packet cst_cephes_exp_C1 = pset1<Packet>(0.693359375f);
-  const Packet cst_cephes_exp_C2 = pset1<Packet>(-2.12194440e-4f);
-  r = psub(x, pmul(m, cst_cephes_exp_C1));
-  r = psub(r, pmul(m, cst_cephes_exp_C2));
-#endif
-
-  Packet r2 = pmul(r, r);
-
-  // TODO(gonnet): Split into odd/even polynomials and try to exploit
-  //               instruction-level parallelism.
-  Packet y = cst_cephes_exp_p0;
-  y = pmadd(y, r, cst_cephes_exp_p1);
-  y = pmadd(y, r, cst_cephes_exp_p2);
-  y = pmadd(y, r, cst_cephes_exp_p3);
-  y = pmadd(y, r, cst_cephes_exp_p4);
-  y = pmadd(y, r, cst_cephes_exp_p5);
-  y = pmadd(y, r2, r);
-  y = padd(y, cst_1);
-
-  // Return 2^m * exp(r).
-  return pmax(pldexp(y,m), _x);
-}
-
-// make it the default path for scalar float
-template<>
-EIGEN_DEVICE_FUNC inline float pexp(const float& a) { return pexp_float(a); }
-
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-EIGEN_UNUSED
-Packet pexp_double(const Packet _x)
-{
-  Packet x = _x;
-
-  const Packet cst_1 = pset1<Packet>(1.0);
-  const Packet cst_2 = pset1<Packet>(2.0);
-  const Packet cst_half = pset1<Packet>(0.5);
-
-  const Packet cst_exp_hi = pset1<Packet>(709.437);
-  const Packet cst_exp_lo = pset1<Packet>(-709.436139303);
-
-  const Packet cst_cephes_LOG2EF = pset1<Packet>(1.4426950408889634073599);
-  const Packet cst_cephes_exp_p0 = pset1<Packet>(1.26177193074810590878e-4);
-  const Packet cst_cephes_exp_p1 = pset1<Packet>(3.02994407707441961300e-2);
-  const Packet cst_cephes_exp_p2 = pset1<Packet>(9.99999999999999999910e-1);
-  const Packet cst_cephes_exp_q0 = pset1<Packet>(3.00198505138664455042e-6);
-  const Packet cst_cephes_exp_q1 = pset1<Packet>(2.52448340349684104192e-3);
-  const Packet cst_cephes_exp_q2 = pset1<Packet>(2.27265548208155028766e-1);
-  const Packet cst_cephes_exp_q3 = pset1<Packet>(2.00000000000000000009e0);
-  const Packet cst_cephes_exp_C1 = pset1<Packet>(0.693145751953125);
-  const Packet cst_cephes_exp_C2 = pset1<Packet>(1.42860682030941723212e-6);
-
-  Packet tmp, fx;
-
-  // clamp x
-  x = pmax(pmin(x, cst_exp_hi), cst_exp_lo);
-  // Express exp(x) as exp(g + n*log(2)).
-  fx = pmadd(cst_cephes_LOG2EF, x, cst_half);
-
-  // Get the integer modulus of log(2), i.e. the "n" described above.
-  fx = pfloor(fx);
-
-  // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
-  // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
-  // digits right.
-  tmp = pmul(fx, cst_cephes_exp_C1);
-  Packet z = pmul(fx, cst_cephes_exp_C2);
-  x = psub(x, tmp);
-  x = psub(x, z);
-
-  Packet x2 = pmul(x, x);
-
-  // Evaluate the numerator polynomial of the rational interpolant.
-  Packet px = cst_cephes_exp_p0;
-  px = pmadd(px, x2, cst_cephes_exp_p1);
-  px = pmadd(px, x2, cst_cephes_exp_p2);
-  px = pmul(px, x);
-
-  // Evaluate the denominator polynomial of the rational interpolant.
-  Packet qx = cst_cephes_exp_q0;
-  qx = pmadd(qx, x2, cst_cephes_exp_q1);
-  qx = pmadd(qx, x2, cst_cephes_exp_q2);
-  qx = pmadd(qx, x2, cst_cephes_exp_q3);
-
-  // I don't really get this bit, copied from the SSE2 routines, so...
-  // TODO(gonnet): Figure out what is going on here, perhaps find a better
-  // rational interpolant?
-  x = pdiv(px, psub(qx, px));
-  x = pmadd(cst_2, x, cst_1);
-
-  // Construct the result 2^n * exp(g) = e * x. The max is used to catch
-  // non-finite values in the input.
-  return pmax(pldexp(x,fx), _x);
-}
-
-// make it the default path for scalar double
-template<>
-EIGEN_DEVICE_FUNC inline double pexp(const double& a) { return pexp_double(a); }
-
-// The following code is inspired by the following stack-overflow answer:
-//   https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
-// It has been largely optimized:
-//  - By-pass calls to frexp.
-//  - Aligned loads of required 96 bits of 2/pi. This is accomplished by
-//    (1) balancing the mantissa and exponent to the required bits of 2/pi are
-//    aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi.
-//  - Avoid a branch in rounding and extraction of the remaining fractional part.
-// Overall, I measured a speed up higher than x2 on x86-64.
-inline float trig_reduce_huge (float xf, int *quadrant)
-{
-  using Eigen::numext::int32_t;
-  using Eigen::numext::uint32_t;
-  using Eigen::numext::int64_t;
-  using Eigen::numext::uint64_t;
-
-  const double pio2_62 = 3.4061215800865545e-19;    // pi/2 * 2^-62
-  const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point foramt
-
-  // 192 bits of 2/pi for Payne-Hanek reduction
-  // Bits are introduced by packet of 8 to enable aligned reads.
-  static const uint32_t two_over_pi [] = 
-  {
-    0x00000028, 0x000028be, 0x0028be60, 0x28be60db,
-    0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a,
-    0x91054a7f, 0x054a7f09, 0x4a7f09d5, 0x7f09d5f4,
-    0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770,
-    0x4d377036, 0x377036d8, 0x7036d8a5, 0x36d8a566,
-    0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410,
-    0x10e41000, 0xe4100000
-  };
-  
-  uint32_t xi = numext::as_uint(xf);
-  // Below, -118 = -126 + 8.
-  //   -126 is to get the exponent,
-  //   +8 is to enable alignment of 2/pi's bits on 8 bits.
-  // This is possible because the fractional part of x as only 24 meaningful bits.
-  uint32_t e = (xi >> 23) - 118;
-  // Extract the mantissa and shift it to align it wrt the exponent
-  xi = ((xi & 0x007fffffu)| 0x00800000u) << (e & 0x7);
-
-  uint32_t i = e >> 3;
-  uint32_t twoopi_1  = two_over_pi[i-1];
-  uint32_t twoopi_2  = two_over_pi[i+3];
-  uint32_t twoopi_3  = two_over_pi[i+7];
-
-  // Compute x * 2/pi in 2.62-bit fixed-point format.
-  uint64_t p;
-  p = uint64_t(xi) * twoopi_3;
-  p = uint64_t(xi) * twoopi_2 + (p >> 32);
-  p = (uint64_t(xi * twoopi_1) << 32) + p;
-
-  // Round to nearest: add 0.5 and extract integral part.
-  uint64_t q = (p + zero_dot_five) >> 62;
-  *quadrant = int(q);
-  // Now it remains to compute "r = x - q*pi/2" with high accuracy,
-  // since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as:
-  //   r = (p-q)*pi/2,
-  // where the product can be be carried out with sufficient accuracy using double precision.
-  p -= q<<62;
-  return float(double(int64_t(p)) * pio2_62);
-}
-
-template<bool ComputeSine,typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-EIGEN_UNUSED
-#if EIGEN_GNUC_AT_LEAST(4,4) && EIGEN_COMP_GNUC_STRICT
-__attribute__((optimize("-fno-unsafe-math-optimizations")))
-#endif
-Packet psincos_float(const Packet& _x)
-{
-// Workaround -ffast-math aggressive optimizations
-// See bug 1674
-#if EIGEN_COMP_CLANG && defined(EIGEN_VECTORIZE_SSE)
-#define EIGEN_SINCOS_DONT_OPT(X) __asm__  ("" : "+x" (X));
-#else
-#define EIGEN_SINCOS_DONT_OPT(X)
-#endif
-
-  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
-
-  const Packet  cst_2oPI            = pset1<Packet>(0.636619746685028076171875f); // 2/PI
-  const Packet  cst_rounding_magic  = pset1<Packet>(12582912); // 2^23 for rounding
-  const PacketI csti_1              = pset1<PacketI>(1);
-  const Packet  cst_sign_mask       = pset1frombits<Packet>(0x80000000u);
-
-  Packet x = pabs(_x);
-
-  // Scale x by 2/Pi to find x's octant.
-  Packet y = pmul(x, cst_2oPI);
-
-  // Rounding trick:
-  Packet y_round = padd(y, cst_rounding_magic);
-  EIGEN_SINCOS_DONT_OPT(y_round)
-  PacketI y_int = preinterpret<PacketI>(y_round); // last 23 digits represent integer (if abs(x)<2^24)
-  y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi
-
-  // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4
-  // using "Extended precision modular arithmetic"
-  #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD)
-  // This version requires true FMA for high accuracy
-  // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
-  const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
-  x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
-  x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
-  x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
-  #else
-  // Without true FMA, the previous set of coefficients maintain 1ULP accuracy
-  // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7.
-  // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs.
-
-  // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
-  // and 2 ULP up to:
-  const float huge_th = ComputeSine ? 25966.f : 18838.f;
-  x = pmadd(y, pset1<Packet>(-1.5703125), x); // = 0xbfc90000
-  EIGEN_SINCOS_DONT_OPT(x)
-  x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x); // = 0xb9fdc000
-  EIGEN_SINCOS_DONT_OPT(x)
-  x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x); // = 0x342ee000
-  x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee
-
-  // For the record, the following set of coefficients maintain 2ULP up
-  // to a slightly larger range:
-  // const float huge_th = ComputeSine ? 51981.f : 39086.125f;
-  // but it slightly fails to maintain 1ULP for two values of sin below pi.
-  // x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
-  // x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
-  // x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
-  // x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
-
-  // For the record, with only 3 iterations it is possible to maintain
-  // 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
-  // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
-  #endif
-
-  if(predux_any(pcmp_le(pset1<Packet>(huge_th),pabs(_x))))
-  {
-    const int PacketSize = unpacket_traits<Packet>::size;
-    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize];
-    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize];
-    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) int y_int2[PacketSize];
-    pstoreu(vals, pabs(_x));
-    pstoreu(x_cpy, x);
-    pstoreu(y_int2, y_int);
-    for(int k=0; k<PacketSize;++k)
-    {
-      float val = vals[k];
-      if(val>=huge_th && (numext::isfinite)(val))
-        x_cpy[k] = trig_reduce_huge(val,&y_int2[k]);
-    }
-    x = ploadu<Packet>(x_cpy);
-    y_int = ploadu<PacketI>(y_int2);
-  }
-
-  // Compute the sign to apply to the polynomial.
-  // sin: sign = second_bit(y_int) xor signbit(_x)
-  // cos: sign = second_bit(y_int+1)
-  Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)))
-                                : preinterpret<Packet>(plogical_shift_left<30>(padd(y_int,csti_1)));
-  sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit
-
-  // Get the polynomial selection mask from the second bit of y_int
-  // We'll calculate both (sin and cos) polynomials and then select from the two.
-  Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
-
-  Packet x2 = pmul(x,x);
-
-  // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4)
-  Packet y1 =        pset1<Packet>(2.4372266125283204019069671630859375e-05f);
-  y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f     ));
-  y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f           ));
-  y1 = pmadd(y1, x2, pset1<Packet>(-0.5f));
-  y1 = pmadd(y1, x2, pset1<Packet>(1.f));
-
-  // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4)
-  // octave/matlab code to compute those coefficients:
-  //    x = (0:0.0001:pi/4)';
-  //    A = [x.^3 x.^5 x.^7];
-  //    w = ((1.-(x/(pi/4)).^2).^5)*2000+1;         # weights trading relative accuracy
-  //    c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1
-  //    printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1))
-  //
-  Packet y2 =        pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
-  y2 = pmadd(y2, x2, pset1<Packet>( 0.0083326873655616851693794799871284340042620897293090820312500000f));
-  y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666203982298255503735617821803316473960876464843750000000000f));
-  y2 = pmul(y2, x2);
-  y2 = pmadd(y2, x, x);
-
-  // Select the correct result from the two polynomials.
-  y = ComputeSine ? pselect(poly_mask,y2,y1)
-                  : pselect(poly_mask,y1,y2);
-
-  // Update the sign and filter huge inputs
-  return pxor(y, sign_bit);
-
-#undef EIGEN_SINCOS_DONT_OPT
-}
-
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-EIGEN_UNUSED
-Packet psin_float(const Packet& x)
-{
-  return psincos_float<true>(x);
-}
-
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-EIGEN_UNUSED
-Packet pcos_float(const Packet& x)
-{
-  return psincos_float<false>(x);
-}
-
-/* polevl (modified for Eigen)
- *
- *      Evaluate polynomial
- *
- *
- *
- * SYNOPSIS:
- *
- * int N;
- * Scalar x, y, coef[N+1];
- *
- * y = polevl<decltype(x), N>( x, coef);
- *
- *
- *
- * DESCRIPTION:
- *
- * Evaluates polynomial of degree N:
- *
- *                     2          N
- * y  =  C  + C x + C x  +...+ C x
- *        0    1     2          N
- *
- * Coefficients are stored in reverse order:
- *
- * coef[0] = C  , ..., coef[N] = C  .
- *            N                   0
- *
- *  The function p1evl() assumes that coef[N] = 1.0 and is
- * omitted from the array.  Its calling arguments are
- * otherwise the same as polevl().
- *
- *
- * The Eigen implementation is templatized.  For best speed, store
- * coef as a const array (constexpr), e.g.
- *
- * const double coef[] = {1.0, 2.0, 3.0, ...};
- *
- */
-template <typename Packet, int N>
-struct ppolevl {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits<Packet>::type coeff[]) {
-    EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return pmadd(ppolevl<Packet, N-1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
-  }
-};
-
-template <typename Packet>
-struct ppolevl<Packet, 0> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits<Packet>::type coeff[]) {
-    EIGEN_UNUSED_VARIABLE(x);
-    return pset1<Packet>(coeff[0]);
-  }
-};
-
-/* chbevl (modified for Eigen)
- *
- *     Evaluate Chebyshev series
- *
- *
- *
- * SYNOPSIS:
- *
- * int N;
- * Scalar x, y, coef[N], chebevl();
- *
- * y = chbevl( x, coef, N );
- *
- *
- *
- * DESCRIPTION:
- *
- * Evaluates the series
- *
- *        N-1
- *         - '
- *  y  =   >   coef[i] T (x/2)
- *         -            i
- *        i=0
- *
- * of Chebyshev polynomials Ti at argument x/2.
- *
- * Coefficients are stored in reverse order, i.e. the zero
- * order term is last in the array.  Note N is the number of
- * coefficients, not the order.
- *
- * If coefficients are for the interval a to b, x must
- * have been transformed to x -> 2(2x - b - a)/(b-a) before
- * entering the routine.  This maps x from (a, b) to (-1, 1),
- * over which the Chebyshev polynomials are defined.
- *
- * If the coefficients are for the inverted interval, in
- * which (a, b) is mapped to (1/b, 1/a), the transformation
- * required is x -> 2(2ab/x - b - a)/(b-a).  If b is infinity,
- * this becomes x -> 4a/x - 1.
- *
- *
- *
- * SPEED:
- *
- * Taking advantage of the recurrence properties of the
- * Chebyshev polynomials, the routine requires one more
- * addition per loop than evaluating a nested polynomial of
- * the same degree.
- *
- */
-
-template <typename Packet, int N>
-struct pchebevl {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits<Packet>::type coef[]) {
-    typedef typename unpacket_traits<Packet>::type Scalar;
-    Packet b0 = pset1<Packet>(coef[0]);
-    Packet b1 = pset1<Packet>(static_cast<Scalar>(0.f));
-    Packet b2;
-
-    for (int i = 1; i < N; i++) {
-      b2 = b1;
-      b1 = b0;
-      b0 = psub(pmadd(x, b1, pset1<Packet>(coef[i])), b2);
-    }
-
-    return pmul(pset1<Packet>(static_cast<Scalar>(0.5f)), psub(b0, b2));
-  }
-};
-
-} // end namespace internal
-} // end namespace Eigen
-
-#endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
deleted file mode 100644
index 68153cae3..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
+++ /dev/null
@@ -1,69 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2019 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H
-#define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H
-
-namespace Eigen {
-namespace internal {
-
-// Forward declarations of the generic math functions
-// implemented in GenericPacketMathFunctions.h
-// This is needed to workaround a circular dependency.
-
-template<typename Packet> EIGEN_STRONG_INLINE Packet
-pfrexp_float(const Packet& a, Packet& exponent);
-
-template<typename Packet> EIGEN_STRONG_INLINE Packet
-pldexp_float(Packet a, Packet exponent);
-
-/** \internal \returns log(x) for single precision float */
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-EIGEN_UNUSED
-Packet plog_float(const Packet _x);
-
-/** \internal \returns log(1 + x) */
-template<typename Packet>
-Packet generic_plog1p(const Packet& x);
-
-/** \internal \returns exp(x)-1 */
-template<typename Packet>
-Packet generic_expm1(const Packet& x);
-
-/** \internal \returns exp(x) for single precision float */
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-EIGEN_UNUSED
-Packet pexp_float(const Packet _x);
-
-/** \internal \returns exp(x) for double precision real numbers */
-template <typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-EIGEN_UNUSED
-Packet pexp_double(const Packet _x);
-
-/** \internal \returns sin(x) for single precision float */
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-EIGEN_UNUSED
-Packet psin_float(const Packet& x);
-
-/** \internal \returns cos(x) for single precision float */
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-EIGEN_UNUSED
-Packet pcos_float(const Packet& x);
-
-template <typename Packet, int N> struct ppolevl;
-
-} // end namespace internal
-} // end namespace Eigen
-
-#endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/Settings.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/Settings.h
index a5c3ada4c..097373c84 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/Settings.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/Settings.h
@@ -21,7 +21,7 @@
   * it does not correspond to the number of iterations or the number of instructions
   */
 #ifndef EIGEN_UNROLLING_LIMIT
-#define EIGEN_UNROLLING_LIMIT 110
+#define EIGEN_UNROLLING_LIMIT 100
 #endif
 
 /** Defines the threshold between a "small" and a "large" matrix.
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/TypeCasting.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/TypeCasting.h
deleted file mode 100644
index b6df98468..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/TypeCasting.h
+++ /dev/null
@@ -1,77 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2019 Rasmus Munk Larsen <rmlarsen@google.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_GENERIC_TYPE_CASTING_H
-#define EIGEN_GENERIC_TYPE_CASTING_H
-
-namespace Eigen {
-
-namespace internal {
-
-template<>
-struct scalar_cast_op<float, Eigen::half> {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
-  typedef Eigen::half result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const {
-    #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-      (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
-      return __float2half(a);
-    #else
-      return Eigen::half(a);
-    #endif
-  }
-};
-
-template<>
-struct functor_traits<scalar_cast_op<float, Eigen::half> >
-{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
-
-
-template<>
-struct scalar_cast_op<int, Eigen::half> {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
-  typedef Eigen::half result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const {
-    #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-      (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
-      return __float2half(static_cast<float>(a));
-    #else
-      return Eigen::half(static_cast<float>(a));
-    #endif
-  }
-};
-
-template<>
-struct functor_traits<scalar_cast_op<int, Eigen::half> >
-{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
-
-
-template<>
-struct scalar_cast_op<Eigen::half, float> {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
-  typedef float result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const {
-    #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-      (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
-      return __half2float(a);
-    #else
-      return static_cast<float>(a);
-    #endif
-  }
-};
-
-template<>
-struct functor_traits<scalar_cast_op<Eigen::half, float> >
-{ enum { Cost = NumTraits<float>::AddCost, PacketAccess = false }; };
-
-}
-}
-
-#endif  // EIGEN_GENERIC_TYPE_CASTING_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/GPU/PacketMath.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/GPU/PacketMath.h
deleted file mode 100644
index dd4e77d3a..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/GPU/PacketMath.h
+++ /dev/null
@@ -1,1786 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_PACKET_MATH_GPU_H
-#define EIGEN_PACKET_MATH_GPU_H
-
-namespace Eigen {
-
-namespace internal {
-
-// Make sure this is only available when targeting a GPU: we don't want to
-// introduce conflicts between these packet_traits definitions and the ones
-// we'll use on the host side (SSE, AVX, ...)
-#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
-template<> struct is_arithmetic<float4>  { enum { value = true }; };
-template<> struct is_arithmetic<double2> { enum { value = true }; };
-
-template<> struct packet_traits<float> : default_packet_traits
-{
-  typedef float4 type;
-  typedef float4 half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size=4,
-    HasHalfPacket = 0,
-
-    HasDiv  = 1,
-    HasSin  = 0,
-    HasCos  = 0,
-    HasLog  = 1,
-    HasExp  = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
-    HasLGamma = 1,
-    HasDiGamma = 1,
-    HasZeta = 1,
-    HasPolygamma = 1,
-    HasErf = 1,
-    HasErfc = 1,
-    HasNdtri = 1,
-    HasBessel = 1,
-    HasIGamma = 1,
-    HasIGammaDerA = 1,
-    HasGammaSampleDerAlpha = 1,
-    HasIGammac = 1,
-    HasBetaInc = 1,
-
-    HasBlend = 0,
-    HasFloor = 1,
-  };
-};
-
-template<> struct packet_traits<double> : default_packet_traits
-{
-  typedef double2 type;
-  typedef double2 half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size=2,
-    HasHalfPacket = 0,
-
-    HasDiv  = 1,
-    HasLog  = 1,
-    HasExp  = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
-    HasLGamma = 1,
-    HasDiGamma = 1,
-    HasZeta = 1,
-    HasPolygamma = 1,
-    HasErf = 1,
-    HasErfc = 1,
-    HasNdtri = 1,
-    HasBessel = 1,
-    HasIGamma = 1,
-    HasIGammaDerA = 1,
-    HasGammaSampleDerAlpha = 1,
-    HasIGammac = 1,
-    HasBetaInc = 1,
-
-    HasBlend = 0,
-    HasFloor = 1,
-  };
-};
-
-
-template<> struct unpacket_traits<float4>  { typedef float  type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef float4 half; };
-template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef double2 half; };
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float&  from) {
-  return make_float4(from, from, from, from);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
-  return make_double2(from, from);
-}
-
-// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
-// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
-// of the functions, while the latter can only deal with one of them.
-#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
-namespace {
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a,
-                                                        const float& b) {
-  return __int_as_float(__float_as_int(a) & __float_as_int(b));
-}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a,
-                                                         const double& b) {
-  return __longlong_as_double(__double_as_longlong(a) &
-                              __double_as_longlong(b));
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a,
-                                                       const float& b) {
-  return __int_as_float(__float_as_int(a) | __float_as_int(b));
-}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a,
-                                                        const double& b) {
-  return __longlong_as_double(__double_as_longlong(a) |
-                              __double_as_longlong(b));
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a,
-                                                        const float& b) {
-  return __int_as_float(__float_as_int(a) ^ __float_as_int(b));
-}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a,
-                                                         const double& b) {
-  return __longlong_as_double(__double_as_longlong(a) ^
-                              __double_as_longlong(b));
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a,
-                                                           const float& b) {
-  return __int_as_float(__float_as_int(a) & ~__float_as_int(b));
-}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a,
-                                                            const double& b) {
-  return __longlong_as_double(__double_as_longlong(a) &
-                              ~__double_as_longlong(b));
-}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a,
-                                                    const float& b) {
-  return __int_as_float(a == b ? 0xffffffffu : 0u);
-}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a,
-                                                     const double& b) {
-  return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float lt_mask(const float& a,
-                                                    const float& b) {
-  return __int_as_float(a < b ? 0xffffffffu : 0u);
-}
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a,
-                                                     const double& b) {
-  return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull);
-}
-
-}  // namespace
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a,
-                                                          const float4& b) {
-  return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y),
-                     bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(const double2& a,
-                                                            const double2& b) {
-  return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(const float4& a,
-                                                         const float4& b) {
-  return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y),
-                     bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(const double2& a,
-                                                           const double2& b) {
-  return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(const float4& a,
-                                                          const float4& b) {
-  return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y),
-                     bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(const double2& a,
-                                                            const double2& b) {
-  return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(const float4& a,
-                                                             const float4& b) {
-  return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y),
-                     bitwise_andnot(a.z, b.z), bitwise_andnot(a.w, b.w));
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
-pandnot<double2>(const double2& a, const double2& b) {
-  return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(const float4& a,
-                                                             const float4& b) {
-  return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z),
-                     eq_mask(a.w, b.w));
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_lt<float4>(const float4& a,
-                                                             const float4& b) {
-  return make_float4(lt_mask(a.x, b.x), lt_mask(a.y, b.y), lt_mask(a.z, b.z),
-                     lt_mask(a.w, b.w));
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
-pcmp_eq<double2>(const double2& a, const double2& b) {
-  return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
-pcmp_lt<double2>(const double2& a, const double2& b) {
-  return make_double2(lt_mask(a.x, b.x), lt_mask(a.y, b.y));
-}
-#endif  // EIGEN_CUDA_ARCH || defined(EIGEN_HIP_DEVICE_COMPILE)
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
-  return make_float4(a, a+1, a+2, a+3);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
-  return make_double2(a, a+1);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
-  return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
-  return make_double2(a.x+b.x, a.y+b.y);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
-  return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
-  return make_double2(a.x-b.x, a.y-b.y);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
-  return make_float4(-a.x, -a.y, -a.z, -a.w);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
-  return make_double2(-a.x, -a.y);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; }
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
-  return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
-  return make_double2(a.x*b.x, a.y*b.y);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
-  return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
-  return make_double2(a.x/b.x, a.y/b.y);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
-  return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
-  return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
-  return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
-  return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
-  return *reinterpret_cast<const float4*>(from);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
-  return *reinterpret_cast<const double2*>(from);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
-  return make_float4(from[0], from[1], from[2], from[3]);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
-  return make_double2(from[0], from[1]);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float*   from) {
-  return make_float4(from[0], from[0], from[1], from[1]);
-}
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double*  from) {
-  return make_double2(from[0], from[0]);
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float*   to, const float4& from) {
-  *reinterpret_cast<float4*>(to) = from;
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
-  *reinterpret_cast<double2*>(to) = from;
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const float4& from) {
-  to[0] = from.x;
-  to[1] = from.y;
-  to[2] = from.z;
-  to[3] = from.w;
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
-  to[0] = from.x;
-  to[1] = from.y;
-}
-
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
-  return __ldg((const float4*)from);
-#else
-  return make_float4(from[0], from[1], from[2], from[3]);
-#endif
-}
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
-  return __ldg((const double2*)from);
-#else
-  return make_double2(from[0], from[1]);
-#endif
-}
-
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
-  return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
-#else
-  return make_float4(from[0], from[1], from[2], from[3]);
-#endif
-}
-template<>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
-  return make_double2(__ldg(from+0), __ldg(from+1));
-#else
-  return make_double2(from[0], from[1]);
-#endif
-}
-
-template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
-  return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
-}
-
-template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
-  return make_double2(from[0*stride], from[1*stride]);
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
-  to[stride*0] = from.x;
-  to[stride*1] = from.y;
-  to[stride*2] = from.z;
-  to[stride*3] = from.w;
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
-  to[stride*0] = from.x;
-  to[stride*1] = from.y;
-}
-
-template<> EIGEN_DEVICE_FUNC inline float  pfirst<float4>(const float4& a) {
-  return a.x;
-}
-template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
-  return a.x;
-}
-
-template<> EIGEN_DEVICE_FUNC inline float  predux<float4>(const float4& a) {
-  return a.x + a.y + a.z + a.w;
-}
-template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
-  return a.x + a.y;
-}
-
-template<> EIGEN_DEVICE_FUNC inline float  predux_max<float4>(const float4& a) {
-  return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
-}
-template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
-  return fmax(a.x, a.y);
-}
-
-template<> EIGEN_DEVICE_FUNC inline float  predux_min<float4>(const float4& a) {
-  return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
-}
-template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
-  return fmin(a.x, a.y);
-}
-
-template<> EIGEN_DEVICE_FUNC inline float  predux_mul<float4>(const float4& a) {
-  return a.x * a.y * a.z * a.w;
-}
-template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
-  return a.x * a.y;
-}
-
-template<> EIGEN_DEVICE_FUNC inline float4  pabs<float4>(const float4& a) {
-  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
-}
-template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
-  return make_double2(fabs(a.x), fabs(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC inline float4  pfloor<float4>(const float4& a) {
-  return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
-}
-template<> EIGEN_DEVICE_FUNC inline double2 pfloor<double2>(const double2& a) {
-  return make_double2(floor(a.x), floor(a.y));
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<float4,4>& kernel) {
-  float tmp = kernel.packet[0].y;
-  kernel.packet[0].y = kernel.packet[1].x;
-  kernel.packet[1].x = tmp;
-
-  tmp = kernel.packet[0].z;
-  kernel.packet[0].z = kernel.packet[2].x;
-  kernel.packet[2].x = tmp;
-
-  tmp = kernel.packet[0].w;
-  kernel.packet[0].w = kernel.packet[3].x;
-  kernel.packet[3].x = tmp;
-
-  tmp = kernel.packet[1].z;
-  kernel.packet[1].z = kernel.packet[2].y;
-  kernel.packet[2].y = tmp;
-
-  tmp = kernel.packet[1].w;
-  kernel.packet[1].w = kernel.packet[3].y;
-  kernel.packet[3].y = tmp;
-
-  tmp = kernel.packet[2].w;
-  kernel.packet[2].w = kernel.packet[3].z;
-  kernel.packet[3].z = tmp;
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<double2,2>& kernel) {
-  double tmp = kernel.packet[0].y;
-  kernel.packet[0].y = kernel.packet[1].x;
-  kernel.packet[1].x = tmp;
-}
-
-#endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
-
-// Packet4h2 must be defined in the macro without EIGEN_CUDA_ARCH, meaning
-// its corresponding packet_traits<Eigen::half> must be visible on host.
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC)) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC)) || \
-  (defined(EIGEN_HAS_CUDA_FP16) && defined(__clang__) && defined(__CUDA__))
-
-typedef ulonglong2 Packet4h2;
-template<> struct unpacket_traits<Packet4h2> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h2 half; };
-template<> struct is_arithmetic<Packet4h2> { enum { value = true }; };
-
-template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef half2 half; };
-template<> struct is_arithmetic<half2> { enum { value = true }; };
-
-template<> struct packet_traits<Eigen::half> : default_packet_traits
-{
-  typedef Packet4h2 type;
-  typedef Packet4h2 half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size=8,
-    HasHalfPacket = 0,
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
-    HasSqrt   = 1,
-    HasRsqrt  = 1,
-    HasExp    = 1,
-    HasExpm1  = 1,
-    HasLog    = 1,
-    HasLog1p  = 1
-  };
-};
-
-template<>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
-#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIPCC)
-  half2 r;
-  r.x = from;
-  r.y = from;
-  return r;
-#elif defined(EIGEN_HIPCC)
-  return __half2{from,from};
-#else
-  return __half2half2(from);
-#endif
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-pset1<Packet4h2>(const Eigen::half& from) {
-  Packet4h2 r;
-  half2* p_alias = reinterpret_cast<half2*>(&r);
-  p_alias[0] = pset1<half2>(from);
-  p_alias[1] = pset1<half2>(from);
-  p_alias[2] = pset1<half2>(from);
-  p_alias[3] = pset1<half2>(from);
-  return r;
-}
-
-#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
-namespace {
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) {
-  return *reinterpret_cast<const half2*>(from);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) {
-  return __halves2half2(from[0], from[1]);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half*  from) {
-  return __halves2half2(from[0], from[0]);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to,
-                                                  const half2& from) {
-  *reinterpret_cast<half2*>(to) = from;
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to,
-                                                   const half2& from) {
-#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIPCC)
-  to[0] = from.x;
-  to[1] = from.y;
-#else
-  to[0] = __low2half(from);
-  to[1] = __high2half(from);
-#endif
-}
-
-
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(
-    const Eigen::half* from) {
-
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  return __ldg((const half2*)from);
-
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 350
-   return __ldg((const half2*)from);
-#else
-  return __halves2half2(*(from+0), *(from+1));
-#endif
-
-#endif
-}
-
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(
-    const Eigen::half* from) {
-
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  return __halves2half2(__ldg(from+0), __ldg(from+1));
-
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 350
-   return __halves2half2(__ldg(from+0), __ldg(from+1));
-#else
-  return __halves2half2(*(from+0), *(from+1));
-#endif
-
-#endif
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from,
-                                                    Index stride) {
-  return __halves2half2(from[0*stride], from[1*stride]);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(
-    Eigen::half* to, const half2& from, Index stride) {
-  to[stride*0] = __low2half(from);
-  to[stride*1] = __high2half(from);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) {
-  return __low2half(a);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) {
-  half a1 = __low2half(a);
-  half a2 = __high2half(a);
-  half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF);
-  half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF);
-  return __halves2half2(result1, result2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& a) {
-  half true_half = half_impl::raw_uint16_to_half(0xffffu);
-  return pset1<half2>(true_half);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& a) {
-  half false_half = half_impl::raw_uint16_to_half(0x0000u);
-  return pset1<half2>(false_half);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<half2,2>& kernel) {
-  __half a1 = __low2half(kernel.packet[0]);
-  __half a2 = __high2half(kernel.packet[0]);
-  __half b1 = __low2half(kernel.packet[1]);
-  __half b2 = __high2half(kernel.packet[1]);
-  kernel.packet[0] = __halves2half2(a1, b1);
-  kernel.packet[1] = __halves2half2(a2, b2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
-
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
-  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
-#else
-  float f = __half2float(a) + 1.0f;
-  return __halves2half2(a, __float2half(f));
-#endif
-
-#endif
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask,
-                                                    const half2& a,
-                                                    const half2& b) {
-  half mask_low = __low2half(mask);
-  half mask_high = __high2half(mask);
-  half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a);
-  half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a);
-  return __halves2half2(result_low, result_high);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a,
-                                                    const half2& b) {
-  half true_half = half_impl::raw_uint16_to_half(0xffffu);
-  half false_half = half_impl::raw_uint16_to_half(0x0000u);
-  half a1 = __low2half(a);
-  half a2 = __high2half(a);
-  half b1 = __low2half(b);
-  half b2 = __high2half(b);
-  half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
-  half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
-  return __halves2half2(eq1, eq2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a,
-                                                    const half2& b) {
-  half true_half = half_impl::raw_uint16_to_half(0xffffu);
-  half false_half = half_impl::raw_uint16_to_half(0x0000u);
-  half a1 = __low2half(a);
-  half a2 = __high2half(a);
-  half b1 = __low2half(b);
-  half b2 = __high2half(b);
-  half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half;
-  half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half;
-  return __halves2half2(eq1, eq2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a,
-                                                 const half2& b) {
-  half a1 = __low2half(a);
-  half a2 = __high2half(a);
-  half b1 = __low2half(b);
-  half b2 = __high2half(b);
-  half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x);
-  half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x);
-  return __halves2half2(result1, result2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a,
-                                                const half2& b) {
-  half a1 = __low2half(a);
-  half a2 = __high2half(a);
-  half b1 = __low2half(b);
-  half b2 = __high2half(b);
-  half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x);
-  half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x);
-  return __halves2half2(result1, result2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a,
-                                                 const half2& b) {
-  half a1 = __low2half(a);
-  half a2 = __high2half(a);
-  half b1 = __low2half(b);
-  half b2 = __high2half(b);
-  half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x);
-  half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x);
-  return __halves2half2(result1, result2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a,
-                                                    const half2& b) {
-  half a1 = __low2half(a);
-  half a2 = __high2half(a);
-  half b1 = __low2half(b);
-  half b2 = __high2half(b);
-  half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x);
-  half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x);
-  return __halves2half2(result1, result2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a,
-                                                 const half2& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  return __hadd2(a, b);
-
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
-  return __hadd2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 + b1;
-  float r2 = a2 + b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-
-#endif
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a,
-                                                 const half2& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  return __hsub2(a, b);
-
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
-  return __hsub2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 - b1;
-  float r2 = a2 - b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-
-#endif
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  return __hneg2(a);
-
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
-  return __hneg2(a);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return __floats2half2_rn(-a1, -a2);
-#endif
-
-#endif
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a,
-                                                 const half2& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  return __hmul2(a, b);
-
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
-  return __hmul2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 * b1;
-  float r2 = a2 * b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-
-#endif
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a,
-                                                  const half2& b,
-                                                  const half2& c) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-   return __hfma2(a, b, c);
-
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
-   return __hfma2(a, b, c);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float c1 = __low2float(c);
-  float c2 = __high2float(c);
-  float r1 = a1 * b1 + c1;
-  float r2 = a2 * b2 + c2;
-  return __floats2half2_rn(r1, r2);
-#endif
-
-#endif
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a,
-                                                 const half2& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  return __h2div(a, b);
-
-#else // EIGEN_CUDA_ARCH
-
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 / b1;
-  float r2 = a2 / b2;
-  return __floats2half2_rn(r1, r2);
-
-#endif
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a,
-                                                 const half2& b) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
-  __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
-  return __halves2half2(r1, r2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a,
-                                                 const half2& b) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
-  __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
-  return __halves2half2(r1, r2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  return __hadd(__low2half(a), __high2half(a));
-
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
-  return __hadd(__low2half(a), __high2half(a));
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return Eigen::half(__float2half(a1 + a2));
-#endif
-
-#endif
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  __half first = __low2half(a);
-  __half second = __high2half(a);
-  return __hgt(first, second) ? first : second;
-
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
-  __half first = __low2half(a);
-  __half second = __high2half(a);
-  return __hgt(first, second) ? first : second;
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return a1 > a2 ? __low2half(a) : __high2half(a);
-#endif
-
-#endif
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  __half first = __low2half(a);
-  __half second = __high2half(a);
-  return __hlt(first, second) ? first : second;
-
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
-  __half first = __low2half(a);
-  __half second = __high2half(a);
-  return __hlt(first, second) ? first : second;
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return a1 < a2 ? __low2half(a) : __high2half(a);
-#endif
-
-#endif
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  return __hmul(__low2half(a), __high2half(a));
-
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
-  return __hmul(__low2half(a), __high2half(a));
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  return Eigen::half(__float2half(a1 * a2));
-#endif
-
-#endif
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = log1pf(a1);
-  float r2 = log1pf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = expm1f(a1);
-  float r2 = expm1f(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
-  defined(EIGEN_HIP_DEVICE_COMPILE)
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-half2 plog(const half2& a) {
-  return h2log(a);
-}
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-half2 pexp(const half2& a) {
-  return h2exp(a);
-}
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-half2 psqrt(const half2& a) {
-  return h2sqrt(a);
-}
-
- EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-half2 prsqrt(const half2& a) {
-  return h2rsqrt(a);
-}
-
-#else
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = logf(a1);
-  float r2 = logf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = expf(a1);
-  float r2 = expf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = sqrtf(a1);
-  float r2 = sqrtf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float r1 = rsqrtf(a1);
-  float r2 = rsqrtf(a2);
-  return __floats2half2_rn(r1, r2);
-}
-#endif
-} // namespace
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-pload<Packet4h2>(const Eigen::half* from) {
-  return *reinterpret_cast<const Packet4h2*>(from);
-}
-
-// unaligned load;
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-ploadu<Packet4h2>(const Eigen::half* from) {
-  Packet4h2 r;
-  half2* p_alias = reinterpret_cast<half2*>(&r);
-  p_alias[0] = ploadu(from + 0);
-  p_alias[1] = ploadu(from + 2);
-  p_alias[2] = ploadu(from + 4);
-  p_alias[3] = ploadu(from + 6);
-  return r;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-ploaddup<Packet4h2>(const Eigen::half* from) {
-  Packet4h2 r;
-  half2* p_alias = reinterpret_cast<half2*>(&r);
-  p_alias[0] = ploaddup(from + 0);
-  p_alias[1] = ploaddup(from + 1);
-  p_alias[2] = ploaddup(from + 2);
-  p_alias[3] = ploaddup(from + 3);
-  return r;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(
-    Eigen::half* to, const Packet4h2& from) {
-  *reinterpret_cast<Packet4h2*>(to) = from;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(
-    Eigen::half* to, const Packet4h2& from) {
-  const half2* from_alias = reinterpret_cast<const half2*>(&from);
-  pstoreu(to + 0,from_alias[0]);
-  pstoreu(to + 2,from_alias[1]);
-  pstoreu(to + 4,from_alias[2]);
-  pstoreu(to + 6,from_alias[3]);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2
-ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  Packet4h2 r;
-  r = __ldg((const Packet4h2*)from);
-  return r;
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 350
-  Packet4h2 r;
-  r = __ldg((const Packet4h2*)from);
-  return r;
-#else
-  Packet4h2 r;
-  half2* r_alias = reinterpret_cast<half2*>(&r);
-  r_alias[0] = ploadt_ro_aligned(from + 0);
-  r_alias[1] = ploadt_ro_aligned(from + 2);
-  r_alias[2] = ploadt_ro_aligned(from + 4);
-  r_alias[3] = ploadt_ro_aligned(from + 6);
-  return r;
-#endif
-
-#endif
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2
-ploadt_ro<Packet4h2, Unaligned>(const Eigen::half* from) {
-  Packet4h2 r;
-  half2* r_alias = reinterpret_cast<half2*>(&r);
-  r_alias[0] = ploadt_ro_unaligned(from + 0);
-  r_alias[1] = ploadt_ro_unaligned(from + 2);
-  r_alias[2] = ploadt_ro_unaligned(from + 4);
-  r_alias[3] = ploadt_ro_unaligned(from + 6);
-  return r;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-pgather<Eigen::half, Packet4h2>(const Eigen::half* from, Index stride) {
-  Packet4h2 r;
-  half2* p_alias = reinterpret_cast<half2*>(&r);
-  p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]);
-  p_alias[1] = __halves2half2(from[2 * stride], from[3 * stride]);
-  p_alias[2] = __halves2half2(from[4 * stride], from[5 * stride]);
-  p_alias[3] = __halves2half2(from[6 * stride], from[7 * stride]);
-  return r;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h2>(
-    Eigen::half* to, const Packet4h2& from, Index stride) {
-  const half2* from_alias = reinterpret_cast<const half2*>(&from);
-  pscatter(to + stride * 0, from_alias[0], stride);
-  pscatter(to + stride * 2, from_alias[1], stride);
-  pscatter(to + stride * 4, from_alias[2], stride);
-  pscatter(to + stride * 6, from_alias[3], stride);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h2>(
-    const Packet4h2& a) {
-  return pfirst(*(reinterpret_cast<const half2*>(&a)));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs<Packet4h2>(
-    const Packet4h2& a) {
-  Packet4h2 r;
-  half2* p_alias = reinterpret_cast<half2*>(&r);
-  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  p_alias[0] = pabs(a_alias[0]);
-  p_alias[1] = pabs(a_alias[1]);
-  p_alias[2] = pabs(a_alias[2]);
-  p_alias[3] = pabs(a_alias[3]);
-  return r;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue<Packet4h2>(
-    const Packet4h2& a) {
-  half true_half = half_impl::raw_uint16_to_half(0xffffu);
-  return pset1<Packet4h2>(true_half);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pzero<Packet4h2>(const Packet4h2& a) {
-  half false_half = half_impl::raw_uint16_to_half(0x0000u);
-  return pset1<Packet4h2>(false_half);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_double(
-    double* d_row0, double* d_row1, double* d_row2, double* d_row3,
-    double* d_row4, double* d_row5, double* d_row6, double* d_row7) {
-  double d_tmp;
-  d_tmp = d_row0[1];
-  d_row0[1] = d_row4[0];
-  d_row4[0] = d_tmp;
-
-  d_tmp = d_row1[1];
-  d_row1[1] = d_row5[0];
-  d_row5[0] = d_tmp;
-
-  d_tmp = d_row2[1];
-  d_row2[1] = d_row6[0];
-  d_row6[0] = d_tmp;
-
-  d_tmp = d_row3[1];
-  d_row3[1] = d_row7[0];
-  d_row7[0] = d_tmp;
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2(
-    half2* f_row0, half2* f_row1, half2* f_row2, half2* f_row3) {
-  half2 f_tmp;
-  f_tmp = f_row0[1];
-  f_row0[1] = f_row2[0];
-  f_row2[0] = f_tmp;
-
-  f_tmp = f_row1[1];
-  f_row1[1] = f_row3[0];
-  f_row3[0] = f_tmp;
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
-ptranspose_half(half2& f0, half2& f1) {
-  __half a1 = __low2half(f0);
-  __half a2 = __high2half(f0);
-  __half b1 = __low2half(f1);
-  __half b2 = __high2half(f1);
-  f0 = __halves2half2(a1, b1);
-  f1 = __halves2half2(a2, b2);
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet4h2,8>& kernel) {
-  double* d_row0 = reinterpret_cast<double*>(&kernel.packet[0]);
-  double* d_row1 = reinterpret_cast<double*>(&kernel.packet[1]);
-  double* d_row2 = reinterpret_cast<double*>(&kernel.packet[2]);
-  double* d_row3 = reinterpret_cast<double*>(&kernel.packet[3]);
-  double* d_row4 = reinterpret_cast<double*>(&kernel.packet[4]);
-  double* d_row5 = reinterpret_cast<double*>(&kernel.packet[5]);
-  double* d_row6 = reinterpret_cast<double*>(&kernel.packet[6]);
-  double* d_row7 = reinterpret_cast<double*>(&kernel.packet[7]);
-  ptranspose_double(d_row0, d_row1, d_row2, d_row3,
-                    d_row4, d_row5, d_row6, d_row7);
-
-
-  half2* f_row0 = reinterpret_cast<half2*>(d_row0);
-  half2* f_row1 = reinterpret_cast<half2*>(d_row1);
-  half2* f_row2 = reinterpret_cast<half2*>(d_row2);
-  half2* f_row3 = reinterpret_cast<half2*>(d_row3);
-  ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
-  ptranspose_half(f_row0[0], f_row1[0]);
-  ptranspose_half(f_row0[1], f_row1[1]);
-  ptranspose_half(f_row2[0], f_row3[0]);
-  ptranspose_half(f_row2[1], f_row3[1]);
-
-  f_row0 = reinterpret_cast<half2*>(d_row0 + 1);
-  f_row1 = reinterpret_cast<half2*>(d_row1 + 1);
-  f_row2 = reinterpret_cast<half2*>(d_row2 + 1);
-  f_row3 = reinterpret_cast<half2*>(d_row3 + 1);
-  ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
-  ptranspose_half(f_row0[0], f_row1[0]);
-  ptranspose_half(f_row0[1], f_row1[1]);
-  ptranspose_half(f_row2[0], f_row3[0]);
-  ptranspose_half(f_row2[1], f_row3[1]);
-
-  f_row0 = reinterpret_cast<half2*>(d_row4);
-  f_row1 = reinterpret_cast<half2*>(d_row5);
-  f_row2 = reinterpret_cast<half2*>(d_row6);
-  f_row3 = reinterpret_cast<half2*>(d_row7);
-  ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
-  ptranspose_half(f_row0[0], f_row1[0]);
-  ptranspose_half(f_row0[1], f_row1[1]);
-  ptranspose_half(f_row2[0], f_row3[0]);
-  ptranspose_half(f_row2[1], f_row3[1]);
-
-  f_row0 = reinterpret_cast<half2*>(d_row4 + 1);
-  f_row1 = reinterpret_cast<half2*>(d_row5 + 1);
-  f_row2 = reinterpret_cast<half2*>(d_row6 + 1);
-  f_row3 = reinterpret_cast<half2*>(d_row7 + 1);
-  ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
-  ptranspose_half(f_row0[0], f_row1[0]);
-  ptranspose_half(f_row0[1], f_row1[1]);
-  ptranspose_half(f_row2[0], f_row3[0]);
-  ptranspose_half(f_row2[1], f_row3[1]);
-  
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-plset<Packet4h2>(const Eigen::half& a) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  Packet4h2 r;
-  half2* p_alias = reinterpret_cast<half2*>(&r);
-  p_alias[0] = __halves2half2(a, __hadd(a, __float2half(1.0f)));
-  p_alias[1] = __halves2half2(__hadd(a, __float2half(2.0f)),
-                              __hadd(a, __float2half(3.0f)));
-  p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)),
-                              __hadd(a, __float2half(5.0f)));
-  p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)),
-                              __hadd(a, __float2half(7.0f)));
-  return r;
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
-  Packet4h2 r;
-  half2* r_alias = reinterpret_cast<half2*>(&r);
-
-  half2 b = pset1<half2>(a);
-  half2 c;
-  half2 half_offset0 = __halves2half2(__float2half(0.0f),__float2half(2.0f));
-  half2 half_offset1 = __halves2half2(__float2half(4.0f),__float2half(6.0f));
-
-  c = __hadd2(b, half_offset0);
-  r_alias[0] = plset(__low2half(c));
-  r_alias[1] = plset(__high2half(c));
-
-  c = __hadd2(b, half_offset1);
-  r_alias[2] = plset(__low2half(c));
-  r_alias[3] = plset(__high2half(c));
-
-  return r;
-
-#else
-  float f = __half2float(a);
-  Packet4h2 r;
-  half2* p_alias = reinterpret_cast<half2*>(&r);
-  p_alias[0] = __halves2half2(a, __float2half(f + 1.0f));
-  p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f));
-  p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f));
-  p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f));
-  return r;
-#endif
-
-#endif
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-pselect<Packet4h2>(const Packet4h2& mask, const Packet4h2& a,
-                   const Packet4h2& b) {
-  Packet4h2 r;
-  half2* r_alias = reinterpret_cast<half2*>(&r);
-  const half2* mask_alias = reinterpret_cast<const half2*>(&mask);
-  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  const half2* b_alias = reinterpret_cast<const half2*>(&b);
-  r_alias[0] = pselect(mask_alias[0], a_alias[0], b_alias[0]);
-  r_alias[1] = pselect(mask_alias[1], a_alias[1], b_alias[1]);
-  r_alias[2] = pselect(mask_alias[2], a_alias[2], b_alias[2]);
-  r_alias[3] = pselect(mask_alias[3], a_alias[3], b_alias[3]);
-  return r;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-pcmp_eq<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
-  Packet4h2 r;
-  half2* r_alias = reinterpret_cast<half2*>(&r);
-  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  const half2* b_alias = reinterpret_cast<const half2*>(&b);
-  r_alias[0] = pcmp_eq(a_alias[0], b_alias[0]);
-  r_alias[1] = pcmp_eq(a_alias[1], b_alias[1]);
-  r_alias[2] = pcmp_eq(a_alias[2], b_alias[2]);
-  r_alias[3] = pcmp_eq(a_alias[3], b_alias[3]);
-  return r;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand<Packet4h2>(
-    const Packet4h2& a, const Packet4h2& b) {
-  Packet4h2 r;
-  half2* r_alias = reinterpret_cast<half2*>(&r);
-  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  const half2* b_alias = reinterpret_cast<const half2*>(&b);
-  r_alias[0] = pand(a_alias[0], b_alias[0]);
-  r_alias[1] = pand(a_alias[1], b_alias[1]);
-  r_alias[2] = pand(a_alias[2], b_alias[2]);
-  r_alias[3] = pand(a_alias[3], b_alias[3]);
-  return r;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por<Packet4h2>(
-    const Packet4h2& a, const Packet4h2& b) {
-  Packet4h2 r;
-  half2* r_alias = reinterpret_cast<half2*>(&r);
-  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  const half2* b_alias = reinterpret_cast<const half2*>(&b);
-  r_alias[0] = por(a_alias[0], b_alias[0]);
-  r_alias[1] = por(a_alias[1], b_alias[1]);
-  r_alias[2] = por(a_alias[2], b_alias[2]);
-  r_alias[3] = por(a_alias[3], b_alias[3]);
-  return r;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor<Packet4h2>(
-    const Packet4h2& a, const Packet4h2& b) {
-  Packet4h2 r;
-  half2* r_alias = reinterpret_cast<half2*>(&r);
-  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  const half2* b_alias = reinterpret_cast<const half2*>(&b);
-  r_alias[0] = pxor(a_alias[0], b_alias[0]);
-  r_alias[1] = pxor(a_alias[1], b_alias[1]);
-  r_alias[2] = pxor(a_alias[2], b_alias[2]);
-  r_alias[3] = pxor(a_alias[3], b_alias[3]);
-  return r;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-pandnot<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
-  Packet4h2 r;
-  half2* r_alias = reinterpret_cast<half2*>(&r);
-  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  const half2* b_alias = reinterpret_cast<const half2*>(&b);
-  r_alias[0] = pandnot(a_alias[0], b_alias[0]);
-  r_alias[1] = pandnot(a_alias[1], b_alias[1]);
-  r_alias[2] = pandnot(a_alias[2], b_alias[2]);
-  r_alias[3] = pandnot(a_alias[3], b_alias[3]);
-  return r;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd<Packet4h2>(
-    const Packet4h2& a, const Packet4h2& b) {
-  Packet4h2 r;
-  half2* r_alias = reinterpret_cast<half2*>(&r);
-  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  const half2* b_alias = reinterpret_cast<const half2*>(&b);
-  r_alias[0] = padd(a_alias[0], b_alias[0]);
-  r_alias[1] = padd(a_alias[1], b_alias[1]);
-  r_alias[2] = padd(a_alias[2], b_alias[2]);
-  r_alias[3] = padd(a_alias[3], b_alias[3]);
-  return r;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psub<Packet4h2>(
-    const Packet4h2& a, const Packet4h2& b) {
-  Packet4h2 r;
-  half2* r_alias = reinterpret_cast<half2*>(&r);
-  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  const half2* b_alias = reinterpret_cast<const half2*>(&b);
-  r_alias[0] = psub(a_alias[0], b_alias[0]);
-  r_alias[1] = psub(a_alias[1], b_alias[1]);
-  r_alias[2] = psub(a_alias[2], b_alias[2]);
-  r_alias[3] = psub(a_alias[3], b_alias[3]);
-  return r;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pnegate(const Packet4h2& a) {
-  Packet4h2 r;
-  half2* r_alias = reinterpret_cast<half2*>(&r);
-  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  r_alias[0] = pnegate(a_alias[0]);
-  r_alias[1] = pnegate(a_alias[1]);
-  r_alias[2] = pnegate(a_alias[2]);
-  r_alias[3] = pnegate(a_alias[3]);
-  return r;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pconj(const Packet4h2& a) {
-  return a;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul<Packet4h2>(
-    const Packet4h2& a, const Packet4h2& b) {
-  Packet4h2 r;
-  half2* r_alias = reinterpret_cast<half2*>(&r);
-  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  const half2* b_alias = reinterpret_cast<const half2*>(&b);
-  r_alias[0] = pmul(a_alias[0], b_alias[0]);
-  r_alias[1] = pmul(a_alias[1], b_alias[1]);
-  r_alias[2] = pmul(a_alias[2], b_alias[2]);
-  r_alias[3] = pmul(a_alias[3], b_alias[3]);
-  return r;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd<Packet4h2>(
-    const Packet4h2& a, const Packet4h2& b, const Packet4h2& c) {
-  Packet4h2 r;
-  half2* r_alias = reinterpret_cast<half2*>(&r);
-  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  const half2* b_alias = reinterpret_cast<const half2*>(&b);
-  const half2* c_alias = reinterpret_cast<const half2*>(&c);
-  r_alias[0] = pmadd(a_alias[0], b_alias[0], c_alias[0]);
-  r_alias[1] = pmadd(a_alias[1], b_alias[1], c_alias[1]);
-  r_alias[2] = pmadd(a_alias[2], b_alias[2], c_alias[2]);
-  r_alias[3] = pmadd(a_alias[3], b_alias[3], c_alias[3]);
-  return r;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv<Packet4h2>(
-    const Packet4h2& a, const Packet4h2& b) {
-  Packet4h2 r;
-  half2* r_alias = reinterpret_cast<half2*>(&r);
-  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  const half2* b_alias = reinterpret_cast<const half2*>(&b);
-  r_alias[0] = pdiv(a_alias[0], b_alias[0]);
-  r_alias[1] = pdiv(a_alias[1], b_alias[1]);
-  r_alias[2] = pdiv(a_alias[2], b_alias[2]);
-  r_alias[3] = pdiv(a_alias[3], b_alias[3]);
-  return r;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin<Packet4h2>(
-    const Packet4h2& a, const Packet4h2& b) {
-  Packet4h2 r;
-  half2* r_alias = reinterpret_cast<half2*>(&r);
-  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  const half2* b_alias = reinterpret_cast<const half2*>(&b);
-  r_alias[0] = pmin(a_alias[0], b_alias[0]);
-  r_alias[1] = pmin(a_alias[1], b_alias[1]);
-  r_alias[2] = pmin(a_alias[2], b_alias[2]);
-  r_alias[3] = pmin(a_alias[3], b_alias[3]);
-  return r;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax<Packet4h2>(
-    const Packet4h2& a, const Packet4h2& b) {
-  Packet4h2 r;
-  half2* r_alias = reinterpret_cast<half2*>(&r);
-  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  const half2* b_alias = reinterpret_cast<const half2*>(&b);
-  r_alias[0] = pmax(a_alias[0], b_alias[0]);
-  r_alias[1] = pmax(a_alias[1], b_alias[1]);
-  r_alias[2] = pmax(a_alias[2], b_alias[2]);
-  r_alias[3] = pmax(a_alias[3], b_alias[3]);
-  return r;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<Packet4h2>(
-    const Packet4h2& a) {
-  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-
-  return predux(a_alias[0]) + predux(a_alias[1]) +
-         predux(a_alias[2]) + predux(a_alias[3]);
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(
-    const Packet4h2& a) {
-  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  half2 m0 = __halves2half2(predux_max(a_alias[0]),
-                            predux_max(a_alias[1]));
-  half2 m1 = __halves2half2(predux_max(a_alias[2]),
-                            predux_max(a_alias[3]));
-  __half first  = predux_max(m0);
-  __half second = predux_max(m1);
-#if EIGEN_CUDA_ARCH >= 530
-  return (__hgt(first, second) ? first : second);
-#else
-  float ffirst  = __half2float(first);
-  float fsecond = __half2float(second);
-  return (ffirst > fsecond)? first: second;
-#endif
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(
-    const Packet4h2& a) {
-  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  half2 m0 = __halves2half2(predux_min(a_alias[0]),
-                            predux_min(a_alias[1]));
-  half2 m1 = __halves2half2(predux_min(a_alias[2]),
-                            predux_min(a_alias[3]));
-  __half first  = predux_min(m0);
-  __half second = predux_min(m1);
-#if EIGEN_CUDA_ARCH >= 530
-  return (__hlt(first, second) ? first : second);
-#else
-  float ffirst  = __half2float(first);
-  float fsecond = __half2float(second);
-  return (ffirst < fsecond)? first: second;
-#endif
-}
-
-// likely overflow/underflow
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4h2>(
-    const Packet4h2& a) {
-  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  return predux_mul(pmul(pmul(a_alias[0], a_alias[1]),
-                                       pmul(a_alias[2], a_alias[3])));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-plog1p<Packet4h2>(const Packet4h2& a) {
-  Packet4h2 r;
-  half2* r_alias = reinterpret_cast<half2*>(&r);
-  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  r_alias[0] = plog1p(a_alias[0]);
-  r_alias[1] = plog1p(a_alias[1]);
-  r_alias[2] = plog1p(a_alias[2]);
-  r_alias[3] = plog1p(a_alias[3]);
-  return r;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-pexpm1<Packet4h2>(const Packet4h2& a) {
-  Packet4h2 r;
-  half2* r_alias = reinterpret_cast<half2*>(&r);
-  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  r_alias[0] = pexpm1(a_alias[0]);
-  r_alias[1] = pexpm1(a_alias[1]);
-  r_alias[2] = pexpm1(a_alias[2]);
-  r_alias[3] = pexpm1(a_alias[3]);
-  return r;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog<Packet4h2>(const Packet4h2& a) {
-  Packet4h2 r;
-  half2* r_alias = reinterpret_cast<half2*>(&r);
-  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  r_alias[0] = plog(a_alias[0]);
-  r_alias[1] = plog(a_alias[1]);
-  r_alias[2] = plog(a_alias[2]);
-  r_alias[3] = plog(a_alias[3]);
-  return r;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexp<Packet4h2>(const Packet4h2& a) {
-  Packet4h2 r;
-  half2* r_alias = reinterpret_cast<half2*>(&r);
-  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  r_alias[0] = pexp(a_alias[0]);
-  r_alias[1] = pexp(a_alias[1]);
-  r_alias[2] = pexp(a_alias[2]);
-  r_alias[3] = pexp(a_alias[3]);
-  return r;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psqrt<Packet4h2>(const Packet4h2& a) {
-  Packet4h2 r;
-  half2* r_alias = reinterpret_cast<half2*>(&r);
-  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  r_alias[0] = psqrt(a_alias[0]);
-  r_alias[1] = psqrt(a_alias[1]);
-  r_alias[2] = psqrt(a_alias[2]);
-  r_alias[3] = psqrt(a_alias[3]);
-  return r;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
-prsqrt<Packet4h2>(const Packet4h2& a) {
-  Packet4h2 r;
-  half2* r_alias = reinterpret_cast<half2*>(&r);
-  const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  r_alias[0] = prsqrt(a_alias[0]);
-  r_alias[1] = prsqrt(a_alias[1]);
-  r_alias[2] = prsqrt(a_alias[2]);
-  r_alias[3] = prsqrt(a_alias[3]);
-  return r;
-}
-
-// The following specialized padd, pmul, pdiv, pmin, pmax, pset1 are needed for
-// the implementation of GPU half reduction.
-template<>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a,
-                                                        const half2& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  return __hadd2(a, b);
-
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
-  return __hadd2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 + b1;
-  float r2 = a2 + b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-
-#endif
-}
-
-template<>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a,
-                                                        const half2& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  return __hmul2(a, b);
-
-#else  // EIGEN_CUDA_ARCH
-
-#if EIGEN_CUDA_ARCH >= 530
-  return __hmul2(a, b);
-#else
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 * b1;
-  float r2 = a2 * b2;
-  return __floats2half2_rn(r1, r2);
-#endif
-
-#endif
-}
-
-template<>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a,
-                                                        const half2& b) {
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-
-  return __h2div(a, b);
-
-#else // EIGEN_CUDA_ARCH
-
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  float r1 = a1 / b1;
-  float r2 = a2 / b2;
-  return __floats2half2_rn(r1, r2);
-
-#endif
-}
-
-template<>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a,
-                                                        const half2& b) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
-  __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
-  return __halves2half2(r1, r2);
-}
-
-template<>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a,
-                                                        const half2& b) {
-  float a1 = __low2float(a);
-  float a2 = __high2float(a);
-  float b1 = __low2float(b);
-  float b2 = __high2float(b);
-  __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
-  __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
-  return __halves2half2(r1, r2);
-}
-
-#endif // defined(EIGEN_CUDA_ARCH)
-
-#endif // defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC)
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-
-#endif // EIGEN_PACKET_MATH_GPU_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/GPU/TypeCasting.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/GPU/TypeCasting.h
deleted file mode 100644
index 754546225..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/GPU/TypeCasting.h
+++ /dev/null
@@ -1,80 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_TYPE_CASTING_GPU_H
-#define EIGEN_TYPE_CASTING_GPU_H
-
-namespace Eigen {
-
-namespace internal {
-
-#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
-
-
-template <>
-struct type_casting_traits<Eigen::half, float> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 2
-  };
-};
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) {
-  float2 r1 = __half22float2(a);
-  float2 r2 = __half22float2(b);
-  return make_float4(r1.x, r1.y, r2.x, r2.y);
-}
-
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcast<float4, Packet4h2>(const float4& a, const float4& b) {
-  Packet4h2 r;
-  half2* r_alias=reinterpret_cast<half2*>(&r);
-  r_alias[0]=__floats2half2_rn(a.x,a.y);
-  r_alias[1]=__floats2half2_rn(a.z,a.w);
-  r_alias[2]=__floats2half2_rn(b.x,b.y);
-  r_alias[3]=__floats2half2_rn(b.z,b.w);
-  return r;
-}
-
-template <>
-struct type_casting_traits<float, Eigen::half> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 2,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<Packet4h2, float4>(const Packet4h2& a) {
-  // Simply discard the second half of the input
-  float4 r;
-  const half2* a_alias=reinterpret_cast<const half2*>(&a);
-  float2 r1 = __half22float2(a_alias[0]);
-  float2 r2 = __half22float2(a_alias[1]);
-  r.x=static_cast<float>(r1.x);
-  r.y=static_cast<float>(r1.y);
-  r.z=static_cast<float>(r2.x);
-  r.w=static_cast<float>(r2.y);
-  return r;
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
-  // Simply discard the second half of the input
-  return __floats2half2_rn(a.x, a.y);
-}
-
-#endif
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_TYPE_CASTING_GPU_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h
deleted file mode 100644
index 25375a0a4..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * math_constants.h - 
- *  HIP equivalent of the CUDA header of the same name
- */
-
-#ifndef __MATH_CONSTANTS_H__
-#define __MATH_CONSTANTS_H__
-
-/* single precision constants */
-
-#define HIPRT_INF_F        __int_as_float(0x7f800000)
-#define HIPRT_NAN_F        __int_as_float(0x7fffffff)
-#define HIPRT_MIN_DENORM_F __int_as_float(0x00000001)
-#define HIPRT_MAX_NORMAL_F __int_as_float(0x7f7fffff)
-#define HIPRT_NEG_ZERO_F   __int_as_float(0x80000000)
-#define HIPRT_ZERO_F       0.0f
-#define HIPRT_ONE_F        1.0f
-
-/* double precision constants */
-#define HIPRT_INF          __hiloint2double(0x7ff00000, 0x00000000)
-#define HIPRT_NAN          __hiloint2double(0xfff80000, 0x00000000)
-
-#endif
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/MSA/Complex.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/MSA/Complex.h
deleted file mode 100644
index 4877a95a8..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/MSA/Complex.h
+++ /dev/null
@@ -1,720 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2018 Wave Computing, Inc.
-// Written by:
-//   Chris Larsen
-//   Alexey Frunze (afrunze@wavecomp.com)
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_COMPLEX_MSA_H
-#define EIGEN_COMPLEX_MSA_H
-
-#include <iostream>
-
-namespace Eigen {
-
-namespace internal {
-
-//---------- float ----------
-struct Packet2cf {
-  EIGEN_STRONG_INLINE Packet2cf() {
-  }
-  EIGEN_STRONG_INLINE explicit Packet2cf(const std::complex<float>& a,
-                                         const std::complex<float>& b) {
-    Packet4f t = { std::real(a), std::imag(a), std::real(b), std::imag(b) };
-    v = t;
-  }
-  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {
-  }
-  EIGEN_STRONG_INLINE Packet2cf(const Packet2cf& a) : v(a.v) {
-  }
-  EIGEN_STRONG_INLINE Packet2cf& operator=(const Packet2cf& b) {
-    v = b.v;
-    return *this;
-  }
-  EIGEN_STRONG_INLINE Packet2cf conjugate(void) const {
-    return Packet2cf((Packet4f)__builtin_msa_bnegi_d((v2u64)v, 63));
-  }
-  EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) {
-    Packet4f v1, v2;
-
-    // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
-    v1 = (Packet4f)__builtin_msa_ilvev_w((v4i32)v, (v4i32)v);
-    // Get the imag values of a | a1_im | a1_im | a2_im | a2_im |
-    v2 = (Packet4f)__builtin_msa_ilvod_w((v4i32)v, (v4i32)v);
-    // Multiply the real a with b
-    v1 = pmul(v1, b.v);
-    // Multiply the imag a with b
-    v2 = pmul(v2, b.v);
-    // Conjugate v2
-    v2 = Packet2cf(v2).conjugate().v;
-    // Swap real/imag elements in v2.
-    v2 = (Packet4f)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(1, 0, 3, 2));
-    // Add and return the result
-    v = padd(v1, v2);
-    return *this;
-  }
-  EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const {
-    return Packet2cf(*this) *= b;
-  }
-  EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) {
-    v = padd(v, b.v);
-    return *this;
-  }
-  EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const {
-    return Packet2cf(*this) += b;
-  }
-  EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) {
-    v = psub(v, b.v);
-    return *this;
-  }
-  EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const {
-    return Packet2cf(*this) -= b;
-  }
-  EIGEN_STRONG_INLINE Packet2cf& operator/=(const Packet2cf& b) {
-    *this *= b.conjugate();
-    Packet4f s = pmul<Packet4f>(b.v, b.v);
-    s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
-    v = pdiv(v, s);
-    return *this;
-  }
-  EIGEN_STRONG_INLINE Packet2cf operator/(const Packet2cf& b) const {
-    return Packet2cf(*this) /= b;
-  }
-  EIGEN_STRONG_INLINE Packet2cf operator-(void) const {
-    return Packet2cf(pnegate(v));
-  }
-
-  Packet4f v;
-};
-
-inline std::ostream& operator<<(std::ostream& os, const Packet2cf& value) {
-  os << "[ (" << value.v[0] << ", " << value.v[1]
-     << "i),"
-        "  ("
-     << value.v[2] << ", " << value.v[3] << "i) ]";
-  return os;
-}
-
-template <>
-struct packet_traits<std::complex<float> > : default_packet_traits {
-  typedef Packet2cf type;
-  typedef Packet2cf half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 2,
-    HasHalfPacket = 0,
-
-    HasAdd = 1,
-    HasSub = 1,
-    HasMul = 1,
-    HasDiv = 1,
-    HasNegate = 1,
-    HasAbs = 0,
-    HasAbs2 = 0,
-    HasMin = 0,
-    HasMax = 0,
-    HasSetLinear = 0,
-    HasBlend = 1
-  };
-};
-
-template <>
-struct unpacket_traits<Packet2cf> {
-  typedef std::complex<float> type;
-  enum { size = 2, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
-  typedef Packet2cf half;
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
-  EIGEN_MSA_DEBUG;
-
-  float f0 = from.real(), f1 = from.imag();
-  Packet4f v0 = { f0, f0, f0, f0 };
-  Packet4f v1 = { f1, f1, f1, f1 };
-  return Packet2cf((Packet4f)__builtin_msa_ilvr_w((Packet4i)v1, (Packet4i)v0));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
-  EIGEN_MSA_DEBUG;
-
-  return a + b;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
-  EIGEN_MSA_DEBUG;
-
-  return a - b;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
-  EIGEN_MSA_DEBUG;
-
-  return -a;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
-  EIGEN_MSA_DEBUG;
-
-  return a.conjugate();
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
-  EIGEN_MSA_DEBUG;
-
-  return a * b;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
-  EIGEN_MSA_DEBUG;
-
-  return Packet2cf(pand(a.v, b.v));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
-  EIGEN_MSA_DEBUG;
-
-  return Packet2cf(por(a.v, b.v));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
-  EIGEN_MSA_DEBUG;
-
-  return Packet2cf(pxor(a.v, b.v));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
-  EIGEN_MSA_DEBUG;
-
-  return Packet2cf(pandnot(a.v, b.v));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
-  EIGEN_MSA_DEBUG;
-
-  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
-  EIGEN_MSA_DEBUG;
-
-  EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
-  EIGEN_MSA_DEBUG;
-
-  return pset1<Packet2cf>(*from);
-}
-
-template <>
-EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to,
-                                                      const Packet2cf& from) {
-  EIGEN_MSA_DEBUG;
-
-  EIGEN_DEBUG_ALIGNED_STORE pstore<float>((float*)to, from.v);
-}
-
-template <>
-EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to,
-                                                       const Packet2cf& from) {
-  EIGEN_MSA_DEBUG;
-
-  EIGEN_DEBUG_UNALIGNED_STORE pstoreu<float>((float*)to, from.v);
-}
-
-template <>
-EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(
-    const std::complex<float>* from, Index stride) {
-  EIGEN_MSA_DEBUG;
-
-  return Packet2cf(from[0 * stride], from[1 * stride]);
-}
-
-template <>
-EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to,
-                                                                       const Packet2cf& from,
-                                                                       Index stride) {
-  EIGEN_MSA_DEBUG;
-
-  *to = std::complex<float>(from.v[0], from.v[1]);
-  to += stride;
-  *to = std::complex<float>(from.v[2], from.v[3]);
-}
-
-template <>
-EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
-  EIGEN_MSA_DEBUG;
-
-  prefetch(reinterpret_cast<const float*>(addr));
-}
-
-template <>
-EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
-  EIGEN_MSA_DEBUG;
-
-  return std::complex<float>(a.v[0], a.v[1]);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
-  EIGEN_MSA_DEBUG;
-
-  return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a) {
-  EIGEN_MSA_DEBUG;
-
-  return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
-}
-
-template <>
-EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
-  EIGEN_MSA_DEBUG;
-
-  Packet4f value = (Packet4f)preverse((Packet2d)a.v);
-  value += a.v;
-  return std::complex<float>(value[0], value[1]);
-}
-
-template <>
-EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
-  EIGEN_MSA_DEBUG;
-
-  return std::complex<float>((a.v[0] * a.v[2]) - (a.v[1] * a.v[3]),
-                             (a.v[0] * a.v[3]) + (a.v[1] * a.v[2]));
-}
-
-template <>
-struct conj_helper<Packet2cf, Packet2cf, false, true> {
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
-                                      const Packet2cf& c) const {
-    return padd(pmul(x, y), c);
-  }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template <>
-struct conj_helper<Packet2cf, Packet2cf, true, false> {
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
-                                      const Packet2cf& c) const {
-    return padd(pmul(x, y), c);
-  }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template <>
-struct conj_helper<Packet2cf, Packet2cf, true, true> {
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
-                                      const Packet2cf& c) const {
-    return padd(pmul(x, y), c);
-  }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
-  EIGEN_MSA_DEBUG;
-
-  return a / b;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2cf, 2>& value) {
-  os << "[ " << value.packet[0] << ", " << std::endl << "  " << value.packet[1] << " ]";
-  return os;
-}
-
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
-  EIGEN_MSA_DEBUG;
-
-  Packet4f tmp =
-      (Packet4f)__builtin_msa_ilvl_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
-  kernel.packet[0].v =
-      (Packet4f)__builtin_msa_ilvr_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
-  kernel.packet[1].v = tmp;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
-                                     const Packet2cf& elsePacket) {
-  return (Packet2cf)(Packet4f)pblend<Packet2d>(ifPacket, (Packet2d)thenPacket.v,
-                                               (Packet2d)elsePacket.v);
-}
-
-//---------- double ----------
-
-struct Packet1cd {
-  EIGEN_STRONG_INLINE Packet1cd() {
-  }
-  EIGEN_STRONG_INLINE explicit Packet1cd(const std::complex<double>& a) {
-    v[0] = std::real(a);
-    v[1] = std::imag(a);
-  }
-  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {
-  }
-  EIGEN_STRONG_INLINE Packet1cd(const Packet1cd& a) : v(a.v) {
-  }
-  EIGEN_STRONG_INLINE Packet1cd& operator=(const Packet1cd& b) {
-    v = b.v;
-    return *this;
-  }
-  EIGEN_STRONG_INLINE Packet1cd conjugate(void) const {
-    static const v2u64 p2ul_CONJ_XOR = { 0x0, 0x8000000000000000 };
-    return (Packet1cd)pxor(v, (Packet2d)p2ul_CONJ_XOR);
-  }
-  EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) {
-    Packet2d v1, v2;
-
-    // Get the real values of a | a1_re | a1_re
-    v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)v, (v2i64)v);
-    // Get the imag values of a | a1_im | a1_im
-    v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)v, (v2i64)v);
-    // Multiply the real a with b
-    v1 = pmul(v1, b.v);
-    // Multiply the imag a with b
-    v2 = pmul(v2, b.v);
-    // Conjugate v2
-    v2 = Packet1cd(v2).conjugate().v;
-    // Swap real/imag elements in v2.
-    v2 = (Packet2d)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
-    // Add and return the result
-    v = padd(v1, v2);
-    return *this;
-  }
-  EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const {
-    return Packet1cd(*this) *= b;
-  }
-  EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) {
-    v = padd(v, b.v);
-    return *this;
-  }
-  EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const {
-    return Packet1cd(*this) += b;
-  }
-  EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) {
-    v = psub(v, b.v);
-    return *this;
-  }
-  EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const {
-    return Packet1cd(*this) -= b;
-  }
-  EIGEN_STRONG_INLINE Packet1cd& operator/=(const Packet1cd& b) {
-    *this *= b.conjugate();
-    Packet2d s = pmul<Packet2d>(b.v, b.v);
-    s = padd(s, preverse<Packet2d>(s));
-    v = pdiv(v, s);
-    return *this;
-  }
-  EIGEN_STRONG_INLINE Packet1cd operator/(const Packet1cd& b) const {
-    return Packet1cd(*this) /= b;
-  }
-  EIGEN_STRONG_INLINE Packet1cd operator-(void) const {
-    return Packet1cd(pnegate(v));
-  }
-
-  Packet2d v;
-};
-
-inline std::ostream& operator<<(std::ostream& os, const Packet1cd& value) {
-  os << "[ (" << value.v[0] << ", " << value.v[1] << "i) ]";
-  return os;
-}
-
-template <>
-struct packet_traits<std::complex<double> > : default_packet_traits {
-  typedef Packet1cd type;
-  typedef Packet1cd half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 0,
-    size = 1,
-    HasHalfPacket = 0,
-
-    HasAdd = 1,
-    HasSub = 1,
-    HasMul = 1,
-    HasDiv = 1,
-    HasNegate = 1,
-    HasAbs = 0,
-    HasAbs2 = 0,
-    HasMin = 0,
-    HasMax = 0,
-    HasSetLinear = 0
-  };
-};
-
-template <>
-struct unpacket_traits<Packet1cd> {
-  typedef std::complex<double> type;
-  enum { size = 1, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
-  typedef Packet1cd half;
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
-  EIGEN_MSA_DEBUG;
-
-  EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
-  EIGEN_MSA_DEBUG;
-
-  EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from) {
-  EIGEN_MSA_DEBUG;
-
-  return Packet1cd(from);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
-  EIGEN_MSA_DEBUG;
-
-  return a + b;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
-  EIGEN_MSA_DEBUG;
-
-  return a - b;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
-  EIGEN_MSA_DEBUG;
-
-  return -a;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
-  EIGEN_MSA_DEBUG;
-
-  return a.conjugate();
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
-  EIGEN_MSA_DEBUG;
-
-  return a * b;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
-  EIGEN_MSA_DEBUG;
-
-  return Packet1cd(pand(a.v, b.v));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
-  EIGEN_MSA_DEBUG;
-
-  return Packet1cd(por(a.v, b.v));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
-  EIGEN_MSA_DEBUG;
-
-  return Packet1cd(pxor(a.v, b.v));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
-  EIGEN_MSA_DEBUG;
-
-  return Packet1cd(pandnot(a.v, b.v));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
-  EIGEN_MSA_DEBUG;
-
-  return pset1<Packet1cd>(*from);
-}
-
-template <>
-EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to,
-                                                       const Packet1cd& from) {
-  EIGEN_MSA_DEBUG;
-
-  EIGEN_DEBUG_ALIGNED_STORE pstore<double>((double*)to, from.v);
-}
-
-template <>
-EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to,
-                                                        const Packet1cd& from) {
-  EIGEN_MSA_DEBUG;
-
-  EIGEN_DEBUG_UNALIGNED_STORE pstoreu<double>((double*)to, from.v);
-}
-
-template <>
-EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
-  EIGEN_MSA_DEBUG;
-
-  prefetch(reinterpret_cast<const double*>(addr));
-}
-
-template <>
-EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(
-    const std::complex<double>* from, Index stride __attribute__((unused))) {
-  EIGEN_MSA_DEBUG;
-
-  Packet1cd res;
-  res.v[0] = std::real(from[0]);
-  res.v[1] = std::imag(from[0]);
-  return res;
-}
-
-template <>
-EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to,
-                                                                        const Packet1cd& from,
-                                                                        Index stride
-                                                                        __attribute__((unused))) {
-  EIGEN_MSA_DEBUG;
-
-  pstore(to, from);
-}
-
-template <>
-EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
-  EIGEN_MSA_DEBUG;
-
-  return std::complex<double>(a.v[0], a.v[1]);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
-  EIGEN_MSA_DEBUG;
-
-  return a;
-}
-
-template <>
-EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
-  EIGEN_MSA_DEBUG;
-
-  return pfirst(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
-  EIGEN_MSA_DEBUG;
-
-  return pfirst(a);
-}
-
-template <>
-struct conj_helper<Packet1cd, Packet1cd, false, true> {
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
-                                      const Packet1cd& c) const {
-    return padd(pmul(x, y), c);
-  }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template <>
-struct conj_helper<Packet1cd, Packet1cd, true, false> {
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
-                                      const Packet1cd& c) const {
-    return padd(pmul(x, y), c);
-  }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template <>
-struct conj_helper<Packet1cd, Packet1cd, true, true> {
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
-                                      const Packet1cd& c) const {
-    return padd(pmul(x, y), c);
-  }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
-
-template <>
-EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
-  EIGEN_MSA_DEBUG;
-
-  return a / b;
-}
-
-EIGEN_STRONG_INLINE Packet1cd pcplxflip /*<Packet1cd>*/ (const Packet1cd& x) {
-  EIGEN_MSA_DEBUG;
-
-  return Packet1cd(preverse(Packet2d(x.v)));
-}
-
-inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet1cd, 2>& value) {
-  os << "[ " << value.packet[0] << ", " << std::endl << "  " << value.packet[1] << " ]";
-  return os;
-}
-
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd, 2>& kernel) {
-  EIGEN_MSA_DEBUG;
-
-  Packet2d v1, v2;
-
-  v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v);
-  // Get the imag values of a
-  v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v);
-
-  kernel.packet[0].v = v1;
-  kernel.packet[1].v = v2;
-}
-
-}  // end namespace internal
-
-}  // end namespace Eigen
-
-#endif  // EIGEN_COMPLEX_MSA_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/MSA/MathFunctions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/MSA/MathFunctions.h
deleted file mode 100644
index f5181b90e..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/MSA/MathFunctions.h
+++ /dev/null
@@ -1,387 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2007 Julien Pommier
-// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
-// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// Copyright (C) 2018 Wave Computing, Inc.
-// Written by:
-//   Chris Larsen
-//   Alexey Frunze (afrunze@wavecomp.com)
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/* The sin, cos, exp, and log functions of this file come from
- * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
- */
-
-/* The tanh function of this file is an adaptation of
- * template<typename T> T generic_fast_tanh_float(const T&)
- * from MathFunctionsImpl.h.
- */
-
-#ifndef EIGEN_MATH_FUNCTIONS_MSA_H
-#define EIGEN_MATH_FUNCTIONS_MSA_H
-
-namespace Eigen {
-
-namespace internal {
-
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
-plog<Packet4f>(const Packet4f& _x) {
-  static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
-  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292e-2f);
-  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310e-1f);
-  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740e-1f);
-  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, -1.2420140846e-1f);
-  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, +1.4249322787e-1f);
-  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, -1.6668057665e-1f);
-  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, +2.0000714765e-1f);
-  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, -2.4999993993e-1f);
-  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, +3.3333331174e-1f);
-  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
-  static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
-  static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-  static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
-
-  // Convert negative argument into NAN (quiet negative, to be specific).
-  Packet4f zero = (Packet4f)__builtin_msa_ldi_w(0);
-  Packet4i neg_mask = __builtin_msa_fclt_w(_x, zero);
-  Packet4i zero_mask = __builtin_msa_fceq_w(_x, zero);
-  Packet4f non_neg_x_or_nan = padd(_x, (Packet4f)neg_mask);  // Add 0.0 or NAN.
-  Packet4f x = non_neg_x_or_nan;
-
-  // Extract exponent from x = mantissa * 2**exponent, where 1.0 <= mantissa < 2.0.
-  // N.B. the exponent is one less of what frexpf() would return.
-  Packet4i e_int = __builtin_msa_ftint_s_w(__builtin_msa_flog2_w(x));
-  // Multiply x by 2**(-exponent-1) to get 0.5 <= x < 1.0 as from frexpf().
-  x = __builtin_msa_fexp2_w(x, (Packet4i)__builtin_msa_nori_b((v16u8)e_int, 0));
-
-  /*
-     if (x < SQRTHF) {
-       x = x + x - 1.0;
-     } else {
-       e += 1;
-       x = x - 1.0;
-     }
-  */
-  Packet4f xx = padd(x, x);
-  Packet4i ge_mask = __builtin_msa_fcle_w(p4f_cephes_SQRTHF, x);
-  e_int = psub(e_int, ge_mask);
-  x = (Packet4f)__builtin_msa_bsel_v((v16u8)ge_mask, (v16u8)xx, (v16u8)x);
-  x = psub(x, p4f_1);
-  Packet4f e = __builtin_msa_ffint_s_w(e_int);
-
-  Packet4f x2 = pmul(x, x);
-  Packet4f x3 = pmul(x2, x);
-
-  Packet4f y, y1, y2;
-  y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
-  y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
-  y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
-  y = pmadd(y, x, p4f_cephes_log_p2);
-  y1 = pmadd(y1, x, p4f_cephes_log_p5);
-  y2 = pmadd(y2, x, p4f_cephes_log_p8);
-  y = pmadd(y, x3, y1);
-  y = pmadd(y, x3, y2);
-  y = pmul(y, x3);
-
-  y = pmadd(e, p4f_cephes_log_q1, y);
-  x = __builtin_msa_fmsub_w(x, x2, p4f_half);
-  x = padd(x, y);
-  x = pmadd(e, p4f_cephes_log_q2, x);
-
-  // x is now the logarithm result candidate. We still need to handle the
-  // extreme arguments of zero and positive infinity, though.
-  // N.B. if the argument is +INFINITY, x is NAN because the polynomial terms
-  // contain infinities of both signs (see the coefficients and code above).
-  // INFINITY - INFINITY is NAN.
-
-  // If the argument is +INFINITY, make it the new result candidate.
-  // To achieve that we choose the smaller of the result candidate and the
-  // argument.
-  // This is correct for all finite pairs of values (the logarithm is smaller
-  // than the argument).
-  // This is also correct in the special case when the argument is +INFINITY
-  // and the result candidate is NAN. This is because the fmin.df instruction
-  // prefers non-NANs to NANs.
-  x = __builtin_msa_fmin_w(x, non_neg_x_or_nan);
-
-  // If the argument is zero (including -0.0), the result becomes -INFINITY.
-  Packet4i neg_infs = __builtin_msa_slli_w(zero_mask, 23);
-  x = (Packet4f)__builtin_msa_bsel_v((v16u8)zero_mask, (v16u8)x, (v16u8)neg_infs);
-
-  return x;
-}
-
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
-pexp<Packet4f>(const Packet4f& _x) {
-  // Limiting single-precision pexp's argument to [-128, +128] lets pexp
-  // reach 0 and INFINITY naturally.
-  static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -128.0f);
-  static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, +128.0f);
-  static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
-  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
-  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
-  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500e-4f);
-  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507e-3f);
-  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073e-3f);
-  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894e-2f);
-  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459e-1f);
-  static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201e-1f);
-  static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-  static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
-
-  Packet4f x = _x;
-
-  // Clamp x.
-  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(x, p4f_exp_lo), (v16u8)x,
-                                     (v16u8)p4f_exp_lo);
-  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_exp_hi, x), (v16u8)x,
-                                     (v16u8)p4f_exp_hi);
-
-  // Round to nearest integer by adding 0.5 (with x's sign) and truncating.
-  Packet4f x2_add = (Packet4f)__builtin_msa_binsli_w((v4u32)p4f_half, (v4u32)x, 0);
-  Packet4f x2 = pmadd(x, p4f_cephes_LOG2EF, x2_add);
-  Packet4i x2_int = __builtin_msa_ftrunc_s_w(x2);
-  Packet4f x2_int_f = __builtin_msa_ffint_s_w(x2_int);
-
-  x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C1);
-  x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C2);
-
-  Packet4f z = pmul(x, x);
-
-  Packet4f y = p4f_cephes_exp_p0;
-  y = pmadd(y, x, p4f_cephes_exp_p1);
-  y = pmadd(y, x, p4f_cephes_exp_p2);
-  y = pmadd(y, x, p4f_cephes_exp_p3);
-  y = pmadd(y, x, p4f_cephes_exp_p4);
-  y = pmadd(y, x, p4f_cephes_exp_p5);
-  y = pmadd(y, z, x);
-  y = padd(y, p4f_1);
-
-  // y *= 2**exponent.
-  y = __builtin_msa_fexp2_w(y, x2_int);
-
-  return y;
-}
-
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
-ptanh<Packet4f>(const Packet4f& _x) {
-  static _EIGEN_DECLARE_CONST_Packet4f(tanh_tiny, 1e-4f);
-  static _EIGEN_DECLARE_CONST_Packet4f(tanh_hi, 9.0f);
-  // The monomial coefficients of the numerator polynomial (odd).
-  static _EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-3f);
-  static _EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-4f);
-  static _EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-5f);
-  static _EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-8f);
-  static _EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f);
-  static _EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f);
-  static _EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f);
-  // The monomial coefficients of the denominator polynomial (even).
-  static _EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-3f);
-  static _EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-3f);
-  static _EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-4f);
-  static _EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-6f);
-
-  Packet4f x = pabs(_x);
-  Packet4i tiny_mask = __builtin_msa_fclt_w(x, p4f_tanh_tiny);
-
-  // Clamp the inputs to the range [-9, 9] since anything outside
-  // this range is -/+1.0f in single-precision.
-  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_tanh_hi, x), (v16u8)x,
-                                     (v16u8)p4f_tanh_hi);
-
-  // Since the polynomials are odd/even, we need x**2.
-  Packet4f x2 = pmul(x, x);
-
-  // Evaluate the numerator polynomial p.
-  Packet4f p = pmadd(x2, p4f_alpha_13, p4f_alpha_11);
-  p = pmadd(x2, p, p4f_alpha_9);
-  p = pmadd(x2, p, p4f_alpha_7);
-  p = pmadd(x2, p, p4f_alpha_5);
-  p = pmadd(x2, p, p4f_alpha_3);
-  p = pmadd(x2, p, p4f_alpha_1);
-  p = pmul(x, p);
-
-  // Evaluate the denominator polynomial q.
-  Packet4f q = pmadd(x2, p4f_beta_6, p4f_beta_4);
-  q = pmadd(x2, q, p4f_beta_2);
-  q = pmadd(x2, q, p4f_beta_0);
-
-  // Divide the numerator by the denominator.
-  p = pdiv(p, q);
-
-  // Reinstate the sign.
-  p = (Packet4f)__builtin_msa_binsli_w((v4u32)p, (v4u32)_x, 0);
-
-  // When the argument is very small in magnitude it's more accurate to just return it.
-  p = (Packet4f)__builtin_msa_bsel_v((v16u8)tiny_mask, (v16u8)p, (v16u8)_x);
-
-  return p;
-}
-
-template <bool sine>
-Packet4f psincos_inner_msa_float(const Packet4f& _x) {
-  static _EIGEN_DECLARE_CONST_Packet4f(sincos_max_arg, 13176795.0f);  // Approx. (2**24) / (4/Pi).
-  static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1, -0.78515625f);
-  static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
-  static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
-  static _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891e-4f);
-  static _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736e-3f);
-  static _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611e-1f);
-  static _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948e-5f);
-  static _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765e-3f);
-  static _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827e-2f);
-  static _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f);  // 4/Pi.
-  static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-  static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
-
-  Packet4f x = pabs(_x);
-
-  // Translate infinite arguments into NANs.
-  Packet4f zero_or_nan_if_inf = psub(_x, _x);
-  x = padd(x, zero_or_nan_if_inf);
-  // Prevent sin/cos from generating values larger than 1.0 in magnitude
-  // for very large arguments by setting x to 0.0.
-  Packet4i small_or_nan_mask = __builtin_msa_fcult_w(x, p4f_sincos_max_arg);
-  x = pand(x, (Packet4f)small_or_nan_mask);
-
-  // Scale x by 4/Pi to find x's octant.
-  Packet4f y = pmul(x, p4f_cephes_FOPI);
-  // Get the octant. We'll reduce x by this number of octants or by one more than it.
-  Packet4i y_int = __builtin_msa_ftrunc_s_w(y);
-  // x's from even-numbered octants will translate to octant 0: [0, +Pi/4].
-  // x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0].
-  // Adjustment for odd-numbered octants: octant = (octant + 1) & (~1).
-  Packet4i y_int1 = __builtin_msa_addvi_w(y_int, 1);
-  Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0); // bclri = bit-clear
-  y = __builtin_msa_ffint_s_w(y_int2);
-
-  // Compute the sign to apply to the polynomial.
-  Packet4i sign_mask = sine ? pxor(__builtin_msa_slli_w(y_int1, 29), (Packet4i)_x)
-                            : __builtin_msa_slli_w(__builtin_msa_addvi_w(y_int, 3), 29);
-
-  // Get the polynomial selection mask.
-  // We'll calculate both (sin and cos) polynomials and then select from the two.
-  Packet4i poly_mask = __builtin_msa_ceqi_w(__builtin_msa_slli_w(y_int2, 30), 0);
-
-  // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4.
-  // The magic pass: "Extended precision modular arithmetic"
-  // x = ((x - y * DP1) - y * DP2) - y * DP3
-  Packet4f tmp1 = pmul(y, p4f_minus_cephes_DP1);
-  Packet4f tmp2 = pmul(y, p4f_minus_cephes_DP2);
-  Packet4f tmp3 = pmul(y, p4f_minus_cephes_DP3);
-  x = padd(x, tmp1);
-  x = padd(x, tmp2);
-  x = padd(x, tmp3);
-
-  // Evaluate the cos(x) polynomial.
-  y = p4f_coscof_p0;
-  Packet4f z = pmul(x, x);
-  y = pmadd(y, z, p4f_coscof_p1);
-  y = pmadd(y, z, p4f_coscof_p2);
-  y = pmul(y, z);
-  y = pmul(y, z);
-  y = __builtin_msa_fmsub_w(y, z, p4f_half);
-  y = padd(y, p4f_1);
-
-  // Evaluate the sin(x) polynomial.
-  Packet4f y2 = p4f_sincof_p0;
-  y2 = pmadd(y2, z, p4f_sincof_p1);
-  y2 = pmadd(y2, z, p4f_sincof_p2);
-  y2 = pmul(y2, z);
-  y2 = pmadd(y2, x, x);
-
-  // Select the correct result from the two polynomials.
-  y = sine ? (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y, (v16u8)y2)
-           : (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y2, (v16u8)y);
-
-  // Update the sign.
-  sign_mask = pxor(sign_mask, (Packet4i)y);
-  y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0); // binsli = bit-insert-left
-  return y;
-}
-
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
-psin<Packet4f>(const Packet4f& x) {
-  return psincos_inner_msa_float</* sine */ true>(x);
-}
-
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
-pcos<Packet4f>(const Packet4f& x) {
-  return psincos_inner_msa_float</* sine */ false>(x);
-}
-
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d
-pexp<Packet2d>(const Packet2d& _x) {
-  // Limiting double-precision pexp's argument to [-1024, +1024] lets pexp
-  // reach 0 and INFINITY naturally.
-  static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -1024.0);
-  static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, +1024.0);
-  static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
-  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
-  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
-  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
-  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
-  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
-  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
-  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
-  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
-  static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
-  static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
-  static _EIGEN_DECLARE_CONST_Packet2d(1, 1.0);
-  static _EIGEN_DECLARE_CONST_Packet2d(2, 2.0);
-
-  Packet2d x = _x;
-
-  // Clamp x.
-  x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(x, p2d_exp_lo), (v16u8)x,
-                                     (v16u8)p2d_exp_lo);
-  x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(p2d_exp_hi, x), (v16u8)x,
-                                     (v16u8)p2d_exp_hi);
-
-  // Round to nearest integer by adding 0.5 (with x's sign) and truncating.
-  Packet2d x2_add = (Packet2d)__builtin_msa_binsli_d((v2u64)p2d_half, (v2u64)x, 0);
-  Packet2d x2 = pmadd(x, p2d_cephes_LOG2EF, x2_add);
-  Packet2l x2_long = __builtin_msa_ftrunc_s_d(x2);
-  Packet2d x2_long_d = __builtin_msa_ffint_s_d(x2_long);
-
-  x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C1);
-  x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C2);
-
-  x2 = pmul(x, x);
-
-  Packet2d px = p2d_cephes_exp_p0;
-  px = pmadd(px, x2, p2d_cephes_exp_p1);
-  px = pmadd(px, x2, p2d_cephes_exp_p2);
-  px = pmul(px, x);
-
-  Packet2d qx = p2d_cephes_exp_q0;
-  qx = pmadd(qx, x2, p2d_cephes_exp_q1);
-  qx = pmadd(qx, x2, p2d_cephes_exp_q2);
-  qx = pmadd(qx, x2, p2d_cephes_exp_q3);
-
-  x = pdiv(px, psub(qx, px));
-  x = pmadd(p2d_2, x, p2d_1);
-
-  // x *= 2**exponent.
-  x = __builtin_msa_fexp2_d(x, x2_long);
-
-  return x;
-}
-
-}  // end namespace internal
-
-}  // end namespace Eigen
-
-#endif  // EIGEN_MATH_FUNCTIONS_MSA_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/MSA/PacketMath.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/MSA/PacketMath.h
deleted file mode 100644
index f03cf61ff..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/MSA/PacketMath.h
+++ /dev/null
@@ -1,1237 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2018 Wave Computing, Inc.
-// Written by:
-//   Chris Larsen
-//   Alexey Frunze (afrunze@wavecomp.com)
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_PACKET_MATH_MSA_H
-#define EIGEN_PACKET_MATH_MSA_H
-
-#include <iostream>
-#include <string>
-
-namespace Eigen {
-
-namespace internal {
-
-#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
-#endif
-
-#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
-#endif
-
-#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#endif
-
-#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
-#endif
-
-#if 0
-#define EIGEN_MSA_DEBUG                                                             \
-  static bool firstTime = true;                                                     \
-  do {                                                                              \
-    if (firstTime) {                                                                \
-      std::cout << __FILE__ << ':' << __LINE__ << ':' << __FUNCTION__ << std::endl; \
-      firstTime = false;                                                            \
-    }                                                                               \
-  } while (0)
-#else
-#define EIGEN_MSA_DEBUG
-#endif
-
-#define EIGEN_MSA_SHF_I8(a, b, c, d) (((d) << 6) | ((c) << 4) | ((b) << 2) | (a))
-
-typedef v4f32 Packet4f;
-typedef v4i32 Packet4i;
-typedef v4u32 Packet4ui;
-
-#define _EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = { X, X, X, X }
-#define _EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = { X, X, X, X }
-#define _EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = { X, X, X, X }
-
-inline std::ostream& operator<<(std::ostream& os, const Packet4f& value) {
-  os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
-  return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const Packet4i& value) {
-  os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
-  return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const Packet4ui& value) {
-  os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
-  return os;
-}
-
-template <>
-struct packet_traits<float> : default_packet_traits {
-  typedef Packet4f type;
-  typedef Packet4f half;  // Packet2f intrinsics not implemented yet
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 4,
-    HasHalfPacket = 0,  // Packet2f intrinsics not implemented yet
-    // FIXME check the Has*
-    HasDiv = 1,
-    HasSin = EIGEN_FAST_MATH,
-    HasCos = EIGEN_FAST_MATH,
-    HasTanh = EIGEN_FAST_MATH,
-    HasErf = EIGEN_FAST_MATH,
-    HasLog = 1,
-    HasExp = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
-    HasRound = 1,
-    HasFloor = 1,
-    HasCeil = 1,
-    HasBlend = 1
-  };
-};
-
-template <>
-struct packet_traits<int32_t> : default_packet_traits {
-  typedef Packet4i type;
-  typedef Packet4i half;  // Packet2i intrinsics not implemented yet
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 4,
-    HasHalfPacket = 0,  // Packet2i intrinsics not implemented yet
-    // FIXME check the Has*
-    HasDiv = 1,
-    HasBlend = 1
-  };
-};
-
-template <>
-struct unpacket_traits<Packet4f> {
-  typedef float type;
-  enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
-  typedef Packet4f half;
-};
-
-template <>
-struct unpacket_traits<Packet4i> {
-  typedef int32_t type;
-  enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
-  typedef Packet4i half;
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
-  EIGEN_MSA_DEBUG;
-
-  Packet4f v = { from, from, from, from };
-  return v;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) {
-  EIGEN_MSA_DEBUG;
-
-  return __builtin_msa_fill_w(from);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float* from) {
-  EIGEN_MSA_DEBUG;
-
-  float f = *from;
-  Packet4f v = { f, f, f, f };
-  return v;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4i pload1<Packet4i>(const int32_t* from) {
-  EIGEN_MSA_DEBUG;
-
-  return __builtin_msa_fill_w(*from);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
-  EIGEN_MSA_DEBUG;
-
-  return __builtin_msa_fadd_w(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
-  EIGEN_MSA_DEBUG;
-
-  return __builtin_msa_addv_w(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
-  EIGEN_MSA_DEBUG;
-
-  static const Packet4f countdown = { 0.0f, 1.0f, 2.0f, 3.0f };
-  return padd(pset1<Packet4f>(a), countdown);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) {
-  EIGEN_MSA_DEBUG;
-
-  static const Packet4i countdown = { 0, 1, 2, 3 };
-  return padd(pset1<Packet4i>(a), countdown);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
-  EIGEN_MSA_DEBUG;
-
-  return __builtin_msa_fsub_w(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
-  EIGEN_MSA_DEBUG;
-
-  return __builtin_msa_subv_w(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
-  EIGEN_MSA_DEBUG;
-
-  return (Packet4f)__builtin_msa_bnegi_w((v4u32)a, 31);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
-  EIGEN_MSA_DEBUG;
-
-  return __builtin_msa_addvi_w((v4i32)__builtin_msa_nori_b((v16u8)a, 0), 1);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
-  EIGEN_MSA_DEBUG;
-
-  return a;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
-  EIGEN_MSA_DEBUG;
-
-  return a;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
-  EIGEN_MSA_DEBUG;
-
-  return __builtin_msa_fmul_w(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
-  EIGEN_MSA_DEBUG;
-
-  return __builtin_msa_mulv_w(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
-  EIGEN_MSA_DEBUG;
-
-  return __builtin_msa_fdiv_w(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
-  EIGEN_MSA_DEBUG;
-
-  return __builtin_msa_div_s_w(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
-  EIGEN_MSA_DEBUG;
-
-  return __builtin_msa_fmadd_w(c, a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
-  EIGEN_MSA_DEBUG;
-
-  // Use "asm" construct to avoid __builtin_msa_maddv_w GNU C bug.
-  Packet4i value = c;
-  __asm__("maddv.w %w[value], %w[a], %w[b]\n"
-          // Outputs
-          : [value] "+f"(value)
-          // Inputs
-          : [a] "f"(a), [b] "f"(b));
-  return value;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
-  EIGEN_MSA_DEBUG;
-
-  return (Packet4f)__builtin_msa_and_v((v16u8)a, (v16u8)b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
-  EIGEN_MSA_DEBUG;
-
-  return (Packet4i)__builtin_msa_and_v((v16u8)a, (v16u8)b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
-  EIGEN_MSA_DEBUG;
-
-  return (Packet4f)__builtin_msa_or_v((v16u8)a, (v16u8)b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
-  EIGEN_MSA_DEBUG;
-
-  return (Packet4i)__builtin_msa_or_v((v16u8)a, (v16u8)b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
-  EIGEN_MSA_DEBUG;
-
-  return (Packet4f)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
-  EIGEN_MSA_DEBUG;
-
-  return (Packet4i)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
-  EIGEN_MSA_DEBUG;
-
-  return pand(a, (Packet4f)__builtin_msa_xori_b((v16u8)b, 255));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
-  EIGEN_MSA_DEBUG;
-
-  return pand(a, (Packet4i)__builtin_msa_xori_b((v16u8)b, 255));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
-  EIGEN_MSA_DEBUG;
-
-#if EIGEN_FAST_MATH
-  // This prefers numbers to NaNs.
-  return __builtin_msa_fmin_w(a, b);
-#else
-  // This prefers NaNs to numbers.
-  Packet4i aNaN = __builtin_msa_fcun_w(a, a);
-  Packet4i aMinOrNaN = por(__builtin_msa_fclt_w(a, b), aNaN);
-  return (Packet4f)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
-#endif
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
-  EIGEN_MSA_DEBUG;
-
-  return __builtin_msa_min_s_w(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
-  EIGEN_MSA_DEBUG;
-
-#if EIGEN_FAST_MATH
-  // This prefers numbers to NaNs.
-  return __builtin_msa_fmax_w(a, b);
-#else
-  // This prefers NaNs to numbers.
-  Packet4i aNaN = __builtin_msa_fcun_w(a, a);
-  Packet4i aMaxOrNaN = por(__builtin_msa_fclt_w(b, a), aNaN);
-  return (Packet4f)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
-#endif
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
-  EIGEN_MSA_DEBUG;
-
-  return __builtin_msa_max_s_w(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
-  EIGEN_MSA_DEBUG;
-
-  EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
-  EIGEN_MSA_DEBUG;
-
-  EIGEN_DEBUG_ALIGNED_LOAD return __builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
-  EIGEN_MSA_DEBUG;
-
-  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) {
-  EIGEN_MSA_DEBUG;
-
-  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4i)__builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
-  EIGEN_MSA_DEBUG;
-
-  float f0 = from[0], f1 = from[1];
-  Packet4f v0 = { f0, f0, f0, f0 };
-  Packet4f v1 = { f1, f1, f1, f1 };
-  return (Packet4f)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) {
-  EIGEN_MSA_DEBUG;
-
-  int32_t i0 = from[0], i1 = from[1];
-  Packet4i v0 = { i0, i0, i0, i0 };
-  Packet4i v1 = { i1, i1, i1, i1 };
-  return (Packet4i)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
-}
-
-template <>
-EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
-  EIGEN_MSA_DEBUG;
-
-  EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
-}
-
-template <>
-EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
-  EIGEN_MSA_DEBUG;
-
-  EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w(from, to, 0);
-}
-
-template <>
-EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
-  EIGEN_MSA_DEBUG;
-
-  EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
-}
-
-template <>
-EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) {
-  EIGEN_MSA_DEBUG;
-
-  EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w(from, to, 0);
-}
-
-template <>
-EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
-  EIGEN_MSA_DEBUG;
-
-  float f = *from;
-  Packet4f v = { f, f, f, f };
-  v[1] = from[stride];
-  v[2] = from[2 * stride];
-  v[3] = from[3 * stride];
-  return v;
-}
-
-template <>
-EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) {
-  EIGEN_MSA_DEBUG;
-
-  int32_t i = *from;
-  Packet4i v = { i, i, i, i };
-  v[1] = from[stride];
-  v[2] = from[2 * stride];
-  v[3] = from[3 * stride];
-  return v;
-}
-
-template <>
-EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from,
-                                                        Index stride) {
-  EIGEN_MSA_DEBUG;
-
-  *to = from[0];
-  to += stride;
-  *to = from[1];
-  to += stride;
-  *to = from[2];
-  to += stride;
-  *to = from[3];
-}
-
-template <>
-EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from,
-                                                          Index stride) {
-  EIGEN_MSA_DEBUG;
-
-  *to = from[0];
-  to += stride;
-  *to = from[1];
-  to += stride;
-  *to = from[2];
-  to += stride;
-  *to = from[3];
-}
-
-template <>
-EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
-  EIGEN_MSA_DEBUG;
-
-  __builtin_prefetch(addr);
-}
-
-template <>
-EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) {
-  EIGEN_MSA_DEBUG;
-
-  __builtin_prefetch(addr);
-}
-
-template <>
-EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
-  EIGEN_MSA_DEBUG;
-
-  return a[0];
-}
-
-template <>
-EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) {
-  EIGEN_MSA_DEBUG;
-
-  return a[0];
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
-  EIGEN_MSA_DEBUG;
-
-  return (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
-  EIGEN_MSA_DEBUG;
-
-  return __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
-  EIGEN_MSA_DEBUG;
-
-  return (Packet4f)__builtin_msa_bclri_w((v4u32)a, 31);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
-  EIGEN_MSA_DEBUG;
-
-  Packet4i zero = __builtin_msa_ldi_w(0);
-  return __builtin_msa_add_a_w(zero, a);
-}
-
-template <>
-EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
-  EIGEN_MSA_DEBUG;
-
-  Packet4f s = padd(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
-  s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
-  return s[0];
-}
-
-
-template <>
-EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
-  EIGEN_MSA_DEBUG;
-
-  Packet4i s = padd(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
-  s = padd(s, __builtin_msa_shf_w(s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
-  return s[0];
-}
-
-// Other reduction functions:
-// mul
-template <>
-EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
-  EIGEN_MSA_DEBUG;
-
-  Packet4f p = pmul(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
-  p = pmul(p, (Packet4f)__builtin_msa_shf_w((v4i32)p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
-  return p[0];
-}
-
-template <>
-EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) {
-  EIGEN_MSA_DEBUG;
-
-  Packet4i p = pmul(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
-  p = pmul(p, __builtin_msa_shf_w(p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
-  return p[0];
-}
-
-// min
-template <>
-EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
-  EIGEN_MSA_DEBUG;
-
-  // Swap 64-bit halves of a.
-  Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
-#if !EIGEN_FAST_MATH
-  // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit
-  // masks of all zeroes/ones in low 64 bits.
-  v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
-  // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes.
-  unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
-#endif
-  // Continue with min computation.
-  Packet4f v = __builtin_msa_fmin_w(a, swapped);
-  v = __builtin_msa_fmin_w(
-      v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
-#if !EIGEN_FAST_MATH
-  // Based on the mask select between v and 4 qNaNs.
-  v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
-  v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
-#endif
-  return v[0];
-}
-
-template <>
-EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
-  EIGEN_MSA_DEBUG;
-
-  Packet4i m = pmin(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
-  m = pmin(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
-  return m[0];
-}
-
-// max
-template <>
-EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
-  EIGEN_MSA_DEBUG;
-
-  // Swap 64-bit halves of a.
-  Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
-#if !EIGEN_FAST_MATH
-  // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit
-  // masks of all zeroes/ones in low 64 bits.
-  v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
-  // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes.
-  unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
-#endif
-  // Continue with max computation.
-  Packet4f v = __builtin_msa_fmax_w(a, swapped);
-  v = __builtin_msa_fmax_w(
-      v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
-#if !EIGEN_FAST_MATH
-  // Based on the mask select between v and 4 qNaNs.
-  v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
-  v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
-#endif
-  return v[0];
-}
-
-template <>
-EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
-  EIGEN_MSA_DEBUG;
-
-  Packet4i m = pmax(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
-  m = pmax(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
-  return m[0];
-}
-
-inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4f, 4>& value) {
-  os << "[ " << value.packet[0] << "," << std::endl
-     << "  " << value.packet[1] << "," << std::endl
-     << "  " << value.packet[2] << "," << std::endl
-     << "  " << value.packet[3] << " ]";
-  return os;
-}
-
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
-  EIGEN_MSA_DEBUG;
-
-  v4i32 tmp1, tmp2, tmp3, tmp4;
-
-  tmp1 = __builtin_msa_ilvr_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
-  tmp2 = __builtin_msa_ilvr_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
-  tmp3 = __builtin_msa_ilvl_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
-  tmp4 = __builtin_msa_ilvl_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
-
-  kernel.packet[0] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
-  kernel.packet[1] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
-  kernel.packet[2] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
-  kernel.packet[3] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
-}
-
-inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4i, 4>& value) {
-  os << "[ " << value.packet[0] << "," << std::endl
-     << "  " << value.packet[1] << "," << std::endl
-     << "  " << value.packet[2] << "," << std::endl
-     << "  " << value.packet[3] << " ]";
-  return os;
-}
-
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
-  EIGEN_MSA_DEBUG;
-
-  v4i32 tmp1, tmp2, tmp3, tmp4;
-
-  tmp1 = __builtin_msa_ilvr_w(kernel.packet[1], kernel.packet[0]);
-  tmp2 = __builtin_msa_ilvr_w(kernel.packet[3], kernel.packet[2]);
-  tmp3 = __builtin_msa_ilvl_w(kernel.packet[1], kernel.packet[0]);
-  tmp4 = __builtin_msa_ilvl_w(kernel.packet[3], kernel.packet[2]);
-
-  kernel.packet[0] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
-  kernel.packet[1] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
-  kernel.packet[2] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
-  kernel.packet[3] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
-  EIGEN_MSA_DEBUG;
-
-  return __builtin_msa_fsqrt_w(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
-  EIGEN_MSA_DEBUG;
-
-#if EIGEN_FAST_MATH
-  return __builtin_msa_frsqrt_w(a);
-#else
-  Packet4f ones = __builtin_msa_ffint_s_w(__builtin_msa_ldi_w(1));
-  return pdiv(ones, psqrt(a));
-#endif
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
-  Packet4f v = a;
-  int32_t old_mode, new_mode;
-  asm volatile(
-      "cfcmsa  %[old_mode], $1\n"
-      "ori     %[new_mode], %[old_mode], 3\n"  // 3 = round towards -INFINITY.
-      "ctcmsa  $1, %[new_mode]\n"
-      "frint.w %w[v], %w[v]\n"
-      "ctcmsa  $1, %[old_mode]\n"
-      :  // outputs
-      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
-      [v] "+f"(v)
-      :  // inputs
-      :  // clobbers
-  );
-  return v;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
-  Packet4f v = a;
-  int32_t old_mode, new_mode;
-  asm volatile(
-      "cfcmsa  %[old_mode], $1\n"
-      "ori     %[new_mode], %[old_mode], 3\n"
-      "xori    %[new_mode], %[new_mode], 1\n"  // 2 = round towards +INFINITY.
-      "ctcmsa  $1, %[new_mode]\n"
-      "frint.w %w[v], %w[v]\n"
-      "ctcmsa  $1, %[old_mode]\n"
-      :  // outputs
-      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
-      [v] "+f"(v)
-      :  // inputs
-      :  // clobbers
-  );
-  return v;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
-  Packet4f v = a;
-  int32_t old_mode, new_mode;
-  asm volatile(
-      "cfcmsa  %[old_mode], $1\n"
-      "ori     %[new_mode], %[old_mode], 3\n"
-      "xori    %[new_mode], %[new_mode], 3\n"  // 0 = round to nearest, ties to even.
-      "ctcmsa  $1, %[new_mode]\n"
-      "frint.w %w[v], %w[v]\n"
-      "ctcmsa  $1, %[old_mode]\n"
-      :  // outputs
-      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
-      [v] "+f"(v)
-      :  // inputs
-      :  // clobbers
-  );
-  return v;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
-                                    const Packet4f& elsePacket) {
-  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
-                       ifPacket.select[3] };
-  Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
-  return (Packet4f)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
-                                    const Packet4i& elsePacket) {
-  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2],
-                       ifPacket.select[3] };
-  Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
-  return (Packet4i)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
-}
-
-//---------- double ----------
-
-typedef v2f64 Packet2d;
-typedef v2i64 Packet2l;
-typedef v2u64 Packet2ul;
-
-#define _EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = { X, X }
-#define _EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = { X, X }
-#define _EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = { X, X }
-
-inline std::ostream& operator<<(std::ostream& os, const Packet2d& value) {
-  os << "[ " << value[0] << ", " << value[1] << " ]";
-  return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const Packet2l& value) {
-  os << "[ " << value[0] << ", " << value[1] << " ]";
-  return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const Packet2ul& value) {
-  os << "[ " << value[0] << ", " << value[1] << " ]";
-  return os;
-}
-
-template <>
-struct packet_traits<double> : default_packet_traits {
-  typedef Packet2d type;
-  typedef Packet2d half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 2,
-    HasHalfPacket = 0,
-    // FIXME check the Has*
-    HasDiv = 1,
-    HasExp = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
-    HasRound = 1,
-    HasFloor = 1,
-    HasCeil = 1,
-    HasBlend = 1
-  };
-};
-
-template <>
-struct unpacket_traits<Packet2d> {
-  typedef double type;
-  enum { size = 2, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false };
-  typedef Packet2d half;
-};
-
-template <>
-EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
-  EIGEN_MSA_DEBUG;
-
-  Packet2d value = { from, from };
-  return value;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
-  EIGEN_MSA_DEBUG;
-
-  return __builtin_msa_fadd_d(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
-  EIGEN_MSA_DEBUG;
-
-  static const Packet2d countdown = { 0.0, 1.0 };
-  return padd(pset1<Packet2d>(a), countdown);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
-  EIGEN_MSA_DEBUG;
-
-  return __builtin_msa_fsub_d(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
-  EIGEN_MSA_DEBUG;
-
-  return (Packet2d)__builtin_msa_bnegi_d((v2u64)a, 63);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
-  EIGEN_MSA_DEBUG;
-
-  return a;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
-  EIGEN_MSA_DEBUG;
-
-  return __builtin_msa_fmul_d(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
-  EIGEN_MSA_DEBUG;
-
-  return __builtin_msa_fdiv_d(a, b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
-  EIGEN_MSA_DEBUG;
-
-  return __builtin_msa_fmadd_d(c, a, b);
-}
-
-// Logical Operations are not supported for float, so we have to reinterpret casts using MSA
-// intrinsics
-template <>
-EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
-  EIGEN_MSA_DEBUG;
-
-  return (Packet2d)__builtin_msa_and_v((v16u8)a, (v16u8)b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
-  EIGEN_MSA_DEBUG;
-
-  return (Packet2d)__builtin_msa_or_v((v16u8)a, (v16u8)b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
-  EIGEN_MSA_DEBUG;
-
-  return (Packet2d)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
-  EIGEN_MSA_DEBUG;
-
-  return pand(a, (Packet2d)__builtin_msa_xori_b((v16u8)b, 255));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
-  EIGEN_MSA_DEBUG;
-
-  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
-  EIGEN_MSA_DEBUG;
-
-#if EIGEN_FAST_MATH
-  // This prefers numbers to NaNs.
-  return __builtin_msa_fmin_d(a, b);
-#else
-  // This prefers NaNs to numbers.
-  v2i64 aNaN = __builtin_msa_fcun_d(a, a);
-  v2i64 aMinOrNaN = por(__builtin_msa_fclt_d(a, b), aNaN);
-  return (Packet2d)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
-#endif
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
-  EIGEN_MSA_DEBUG;
-
-#if EIGEN_FAST_MATH
-  // This prefers numbers to NaNs.
-  return __builtin_msa_fmax_d(a, b);
-#else
-  // This prefers NaNs to numbers.
-  v2i64 aNaN = __builtin_msa_fcun_d(a, a);
-  v2i64 aMaxOrNaN = por(__builtin_msa_fclt_d(b, a), aNaN);
-  return (Packet2d)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
-#endif
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
-  EIGEN_MSA_DEBUG;
-
-  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
-  EIGEN_MSA_DEBUG;
-
-  Packet2d value = { *from, *from };
-  return value;
-}
-
-template <>
-EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
-  EIGEN_MSA_DEBUG;
-
-  EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
-}
-
-template <>
-EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
-  EIGEN_MSA_DEBUG;
-
-  EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
-}
-
-template <>
-EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
-  EIGEN_MSA_DEBUG;
-
-  Packet2d value;
-  value[0] = *from;
-  from += stride;
-  value[1] = *from;
-  return value;
-}
-
-template <>
-EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from,
-                                                         Index stride) {
-  EIGEN_MSA_DEBUG;
-
-  *to = from[0];
-  to += stride;
-  *to = from[1];
-}
-
-template <>
-EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
-  EIGEN_MSA_DEBUG;
-
-  __builtin_prefetch(addr);
-}
-
-template <>
-EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
-  EIGEN_MSA_DEBUG;
-
-  return a[0];
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
-  EIGEN_MSA_DEBUG;
-
-  return (Packet2d)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
-  EIGEN_MSA_DEBUG;
-
-  return (Packet2d)__builtin_msa_bclri_d((v2u64)a, 63);
-}
-
-template <>
-EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
-  EIGEN_MSA_DEBUG;
-
-  Packet2d s = padd(a, preverse(a));
-  return s[0];
-}
-
-// Other reduction functions:
-// mul
-template <>
-EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
-  EIGEN_MSA_DEBUG;
-
-  Packet2d p = pmul(a, preverse(a));
-  return p[0];
-}
-
-// min
-template <>
-EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
-  EIGEN_MSA_DEBUG;
-
-#if EIGEN_FAST_MATH
-  Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
-  Packet2d v = __builtin_msa_fmin_d(a, swapped);
-  return v[0];
-#else
-  double a0 = a[0], a1 = a[1];
-  return ((numext::isnan)(a0) || a0 < a1) ? a0 : a1;
-#endif
-}
-
-// max
-template <>
-EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
-  EIGEN_MSA_DEBUG;
-
-#if EIGEN_FAST_MATH
-  Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
-  Packet2d v = __builtin_msa_fmax_d(a, swapped);
-  return v[0];
-#else
-  double a0 = a[0], a1 = a[1];
-  return ((numext::isnan)(a0) || a0 > a1) ? a0 : a1;
-#endif
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& a) {
-  EIGEN_MSA_DEBUG;
-
-  return __builtin_msa_fsqrt_d(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
-  EIGEN_MSA_DEBUG;
-
-#if EIGEN_FAST_MATH
-  return __builtin_msa_frsqrt_d(a);
-#else
-  Packet2d ones = __builtin_msa_ffint_s_d(__builtin_msa_ldi_d(1));
-  return pdiv(ones, psqrt(a));
-#endif
-}
-
-inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2d, 2>& value) {
-  os << "[ " << value.packet[0] << "," << std::endl << "  " << value.packet[1] << " ]";
-  return os;
-}
-
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
-  EIGEN_MSA_DEBUG;
-
-  Packet2d trn1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
-  Packet2d trn2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
-  kernel.packet[0] = trn1;
-  kernel.packet[1] = trn2;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
-  Packet2d v = a;
-  int32_t old_mode, new_mode;
-  asm volatile(
-      "cfcmsa  %[old_mode], $1\n"
-      "ori     %[new_mode], %[old_mode], 3\n"  // 3 = round towards -INFINITY.
-      "ctcmsa  $1, %[new_mode]\n"
-      "frint.d %w[v], %w[v]\n"
-      "ctcmsa  $1, %[old_mode]\n"
-      :  // outputs
-      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
-      [v] "+f"(v)
-      :  // inputs
-      :  // clobbers
-  );
-  return v;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
-  Packet2d v = a;
-  int32_t old_mode, new_mode;
-  asm volatile(
-      "cfcmsa  %[old_mode], $1\n"
-      "ori     %[new_mode], %[old_mode], 3\n"
-      "xori    %[new_mode], %[new_mode], 1\n"  // 2 = round towards +INFINITY.
-      "ctcmsa  $1, %[new_mode]\n"
-      "frint.d %w[v], %w[v]\n"
-      "ctcmsa  $1, %[old_mode]\n"
-      :  // outputs
-      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
-      [v] "+f"(v)
-      :  // inputs
-      :  // clobbers
-  );
-  return v;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
-  Packet2d v = a;
-  int32_t old_mode, new_mode;
-  asm volatile(
-      "cfcmsa  %[old_mode], $1\n"
-      "ori     %[new_mode], %[old_mode], 3\n"
-      "xori    %[new_mode], %[new_mode], 3\n"  // 0 = round to nearest, ties to even.
-      "ctcmsa  $1, %[new_mode]\n"
-      "frint.d %w[v], %w[v]\n"
-      "ctcmsa  $1, %[old_mode]\n"
-      :  // outputs
-      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
-      [v] "+f"(v)
-      :  // inputs
-      :  // clobbers
-  );
-  return v;
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
-                                    const Packet2d& elsePacket) {
-  Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
-  Packet2l mask = __builtin_msa_ceqi_d((Packet2l)select, 0);
-  return (Packet2d)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
-}
-
-}  // end namespace internal
-
-}  // end namespace Eigen
-
-#endif  // EIGEN_PACKET_MATH_MSA_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/Complex.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/Complex.h
index 8cd2a5ebe..306a309be 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/Complex.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/Complex.h
@@ -15,8 +15,7 @@ namespace Eigen {
 
 namespace internal {
 
-inline uint32x4_t p4ui_CONJ_XOR()
-{
+inline uint32x4_t p4ui_CONJ_XOR() {
 // See bug 1325, clang fails to call vld1q_u64.
 #if EIGEN_COMP_CLANG
   uint32x4_t ret = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
@@ -27,134 +26,61 @@ inline uint32x4_t p4ui_CONJ_XOR()
 #endif
 }
 
-inline uint32x2_t p2ui_CONJ_XOR()
-{
+inline uint32x2_t p2ui_CONJ_XOR() {
   static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000 };
   return vld1_u32( conj_XOR_DATA );
 }
 
 //---------- float ----------
-
-struct Packet1cf
-{
-  EIGEN_STRONG_INLINE Packet1cf() {}
-  EIGEN_STRONG_INLINE explicit Packet1cf(const Packet2f& a) : v(a) {}
-  Packet2f v;
-};
 struct Packet2cf
 {
   EIGEN_STRONG_INLINE Packet2cf() {}
   EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
-  Packet4f v;
+  Packet4f  v;
 };
 
-template<> struct packet_traits<std::complex<float> > : default_packet_traits
+template<> struct packet_traits<std::complex<float> >  : default_packet_traits
 {
   typedef Packet2cf type;
-  typedef Packet1cf half;
-  enum
-  {
+  typedef Packet2cf half;
+  enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
-    HasHalfPacket = 1,
+    HasHalfPacket = 0,
 
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasMul       = 1,
-    HasDiv       = 1,
-    HasNegate    = 1,
-    HasAbs       = 0,
-    HasAbs2      = 0,
-    HasMin       = 0,
-    HasMax       = 0,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 1,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
     HasSetLinear = 0
   };
 };
 
-template<> struct unpacket_traits<Packet1cf>
-{
-  typedef std::complex<float> type;
-  typedef Packet1cf half;
-  enum
-  {
-    size = 1,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template<> struct unpacket_traits<Packet2cf>
-{
-  typedef std::complex<float> type;
-  typedef Packet1cf half;
-  enum
-  {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
 
-template<> EIGEN_STRONG_INLINE Packet1cf pcast<float,Packet1cf>(const float& a)
-{ return Packet1cf(vset_lane_f32(a, vdup_n_f32(0.f), 0)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pcast<Packet2f,Packet2cf>(const Packet2f& a)
-{ return Packet2cf(vreinterpretq_f32_u64(vmovl_u32(vreinterpret_u32_f32(a)))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cf pset1<Packet1cf>(const std::complex<float>& from)
-{ return Packet1cf(vld1_f32(reinterpret_cast<const float*>(&from))); }
-template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from)
+template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
-  const float32x2_t r64 = vld1_f32(reinterpret_cast<const float*>(&from));
+  float32x2_t r64;
+  r64 = vld1_f32((const float *)&from);
+
   return Packet2cf(vcombine_f32(r64, r64));
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cf padd<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
-{ return Packet1cf(padd<Packet2f>(a.v, b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{ return Packet2cf(padd<Packet4f>(a.v, b.v)); }
-
-template<> EIGEN_STRONG_INLINE Packet1cf psub<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
-{ return Packet1cf(psub<Packet2f>(a.v, b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{ return Packet2cf(psub<Packet4f>(a.v, b.v)); }
-
-template<> EIGEN_STRONG_INLINE Packet1cf pnegate(const Packet1cf& a) { return Packet1cf(pnegate<Packet2f>(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate<Packet4f>(a.v)); }
-
-template<> EIGEN_STRONG_INLINE Packet1cf pconj(const Packet1cf& a)
-{
-  const Packet2ui b = vreinterpret_u32_f32(a.v);
-  return Packet1cf(vreinterpret_f32_u32(veor_u32(b, p2ui_CONJ_XOR())));
-}
 template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
 {
-  const Packet4ui b = vreinterpretq_u32_f32(a.v);
+  Packet4ui b = vreinterpretq_u32_f32(a.v);
   return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR())));
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cf pmul<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
-{
-  Packet2f v1, v2;
-
-  // Get the real values of a | a1_re | a1_re |
-  v1 = vdup_lane_f32(a.v, 0);
-  // Get the imag values of a | a1_im | a1_im |
-  v2 = vdup_lane_f32(a.v, 1);
-  // Multiply the real a with b
-  v1 = vmul_f32(v1, b.v);
-  // Multiply the imag a with b
-  v2 = vmul_f32(v2, b.v);
-  // Conjugate v2
-  v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR()));
-  // Swap real/imag elements in v2.
-  v2 = vrev64_f32(v2);
-  // Add and return the result
-  return Packet1cf(vadd_f32(v1, v2));
-}
 template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
   Packet4f v1, v2;
@@ -167,7 +93,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
   v1 = vmulq_f32(v1, b.v);
   // Multiply the imag a with b
   v2 = vmulq_f32(v2, b.v);
-  // Conjugate v2
+  // Conjugate v2 
   v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), p4ui_CONJ_XOR()));
   // Swap real/imag elements in v2.
   v2 = vrev64q_f32(v2);
@@ -175,144 +101,98 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
   return Packet2cf(vaddq_f32(v1, v2));
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cf pcmp_eq(const Packet1cf& a, const Packet1cf& b)
+template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
-  // Compare real and imaginary parts of a and b to get the mask vector:
-  // [re(a[0])==re(b[0]), im(a[0])==im(b[0])]
-  Packet2f eq = pcmp_eq<Packet2f>(a.v, b.v);
-  // Swap real/imag elements in the mask in to get:
-  // [im(a[0])==im(b[0]), re(a[0])==re(b[0])]
-  Packet2f eq_swapped = vrev64_f32(eq);
-  // Return re(a)==re(b) && im(a)==im(b) by computing bitwise AND of eq and eq_swapped
-  return Packet1cf(pand<Packet2f>(eq, eq_swapped));
+  return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
 }
-template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b)
+template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
-  // Compare real and imaginary parts of a and b to get the mask vector:
-  // [re(a[0])==re(b[0]), im(a[0])==im(b[0]), re(a[1])==re(b[1]), im(a[1])==im(b[1])]
-  Packet4f eq = pcmp_eq<Packet4f>(a.v, b.v);
-  // Swap real/imag elements in the mask in to get:
-  // [im(a[0])==im(b[0]), re(a[0])==re(b[0]), im(a[1])==im(b[1]), re(a[1])==re(b[1])]
-  Packet4f eq_swapped = vrev64q_f32(eq);
-  // Return re(a)==re(b) && im(a)==im(b) by computing bitwise AND of eq and eq_swapped
-  return Packet2cf(pand<Packet4f>(eq, eq_swapped));
+  return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
+}
+template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
 }
-
-template<> EIGEN_STRONG_INLINE Packet1cf pand<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
-{ return Packet1cf(vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); }
-template<> EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{ return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cf por<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
-{ return Packet1cf(vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); }
-template<> EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{ return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cf pxor<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
-{ return Packet1cf(vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); }
-template<> EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{ return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cf pandnot<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
-{ return Packet1cf(vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); }
 template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{ return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cf pload<Packet1cf>(const std::complex<float>* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cf(pload<Packet2f>((const float*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(reinterpret_cast<const float*>(from))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cf ploadu<Packet1cf>(const std::complex<float>* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cf(ploadu<Packet2f>((const float*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(reinterpret_cast<const float*>(from))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cf ploaddup<Packet1cf>(const std::complex<float>* from)
-{ return pset1<Packet1cf>(*from); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from)
-{ return pset1<Packet2cf>(*from); }
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *to, const Packet1cf& from)
-{ EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *to, const Packet2cf& from)
-{ EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast<float*>(to), from.v); }
-
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *to, const Packet1cf& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *to, const Packet2cf& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<float*>(to), from.v); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet1cf pgather<std::complex<float>, Packet1cf>(
-    const std::complex<float>* from, Index stride)
 {
-  const Packet2f tmp = vdup_n_f32(std::real(from[0*stride]));
-  return Packet1cf(vset_lane_f32(std::imag(from[0*stride]), tmp, 1));
+  return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
 }
-template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(
-    const std::complex<float>* from, Index stride)
+
+template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
+
+template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
+
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
+
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
 {
-  Packet4f res = vdupq_n_f32(std::real(from[0*stride]));
+  Packet4f res = pset1<Packet4f>(0.f);
+  res = vsetq_lane_f32(std::real(from[0*stride]), res, 0);
   res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1);
   res = vsetq_lane_f32(std::real(from[1*stride]), res, 2);
   res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3);
   return Packet2cf(res);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet1cf>(
-    std::complex<float>* to, const Packet1cf& from, Index stride)
-{ to[stride*0] = std::complex<float>(vget_lane_f32(from.v, 0), vget_lane_f32(from.v, 1)); }
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(
-    std::complex<float>* to, const Packet2cf& from, Index stride)
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
 {
   to[stride*0] = std::complex<float>(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1));
   to[stride*1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
 }
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *addr)
-{ EIGEN_ARM_PREFETCH(reinterpret_cast<const float*>(addr)); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { EIGEN_ARM_PREFETCH((const float *)addr); }
 
-template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet1cf>(const Packet1cf& a)
+template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
-  EIGEN_ALIGN16 std::complex<float> x;
-  vst1_f32(reinterpret_cast<float*>(&x), a.v);
-  return x;
-}
-template<> EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a)
-{
-  EIGEN_ALIGN16 std::complex<float> x[2];
-  vst1q_f32(reinterpret_cast<float*>(x), a.v);
+  std::complex<float> EIGEN_ALIGN16 x[2];
+  vst1q_f32((float *)x, a.v);
   return x[0];
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cf preverse(const Packet1cf& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
-{ return Packet2cf(vcombine_f32(vget_high_f32(a.v), vget_low_f32(a.v))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cf pcplxflip<Packet1cf>(const Packet1cf& a)
-{ return Packet1cf(vrev64_f32(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a)
-{ return Packet2cf(vrev64q_f32(a.v)); }
-
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet1cf>(const Packet1cf& a)
 {
-  std::complex<float> s;
-  vst1_f32((float *)&s, a.v);
-  return s;
+  float32x2_t a_lo, a_hi;
+  Packet4f a_r128;
+
+  a_lo = vget_low_f32(a.v);
+  a_hi = vget_high_f32(a.v);
+  a_r128 = vcombine_f32(a_hi, a_lo);
+
+  return Packet2cf(a_r128);
 }
+
+template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a)
+{
+  return Packet2cf(vrev64q_f32(a.v));
+}
+
 template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
 {
+  float32x2_t a1, a2;
   std::complex<float> s;
-  vst1_f32(reinterpret_cast<float*>(&s), vadd_f32(vget_low_f32(a.v), vget_high_f32(a.v)));
+
+  a1 = vget_low_f32(a.v);
+  a2 = vget_high_f32(a.v);
+  a2 = vadd_f32(a1, a2);
+  vst1_f32((float *)&s, a2);
+
   return s;
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet1cf>(const Packet1cf& a)
+template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
 {
-  std::complex<float> s;
-  vst1_f32((float *)&s, a.v);
-  return s;
+  Packet4f sum1, sum2, sum;
+
+  // Add the first two 64-bit float32x2_t of vecs[0]
+  sum1 = vcombine_f32(vget_low_f32(vecs[0].v), vget_low_f32(vecs[1].v));
+  sum2 = vcombine_f32(vget_high_f32(vecs[0].v), vget_high_f32(vecs[1].v));
+  sum = vaddq_f32(sum1, sum2);
+
+  return Packet2cf(sum);
 }
+
 template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
 {
   float32x2_t a1, a2, v1, v2, prod;
@@ -328,103 +208,80 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
   v1 = vmul_f32(v1, a2);
   // Multiply the imag a with b
   v2 = vmul_f32(v2, a2);
-  // Conjugate v2
+  // Conjugate v2 
   v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR()));
   // Swap real/imag elements in v2.
   v2 = vrev64_f32(v2);
   // Add v1, v2
   prod = vadd_f32(v1, v2);
 
-  vst1_f32(reinterpret_cast<float*>(&s), prod);
+  vst1_f32((float *)&s, prod);
 
   return s;
 }
 
-template<> struct conj_helper<Packet1cf,Packet1cf,false,true>
+template<int Offset>
+struct palign_impl<Offset,Packet2cf>
 {
-  EIGEN_STRONG_INLINE Packet1cf pmadd(const Packet1cf& x, const Packet1cf& y, const Packet1cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cf pmul(const Packet1cf& a, const Packet1cf& b) const
-  { return internal::pmul(a, pconj(b)); }
+  EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second)
+  {
+    if (Offset==1)
+    {
+      first.v = vextq_f32(first.v, second.v, 2);
+    }
+  }
 };
 
-template<> struct conj_helper<Packet1cf,Packet1cf,true,false>
-{
-  EIGEN_STRONG_INLINE Packet1cf pmadd(const Packet1cf& x, const Packet1cf& y, const Packet1cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cf pmul(const Packet1cf& a, const Packet1cf& b) const
-  { return internal::pmul(pconj(a), b); }
-};
-
-template<> struct conj_helper<Packet1cf,Packet1cf,true,true>
-{
-  EIGEN_STRONG_INLINE Packet1cf pmadd(const Packet1cf& x, const Packet1cf& y, const Packet1cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cf pmul(const Packet1cf& a, const Packet1cf& b) const
-  { return pconj(internal::pmul(a,b)); }
-};
-
-template<> struct conj_helper<Packet2cf,Packet2cf,false,true>
+template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
 {
   EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
   { return padd(pmul(x,y),c); }
 
   EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  { return internal::pmul(a, pconj(b)); }
+  {
+    return internal::pmul(a, pconj(b));
+  }
 };
 
-template<> struct conj_helper<Packet2cf,Packet2cf,true,false>
+template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
 {
   EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
   { return padd(pmul(x,y),c); }
 
   EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  { return internal::pmul(pconj(a), b); }
+  {
+    return internal::pmul(pconj(a), b);
+  }
 };
 
-template<> struct conj_helper<Packet2cf,Packet2cf,true,true>
+template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
 {
   EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
   { return padd(pmul(x,y),c); }
 
   EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  { return pconj(internal::pmul(a,b)); }
+  {
+    return pconj(internal::pmul(a, b));
+  }
 };
 
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cf,Packet2f)
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
 
-template<> EIGEN_STRONG_INLINE Packet1cf pdiv<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
-{
-  // TODO optimize it for NEON
-  Packet1cf res = conj_helper<Packet1cf, Packet1cf, false, true>().pmul(a,b);
-  Packet2f s, rev_s;
-
-  // this computes the norm
-  s = vmul_f32(b.v, b.v);
-  rev_s = vrev64_f32(s);
-
-  return Packet1cf(pdiv<Packet2f>(res.v, vadd_f32(s, rev_s)));
-}
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
   // TODO optimize it for NEON
-  Packet2cf res = conj_helper<Packet2cf, Packet2cf, false, true>().pmul(a,b);
+  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
   Packet4f s, rev_s;
 
   // this computes the norm
   s = vmulq_f32(b.v, b.v);
   rev_s = vrev64q_f32(s);
 
-  return Packet2cf(pdiv<Packet4f>(res.v, vaddq_f32(s, rev_s)));
+  return Packet2cf(pdiv<Packet4f>(res.v, vaddq_f32(s,rev_s)));
 }
 
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet1cf, 1>& /*kernel*/) {}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel)
-{
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2cf,2>& kernel) {
   Packet4f tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v));
   kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v));
   kernel.packet[1].v = tmp;
@@ -452,8 +309,7 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
 {
   typedef Packet1cd type;
   typedef Packet1cd half;
-  enum
-  {
+  enum {
     Vectorizable = 1,
     AlignedOnScalar = 0,
     size = 1,
@@ -472,49 +328,24 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
   };
 };
 
-template<> struct unpacket_traits<Packet1cd>
-{
-  typedef std::complex<double> type;
-  enum
-  {
-    size=1,
-    alignment=Aligned16,
-    vectorizable=true,
-    masked_load_available=false,
-    masked_store_available=false
-  };
-  typedef Packet1cd half;
-};
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
 
-template<> EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>(reinterpret_cast<const double*>(from))); }
+template<> EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
 
-template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>(reinterpret_cast<const double*>(from))); }
+template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
+{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
 
-template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from)
-{
-  /* here we really have to use unaligned loads :( */
-  return ploadu<Packet1cd>(&from);
-}
-
-template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{ return Packet1cd(padd<Packet2d>(a.v, b.v)); }
-
-template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{ return Packet1cd(psub<Packet2d>(a.v, b.v)); }
-
-template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a)
-{ return Packet1cd(pnegate<Packet2d>(a.v)); }
-
-template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
-{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); }
+template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(padd<Packet2d>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(psub<Packet2d>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate<Packet2d>(a.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); }
 
 template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
   Packet2d v1, v2;
 
-  // Get the real values of a
+  // Get the real values of a 
   v1 = vdupq_lane_f64(vget_low_f64(a.v), 0);
   // Get the imag values of a
   v2 = vdupq_lane_f64(vget_high_f64(a.v), 0);
@@ -522,7 +353,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
   v1 = vmulq_f64(v1, b.v);
   // Multiply the imag a with b
   v2 = vmulq_f64(v2, b.v);
-  // Conjugate v2
+  // Conjugate v2 
   v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR));
   // Swap real/imag elements in v2.
   v2 = preverse<Packet2d>(v2);
@@ -530,44 +361,31 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
   return Packet1cd(vaddq_f64(v1, v2));
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b)
+template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
-  // Compare real and imaginary parts of a and b to get the mask vector:
-  // [re(a)==re(b), im(a)==im(b)]
-  Packet2d eq = pcmp_eq<Packet2d>(a.v, b.v);
-  // Swap real/imag elements in the mask in to get:
-  // [im(a)==im(b), re(a)==re(b)]
-  Packet2d eq_swapped = vreinterpretq_f64_u32(vrev64q_u32(vreinterpretq_u32_f64(eq)));
-  // Return re(a)==re(b) & im(a)==im(b) by computing bitwise AND of eq and eq_swapped
-  return Packet1cd(pand<Packet2d>(eq, eq_swapped));
+  return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
+}
+template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
+}
+template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
+}
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v))));
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{ return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); }
+template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
 
-template<> EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{ return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
 
-template<> EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); }
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { EIGEN_ARM_PREFETCH((const double *)addr); }
 
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{ return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); }
-
-template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from)
-{ return pset1<Packet1cd>(*from); }
-
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *to, const Packet1cd& from)
-{ EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast<double*>(to), from.v); }
-
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *to, const Packet1cd& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), from.v); }
-
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *addr)
-{ EIGEN_ARM_PREFETCH(reinterpret_cast<const double*>(addr)); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(
-    const std::complex<double>* from, Index stride)
+template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride)
 {
   Packet2d res = pset1<Packet2d>(0.0);
   res = vsetq_lane_f64(std::real(from[0*stride]), res, 0);
@@ -575,14 +393,17 @@ template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Pack
   return Packet1cd(res);
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(
-    std::complex<double>* to, const Packet1cd& from, Index stride)
-{ to[stride*0] = std::complex<double>(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1)); }
-
-template<> EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a)
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride)
 {
-  EIGEN_ALIGN16 std::complex<double> res;
+  to[stride*0] = std::complex<double>(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1));
+}
+
+
+template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
+{
+  std::complex<double> EIGEN_ALIGN16 res;
   pstore<std::complex<double> >(&res, a);
+
   return res;
 }
 
@@ -590,15 +411,29 @@ template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a
 
 template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
 
+template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs) { return vecs[0]; }
+
 template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
 
+template<int Offset>
+struct palign_impl<Offset,Packet1cd>
+{
+  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
+  {
+    // FIXME is it sure we never have to align a Packet1cd?
+    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
+  }
+};
+
 template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
 {
   EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
   { return padd(pmul(x,y),c); }
 
   EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  { return internal::pmul(a, pconj(b)); }
+  {
+    return internal::pmul(a, pconj(b));
+  }
 };
 
 template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
@@ -607,7 +442,9 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
   { return padd(pmul(x,y),c); }
 
   EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  { return internal::pmul(pconj(a), b); }
+  {
+    return internal::pmul(pconj(a), b);
+  }
 };
 
 template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
@@ -616,7 +453,9 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
   { return padd(pmul(x,y),c); }
 
   EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  { return pconj(internal::pmul(a,b)); }
+  {
+    return pconj(internal::pmul(a, b));
+  }
 };
 
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
@@ -632,7 +471,9 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, con
 }
 
 EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
-{ return Packet1cd(preverse(Packet2d(x.v))); }
+{
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
 
 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
 {
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/MathFunctions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/MathFunctions.h
index 1c025618e..6bb05bb92 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/MathFunctions.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/MathFunctions.h
@@ -5,6 +5,10 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+/* The sin, cos, exp, and log functions of this file come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
 #ifndef EIGEN_MATH_FUNCTIONS_NEON_H
 #define EIGEN_MATH_FUNCTIONS_NEON_H
 
@@ -12,31 +16,73 @@ namespace Eigen {
 
 namespace internal {
 
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f pexp<Packet2f>(const Packet2f& x)
-{ return pexp_float(x); }
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pexp<Packet4f>(const Packet4f& x)
-{ return pexp_float(x); }
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet4f pexp<Packet4f>(const Packet4f& _x)
+{
+  Packet4f x = _x;
+  Packet4f tmp, fx;
 
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f plog<Packet2f>(const Packet2f& x)
-{ return plog_float(x); }
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f plog<Packet4f>(const Packet4f& x)
-{ return plog_float(x); }
+  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+  _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
+  _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
 
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f psin<Packet2f>(const Packet2f& x)
-{ return psin_float(x); }
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psin<Packet4f>(const Packet4f& x)
-{ return psin_float(x); }
+  x = vminq_f32(x, p4f_exp_hi);
+  x = vmaxq_f32(x, p4f_exp_lo);
 
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f pcos<Packet2f>(const Packet2f& x)
-{ return pcos_float(x); }
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pcos<Packet4f>(const Packet4f& x)
-{ return pcos_float(x); }
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF);
 
-// Hyperbolic Tangent function.
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f ptanh<Packet2f>(const Packet2f& x)
-{ return internal::generic_fast_tanh_float(x); }
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f ptanh<Packet4f>(const Packet4f& x)
-{ return internal::generic_fast_tanh_float(x); }
+  /* perform a floorf */
+  tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
+
+  /* if greater, substract 1 */
+  Packet4ui mask = vcgtq_f32(tmp, fx);
+  mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1));
+
+  fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
+
+  tmp = vmulq_f32(fx, p4f_cephes_exp_C1);
+  Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2);
+  x = vsubq_f32(x, tmp);
+  x = vsubq_f32(x, z);
+
+  Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x);
+  z = vmulq_f32(x, x);
+  y = vaddq_f32(y, p4f_cephes_exp_p1);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_exp_p2);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_exp_p3);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_exp_p4);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, p4f_cephes_exp_p5);
+
+  y = vmulq_f32(y, z);
+  y = vaddq_f32(y, x);
+  y = vaddq_f32(y, p4f_1);
+
+  /* build 2^n */
+  int32x4_t mm;
+  mm = vcvtq_s32_f32(fx);
+  mm = vaddq_s32(mm, p4i_0x7f);
+  mm = vshlq_n_s32(mm, 23);
+  Packet4f pow2n = vreinterpretq_f32_s32(mm);
+
+  y = vmulq_f32(y, pow2n);
+  return y;
+}
 
 } // end namespace internal
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/PacketMath.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/PacketMath.h
index ee5a938b9..3d5ed0d24 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -32,7 +32,7 @@ namespace internal {
 #if EIGEN_ARCH_ARM64
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
 #else
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 
 #endif
 #endif
 
@@ -42,45 +42,34 @@ namespace internal {
 // are aliases to the same underlying type __n128.
 // We thus have to wrap them to make them different C++ types.
 // (See also bug 1428)
-typedef eigen_packet_wrapper<float32x2_t,0>  Packet2f;
-typedef eigen_packet_wrapper<float32x4_t,1>  Packet4f;
-typedef eigen_packet_wrapper<int32_t    ,2>  Packet4c;
-typedef eigen_packet_wrapper<int8x8_t   ,3>  Packet8c;
-typedef eigen_packet_wrapper<int8x16_t  ,4>  Packet16c;
-typedef eigen_packet_wrapper<uint32_t   ,5>  Packet4uc;
-typedef eigen_packet_wrapper<uint8x8_t  ,6>  Packet8uc;
-typedef eigen_packet_wrapper<uint8x16_t ,7>  Packet16uc;
-typedef eigen_packet_wrapper<int16x4_t  ,8>  Packet4s;
-typedef eigen_packet_wrapper<int16x8_t  ,9>  Packet8s;
-typedef eigen_packet_wrapper<uint16x4_t ,10> Packet4us;
-typedef eigen_packet_wrapper<uint16x8_t ,11> Packet8us;
-typedef eigen_packet_wrapper<int32x2_t  ,12> Packet2i;
-typedef eigen_packet_wrapper<int32x4_t  ,13> Packet4i;
-typedef eigen_packet_wrapper<uint32x2_t ,14> Packet2ui;
-typedef eigen_packet_wrapper<uint32x4_t ,15> Packet4ui;
-typedef eigen_packet_wrapper<int64x2_t  ,16> Packet2l;
-typedef eigen_packet_wrapper<uint64x2_t ,17> Packet2ul;
+
+template<typename T,int unique_id>
+struct eigen_packet_wrapper
+{
+  operator T&() { return m_val; }
+  operator const T&() const { return m_val; }
+  eigen_packet_wrapper() {}
+  eigen_packet_wrapper(const T &v) : m_val(v) {}
+  eigen_packet_wrapper& operator=(const T &v) {
+    m_val = v;
+    return *this;
+  }
+
+  T m_val;
+};
+typedef eigen_packet_wrapper<float32x2_t,0> Packet2f;
+typedef eigen_packet_wrapper<float32x4_t,1> Packet4f;
+typedef eigen_packet_wrapper<int32x4_t  ,2> Packet4i;
+typedef eigen_packet_wrapper<int32x2_t  ,3> Packet2i;
+typedef eigen_packet_wrapper<uint32x4_t ,4> Packet4ui;
 
 #else
 
-typedef float32x2_t                          Packet2f;
-typedef float32x4_t                          Packet4f;
-typedef eigen_packet_wrapper<int32_t    ,2>  Packet4c;
-typedef int8x8_t                             Packet8c;
-typedef int8x16_t                            Packet16c;
-typedef eigen_packet_wrapper<uint32_t   ,5>  Packet4uc;
-typedef uint8x8_t                            Packet8uc;
-typedef uint8x16_t                           Packet16uc;
-typedef int16x4_t                            Packet4s;
-typedef int16x8_t                            Packet8s;
-typedef uint16x4_t                           Packet4us;
-typedef uint16x8_t                           Packet8us;
-typedef int32x2_t                            Packet2i;
-typedef int32x4_t                            Packet4i;
-typedef uint32x2_t                           Packet2ui;
-typedef uint32x4_t                           Packet4ui;
-typedef int64x2_t                            Packet2l;
-typedef uint64x2_t                           Packet2ul;
+typedef float32x2_t Packet2f;
+typedef float32x4_t Packet4f;
+typedef int32x4_t   Packet4i;
+typedef int32x2_t   Packet2i;
+typedef uint32x4_t  Packet4ui;
 
 #endif // EIGEN_COMP_MSVC
 
@@ -109,809 +98,81 @@ typedef uint64x2_t                           Packet2ul;
   #define EIGEN_ARM_PREFETCH(ADDR)
 #endif
 
-template <>
-struct packet_traits<float> : default_packet_traits
+template<> struct packet_traits<float>  : default_packet_traits
 {
   typedef Packet4f type;
-  typedef Packet2f half;
-  enum
-  {
+  typedef Packet4f half; // Packet2f intrinsics not implemented yet
+  enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 4,
-    HasHalfPacket = 1,
-
-    HasCast      = 1,
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasShift     = 1,
-    HasMul       = 1,
-    HasNegate    = 1,
-    HasAbs       = 1,
-    HasArg       = 0,
-    HasAbs2      = 1,
-    HasAbsDiff   = 1,
-    HasMin       = 1,
-    HasMax       = 1,
-    HasConj      = 1,
-    HasSetLinear = 0,
-    HasBlend     = 0,
-    HasInsert    = 1,
-
-    HasDiv   = 1,
-    HasFloor = 1,
-
-    HasSin  = EIGEN_FAST_MATH,
-    HasCos  = EIGEN_FAST_MATH,
-    HasLog  = 1,
+    HasHalfPacket=0, // Packet2f intrinsics not implemented yet
+   
+    HasDiv  = 1,
+    // FIXME check the Has*
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 0,
     HasExp  = 1,
-    HasSqrt = 0,
-    HasTanh = EIGEN_FAST_MATH,
-    HasErf  = EIGEN_FAST_MATH
+    HasSqrt = 0
   };
 };
-
-template <>
-struct packet_traits<int8_t> : default_packet_traits
-{
-  typedef Packet16c type;
-  typedef Packet8c half;
-  enum
-  {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 16,
-    HasHalfPacket = 1,
-
-    HasCast      = 1,
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasShift     = 1,
-    HasMul       = 1,
-    HasNegate    = 1,
-    HasAbs       = 1,
-    HasAbsDiff   = 1,
-    HasArg       = 0,
-    HasAbs2      = 1,
-    HasMin       = 1,
-    HasMax       = 1,
-    HasConj      = 1,
-    HasSetLinear = 0,
-    HasBlend     = 0,
-    HasInsert    = 1,
-  };
-};
-
-template <>
-struct packet_traits<uint8_t> : default_packet_traits
-{
-  typedef Packet16uc type;
-  typedef Packet8uc half;
-  enum
-  {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 16,
-    HasHalfPacket = 1,
-
-    HasCast      = 1,
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasShift     = 1,
-    HasMul       = 1,
-    HasNegate    = 0,
-    HasAbs       = 1,
-    HasAbsDiff   = 1,
-    HasArg       = 0,
-    HasAbs2      = 1,
-    HasMin       = 1,
-    HasMax       = 1,
-    HasConj      = 1,
-    HasSetLinear = 0,
-    HasBlend     = 0,
-    HasInsert    = 1,
-
-    HasSqrt = 1
-  };
-};
-
-template <>
-struct packet_traits<int16_t> : default_packet_traits
-{
-  typedef Packet8s type;
-  typedef Packet4s half;
-  enum
-  {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 8,
-    HasHalfPacket = 1,
-
-    HasCast      = 1,
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasShift     = 1,
-    HasMul       = 1,
-    HasNegate    = 1,
-    HasAbs       = 1,
-    HasAbsDiff   = 1,
-    HasArg       = 0,
-    HasAbs2      = 1,
-    HasMin       = 1,
-    HasMax       = 1,
-    HasConj      = 1,
-    HasSetLinear = 0,
-    HasBlend     = 0,
-    HasInsert    = 1,
-  };
-};
-
-template <>
-struct packet_traits<uint16_t> : default_packet_traits
-{
-  typedef Packet8us type;
-  typedef Packet4us half;
-  enum
-  {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 8,
-    HasHalfPacket = 1,
-
-    HasCast      = 1,
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasShift     = 1,
-    HasMul       = 1,
-    HasNegate    = 0,
-    HasAbs       = 0,
-    HasAbsDiff   = 1,
-    HasArg       = 0,
-    HasAbs2      = 1,
-    HasMin       = 1,
-    HasMax       = 1,
-    HasConj      = 1,
-    HasSetLinear = 0,
-    HasBlend     = 0,
-    HasInsert    = 1,
-
-    HasSqrt = 1
-  };
-};
-
-template <>
-struct packet_traits<int32_t> : default_packet_traits
+template<> struct packet_traits<int32_t>    : default_packet_traits
 {
   typedef Packet4i type;
-  typedef Packet2i half;
-  enum
-  {
+  typedef Packet4i half; // Packet2i intrinsics not implemented yet
+  enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size = 4,
-    HasHalfPacket = 1,
-
-    HasCast      = 1,
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasShift     = 1,
-    HasMul       = 1,
-    HasNegate    = 1,
-    HasAbs       = 1,
-    HasArg       = 0,
-    HasAbs2      = 1,
-    HasAbsDiff   = 1,
-    HasMin       = 1,
-    HasMax       = 1,
-    HasConj      = 1,
-    HasSetLinear = 0,
-    HasBlend     = 0,
-    HasInsert    = 1,
+    size=4,
+    HasHalfPacket=0 // Packet2i intrinsics not implemented yet
+    // FIXME check the Has*
   };
 };
 
-template <>
-struct packet_traits<uint32_t> : default_packet_traits
-{
-  typedef Packet4ui type;
-  typedef Packet2ui half;
-  enum
-  {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 4,
-    HasHalfPacket = 1,
-
-    HasCast      = 1,
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasShift     = 1,
-    HasMul       = 1,
-    HasNegate    = 0,
-    HasAbs       = 0,
-    HasArg       = 0,
-    HasAbs2      = 1,
-    HasAbsDiff   = 1,
-    HasMin       = 1,
-    HasMax       = 1,
-    HasConj      = 1,
-    HasSetLinear = 0,
-    HasBlend     = 0,
-    HasInsert    = 1,
-
-    HasSqrt = 1
-  };
-};
-
-template <>
-struct packet_traits<int64_t> : default_packet_traits
-{
-  typedef Packet2l type;
-  typedef Packet2l half;
-  enum
-  {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 2,
-    HasHalfPacket = 1,
-
-    HasCast      = 1,
-    HasCmp       = 1,
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasShift     = 1,
-    HasMul       = 1,
-    HasNegate    = 1,
-    HasAbs       = 1,
-    HasArg       = 0,
-    HasAbs2      = 1,
-    HasAbsDiff   = 1,
-    HasMin       = 1,
-    HasMax       = 1,
-    HasConj      = 1,
-    HasSetLinear = 0,
-    HasBlend     = 0,
-    HasInsert    = 1,
-  };
-};
-
-template <>
-struct packet_traits<uint64_t> : default_packet_traits
-{
-  typedef Packet2ul type;
-  typedef Packet2ul half;
-  enum
-  {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 2,
-    HasHalfPacket = 1,
-
-    HasCast      = 1,
-    HasCmp       = 1,
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasShift     = 1,
-    HasMul       = 1,
-    HasNegate    = 0,
-    HasAbs       = 0,
-    HasArg       = 0,
-    HasAbs2      = 1,
-    HasAbsDiff   = 1,
-    HasMin       = 1,
-    HasMax       = 1,
-    HasConj      = 1,
-    HasSetLinear = 0,
-    HasBlend     = 0,
-    HasInsert    = 1,
-  };
-};
-
-#if EIGEN_GNUC_AT_MOST(4, 4) && !EIGEN_COMP_LLVM
+#if EIGEN_GNUC_AT_MOST(4,4) && !EIGEN_COMP_LLVM
 // workaround gcc 4.2, 4.3 and 4.4 compilatin issue
 EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
-EIGEN_STRONG_INLINE float32x2_t vld1_f32(const float* x) { return ::vld1_f32 ((const float32_t*)x); }
-EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32(const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); }
-EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
-EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
+EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); }
+EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); }
+EIGEN_STRONG_INLINE void        vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
+EIGEN_STRONG_INLINE void        vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
 #endif
 
-template<> struct unpacket_traits<Packet2f>
-{
-  typedef float type;
-  typedef Packet2f half;
-  typedef Packet2i integer_packet;
-  enum
-  {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template<> struct unpacket_traits<Packet4f>
-{
-  typedef float type;
-  typedef Packet2f half;
-  typedef Packet4i integer_packet;
-  enum
-  {
-    size = 4,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template<> struct unpacket_traits<Packet4c>
-{
-  typedef int8_t type;
-  typedef Packet4c half;
-  enum
-  {
-    size = 4,
-    alignment = Unaligned,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template<> struct unpacket_traits<Packet8c>
-{
-  typedef int8_t type;
-  typedef Packet4c half;
-  enum
-  {
-    size = 8,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template<> struct unpacket_traits<Packet16c>
-{
-  typedef int8_t type;
-  typedef Packet8c half;
-  enum
-  {
-    size = 16,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template<> struct unpacket_traits<Packet4uc>
-{
-  typedef uint8_t type;
-  typedef Packet4uc half;
-  enum
-  {
-    size = 4,
-    alignment = Unaligned,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template<> struct unpacket_traits<Packet8uc>
-{
-  typedef uint8_t type;
-  typedef Packet4uc half;
-  enum
-  {
-    size = 8,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template<> struct unpacket_traits<Packet16uc>
-{
-  typedef uint8_t type;
-  typedef Packet8uc half;
-  enum
-  {
-    size = 16,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false};
-};
-template<> struct unpacket_traits<Packet4s>
-{
-  typedef int16_t type;
-  typedef Packet4s half;
-  enum
-  {
-    size = 4,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template<> struct unpacket_traits<Packet8s>
-{
-  typedef int16_t type;
-  typedef Packet4s half;
-  enum
-  {
-    size = 8,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template<> struct unpacket_traits<Packet4us>
-{
-  typedef uint16_t type;
-  typedef Packet4us half;
-  enum
-  {
-    size = 4,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template<> struct unpacket_traits<Packet8us>
-{
-  typedef uint16_t type;
-  typedef Packet4us half;
-  enum
-  {
-    size = 8,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template<> struct unpacket_traits<Packet2i>
-{
-  typedef int32_t type;
-  typedef Packet2i half;
-  enum
-  {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template<> struct unpacket_traits<Packet4i>
-{
-  typedef int32_t type;
-  typedef Packet2i half;
-  enum
-  {
-    size = 4,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template<> struct unpacket_traits<Packet2ui>
-{
-  typedef uint32_t type;
-  typedef Packet2ui half;
-  enum
-  {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template<> struct unpacket_traits<Packet4ui>
-{
-  typedef uint32_t type;
-  typedef Packet2ui half;
-  enum
-  {
-    size = 4,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template<> struct unpacket_traits<Packet2l>
-{
-  typedef int64_t type;
-  typedef Packet2l half;
-  enum
-  {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
-template<> struct unpacket_traits<Packet2ul>
-{
-  typedef uint64_t type;
-  typedef Packet2ul half;
-  enum
-  {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
+template<> struct unpacket_traits<Packet4f> { typedef float   type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet4i> { typedef int32_t type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
 
-template<> EIGEN_STRONG_INLINE Packet2f pset1<Packet2f>(const float& from) { return vdup_n_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { return vdupq_n_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4c pset1<Packet4c>(const int8_t& from)
-{ return vget_lane_s32(vreinterpret_s32_s8(vdup_n_s8(from)), 0); }
-template<> EIGEN_STRONG_INLINE Packet8c pset1<Packet8c>(const int8_t& from) { return vdup_n_s8(from); }
-template<> EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const int8_t& from) { return vdupq_n_s8(from); }
-template<> EIGEN_STRONG_INLINE Packet4uc pset1<Packet4uc>(const uint8_t& from)
-{ return vget_lane_u32(vreinterpret_u32_u8(vdup_n_u8(from)), 0); }
-template<> EIGEN_STRONG_INLINE Packet8uc pset1<Packet8uc>(const uint8_t& from) { return vdup_n_u8(from); }
-template<> EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const uint8_t& from) { return vdupq_n_u8(from); }
-template<> EIGEN_STRONG_INLINE Packet4s pset1<Packet4s>(const int16_t& from) { return vdup_n_s16(from); }
-template<> EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const int16_t& from) { return vdupq_n_s16(from); }
-template<> EIGEN_STRONG_INLINE Packet4us pset1<Packet4us>(const uint16_t& from) { return vdup_n_u16(from); }
-template<> EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const uint16_t& from) { return vdupq_n_u16(from); }
-template<> EIGEN_STRONG_INLINE Packet2i pset1<Packet2i>(const int32_t& from) { return vdup_n_s32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) { return vdupq_n_s32(from); }
-template<> EIGEN_STRONG_INLINE Packet2ui pset1<Packet2ui>(const uint32_t& from) { return vdup_n_u32(from); }
-template<> EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) { return vdupq_n_u32(from); }
-template<> EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) { return vdupq_n_s64(from); }
-template<> EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(const uint64_t& from) { return vdupq_n_u64(from); }
+template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return vdupq_n_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t&    from)   { return vdupq_n_s32(from); }
 
-template<> EIGEN_STRONG_INLINE Packet2f pset1frombits<Packet2f>(unsigned int from)
-{ return vreinterpret_f32_u32(vdup_n_u32(from)); }
-template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from)
-{ return vreinterpretq_f32_u32(vdupq_n_u32(from)); }
-
-template<> EIGEN_STRONG_INLINE Packet2f plset<Packet2f>(const float& a)
-{
-  const float c[] = {0.0f,1.0f};
-  return vadd_f32(pset1<Packet2f>(a), vld1_f32(c));
-}
 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)
 {
-  const float c[] = {0.0f,1.0f,2.0f,3.0f};
-  return vaddq_f32(pset1<Packet4f>(a), vld1q_f32(c));
-}
-template<> EIGEN_STRONG_INLINE Packet4c plset<Packet4c>(const int8_t& a)
-{ return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_u32(vdup_n_u32(0x03020100)), vdup_n_s8(a))), 0); }
-template<> EIGEN_STRONG_INLINE Packet8c plset<Packet8c>(const int8_t& a)
-{
-  const int8_t c[] = {0,1,2,3,4,5,6,7};
-  return vadd_s8(pset1<Packet8c>(a), vld1_s8(c));
-}
-template<> EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const int8_t& a)
-{
-  const int8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
-  return vaddq_s8(pset1<Packet16c>(a), vld1q_s8(c));
-}
-template<> EIGEN_STRONG_INLINE Packet4uc plset<Packet4uc>(const uint8_t& a)
-{ return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(0x03020100)), vdup_n_u8(a))), 0); }
-template<> EIGEN_STRONG_INLINE Packet8uc plset<Packet8uc>(const uint8_t& a)
-{
-  const uint8_t c[] = {0,1,2,3,4,5,6,7};
-  return vadd_u8(pset1<Packet8uc>(a), vld1_u8(c));
-}
-template<> EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const uint8_t& a)
-{
-  const uint8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
-  return vaddq_u8(pset1<Packet16uc>(a), vld1q_u8(c));
-}
-template<> EIGEN_STRONG_INLINE Packet4s plset<Packet4s>(const int16_t& a)
-{
-  const int16_t c[] = {0,1,2,3};
-  return vadd_s16(pset1<Packet4s>(a), vld1_s16(c));
-}
-template<> EIGEN_STRONG_INLINE Packet4us plset<Packet4us>(const uint16_t& a)
-{
-  const uint16_t c[] = {0,1,2,3};
-  return vadd_u16(pset1<Packet4us>(a), vld1_u16(c));
-}
-template<> EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const int16_t& a)
-{
-  const int16_t c[] = {0,1,2,3,4,5,6,7};
-  return vaddq_s16(pset1<Packet8s>(a), vld1q_s16(c));
-}
-template<> EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const uint16_t& a)
-{
-  const uint16_t c[] = {0,1,2,3,4,5,6,7};
-  return vaddq_u16(pset1<Packet8us>(a), vld1q_u16(c));
-}
-template<> EIGEN_STRONG_INLINE Packet2i plset<Packet2i>(const int32_t& a)
-{
-  const int32_t c[] = {0,1};
-  return vadd_s32(pset1<Packet2i>(a), vld1_s32(c));
+  const float f[] = {0, 1, 2, 3};
+  Packet4f countdown = vld1q_f32(f);
+  return vaddq_f32(pset1<Packet4f>(a), countdown);
 }
 template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a)
 {
-  const int32_t c[] = {0,1,2,3};
-  return vaddq_s32(pset1<Packet4i>(a), vld1q_s32(c));
-}
-template<> EIGEN_STRONG_INLINE Packet2ui plset<Packet2ui>(const uint32_t& a)
-{
-  const uint32_t c[] = {0,1};
-  return vadd_u32(pset1<Packet2ui>(a), vld1_u32(c));
-}
-template<> EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a)
-{
-  const uint32_t c[] = {0,1,2,3};
-  return vaddq_u32(pset1<Packet4ui>(a), vld1q_u32(c));
-}
-template<> EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a)
-{
-  const int64_t c[] = {0,1};
-  return vaddq_s64(pset1<Packet2l>(a), vld1q_s64(c));
-}
-template<> EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(const uint64_t& a)
-{
-  const uint64_t c[] = {0,1};
-  return vaddq_u64(pset1<Packet2ul>(a), vld1q_u64(c));
+  const int32_t i[] = {0, 1, 2, 3};
+  Packet4i countdown = vld1q_s32(i);
+  return vaddq_s32(pset1<Packet4i>(a), countdown);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2f padd<Packet2f>(const Packet2f& a, const Packet2f& b) { return vadd_f32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4c padd<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
-  return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(
-      vreinterpret_s8_s32(vdup_n_s32(a)),
-      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8c padd<Packet8c>(const Packet8c& a, const Packet8c& b) { return vadd_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) { return vaddq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc padd<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
-  return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(
-      vreinterpret_u8_u32(vdup_n_u32(a)),
-      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8uc padd<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vadd_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vaddq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s padd<Packet4s>(const Packet4s& a, const Packet4s& b) { return vadd_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) { return vaddq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us padd<Packet4us>(const Packet4us& a, const Packet4us& b) { return vadd_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) { return vaddq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i padd<Packet2i>(const Packet2i& a, const Packet2i& b) { return vadd_s32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui padd<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vadd_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vaddq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) { return vaddq_s64(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { return vaddq_u64(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet2f psub<Packet2f>(const Packet2f& a, const Packet2f& b) { return vsub_f32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4c psub<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
-  return vget_lane_s32(vreinterpret_s32_s8(vsub_s8(
-      vreinterpret_s8_s32(vdup_n_s32(a)),
-      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8c psub<Packet8c>(const Packet8c& a, const Packet8c& b) { return vsub_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) { return vsubq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc psub<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
-  return vget_lane_u32(vreinterpret_u32_u8(vsub_u8(
-      vreinterpret_u8_u32(vdup_n_u32(a)),
-      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8uc psub<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vsub_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vsubq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s psub<Packet4s>(const Packet4s& a, const Packet4s& b) { return vsub_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) { return vsubq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us psub<Packet4us>(const Packet4us& a, const Packet4us& b) { return vsub_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) { return vsubq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i psub<Packet2i>(const Packet2i& a, const Packet2i& b) { return vsub_s32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui psub<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vsub_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vsubq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) { return vsubq_s64(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, const Packet2ul& b) { return vsubq_u64(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet2f pnegate(const Packet2f& a) { return vneg_f32(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet4c pnegate(const Packet4c& a)
-{ return vget_lane_s32(vreinterpret_s32_s8(vneg_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
-template<> EIGEN_STRONG_INLINE Packet8c pnegate(const Packet8c& a) { return vneg_s8(a); }
-template<> EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) { return vnegq_s8(a); }
-template<> EIGEN_STRONG_INLINE Packet4s pnegate(const Packet4s& a) { return vneg_s16(a); }
-template<> EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) { return vnegq_s16(a); }
-template<> EIGEN_STRONG_INLINE Packet2i pnegate(const Packet2i& a) { return vneg_s32(a); }
 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); }
-template<> EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
-#if EIGEN_ARCH_ARM64
-  return vnegq_s64(a);
-#else
-  return vcombine_s64(
-      vdup_n_s64(-vgetq_lane_s64(a, 0)),
-      vdup_n_s64(-vgetq_lane_s64(a, 1)));
-#endif
-}
 
-template<> EIGEN_STRONG_INLINE Packet2f pconj(const Packet2f& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4c pconj(const Packet4c& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet8c pconj(const Packet8c& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4uc pconj(const Packet4uc& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet8uc pconj(const Packet8uc& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4s pconj(const Packet4s& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4us pconj(const Packet4us& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2i pconj(const Packet2i& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2ui pconj(const Packet2ui& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) { return a; }
 
-template<> EIGEN_STRONG_INLINE Packet2f pmul<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmul_f32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4c pmul<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
-  return vget_lane_s32(vreinterpret_s32_s8(vmul_s8(
-      vreinterpret_s8_s32(vdup_n_s32(a)),
-      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8c pmul<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmul_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) { return vmulq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc pmul<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
-  return vget_lane_u32(vreinterpret_u32_u8(vmul_u8(
-      vreinterpret_u8_u32(vdup_n_u32(a)),
-      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8uc pmul<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmul_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vmulq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pmul<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmul_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) { return vmulq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us pmul<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmul_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) { return vmulq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pmul<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmul_s32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui pmul<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmul_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vmulq_u32(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet2f pdiv<Packet2f>(const Packet2f& a, const Packet2f& b)
-{
-#if EIGEN_ARCH_ARM64
-  return vdiv_f32(a,b);
-#else
-  Packet2f inv, restep, div;
-
-  // NEON does not offer a divide instruction, we have to do a reciprocal approximation
-  // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
-  // a reciprocal estimate AND a reciprocal step -which saves a few instructions
-  // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
-  // Newton-Raphson and vrecpsq_f32()
-  inv = vrecpe_f32(b);
-
-  // This returns a differential, by which we will have to multiply inv to get a better
-  // approximation of 1/b.
-  restep = vrecps_f32(b, inv);
-  inv = vmul_f32(restep, inv);
-
-  // Finally, multiply a by 1/b and get the wanted result of the division.
-  div = vmul_f32(a, inv);
-
-  return div;
-#endif
-}
 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
 #if EIGEN_ARCH_ARM64
@@ -938,86 +199,10 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet4c pdiv<Packet4c>(const Packet4c& /*a*/, const Packet4c& /*b*/)
-{
-  eigen_assert(false && "packet integer division are not supported by NEON");
-  return pset1<Packet4c>(0);
-}
-template<> EIGEN_STRONG_INLINE Packet8c pdiv<Packet8c>(const Packet8c& /*a*/, const Packet8c& /*b*/)
-{
-  eigen_assert(false && "packet integer division are not supported by NEON");
-  return pset1<Packet8c>(0);
-}
-template<> EIGEN_STRONG_INLINE Packet16c pdiv<Packet16c>(const Packet16c& /*a*/, const Packet16c& /*b*/)
-{
-  eigen_assert(false && "packet integer division are not supported by NEON");
-  return pset1<Packet16c>(0);
-}
-template<> EIGEN_STRONG_INLINE Packet4uc pdiv<Packet4uc>(const Packet4uc& /*a*/, const Packet4uc& /*b*/)
-{
-  eigen_assert(false && "packet integer division are not supported by NEON");
-  return pset1<Packet4uc>(0);
-}
-template<> EIGEN_STRONG_INLINE Packet8uc pdiv<Packet8uc>(const Packet8uc& /*a*/, const Packet8uc& /*b*/)
-{
-  eigen_assert(false && "packet integer division are not supported by NEON");
-  return pset1<Packet8uc>(0);
-}
-template<> EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(const Packet16uc& /*a*/, const Packet16uc& /*b*/)
-{
-  eigen_assert(false && "packet integer division are not supported by NEON");
-  return pset1<Packet16uc>(0);
-}
-template<> EIGEN_STRONG_INLINE Packet4s pdiv<Packet4s>(const Packet4s& /*a*/, const Packet4s& /*b*/)
-{
-  eigen_assert(false && "packet integer division are not supported by NEON");
-  return pset1<Packet4s>(0);
-}
-template<> EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(const Packet8s& /*a*/, const Packet8s& /*b*/)
-{
-  eigen_assert(false && "packet integer division are not supported by NEON");
-  return pset1<Packet8s>(0);
-}
-template<> EIGEN_STRONG_INLINE Packet4us pdiv<Packet4us>(const Packet4us& /*a*/, const Packet4us& /*b*/)
-{
-  eigen_assert(false && "packet integer division are not supported by NEON");
-  return pset1<Packet4us>(0);
-}
-template<> EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(const Packet8us& /*a*/, const Packet8us& /*b*/)
-{
-  eigen_assert(false && "packet integer division are not supported by NEON");
-  return pset1<Packet8us>(0);
-}
-template<> EIGEN_STRONG_INLINE Packet2i pdiv<Packet2i>(const Packet2i& /*a*/, const Packet2i& /*b*/)
-{
-  eigen_assert(false && "packet integer division are not supported by NEON");
-  return pset1<Packet2i>(0);
-}
 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
-{
-  eigen_assert(false && "packet integer division are not supported by NEON");
+{ eigen_assert(false && "packet integer division are not supported by NEON");
   return pset1<Packet4i>(0);
 }
-template<> EIGEN_STRONG_INLINE Packet2ui pdiv<Packet2ui>(const Packet2ui& /*a*/, const Packet2ui& /*b*/)
-{
-  eigen_assert(false && "packet integer division are not supported by NEON");
-  return pset1<Packet2ui>(0);
-}
-template<> EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(const Packet4ui& /*a*/, const Packet4ui& /*b*/)
-{
-  eigen_assert(false && "packet integer division are not supported by NEON");
-  return pset1<Packet4ui>(0);
-}
-template<> EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(const Packet2l& /*a*/, const Packet2l& /*b*/)
-{
-  eigen_assert(false && "packet integer division are not supported by NEON");
-  return pset1<Packet2l>(0LL);
-}
-template<> EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& /*a*/, const Packet2ul& /*b*/)
-{
-  eigen_assert(false && "packet integer division are not supported by NEON");
-  return pset1<Packet2ul>(0ULL);
-}
 
 // Clang/ARM wrongly advertises __ARM_FEATURE_FMA even when it's not available,
 // then implements a slow software scalar fallback calling fmaf()!
@@ -1030,11 +215,9 @@ template<> EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& /*a*/,
 // MLA is not fused i.e. does 2 roundings.
 // In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4):
 // MLA: 10 GFlop/s ; FMA: 12 GFlops/s.
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
-{ return vfmaq_f32(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); }
 #else
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
-{
+template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
 #if EIGEN_COMP_CLANG && EIGEN_ARCH_ARM
   // Clang/ARM will replace VMLA by VMUL+VADD at least for some values of -mcpu,
   // at least -mcpu=cortex-a8 and -mcpu=cortex-a7. Since the former is the default on
@@ -1058,2112 +241,316 @@ template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f&
 #endif
 
 // No FMA instruction for int, so use MLA unconditionally.
-template<> EIGEN_STRONG_INLINE Packet4c pmadd(const Packet4c& a, const Packet4c& b, const Packet4c& c)
-{
-  return vget_lane_s32(vreinterpret_s32_s8(vmla_s8(
-      vreinterpret_s8_s32(vdup_n_s32(c)),
-      vreinterpret_s8_s32(vdup_n_s32(a)),
-      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8c pmadd(const Packet8c& a, const Packet8c& b, const Packet8c& c)
-{ return vmla_s8(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c)
-{ return vmlaq_s8(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc pmadd(const Packet4uc& a, const Packet4uc& b, const Packet4uc& c)
-{
-  return vget_lane_u32(vreinterpret_u32_u8(vmla_u8(
-      vreinterpret_u8_u32(vdup_n_u32(c)),
-      vreinterpret_u8_u32(vdup_n_u32(a)),
-      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8uc pmadd(const Packet8uc& a, const Packet8uc& b, const Packet8uc& c)
-{ return vmla_u8(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c)
-{ return vmlaq_u8(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pmadd(const Packet4s& a, const Packet4s& b, const Packet4s& c)
-{ return vmla_s16(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c)
-{ return vmlaq_s16(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us pmadd(const Packet4us& a, const Packet4us& b, const Packet4us& c)
-{ return vmla_u16(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c)
-{ return vmlaq_u16(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pmadd(const Packet2i& a, const Packet2i& b, const Packet2i& c)
-{ return vmla_s32(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c)
-{ return vmlaq_s32(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui pmadd(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c)
-{ return vmla_u32(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c)
-{ return vmlaq_u32(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet2f pabsdiff<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vabd_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vabdq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4c pabsdiff<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
-  return vget_lane_s32(vreinterpret_s32_s8(vabd_s8(
-      vreinterpret_s8_s32(vdup_n_s32(a)),
-      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8c pabsdiff<Packet8c>(const Packet8c& a, const Packet8c& b)
-{ return vabd_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(const Packet16c& a, const Packet16c& b)
-{ return vabdq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc pabsdiff<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
-  return vget_lane_u32(vreinterpret_u32_u8(vabd_u8(
-      vreinterpret_u8_u32(vdup_n_u32(a)),
-      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8uc pabsdiff<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
-{ return vabd_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
-{ return vabdq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pabsdiff<Packet4s>(const Packet4s& a, const Packet4s& b)
-{ return vabd_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(const Packet8s& a, const Packet8s& b)
-{ return vabdq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us pabsdiff<Packet4us>(const Packet4us& a, const Packet4us& b)
-{ return vabd_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(const Packet8us& a, const Packet8us& b)
-{ return vabdq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pabsdiff<Packet2i>(const Packet2i& a, const Packet2i& b)
-{ return vabd_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(const Packet4i& a, const Packet4i& b)
-{ return vabdq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui pabsdiff<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
-{ return vabd_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{ return vabdq_u32(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet2f pmin<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmin_f32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4c pmin<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
-  return vget_lane_s32(vreinterpret_s32_s8(vmin_s8(
-      vreinterpret_s8_s32(vdup_n_s32(a)),
-      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8c pmin<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmin_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) { return vminq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc pmin<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
-  return vget_lane_u32(vreinterpret_u32_u8(vmin_u8(
-      vreinterpret_u8_u32(vdup_n_u32(a)),
-      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8uc pmin<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmin_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vminq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pmin<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmin_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) { return vminq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us pmin<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmin_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) { return vminq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pmin<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmin_s32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui pmin<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmin_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vminq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
-  return vcombine_s64(
-      vdup_n_s64((std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
-      vdup_n_s64((std::min)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
-}
-template<> EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
-  return vcombine_u64(
-      vdup_n_u64((std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
-      vdup_n_u64((std::min)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
-}
 
-template<> EIGEN_STRONG_INLINE Packet2f pmax<Packet2f>(const Packet2f& a, const Packet2f& b) { return vmax_f32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4c pmax<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
-  return vget_lane_s32(vreinterpret_s32_s8(vmax_s8(
-      vreinterpret_s8_s32(vdup_n_s32(a)),
-      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8c pmax<Packet8c>(const Packet8c& a, const Packet8c& b) { return vmax_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) { return vmaxq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc pmax<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
-  return vget_lane_u32(vreinterpret_u32_u8(vmax_u8(
-      vreinterpret_u8_u32(vdup_n_u32(a)),
-      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8uc pmax<Packet8uc>(const Packet8uc& a, const Packet8uc& b) { return vmax_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vmaxq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pmax<Packet4s>(const Packet4s& a, const Packet4s& b) { return vmax_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) { return vmaxq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us pmax<Packet4us>(const Packet4us& a, const Packet4us& b) { return vmax_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) { return vmaxq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pmax<Packet2i>(const Packet2i& a, const Packet2i& b) { return vmax_s32(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui pmax<Packet2ui>(const Packet2ui& a, const Packet2ui& b) { return vmax_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) { return vmaxq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
-  return vcombine_s64(
-      vdup_n_s64((std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
-      vdup_n_s64((std::max)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
-}
-template<> EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
-  return vcombine_u64(
-      vdup_n_u64((std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
-      vdup_n_u64((std::max)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2f pcmp_le<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vreinterpret_f32_u32(vcle_f32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vreinterpretq_f32_u32(vcleq_f32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4c pcmp_le<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
-  return vget_lane_s32(vreinterpret_s32_u8(vcle_s8(
-      vreinterpret_s8_s32(vdup_n_s32(a)),
-      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8c pcmp_le<Packet8c>(const Packet8c& a, const Packet8c& b)
-{ return vreinterpret_s8_u8(vcle_s8(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(const Packet16c& a, const Packet16c& b)
-{ return vreinterpretq_s8_u8(vcleq_s8(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4uc pcmp_le<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
-  return vget_lane_u32(vreinterpret_u32_u8(vcle_u8(
-      vreinterpret_u8_u32(vdup_n_u32(a)),
-      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8uc pcmp_le<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
-{ return vcle_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
-{ return vcleq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pcmp_le<Packet4s>(const Packet4s& a, const Packet4s& b)
-{ return vreinterpret_s16_u16(vcle_s16(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(const Packet8s& a, const Packet8s& b)
-{ return vreinterpretq_s16_u16(vcleq_s16(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4us pcmp_le<Packet4us>(const Packet4us& a, const Packet4us& b)
-{ return vcle_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(const Packet8us& a, const Packet8us& b)
-{ return vcleq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pcmp_le<Packet2i>(const Packet2i& a, const Packet2i& b)
-{ return vreinterpret_s32_u32(vcle_s32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(const Packet4i& a, const Packet4i& b)
-{ return vreinterpretq_s32_u32(vcleq_s32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet2ui pcmp_le<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
-{ return vcle_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{ return vcleq_u32(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vreinterpret_f32_u32(vclt_f32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vreinterpretq_f32_u32(vcltq_f32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4c pcmp_lt<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
-  return vget_lane_s32(vreinterpret_s32_u8(vclt_s8(
-      vreinterpret_s8_s32(vdup_n_s32(a)),
-      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8c pcmp_lt<Packet8c>(const Packet8c& a, const Packet8c& b)
-{ return vreinterpret_s8_u8(vclt_s8(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(const Packet16c& a, const Packet16c& b)
-{ return vreinterpretq_s8_u8(vcltq_s8(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4uc pcmp_lt<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
-  return vget_lane_u32(vreinterpret_u32_u8(vclt_u8(
-      vreinterpret_u8_u32(vdup_n_u32(a)),
-      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8uc pcmp_lt<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
-{ return vclt_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
-{ return vcltq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pcmp_lt<Packet4s>(const Packet4s& a, const Packet4s& b)
-{ return vreinterpret_s16_u16(vclt_s16(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(const Packet8s& a, const Packet8s& b)
-{ return vreinterpretq_s16_u16(vcltq_s16(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4us pcmp_lt<Packet4us>(const Packet4us& a, const Packet4us& b)
-{ return vclt_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(const Packet8us& a, const Packet8us& b)
-{ return vcltq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pcmp_lt<Packet2i>(const Packet2i& a, const Packet2i& b)
-{ return vreinterpret_s32_u32(vclt_s32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(const Packet4i& a, const Packet4i& b)
-{ return vreinterpretq_s32_u32(vcltq_s32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet2ui pcmp_lt<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
-{ return vclt_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{ return vcltq_u32(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet2f pcmp_eq<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vreinterpret_f32_u32(vceq_f32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vreinterpretq_f32_u32(vceqq_f32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4c pcmp_eq<Packet4c>(const Packet4c& a, const Packet4c& b)
-{
-  return vget_lane_s32(vreinterpret_s32_u8(vceq_s8(
-      vreinterpret_s8_s32(vdup_n_s32(a)),
-      vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8c pcmp_eq<Packet8c>(const Packet8c& a, const Packet8c& b)
-{ return vreinterpret_s8_u8(vceq_s8(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(const Packet16c& a, const Packet16c& b)
-{ return vreinterpretq_s8_u8(vceqq_s8(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4uc pcmp_eq<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{
-  return vget_lane_u32(vreinterpret_u32_u8(vceq_u8(
-      vreinterpret_u8_u32(vdup_n_u32(a)),
-      vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8uc pcmp_eq<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
-{ return vceq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
-{ return vceqq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pcmp_eq<Packet4s>(const Packet4s& a, const Packet4s& b)
-{ return vreinterpret_s16_u16(vceq_s16(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(const Packet8s& a, const Packet8s& b)
-{ return vreinterpretq_s16_u16(vceqq_s16(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4us pcmp_eq<Packet4us>(const Packet4us& a, const Packet4us& b)
-{ return vceq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(const Packet8us& a, const Packet8us& b)
-{ return vceqq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pcmp_eq<Packet2i>(const Packet2i& a, const Packet2i& b)
-{ return vreinterpret_s32_u32(vceq_s32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(const Packet4i& a, const Packet4i& b)
-{ return vreinterpretq_s32_u32(vceqq_s32(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet2ui pcmp_eq<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
-{ return vceq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{ return vceqq_u32(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt_or_nan<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vreinterpret_f32_u32(vmvn_u32(vcge_f32(a,b))); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a,b))); }
-
-template<> EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a)
-{
-  const Packet2f cst_1 = pset1<Packet2f>(1.0f);
-  /* perform a floorf */
-  Packet2f tmp = vcvt_f32_s32(vcvt_s32_f32(a));
-
-  /* if greater, substract 1 */
-  Packet2ui mask = vcgt_f32(tmp, a);
-  mask = vand_u32(mask, vreinterpret_u32_f32(cst_1));
-  return vsub_f32(tmp, vreinterpret_f32_u32(mask));
-}
-template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
-{
-  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
-  /* perform a floorf */
-  Packet4f tmp = vcvtq_f32_s32(vcvtq_s32_f32(a));
-
-  /* if greater, substract 1 */
-  Packet4ui mask = vcgtq_f32(tmp, a);
-  mask = vandq_u32(mask, vreinterpretq_u32_f32(cst_1));
-  return vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
-}
 
 // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
-template<> EIGEN_STRONG_INLINE Packet2f pand<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
-template<> EIGEN_STRONG_INLINE Packet4c pand<Packet4c>(const Packet4c& a, const Packet4c& b)
-{ return a & b; }
-template<> EIGEN_STRONG_INLINE Packet8c pand<Packet8c>(const Packet8c& a, const Packet8c& b)
-{ return vand_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(const Packet16c& a, const Packet16c& b)
-{ return vandq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc pand<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{ return a & b; }
-template<> EIGEN_STRONG_INLINE Packet8uc pand<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
-{ return vand_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
-{ return vandq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pand<Packet4s>(const Packet4s& a, const Packet4s& b) { return vand_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(const Packet8s& a, const Packet8s& b) { return vandq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us pand<Packet4us>(const Packet4us& a, const Packet4us& b)
-{ return vand_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b)
-{ return vandq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pand<Packet2i>(const Packet2i& a, const Packet2i& b) { return vand_s32(a,b); }
+{
+  return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
+}
 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui pand<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
-{ return vand_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{ return vandq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) { return vandq_s64(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
-{ return vandq_u64(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet2f por<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
-template<> EIGEN_STRONG_INLINE Packet4c por<Packet4c>(const Packet4c& a, const Packet4c& b)
-{ return a | b; }
-template<> EIGEN_STRONG_INLINE Packet8c por<Packet8c>(const Packet8c& a, const Packet8c& b) { return vorr_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c por<Packet16c>(const Packet16c& a, const Packet16c& b)
-{ return vorrq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc por<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{ return a | b; }
-template<> EIGEN_STRONG_INLINE Packet8uc por<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
-{ return vorr_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
-{ return vorrq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s por<Packet4s>(const Packet4s& a, const Packet4s& b)
-{ return vorr_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b)
-{ return vorrq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us por<Packet4us>(const Packet4us& a, const Packet4us& b)
-{ return vorr_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b)
-{ return vorrq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i por<Packet2i>(const Packet2i& a, const Packet2i& b) { return vorr_s32(a,b); }
+{
+  return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
+}
 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui por<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
-{ return vorr_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{ return vorrq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b)
-{ return vorrq_s64(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
-{ return vorrq_u64(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
-template<> EIGEN_STRONG_INLINE Packet4c pxor<Packet4c>(const Packet4c& a, const Packet4c& b)
-{ return a ^ b; }
-template<> EIGEN_STRONG_INLINE Packet8c pxor<Packet8c>(const Packet8c& a, const Packet8c& b)
-{ return veor_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(const Packet16c& a, const Packet16c& b)
-{ return veorq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc pxor<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{ return a ^ b; }
-template<> EIGEN_STRONG_INLINE Packet8uc pxor<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
-{ return veor_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
-{ return veorq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pxor<Packet4s>(const Packet4s& a, const Packet4s& b) { return veor_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(const Packet8s& a, const Packet8s& b) { return veorq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us pxor<Packet4us>(const Packet4us& a, const Packet4us& b)
-{ return veor_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b)
-{ return veorq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pxor<Packet2i>(const Packet2i& a, const Packet2i& b) { return veor_s32(a,b); }
+{
+  return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
+}
 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui pxor<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
-{ return veor_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{ return veorq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b)
-{ return veorq_s64(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
-{ return veorq_u64(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet2f pandnot<Packet2f>(const Packet2f& a, const Packet2f& b)
-{ return vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); }
 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
-{ return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); }
-template<> EIGEN_STRONG_INLINE Packet4c pandnot<Packet4c>(const Packet4c& a, const Packet4c& b)
-{ return a & ~b; }
-template<> EIGEN_STRONG_INLINE Packet8c pandnot<Packet8c>(const Packet8c& a, const Packet8c& b) { return vbic_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(const Packet16c& a, const Packet16c& b) { return vbicq_s8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4uc pandnot<Packet4uc>(const Packet4uc& a, const Packet4uc& b)
-{ return a & ~b; }
-template<> EIGEN_STRONG_INLINE Packet8uc pandnot<Packet8uc>(const Packet8uc& a, const Packet8uc& b)
-{ return vbic_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(const Packet16uc& a, const Packet16uc& b)
-{ return vbicq_u8(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4s pandnot<Packet4s>(const Packet4s& a, const Packet4s& b)
-{ return vbic_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(const Packet8s& a, const Packet8s& b)
-{ return vbicq_s16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4us pandnot<Packet4us>(const Packet4us& a, const Packet4us& b)
-{ return vbic_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(const Packet8us& a, const Packet8us& b)
-{ return vbicq_u16(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2i pandnot<Packet2i>(const Packet2i& a, const Packet2i& b)
-{ return vbic_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b)
-{ return vbicq_s32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ui pandnot<Packet2ui>(const Packet2ui& a, const Packet2ui& b)
-{ return vbic_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b)
-{ return vbicq_u32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b)
-{ return vbicq_s64(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(const Packet2ul& a, const Packet2ul& b)
-{ return vbicq_u64(a,b); }
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pnot<Packet2f>(const Packet2f& a)
-{ return vreinterpret_f32_u32(vmvn_u32(vreinterpret_u32_f32(a))); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pnot<Packet4f>(const Packet4f& a)
-{ return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(a))); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pnot<Packet4c>(const Packet4c& a)
-{ return ~a; }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pnot<Packet8c>(const Packet8c& a)
-{ return vmvn_s8(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pnot<Packet16c>(const Packet16c& a)
-{ return vmvnq_s8(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pnot<Packet4uc>(const Packet4uc& a)
-{ return ~a; }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pnot<Packet8uc>(const Packet8uc& a)
-{ return vmvn_u8(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pnot<Packet16uc>(const Packet16uc& a)
-{ return vmvnq_u8(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pnot<Packet4s>(const Packet4s& a)
-{ return vmvn_s16(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pnot<Packet8s>(const Packet8s& a)
-{ return vmvnq_s16(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pnot<Packet4us>(const Packet4us& a)
-{ return vmvn_u16(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pnot<Packet8us>(const Packet8us& a)
-{ return vmvnq_u16(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pnot<Packet2i>(const Packet2i& a)
-{ return vmvn_s32(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pnot<Packet4i>(const Packet4i& a)
-{ return vmvnq_s32(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pnot<Packet2ui>(const Packet2ui& a)
-{ return vmvn_u32(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pnot<Packet4ui>(const Packet4ui& a)
-{ return vmvnq_u32(a); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pnot<Packet2l>(const Packet2l& a)
-{ return vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a))); }
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pnot<Packet2ul>(const Packet2ul& a)
-{ return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(a))); }
-
-template<int N> EIGEN_STRONG_INLINE Packet4c parithmetic_shift_right(Packet4c& a)
-{ return vget_lane_s32(vreinterpret_s32_s8(vshr_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); }
-template<int N> EIGEN_STRONG_INLINE Packet8c parithmetic_shift_right(Packet8c a) { return vshr_n_s8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(Packet16c a) { return vshrq_n_s8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4uc parithmetic_shift_right(Packet4uc& a)
-{ return vget_lane_u32(vreinterpret_u32_u8(vshr_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); }
-template<int N> EIGEN_STRONG_INLINE Packet8uc parithmetic_shift_right(Packet8uc a) { return vshr_n_u8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4s parithmetic_shift_right(Packet4s a) { return vshr_n_s16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) { return vshrq_n_s16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4us parithmetic_shift_right(Packet4us a) { return vshr_n_u16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(Packet8us a) { return vshrq_n_u16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2i parithmetic_shift_right(Packet2i a) { return vshr_n_s32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) { return vshrq_n_s32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2ui parithmetic_shift_right(Packet2ui a) { return vshr_n_u32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(Packet2l a) { return vshrq_n_s64(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); }
-
-template<int N> EIGEN_STRONG_INLINE Packet4c plogical_shift_right(Packet4c& a)
-{ return vget_lane_s32(vreinterpret_s32_u8(vshr_n_u8(vreinterpret_u8_s32(vdup_n_s32(a)), N)), 0); }
-template<int N> EIGEN_STRONG_INLINE Packet8c plogical_shift_right(Packet8c a)
-{ return vreinterpret_s8_u8(vshr_n_u8(vreinterpret_u8_s8(a),N)); }
-template<int N> EIGEN_STRONG_INLINE Packet16c plogical_shift_right(Packet16c a)
-{ return vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(a),N)); }
-template<int N> EIGEN_STRONG_INLINE Packet4uc plogical_shift_right(Packet4uc& a)
-{ return vget_lane_u32(vreinterpret_u32_s8(vshr_n_s8(vreinterpret_s8_u32(vdup_n_u32(a)), N)), 0); }
-template<int N> EIGEN_STRONG_INLINE Packet8uc plogical_shift_right(Packet8uc a) { return vshr_n_u8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4s plogical_shift_right(Packet4s a)
-{ return vreinterpret_s16_u16(vshr_n_u16(vreinterpret_u16_s16(a),N)); }
-template<int N> EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a)
-{ return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(a),N)); }
-template<int N> EIGEN_STRONG_INLINE Packet4us plogical_shift_right(Packet4us a) { return vshr_n_u16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_right(Packet8us a) { return vshrq_n_u16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2i plogical_shift_right(Packet2i a)
-{ return vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(a),N)); }
-template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a)
-{ return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a),N)); }
-template<int N> EIGEN_STRONG_INLINE Packet2ui plogical_shift_right(Packet2ui a) { return vshr_n_u32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2l plogical_shift_right(Packet2l a)
-{ return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a),N)); }
-template<int N> EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); }
-
-template<int N> EIGEN_STRONG_INLINE Packet4c plogical_shift_left(Packet4c& a)
-{ return vget_lane_s32(vreinterpret_s32_s8(vshl_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); }
-template<int N> EIGEN_STRONG_INLINE Packet8c plogical_shift_left(Packet8c a) { return vshl_n_s8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet16c plogical_shift_left(Packet16c a) { return vshlq_n_s8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4uc plogical_shift_left(Packet4uc& a)
-{ return vget_lane_u32(vreinterpret_u32_u8(vshl_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); }
-template<int N> EIGEN_STRONG_INLINE Packet8uc plogical_shift_left(Packet8uc a) { return vshl_n_u8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(Packet16uc a) { return vshlq_n_u8(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4s plogical_shift_left(Packet4s a) { return vshl_n_s16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) { return vshlq_n_s16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4us plogical_shift_left(Packet4us a) { return vshl_n_u16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet8us plogical_shift_left(Packet8us a) { return vshlq_n_u16(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2i plogical_shift_left(Packet2i a) { return vshl_n_s32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) { return vshlq_n_s32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2ui plogical_shift_left(Packet2ui a) { return vshl_n_u32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(Packet4ui a) { return vshlq_n_u32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2l plogical_shift_left(Packet2l a) { return vshlq_n_s64(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) { return vshlq_n_u64(a,N); }
-
-template<> EIGEN_STRONG_INLINE Packet2f pload<Packet2f>(const float* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4c pload<Packet4c>(const int8_t* from)
 {
-  Packet4c res;
-  memcpy(&res, from, sizeof(Packet4c));
-  return res;
+  return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
 }
-template<> EIGEN_STRONG_INLINE Packet8c pload<Packet8c>(const int8_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(from); }
-template<> EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(from); }
-template<> EIGEN_STRONG_INLINE Packet4uc pload<Packet4uc>(const uint8_t* from)
-{
-  Packet4uc res;
-  memcpy(&res, from, sizeof(Packet4uc));
-  return res;
-}
-template<> EIGEN_STRONG_INLINE Packet8uc pload<Packet8uc>(const uint8_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(from); }
-template<> EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(from); }
-template<> EIGEN_STRONG_INLINE Packet4s pload<Packet4s>(const int16_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(from); }
-template<> EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(from); }
-template<> EIGEN_STRONG_INLINE Packet4us pload<Packet4us>(const uint16_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(from); }
-template<> EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(from); }
-template<> EIGEN_STRONG_INLINE Packet2i pload<Packet2i>(const int32_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
-template<> EIGEN_STRONG_INLINE Packet2ui pload<Packet2ui>(const uint32_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(from); }
-template<> EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(from); }
-template<> EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(from); }
-template<> EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(from); }
+template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet2f ploadu<Packet2f>(const float* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4c ploadu<Packet4c>(const int8_t* from)
-{
-  Packet4c res;
-  memcpy(&res, from, sizeof(Packet4c));
-  return res;
-}
-template<> EIGEN_STRONG_INLINE Packet8c ploadu<Packet8c>(const int8_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s8(from); }
-template<> EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const int8_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s8(from); }
-template<> EIGEN_STRONG_INLINE Packet4uc ploadu<Packet4uc>(const uint8_t* from)
-{
-  Packet4uc res;
-  memcpy(&res, from, sizeof(Packet4uc));
-  return res;
-}
-template<> EIGEN_STRONG_INLINE Packet8uc ploadu<Packet8uc>(const uint8_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u8(from); }
-template<> EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const uint8_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u8(from); }
-template<> EIGEN_STRONG_INLINE Packet4s ploadu<Packet4s>(const int16_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s16(from); }
-template<> EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const int16_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s16(from); }
-template<> EIGEN_STRONG_INLINE Packet4us ploadu<Packet4us>(const uint16_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u16(from); }
-template<> EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const uint16_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u16(from); }
-template<> EIGEN_STRONG_INLINE Packet2i ploadu<Packet2i>(const int32_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
-template<> EIGEN_STRONG_INLINE Packet2ui ploadu<Packet2ui>(const uint32_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u32(from); }
-template<> EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u32(from); }
-template<> EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s64(from); }
-template<> EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(const uint64_t* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u64(from); }
+template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*    from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t*  from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
+
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*   from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
+template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
 
-template<> EIGEN_STRONG_INLINE Packet2f ploaddup<Packet2f>(const float* from)
-{ return vld1_dup_f32(from); }
 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
-{ return vcombine_f32(vld1_dup_f32(from), vld1_dup_f32(from+1)); }
-template<> EIGEN_STRONG_INLINE Packet4c ploaddup<Packet4c>(const int8_t* from)
 {
-  const int8x8_t a = vreinterpret_s8_s32(vdup_n_s32(pload<Packet4c>(from)));
-  return vget_lane_s32(vreinterpret_s32_s8(vzip_s8(a,a).val[0]), 0);
+  float32x2_t lo, hi;
+  lo = vld1_dup_f32(from);
+  hi = vld1_dup_f32(from+1);
+  return vcombine_f32(lo, hi);
 }
-template<> EIGEN_STRONG_INLINE Packet8c ploaddup<Packet8c>(const int8_t* from)
-{
-  const int8x8_t a = vld1_s8(from);
-  return vzip_s8(a,a).val[0];
-}
-template<> EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const int8_t* from)
-{
-  const int8x8_t a = vld1_s8(from);
-  const int8x8x2_t b = vzip_s8(a,a);
-  return vcombine_s8(b.val[0], b.val[1]);
-}
-template<> EIGEN_STRONG_INLINE Packet4uc ploaddup<Packet4uc>(const uint8_t* from)
-{
-  const uint8x8_t a = vreinterpret_u8_u32(vdup_n_u32(pload<Packet4uc>(from)));
-  return vget_lane_u32(vreinterpret_u32_u8(vzip_u8(a,a).val[0]), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8uc ploaddup<Packet8uc>(const uint8_t* from)
-{
-  const uint8x8_t a = vld1_u8(from);
-  return vzip_u8(a,a).val[0];
-}
-template<> EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const uint8_t* from)
-{
-  const uint8x8_t a = vld1_u8(from);
-  const uint8x8x2_t b = vzip_u8(a,a);
-  return vcombine_u8(b.val[0], b.val[1]);
-}
-template<> EIGEN_STRONG_INLINE Packet4s ploaddup<Packet4s>(const int16_t* from)
-{
-  return vreinterpret_s16_u32(vzip_u32(vreinterpret_u32_s16(vld1_dup_s16(from)),
-      vreinterpret_u32_s16(vld1_dup_s16(from+1))).val[0]);
-}
-template<> EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const int16_t* from)
-{
-  const int16x4_t a = vld1_s16(from);
-  const int16x4x2_t b = vzip_s16(a,a);
-  return vcombine_s16(b.val[0], b.val[1]);
-}
-template<> EIGEN_STRONG_INLINE Packet4us ploaddup<Packet4us>(const uint16_t* from)
-{
-  return vreinterpret_u16_u32(vzip_u32(vreinterpret_u32_u16(vld1_dup_u16(from)),
-      vreinterpret_u32_u16(vld1_dup_u16(from+1))).val[0]);
-}
-template<> EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const uint16_t* from)
-{
-  const uint16x4_t a = vld1_u16(from);
-  const uint16x4x2_t b = vzip_u16(a,a);
-  return vcombine_u16(b.val[0], b.val[1]);
-}
-template<> EIGEN_STRONG_INLINE Packet2i ploaddup<Packet2i>(const int32_t* from)
-{ return vld1_dup_s32(from); }
 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from)
-{ return vcombine_s32(vld1_dup_s32(from), vld1_dup_s32(from+1)); }
-template<> EIGEN_STRONG_INLINE Packet2ui ploaddup<Packet2ui>(const uint32_t* from)
-{ return vld1_dup_u32(from); }
-template<> EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from)
-{ return vcombine_u32(vld1_dup_u32(from), vld1_dup_u32(from+1)); }
-template<> EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from)
-{ return vld1q_dup_s64(from); }
-template<> EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(const uint64_t* from)
-{ return vld1q_dup_u64(from); }
+{
+  int32x2_t lo, hi;
+  lo = vld1_dup_s32(from);
+  hi = vld1_dup_s32(from+1);
+  return vcombine_s32(lo, hi);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(const float* from) { return vld1q_dup_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4c ploadquad<Packet4c>(const int8_t* from)
-{ return vget_lane_s32(vreinterpret_s32_s8(vld1_dup_s8(from)), 0); }
-template<> EIGEN_STRONG_INLINE Packet8c ploadquad<Packet8c>(const int8_t* from)
-{
-  return vreinterpret_s8_u32(vzip_u32(
-      vreinterpret_u32_s8(vld1_dup_s8(from)),
-      vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]);
-}
-template<> EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const int8_t* from)
-{
-  const int8x8_t a = vreinterpret_s8_u32(vzip_u32(
-      vreinterpret_u32_s8(vld1_dup_s8(from)),
-      vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]);
-  const int8x8_t b = vreinterpret_s8_u32(vzip_u32(
-      vreinterpret_u32_s8(vld1_dup_s8(from+2)),
-      vreinterpret_u32_s8(vld1_dup_s8(from+3))).val[0]);
-  return vcombine_s8(a,b);
-}
-template<> EIGEN_STRONG_INLINE Packet4uc ploadquad<Packet4uc>(const uint8_t* from)
-{ return vget_lane_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), 0); }
-template<> EIGEN_STRONG_INLINE Packet8uc ploadquad<Packet8uc>(const uint8_t* from)
-{
-  return vreinterpret_u8_u32(vzip_u32(
-      vreinterpret_u32_u8(vld1_dup_u8(from)),
-      vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]);
-}
-template<> EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const uint8_t* from)
-{
-  const uint8x8_t a = vreinterpret_u8_u32(vzip_u32(
-      vreinterpret_u32_u8(vld1_dup_u8(from)),
-      vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]);
-  const uint8x8_t b = vreinterpret_u8_u32(vzip_u32(
-      vreinterpret_u32_u8(vld1_dup_u8(from+2)),
-      vreinterpret_u32_u8(vld1_dup_u8(from+3))).val[0]);
-  return vcombine_u8(a,b);
-}
-template<> EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const int16_t* from)
-{ return vcombine_s16(vld1_dup_s16(from), vld1_dup_s16(from+1)); }
-template<> EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const uint16_t* from)
-{ return vcombine_u16(vld1_dup_u16(from), vld1_dup_u16(from+1)); }
-template<> EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(const int32_t* from) { return vld1q_dup_s32(from); }
-template<> EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(const uint32_t* from) { return vld1q_dup_u32(from); }
+template<> EIGEN_STRONG_INLINE void pstore<float>  (float*    to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
+template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t*  to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
 
-template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet2f& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1_f32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet4c& from)
-{ memcpy(to, &from, sizeof(from)); }
-template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet8c& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1_s8(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet4uc& from)
-{ memcpy(to, &from, sizeof(from)); }
-template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet8uc& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1_u8(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet4s& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1_s16(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet4us& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1_u16(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet2i& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1_s32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet2ui& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1_u32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(to,from); }
-template<> EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<float>  (float*   to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
 
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet2f& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1_f32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet4c& from)
-{ memcpy(to, &from, sizeof(from)); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet8c& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s8(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet16c& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s8(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet4uc& from)
-{ memcpy(to, &from, sizeof(from)); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet8uc& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u8(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet16uc& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u8(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet4s& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s16(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet8s& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s16(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet4us& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet8us& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u16(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet2i& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet2ui& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u32(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s64(to,from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet2ul& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u64(to,from); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet2f pgather<float, Packet2f>(const float* from, Index stride)
-{
-  Packet2f res = vld1_dup_f32(from);
-  res = vld1_lane_f32(from + 1*stride, res, 1);
-  return res;
-}
 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
 {
-  Packet4f res = vld1q_dup_f32(from);
-  res = vld1q_lane_f32(from + 1*stride, res, 1);
-  res = vld1q_lane_f32(from + 2*stride, res, 2);
-  res = vld1q_lane_f32(from + 3*stride, res, 3);
-  return res;
-}
-template<> EIGEN_DEVICE_FUNC inline Packet4c pgather<int8_t, Packet4c>(const int8_t* from, Index stride)
-{
-  Packet4c res;
-  for (int i = 0; i != 4; i++)
-    reinterpret_cast<int8_t*>(&res)[i] = *(from + i * stride);
-  return res;
-}
-template<> EIGEN_DEVICE_FUNC inline Packet8c pgather<int8_t, Packet8c>(const int8_t* from, Index stride)
-{
-  Packet8c res = vld1_dup_s8(from);
-  res = vld1_lane_s8(from + 1*stride, res, 1);
-  res = vld1_lane_s8(from + 2*stride, res, 2);
-  res = vld1_lane_s8(from + 3*stride, res, 3);
-  res = vld1_lane_s8(from + 4*stride, res, 4);
-  res = vld1_lane_s8(from + 5*stride, res, 5);
-  res = vld1_lane_s8(from + 6*stride, res, 6);
-  res = vld1_lane_s8(from + 7*stride, res, 7);
-  return res;
-}
-template<> EIGEN_DEVICE_FUNC inline Packet16c pgather<int8_t, Packet16c>(const int8_t* from, Index stride)
-{
-  Packet16c res = vld1q_dup_s8(from);
-  res = vld1q_lane_s8(from + 1*stride, res, 1);
-  res = vld1q_lane_s8(from + 2*stride, res, 2);
-  res = vld1q_lane_s8(from + 3*stride, res, 3);
-  res = vld1q_lane_s8(from + 4*stride, res, 4);
-  res = vld1q_lane_s8(from + 5*stride, res, 5);
-  res = vld1q_lane_s8(from + 6*stride, res, 6);
-  res = vld1q_lane_s8(from + 7*stride, res, 7);
-  res = vld1q_lane_s8(from + 8*stride, res, 8);
-  res = vld1q_lane_s8(from + 9*stride, res, 9);
-  res = vld1q_lane_s8(from + 10*stride, res, 10);
-  res = vld1q_lane_s8(from + 11*stride, res, 11);
-  res = vld1q_lane_s8(from + 12*stride, res, 12);
-  res = vld1q_lane_s8(from + 13*stride, res, 13);
-  res = vld1q_lane_s8(from + 14*stride, res, 14);
-  res = vld1q_lane_s8(from + 15*stride, res, 15);
-  return res;
-}
-template<> EIGEN_DEVICE_FUNC inline Packet4uc pgather<uint8_t, Packet4uc>(const uint8_t* from, Index stride)
-{
-  Packet4uc res;
-  for (int i = 0; i != 4; i++)
-    reinterpret_cast<uint8_t*>(&res)[i] = *(from + i * stride);
-  return res;
-}
-template<> EIGEN_DEVICE_FUNC inline Packet8uc pgather<uint8_t, Packet8uc>(const uint8_t* from, Index stride)
-{
-  Packet8uc res = vld1_dup_u8(from);
-  res = vld1_lane_u8(from + 1*stride, res, 1);
-  res = vld1_lane_u8(from + 2*stride, res, 2);
-  res = vld1_lane_u8(from + 3*stride, res, 3);
-  res = vld1_lane_u8(from + 4*stride, res, 4);
-  res = vld1_lane_u8(from + 5*stride, res, 5);
-  res = vld1_lane_u8(from + 6*stride, res, 6);
-  res = vld1_lane_u8(from + 7*stride, res, 7);
-  return res;
-}
-template<> EIGEN_DEVICE_FUNC inline Packet16uc pgather<uint8_t, Packet16uc>(const uint8_t* from, Index stride)
-{
-  Packet16uc res = vld1q_dup_u8(from);
-  res = vld1q_lane_u8(from + 1*stride, res, 1);
-  res = vld1q_lane_u8(from + 2*stride, res, 2);
-  res = vld1q_lane_u8(from + 3*stride, res, 3);
-  res = vld1q_lane_u8(from + 4*stride, res, 4);
-  res = vld1q_lane_u8(from + 5*stride, res, 5);
-  res = vld1q_lane_u8(from + 6*stride, res, 6);
-  res = vld1q_lane_u8(from + 7*stride, res, 7);
-  res = vld1q_lane_u8(from + 8*stride, res, 8);
-  res = vld1q_lane_u8(from + 9*stride, res, 9);
-  res = vld1q_lane_u8(from + 10*stride, res, 10);
-  res = vld1q_lane_u8(from + 11*stride, res, 11);
-  res = vld1q_lane_u8(from + 12*stride, res, 12);
-  res = vld1q_lane_u8(from + 13*stride, res, 13);
-  res = vld1q_lane_u8(from + 14*stride, res, 14);
-  res = vld1q_lane_u8(from + 15*stride, res, 15);
-  return res;
-}
-template<> EIGEN_DEVICE_FUNC inline Packet4s pgather<int16_t, Packet4s>(const int16_t* from, Index stride)
-{
-  Packet4s res = vld1_dup_s16(from);
-  res = vld1_lane_s16(from + 1*stride, res, 1);
-  res = vld1_lane_s16(from + 2*stride, res, 2);
-  res = vld1_lane_s16(from + 3*stride, res, 3);
-  return res;
-}
-template<> EIGEN_DEVICE_FUNC inline Packet8s pgather<int16_t, Packet8s>(const int16_t* from, Index stride)
-{
-  Packet8s res = vld1q_dup_s16(from);
-  res = vld1q_lane_s16(from + 1*stride, res, 1);
-  res = vld1q_lane_s16(from + 2*stride, res, 2);
-  res = vld1q_lane_s16(from + 3*stride, res, 3);
-  res = vld1q_lane_s16(from + 4*stride, res, 4);
-  res = vld1q_lane_s16(from + 5*stride, res, 5);
-  res = vld1q_lane_s16(from + 6*stride, res, 6);
-  res = vld1q_lane_s16(from + 7*stride, res, 7);
-  return res;
-}
-template<> EIGEN_DEVICE_FUNC inline Packet4us pgather<uint16_t, Packet4us>(const uint16_t* from, Index stride)
-{
-  Packet4us res = vld1_dup_u16(from);
-  res = vld1_lane_u16(from + 1*stride, res, 1);
-  res = vld1_lane_u16(from + 2*stride, res, 2);
-  res = vld1_lane_u16(from + 3*stride, res, 3);
-  return res;
-}
-template<> EIGEN_DEVICE_FUNC inline Packet8us pgather<uint16_t, Packet8us>(const uint16_t* from, Index stride)
-{
-  Packet8us res = vld1q_dup_u16(from);
-  res = vld1q_lane_u16(from + 1*stride, res, 1);
-  res = vld1q_lane_u16(from + 2*stride, res, 2);
-  res = vld1q_lane_u16(from + 3*stride, res, 3);
-  res = vld1q_lane_u16(from + 4*stride, res, 4);
-  res = vld1q_lane_u16(from + 5*stride, res, 5);
-  res = vld1q_lane_u16(from + 6*stride, res, 6);
-  res = vld1q_lane_u16(from + 7*stride, res, 7);
-  return res;
-}
-template<> EIGEN_DEVICE_FUNC inline Packet2i pgather<int32_t, Packet2i>(const int32_t* from, Index stride)
-{
-  Packet2i res = vld1_dup_s32(from);
-  res = vld1_lane_s32(from + 1*stride, res, 1);
+  Packet4f res = pset1<Packet4f>(0.f);
+  res = vsetq_lane_f32(from[0*stride], res, 0);
+  res = vsetq_lane_f32(from[1*stride], res, 1);
+  res = vsetq_lane_f32(from[2*stride], res, 2);
+  res = vsetq_lane_f32(from[3*stride], res, 3);
   return res;
 }
 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride)
 {
-  Packet4i res = vld1q_dup_s32(from);
-  res = vld1q_lane_s32(from + 1*stride, res, 1);
-  res = vld1q_lane_s32(from + 2*stride, res, 2);
-  res = vld1q_lane_s32(from + 3*stride, res, 3);
-  return res;
-}
-template<> EIGEN_DEVICE_FUNC inline Packet2ui pgather<uint32_t, Packet2ui>(const uint32_t* from, Index stride)
-{
-  Packet2ui res = vld1_dup_u32(from);
-  res = vld1_lane_u32(from + 1*stride, res, 1);
-  return res;
-}
-template<> EIGEN_DEVICE_FUNC inline Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride)
-{
-  Packet4ui res = vld1q_dup_u32(from);
-  res = vld1q_lane_u32(from + 1*stride, res, 1);
-  res = vld1q_lane_u32(from + 2*stride, res, 2);
-  res = vld1q_lane_u32(from + 3*stride, res, 3);
-  return res;
-}
-template<> EIGEN_DEVICE_FUNC inline Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride)
-{
-  Packet2l res = vld1q_dup_s64(from);
-  res = vld1q_lane_s64(from + 1*stride, res, 1);
-  return res;
-}
-template<> EIGEN_DEVICE_FUNC inline Packet2ul pgather<uint64_t, Packet2ul>(const uint64_t* from, Index stride)
-{
-  Packet2ul res = vld1q_dup_u64(from);
-  res = vld1q_lane_u64(from + 1*stride, res, 1);
+  Packet4i res = pset1<Packet4i>(0);
+  res = vsetq_lane_s32(from[0*stride], res, 0);
+  res = vsetq_lane_s32(from[1*stride], res, 1);
+  res = vsetq_lane_s32(from[2*stride], res, 2);
+  res = vsetq_lane_s32(from[3*stride], res, 3);
   return res;
 }
 
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet2f>(float* to, const Packet2f& from, Index stride)
-{
-  vst1_lane_f32(to + stride*0, from, 0);
-  vst1_lane_f32(to + stride*1, from, 1);
-}
 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
 {
-  vst1q_lane_f32(to + stride*0, from, 0);
-  vst1q_lane_f32(to + stride*1, from, 1);
-  vst1q_lane_f32(to + stride*2, from, 2);
-  vst1q_lane_f32(to + stride*3, from, 3);
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int8_t, Packet4c>(int8_t* to, const Packet4c& from, Index stride)
-{
-  for (int i = 0; i != 4; i++)
-    *(to + i * stride) = reinterpret_cast<const int8_t*>(&from)[i];
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int8_t, Packet8c>(int8_t* to, const Packet8c& from, Index stride)
-{
-  vst1_lane_s8(to + stride*0, from, 0);
-  vst1_lane_s8(to + stride*1, from, 1);
-  vst1_lane_s8(to + stride*2, from, 2);
-  vst1_lane_s8(to + stride*3, from, 3);
-  vst1_lane_s8(to + stride*4, from, 4);
-  vst1_lane_s8(to + stride*5, from, 5);
-  vst1_lane_s8(to + stride*6, from, 6);
-  vst1_lane_s8(to + stride*7, from, 7);
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int8_t, Packet16c>(int8_t* to, const Packet16c& from, Index stride)
-{
-  vst1q_lane_s8(to + stride*0, from, 0);
-  vst1q_lane_s8(to + stride*1, from, 1);
-  vst1q_lane_s8(to + stride*2, from, 2);
-  vst1q_lane_s8(to + stride*3, from, 3);
-  vst1q_lane_s8(to + stride*4, from, 4);
-  vst1q_lane_s8(to + stride*5, from, 5);
-  vst1q_lane_s8(to + stride*6, from, 6);
-  vst1q_lane_s8(to + stride*7, from, 7);
-  vst1q_lane_s8(to + stride*8, from, 8);
-  vst1q_lane_s8(to + stride*9, from, 9);
-  vst1q_lane_s8(to + stride*10, from, 10);
-  vst1q_lane_s8(to + stride*11, from, 11);
-  vst1q_lane_s8(to + stride*12, from, 12);
-  vst1q_lane_s8(to + stride*13, from, 13);
-  vst1q_lane_s8(to + stride*14, from, 14);
-  vst1q_lane_s8(to + stride*15, from, 15);
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<uint8_t, Packet4uc>(uint8_t* to, const Packet4uc& from, Index stride)
-{
-  for (int i = 0; i != 4; i++)
-    *(to + i * stride) = reinterpret_cast<const uint8_t*>(&from)[i];
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<uint8_t, Packet8uc>(uint8_t* to, const Packet8uc& from, Index stride)
-{
-  vst1_lane_u8(to + stride*0, from, 0);
-  vst1_lane_u8(to + stride*1, from, 1);
-  vst1_lane_u8(to + stride*2, from, 2);
-  vst1_lane_u8(to + stride*3, from, 3);
-  vst1_lane_u8(to + stride*4, from, 4);
-  vst1_lane_u8(to + stride*5, from, 5);
-  vst1_lane_u8(to + stride*6, from, 6);
-  vst1_lane_u8(to + stride*7, from, 7);
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<uint8_t, Packet16uc>(uint8_t* to, const Packet16uc& from, Index stride)
-{
-  vst1q_lane_u8(to + stride*0, from, 0);
-  vst1q_lane_u8(to + stride*1, from, 1);
-  vst1q_lane_u8(to + stride*2, from, 2);
-  vst1q_lane_u8(to + stride*3, from, 3);
-  vst1q_lane_u8(to + stride*4, from, 4);
-  vst1q_lane_u8(to + stride*5, from, 5);
-  vst1q_lane_u8(to + stride*6, from, 6);
-  vst1q_lane_u8(to + stride*7, from, 7);
-  vst1q_lane_u8(to + stride*8, from, 8);
-  vst1q_lane_u8(to + stride*9, from, 9);
-  vst1q_lane_u8(to + stride*10, from, 10);
-  vst1q_lane_u8(to + stride*11, from, 11);
-  vst1q_lane_u8(to + stride*12, from, 12);
-  vst1q_lane_u8(to + stride*13, from, 13);
-  vst1q_lane_u8(to + stride*14, from, 14);
-  vst1q_lane_u8(to + stride*15, from, 15);
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int16_t, Packet4s>(int16_t* to, const Packet4s& from, Index stride)
-{
-  vst1_lane_s16(to + stride*0, from, 0);
-  vst1_lane_s16(to + stride*1, from, 1);
-  vst1_lane_s16(to + stride*2, from, 2);
-  vst1_lane_s16(to + stride*3, from, 3);
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int16_t, Packet8s>(int16_t* to, const Packet8s& from, Index stride)
-{
-  vst1q_lane_s16(to + stride*0, from, 0);
-  vst1q_lane_s16(to + stride*1, from, 1);
-  vst1q_lane_s16(to + stride*2, from, 2);
-  vst1q_lane_s16(to + stride*3, from, 3);
-  vst1q_lane_s16(to + stride*4, from, 4);
-  vst1q_lane_s16(to + stride*5, from, 5);
-  vst1q_lane_s16(to + stride*6, from, 6);
-  vst1q_lane_s16(to + stride*7, from, 7);
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<uint16_t, Packet4us>(uint16_t* to, const Packet4us& from, Index stride)
-{
-  vst1_lane_u16(to + stride*0, from, 0);
-  vst1_lane_u16(to + stride*1, from, 1);
-  vst1_lane_u16(to + stride*2, from, 2);
-  vst1_lane_u16(to + stride*3, from, 3);
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<uint16_t, Packet8us>(uint16_t* to, const Packet8us& from, Index stride)
-{
-  vst1q_lane_u16(to + stride*0, from, 0);
-  vst1q_lane_u16(to + stride*1, from, 1);
-  vst1q_lane_u16(to + stride*2, from, 2);
-  vst1q_lane_u16(to + stride*3, from, 3);
-  vst1q_lane_u16(to + stride*4, from, 4);
-  vst1q_lane_u16(to + stride*5, from, 5);
-  vst1q_lane_u16(to + stride*6, from, 6);
-  vst1q_lane_u16(to + stride*7, from, 7);
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet2i>(int32_t* to, const Packet2i& from, Index stride)
-{
-  vst1_lane_s32(to + stride*0, from, 0);
-  vst1_lane_s32(to + stride*1, from, 1);
+  to[stride*0] = vgetq_lane_f32(from, 0);
+  to[stride*1] = vgetq_lane_f32(from, 1);
+  to[stride*2] = vgetq_lane_f32(from, 2);
+  to[stride*3] = vgetq_lane_f32(from, 3);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride)
 {
-  vst1q_lane_s32(to + stride*0, from, 0);
-  vst1q_lane_s32(to + stride*1, from, 1);
-  vst1q_lane_s32(to + stride*2, from, 2);
-  vst1q_lane_s32(to + stride*3, from, 3);
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<uint32_t, Packet2ui>(uint32_t* to, const Packet2ui& from, Index stride)
-{
-  vst1_lane_u32(to + stride*0, from, 0);
-  vst1_lane_u32(to + stride*1, from, 1);
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index stride)
-{
-  vst1q_lane_u32(to + stride*0, from, 0);
-  vst1q_lane_u32(to + stride*1, from, 1);
-  vst1q_lane_u32(to + stride*2, from, 2);
-  vst1q_lane_u32(to + stride*3, from, 3);
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index stride)
-{
-  vst1q_lane_s64(to + stride*0, from, 0);
-  vst1q_lane_s64(to + stride*1, from, 1);
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<uint64_t, Packet2ul>(uint64_t* to, const Packet2ul& from, Index stride)
-{
-  vst1q_lane_u64(to + stride*0, from, 0);
-  vst1q_lane_u64(to + stride*1, from, 1);
+  to[stride*0] = vgetq_lane_s32(from, 0);
+  to[stride*1] = vgetq_lane_s32(from, 1);
+  to[stride*2] = vgetq_lane_s32(from, 2);
+  to[stride*3] = vgetq_lane_s32(from, 3);
 }
 
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<int8_t>(const int8_t* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<uint8_t>(const uint8_t* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<int16_t>(const int16_t* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<uint16_t>(const uint16_t* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<uint64_t>(const uint64_t* addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>  (const float*    addr) { EIGEN_ARM_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t*  addr) { EIGEN_ARM_PREFETCH(addr); }
 
-template<> EIGEN_STRONG_INLINE float pfirst<Packet2f>(const Packet2f& a) { return vget_lane_f32(a,0); }
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { return vgetq_lane_f32(a,0); }
-template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet4c>(const Packet4c& a) { return static_cast<int8_t>(a & 0xff); }
-template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet8c>(const Packet8c& a) { return vget_lane_s8(a,0); }
-template<> EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(const Packet16c& a) { return vgetq_lane_s8(a,0); }
-template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet4uc>(const Packet4uc& a) { return static_cast<uint8_t>(a & 0xff); }
-template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet8uc>(const Packet8uc& a) { return vget_lane_u8(a,0); }
-template<> EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(const Packet16uc& a) { return vgetq_lane_u8(a,0); }
-template<> EIGEN_STRONG_INLINE int16_t pfirst<Packet4s>(const Packet4s& a) { return vget_lane_s16(a,0); }
-template<> EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(const Packet8s& a) { return vgetq_lane_s16(a,0); }
-template<> EIGEN_STRONG_INLINE uint16_t pfirst<Packet4us>(const Packet4us& a) { return vget_lane_u16(a,0); }
-template<> EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(const Packet8us& a) { return vgetq_lane_u16(a,0); }
-template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet2i>(const Packet2i& a) { return vget_lane_s32(a,0); }
-template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { return vgetq_lane_s32(a,0); }
-template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet2ui>(const Packet2ui& a) { return vget_lane_u32(a,0); }
-template<> EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) { return vgetq_lane_u32(a,0); }
-template<> EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) { return vgetq_lane_s64(a,0); }
-template<> EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(const Packet2ul& a) { return vgetq_lane_u64(a,0); }
+// FIXME only store the 2 first elements ?
+template<> EIGEN_STRONG_INLINE float   pfirst<Packet4f>(const Packet4f& a) { float   EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
 
-template<> EIGEN_STRONG_INLINE Packet2f preverse(const Packet2f& a) { return vrev64_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
-{
-  const float32x4_t a_r64 = vrev64q_f32(a);
-  return vcombine_f32(vget_high_f32(a_r64), vget_low_f32(a_r64));
-}
-template<> EIGEN_STRONG_INLINE Packet4c preverse(const Packet4c& a)
-{ return vget_lane_s32(vreinterpret_s32_s8(vrev64_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
-template<> EIGEN_STRONG_INLINE Packet8c preverse(const Packet8c& a) { return vrev64_s8(a); }
-template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a)
-{
-  const int8x16_t a_r64 = vrev64q_s8(a);
-  return vcombine_s8(vget_high_s8(a_r64), vget_low_s8(a_r64));
-}
-template<> EIGEN_STRONG_INLINE Packet4uc preverse(const Packet4uc& a)
-{ return vget_lane_u32(vreinterpret_u32_u8(vrev64_u8(vreinterpret_u8_u32(vdup_n_u32(a)))), 0); }
-template<> EIGEN_STRONG_INLINE Packet8uc preverse(const Packet8uc& a) { return vrev64_u8(a); }
-template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a)
-{
-  const uint8x16_t a_r64 = vrev64q_u8(a);
-  return vcombine_u8(vget_high_u8(a_r64), vget_low_u8(a_r64));
-}
-template<> EIGEN_STRONG_INLINE Packet4s preverse(const Packet4s& a) { return vrev64_s16(a); }
-template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a)
-{
-  const int16x8_t a_r64 = vrev64q_s16(a);
-  return vcombine_s16(vget_high_s16(a_r64), vget_low_s16(a_r64));
-}
-template<> EIGEN_STRONG_INLINE Packet4us preverse(const Packet4us& a) { return vrev64_u16(a); }
-template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a)
-{
-  const uint16x8_t a_r64 = vrev64q_u16(a);
-  return vcombine_u16(vget_high_u16(a_r64), vget_low_u16(a_r64));
-}
-template<> EIGEN_STRONG_INLINE Packet2i preverse(const Packet2i& a) { return vrev64_s32(a); }
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
-{
-  const int32x4_t a_r64 = vrev64q_s32(a);
-  return vcombine_s32(vget_high_s32(a_r64), vget_low_s32(a_r64));
-}
-template<> EIGEN_STRONG_INLINE Packet2ui preverse(const Packet2ui& a) { return vrev64_u32(a); }
-template<> EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a)
-{
-  const uint32x4_t a_r64 = vrev64q_u32(a);
-  return vcombine_u32(vget_high_u32(a_r64), vget_low_u32(a_r64));
-}
-template<> EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a)
-{ return vcombine_s64(vget_high_s64(a), vget_low_s64(a)); }
-template<> EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a)
-{ return vcombine_u64(vget_high_u64(a), vget_low_u64(a)); }
+template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
+  float32x2_t a_lo, a_hi;
+  Packet4f a_r64;
+
+  a_r64 = vrev64q_f32(a);
+  a_lo = vget_low_f32(a_r64);
+  a_hi = vget_high_f32(a_r64);
+  return vcombine_f32(a_hi, a_lo);
+}
+template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
+  int32x2_t a_lo, a_hi;
+  Packet4i a_r64;
+
+  a_r64 = vrev64q_s32(a);
+  a_lo = vget_low_s32(a_r64);
+  a_hi = vget_high_s32(a_r64);
+  return vcombine_s32(a_hi, a_lo);
+}
 
-template<> EIGEN_STRONG_INLINE Packet2f pabs(const Packet2f& a) { return vabs_f32(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet4c pabs<Packet4c>(const Packet4c& a)
-{ return vget_lane_s32(vreinterpret_s32_s8(vabs_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); }
-template<> EIGEN_STRONG_INLINE Packet8c pabs(const Packet8c& a) { return vabs_s8(a); }
-template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vabsq_s8(a); }
-template<> EIGEN_STRONG_INLINE Packet4uc pabs(const Packet4uc& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet8uc pabs(const Packet8uc& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4s pabs(const Packet4s& a) { return vabs_s16(a); }
-template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vabsq_s16(a); }
-template<> EIGEN_STRONG_INLINE Packet4us pabs(const Packet4us& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2i pabs(const Packet2i& a) { return vabs_s32(a); }
 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
-template<> EIGEN_STRONG_INLINE Packet2ui pabs(const Packet2ui& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
-#if EIGEN_ARCH_ARM64
-  return vabsq_s64(a);
-#else
-  return vcombine_s64(
-      vdup_n_s64((std::abs)(vgetq_lane_s64(a, 0))),
-      vdup_n_s64((std::abs)(vgetq_lane_s64(a, 1))));
-#endif
-}
-template<> EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) { return a; }
 
-template<> EIGEN_STRONG_INLINE Packet2f pfrexp<Packet2f>(const Packet2f& a, Packet2f& exponent)
-{ return pfrexp_float(a,exponent); }
-template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent)
-{ return pfrexp_float(a,exponent); }
-
-template<> EIGEN_STRONG_INLINE Packet2f pldexp<Packet2f>(const Packet2f& a, const Packet2f& exponent)
-{ return pldexp_float(a,exponent); }
-template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent)
-{ return pldexp_float(a,exponent); }
-
-template<> EIGEN_STRONG_INLINE float predux<Packet2f>(const Packet2f& a) { return vget_lane_f32(vpadd_f32(a,a), 0); }
 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
 {
-  const float32x2_t sum = vadd_f32(vget_low_f32(a), vget_high_f32(a));
-  return vget_lane_f32(vpadd_f32(sum, sum), 0);
+  float32x2_t a_lo, a_hi, sum;
+
+  a_lo = vget_low_f32(a);
+  a_hi = vget_high_f32(a);
+  sum = vpadd_f32(a_lo, a_hi);
+  sum = vpadd_f32(sum, sum);
+  return vget_lane_f32(sum, 0);
 }
-template<> EIGEN_STRONG_INLINE int8_t predux<Packet4c>(const Packet4c& a)
+
+template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
 {
-  const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
-  int8x8_t sum = vpadd_s8(a_dup, a_dup);
-  sum = vpadd_s8(sum, sum);
-  return vget_lane_s8(sum, 0);
+  float32x4x2_t vtrn1, vtrn2, res1, res2;
+  Packet4f sum1, sum2, sum;
+
+  // NEON zip performs interleaving of the supplied vectors.
+  // We perform two interleaves in a row to acquire the transposed vector
+  vtrn1 = vzipq_f32(vecs[0], vecs[2]);
+  vtrn2 = vzipq_f32(vecs[1], vecs[3]);
+  res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]);
+  res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]);
+
+  // Do the addition of the resulting vectors
+  sum1 = vaddq_f32(res1.val[0], res1.val[1]);
+  sum2 = vaddq_f32(res2.val[0], res2.val[1]);
+  sum = vaddq_f32(sum1, sum2);
+
+  return sum;
 }
-template<> EIGEN_STRONG_INLINE int8_t predux<Packet8c>(const Packet8c& a)
-{
-  int8x8_t sum = vpadd_s8(a,a);
-  sum = vpadd_s8(sum, sum);
-  sum = vpadd_s8(sum, sum);
-  return vget_lane_s8(sum, 0);
-}
-template<> EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a)
-{
-  int8x8_t sum = vadd_s8(vget_low_s8(a), vget_high_s8(a));
-  sum = vpadd_s8(sum, sum);
-  sum = vpadd_s8(sum, sum);
-  sum = vpadd_s8(sum, sum);
-  return vget_lane_s8(sum, 0);
-}
-template<> EIGEN_STRONG_INLINE uint8_t predux<Packet4uc>(const Packet4uc& a)
-{
-  const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
-  uint8x8_t sum = vpadd_u8(a_dup, a_dup);
-  sum = vpadd_u8(sum, sum);
-  return vget_lane_u8(sum, 0);
-}
-template<> EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(const Packet8uc& a)
-{
-  uint8x8_t sum = vpadd_u8(a,a);
-  sum = vpadd_u8(sum, sum);
-  sum = vpadd_u8(sum, sum);
-  return vget_lane_u8(sum, 0);
-}
-template<> EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a)
-{
-  uint8x8_t sum = vadd_u8(vget_low_u8(a), vget_high_u8(a));
-  sum = vpadd_u8(sum, sum);
-  sum = vpadd_u8(sum, sum);
-  sum = vpadd_u8(sum, sum);
-  return vget_lane_u8(sum, 0);
-}
-template<> EIGEN_STRONG_INLINE int16_t predux<Packet4s>(const Packet4s& a)
-{
-  const int16x4_t sum = vpadd_s16(a,a);
-  return vget_lane_s16(vpadd_s16(sum, sum), 0);
-}
-template<> EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a)
-{
-  int16x4_t sum = vadd_s16(vget_low_s16(a), vget_high_s16(a));
-  sum = vpadd_s16(sum, sum);
-  sum = vpadd_s16(sum, sum);
-  return vget_lane_s16(sum, 0);
-}
-template<> EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(const Packet4us& a)
-{
-  const uint16x4_t sum = vpadd_u16(a,a);
-  return vget_lane_u16(vpadd_u16(sum, sum), 0);
-}
-template<> EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a)
-{
-  uint16x4_t sum = vadd_u16(vget_low_u16(a), vget_high_u16(a));
-  sum = vpadd_u16(sum, sum);
-  sum = vpadd_u16(sum, sum);
-  return vget_lane_u16(sum, 0);
-}
-template<> EIGEN_STRONG_INLINE int32_t predux<Packet2i>(const Packet2i& a) { return vget_lane_s32(vpadd_s32(a,a), 0); }
+
 template<> EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a)
 {
-  const int32x2_t sum = vadd_s32(vget_low_s32(a), vget_high_s32(a));
-  return vget_lane_s32(vpadd_s32(sum, sum), 0);
-}
-template<> EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(const Packet2ui& a) { return vget_lane_u32(vpadd_u32(a,a), 0); }
-template<> EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a)
-{
-  const uint32x2_t sum = vadd_u32(vget_low_u32(a), vget_high_u32(a));
-  return vget_lane_u32(vpadd_u32(sum, sum), 0);
-}
-template<> EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a)
-{ return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); }
-template<> EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a)
-{ return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); }
+  int32x2_t a_lo, a_hi, sum;
 
-template<> EIGEN_DEVICE_FUNC inline Packet4c predux_half_dowto4(const Packet8c& a)
-{
-  return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a,
-      vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0);
+  a_lo = vget_low_s32(a);
+  a_hi = vget_high_s32(a);
+  sum = vpadd_s32(a_lo, a_hi);
+  sum = vpadd_s32(sum, sum);
+  return vget_lane_s32(sum, 0);
 }
-template<> EIGEN_DEVICE_FUNC inline Packet8c predux_half_dowto4(const Packet16c& a)
-{ return vadd_s8(vget_high_s8(a), vget_low_s8(a)); }
-template<> EIGEN_DEVICE_FUNC inline Packet4uc predux_half_dowto4(const Packet8uc& a)
+
+template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
 {
-  return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a,
-      vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0);
+  int32x4x2_t vtrn1, vtrn2, res1, res2;
+  Packet4i sum1, sum2, sum;
+
+  // NEON zip performs interleaving of the supplied vectors.
+  // We perform two interleaves in a row to acquire the transposed vector
+  vtrn1 = vzipq_s32(vecs[0], vecs[2]);
+  vtrn2 = vzipq_s32(vecs[1], vecs[3]);
+  res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]);
+  res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]);
+
+  // Do the addition of the resulting vectors
+  sum1 = vaddq_s32(res1.val[0], res1.val[1]);
+  sum2 = vaddq_s32(res2.val[0], res2.val[1]);
+  sum = vaddq_s32(sum1, sum2);
+
+  return sum;
 }
-template<> EIGEN_DEVICE_FUNC inline Packet8uc predux_half_dowto4(const Packet16uc& a)
-{ return vadd_u8(vget_high_u8(a), vget_low_u8(a)); }
-template<> EIGEN_DEVICE_FUNC inline Packet4s predux_half_dowto4(const Packet8s& a)
-{ return vadd_s16(vget_high_s16(a), vget_low_s16(a)); }
-template<> EIGEN_DEVICE_FUNC inline Packet4us predux_half_dowto4(const Packet8us& a)
-{ return vadd_u16(vget_high_u16(a), vget_low_u16(a)); }
 
 // Other reduction functions:
 // mul
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet2f>(const Packet2f& a)
-{ return vget_lane_f32(a, 0) * vget_lane_f32(a, 1); }
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{ return predux_mul(vmul_f32(vget_low_f32(a), vget_high_f32(a))); }
-template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet4c>(const Packet4c& a)
 {
-  int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a));
-  prod = vmul_s8(prod, vrev16_s8(prod));
-  return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 2);
-}
-template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet8c>(const Packet8c& a)
-{
-  int8x8_t prod = vmul_s8(a, vrev16_s8(a));
-  prod = vmul_s8(prod, vrev32_s8(prod));
-  return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4);
-}
-template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a)
-{ return predux_mul(vmul_s8(vget_low_s8(a), vget_high_s8(a))); }
-template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet4uc>(const Packet4uc& a)
-{
-  uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a));
-  prod = vmul_u8(prod, vrev16_u8(prod));
-  return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 2);
-}
-template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet8uc>(const Packet8uc& a)
-{
-  uint8x8_t prod = vmul_u8(a, vrev16_u8(a));
-  prod = vmul_u8(prod, vrev32_u8(prod));
-  return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4);
-}
-template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a)
-{ return predux_mul(vmul_u8(vget_low_u8(a), vget_high_u8(a))); }
-template<> EIGEN_STRONG_INLINE int16_t predux_mul<Packet4s>(const Packet4s& a)
-{
-  const int16x4_t prod = vmul_s16(a, vrev32_s16(a));
-  return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
-}
-template<> EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(const Packet8s& a)
-{
-  int16x4_t prod;
+  float32x2_t a_lo, a_hi, prod;
 
-  // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
-  prod = vmul_s16(vget_low_s16(a), vget_high_s16(a));
-  // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8|
-  prod = vmul_s16(prod, vrev32_s16(prod));
-  // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
-  return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
-}
-template<> EIGEN_STRONG_INLINE uint16_t predux_mul<Packet4us>(const Packet4us& a)
-{
-  const uint16x4_t prod = vmul_u16(a, vrev32_u16(a));
-  return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
-}
-template<> EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(const Packet8us& a)
-{
-  uint16x4_t prod;
+  // Get a_lo = |a1|a2| and a_hi = |a3|a4|
+  a_lo = vget_low_f32(a);
+  a_hi = vget_high_f32(a);
+  // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
+  prod = vmul_f32(a_lo, a_hi);
+  // Multiply prod with its swapped value |a2*a4|a1*a3|
+  prod = vmul_f32(prod, vrev64_f32(prod));
 
-  // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
-  prod = vmul_u16(vget_low_u16(a), vget_high_u16(a));
-  // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8|
-  prod = vmul_u16(prod, vrev32_u16(prod));
-  // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
-  return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
+  return vget_lane_f32(prod, 0);
 }
-template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet2i>(const Packet2i& a)
-{ return vget_lane_s32(a, 0) * vget_lane_s32(a, 1); }
 template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a)
-{ return predux_mul(vmul_s32(vget_low_s32(a), vget_high_s32(a))); }
-template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet2ui>(const Packet2ui& a)
-{ return vget_lane_u32(a, 0) * vget_lane_u32(a, 1); }
-template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a)
-{ return predux_mul(vmul_u32(vget_low_u32(a), vget_high_u32(a))); }
-template<> EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a)
-{ return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1); }
-template<> EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a)
-{ return vgetq_lane_u64(a, 0) * vgetq_lane_u64(a, 1); }
+{
+  int32x2_t a_lo, a_hi, prod;
+
+  // Get a_lo = |a1|a2| and a_hi = |a3|a4|
+  a_lo = vget_low_s32(a);
+  a_hi = vget_high_s32(a);
+  // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
+  prod = vmul_s32(a_lo, a_hi);
+  // Multiply prod with its swapped value |a2*a4|a1*a3|
+  prod = vmul_s32(prod, vrev64_s32(prod));
+
+  return vget_lane_s32(prod, 0);
+}
 
 // min
-template<> EIGEN_STRONG_INLINE float predux_min<Packet2f>(const Packet2f& a)
-{ return vget_lane_f32(vpmin_f32(a,a), 0); }
 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
 {
-  const float32x2_t min = vmin_f32(vget_low_f32(a), vget_high_f32(a));
-  return vget_lane_f32(vpmin_f32(min, min), 0);
+  float32x2_t a_lo, a_hi, min;
+
+  a_lo = vget_low_f32(a);
+  a_hi = vget_high_f32(a);
+  min = vpmin_f32(a_lo, a_hi);
+  min = vpmin_f32(min, min);
+
+  return vget_lane_f32(min, 0);
 }
-template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet4c>(const Packet4c& a)
-{
-  const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
-  int8x8_t min = vpmin_s8(a_dup, a_dup);
-  min = vpmin_s8(min, min);
-  return vget_lane_s8(min, 0);
-}
-template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(const Packet8c& a)
-{
-  int8x8_t min = vpmin_s8(a,a);
-  min = vpmin_s8(min, min);
-  min = vpmin_s8(min, min);
-  return vget_lane_s8(min, 0);
-}
-template<> EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a)
-{
-  int8x8_t min = vmin_s8(vget_low_s8(a), vget_high_s8(a));
-  min = vpmin_s8(min, min);
-  min = vpmin_s8(min, min);
-  min = vpmin_s8(min, min);
-  return vget_lane_s8(min, 0);
-}
-template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet4uc>(const Packet4uc& a)
-{
-  const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
-  uint8x8_t min = vpmin_u8(a_dup, a_dup);
-  min = vpmin_u8(min, min);
-  return vget_lane_u8(min, 0);
-}
-template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(const Packet8uc& a)
-{
-  uint8x8_t min = vpmin_u8(a,a);
-  min = vpmin_u8(min, min);
-  min = vpmin_u8(min, min);
-  return vget_lane_u8(min, 0);
-}
-template<> EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a)
-{
-  uint8x8_t min = vmin_u8(vget_low_u8(a), vget_high_u8(a));
-  min = vpmin_u8(min, min);
-  min = vpmin_u8(min, min);
-  min = vpmin_u8(min, min);
-  return vget_lane_u8(min, 0);
-}
-template<> EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(const Packet4s& a)
-{
-  const int16x4_t min = vpmin_s16(a,a);
-  return vget_lane_s16(vpmin_s16(min, min), 0);
-}
-template<> EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a)
-{
-  int16x4_t min = vmin_s16(vget_low_s16(a), vget_high_s16(a));
-  min = vpmin_s16(min, min);
-  min = vpmin_s16(min, min);
-  return vget_lane_s16(min, 0);
-}
-template<> EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(const Packet4us& a)
-{
-  const uint16x4_t min = vpmin_u16(a,a);
-  return vget_lane_u16(vpmin_u16(min, min), 0);
-}
-template<> EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a)
-{
-  uint16x4_t min = vmin_u16(vget_low_u16(a), vget_high_u16(a));
-  min = vpmin_u16(min, min);
-  min = vpmin_u16(min, min);
-  return vget_lane_u16(min, 0);
-}
-template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(const Packet2i& a)
-{ return vget_lane_s32(vpmin_s32(a,a), 0); }
+
 template<> EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a)
 {
-  const int32x2_t min = vmin_s32(vget_low_s32(a), vget_high_s32(a));
-  return vget_lane_s32(vpmin_s32(min, min), 0);
+  int32x2_t a_lo, a_hi, min;
+
+  a_lo = vget_low_s32(a);
+  a_hi = vget_high_s32(a);
+  min = vpmin_s32(a_lo, a_hi);
+  min = vpmin_s32(min, min);
+  
+  return vget_lane_s32(min, 0);
 }
-template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(const Packet2ui& a)
-{ return vget_lane_u32(vpmin_u32(a,a), 0); }
-template<> EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a)
-{
-  const uint32x2_t min = vmin_u32(vget_low_u32(a), vget_high_u32(a));
-  return vget_lane_u32(vpmin_u32(min, min), 0);
-}
-template<> EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(const Packet2l& a)
-{ return (std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); }
-template<> EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(const Packet2ul& a)
-{ return (std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); }
 
 // max
-template<> EIGEN_STRONG_INLINE float predux_max<Packet2f>(const Packet2f& a)
-{ return vget_lane_f32(vpmax_f32(a,a), 0); }
 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
 {
-  const float32x2_t max = vmax_f32(vget_low_f32(a), vget_high_f32(a));
-  return vget_lane_f32(vpmax_f32(max, max), 0);
+  float32x2_t a_lo, a_hi, max;
+
+  a_lo = vget_low_f32(a);
+  a_hi = vget_high_f32(a);
+  max = vpmax_f32(a_lo, a_hi);
+  max = vpmax_f32(max, max);
+
+  return vget_lane_f32(max, 0);
 }
-template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet4c>(const Packet4c& a)
-{
-  const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
-  int8x8_t max = vpmax_s8(a_dup, a_dup);
-  max = vpmax_s8(max, max);
-  return vget_lane_s8(max, 0);
-}
-template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(const Packet8c& a)
-{
-  int8x8_t max = vpmax_s8(a,a);
-  max = vpmax_s8(max, max);
-  max = vpmax_s8(max, max);
-  return vget_lane_s8(max, 0);
-}
-template<> EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a)
-{
-  int8x8_t max = vmax_s8(vget_low_s8(a), vget_high_s8(a));
-  max = vpmax_s8(max, max);
-  max = vpmax_s8(max, max);
-  max = vpmax_s8(max, max);
-  return vget_lane_s8(max, 0);
-}
-template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet4uc>(const Packet4uc& a)
-{
-  const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
-  uint8x8_t max = vpmax_u8(a_dup, a_dup);
-  max = vpmax_u8(max, max);
-  return vget_lane_u8(max, 0);
-}
-template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(const Packet8uc& a)
-{
-  uint8x8_t max = vpmax_u8(a,a);
-  max = vpmax_u8(max, max);
-  max = vpmax_u8(max, max);
-  return vget_lane_u8(max, 0);
-}
-template<> EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a)
-{
-  uint8x8_t max = vmax_u8(vget_low_u8(a), vget_high_u8(a));
-  max = vpmax_u8(max, max);
-  max = vpmax_u8(max, max);
-  max = vpmax_u8(max, max);
-  return vget_lane_u8(max, 0);
-}
-template<> EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(const Packet4s& a)
-{
-  const int16x4_t max = vpmax_s16(a,a);
-  return vget_lane_s16(vpmax_s16(max, max), 0);
-}
-template<> EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a)
-{
-  int16x4_t max = vmax_s16(vget_low_s16(a), vget_high_s16(a));
-  max = vpmax_s16(max, max);
-  max = vpmax_s16(max, max);
-  return vget_lane_s16(max, 0);
-}
-template<> EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(const Packet4us& a)
-{
-  const uint16x4_t max = vpmax_u16(a,a);
-  return vget_lane_u16(vpmax_u16(max, max), 0);
-}
-template<> EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a)
-{
-  uint16x4_t max = vmax_u16(vget_low_u16(a), vget_high_u16(a));
-  max = vpmax_u16(max, max);
-  max = vpmax_u16(max, max);
-  return vget_lane_u16(max, 0);
-}
-template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(const Packet2i& a)
-{ return vget_lane_s32(vpmax_s32(a,a), 0); }
+
 template<> EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a)
 {
-  const int32x2_t max = vmax_s32(vget_low_s32(a), vget_high_s32(a));
-  return vget_lane_s32(vpmax_s32(max, max), 0);
-}
-template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(const Packet2ui& a)
-{ return vget_lane_u32(vpmax_u32(a,a), 0); }
-template<> EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a)
-{
-  const uint32x2_t max = vmax_u32(vget_low_u32(a), vget_high_u32(a));
-  return vget_lane_u32(vpmax_u32(max, max), 0);
-}
-template<> EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(const Packet2l& a)
-{ return (std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); }
-template<> EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(const Packet2ul& a)
-{ return (std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); }
+  int32x2_t a_lo, a_hi, max;
 
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
-{
-  uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)),
-                            vget_high_u32(vreinterpretq_u32_f32(x)));
-  return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
+  a_lo = vget_low_s32(a);
+  a_hi = vget_high_s32(a);
+  max = vpmax_s32(a_lo, a_hi);
+  max = vpmax_s32(max, max);
+
+  return vget_lane_s32(max, 0);
 }
 
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2f, 2>& kernel)
-{
-  const float32x2x2_t z = vzip_f32(kernel.packet[0], kernel.packet[1]);
-  kernel.packet[0] = z.val[0];
-  kernel.packet[1] = z.val[1];
-}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel)
-{
-  const float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]);
-  const float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]);
+// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
+// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
+#define PALIGN_NEON(Offset,Type,Command) \
+template<>\
+struct palign_impl<Offset,Type>\
+{\
+    EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
+    {\
+        if (Offset!=0)\
+            first = Command(first, second, Offset);\
+    }\
+};\
+
+PALIGN_NEON(0,Packet4f,vextq_f32)
+PALIGN_NEON(1,Packet4f,vextq_f32)
+PALIGN_NEON(2,Packet4f,vextq_f32)
+PALIGN_NEON(3,Packet4f,vextq_f32)
+PALIGN_NEON(0,Packet4i,vextq_s32)
+PALIGN_NEON(1,Packet4i,vextq_s32)
+PALIGN_NEON(2,Packet4i,vextq_s32)
+PALIGN_NEON(3,Packet4i,vextq_s32)
+
+#undef PALIGN_NEON
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4f,4>& kernel) {
+  float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]);
+  float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]);
 
   kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0]));
   kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0]));
   kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1]));
   kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1]));
 }
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4c, 4>& kernel)
-{
-  const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1));
-  const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1));
-
-  const int8x8x2_t zip8 = vzip_s8(a,b);
-  const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1]));
-
-  kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0);
-  kernel.packet[1] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 1);
-  kernel.packet[2] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 0);
-  kernel.packet[3] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 1);
-}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8c, 8>& kernel)
-{
-  int8x8x2_t zip8[4];
-  uint16x4x2_t zip16[4];
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 4; i++)
-    zip8[i] = vzip_s8(kernel.packet[i*2], kernel.packet[i*2+1]);
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 2; i++)
-  {
-    EIGEN_UNROLL_LOOP
-    for (int j = 0; j != 2; j++)
-      zip16[i*2+j] = vzip_u16(vreinterpret_u16_s8(zip8[i*2].val[j]), vreinterpret_u16_s8(zip8[i*2+1].val[j]));
-  }
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 2; i++)
-  {
-    EIGEN_UNROLL_LOOP
-    for (int j = 0; j != 2; j++)
-    {
-      const uint32x2x2_t z = vzip_u32(vreinterpret_u32_u16(zip16[i].val[j]), vreinterpret_u32_u16(zip16[i+2].val[j]));
-      EIGEN_UNROLL_LOOP
-      for (int k = 0; k != 2; k++)
-        kernel.packet[i*4+j*2+k] = vreinterpret_s8_u32(z.val[k]);
-    }
-  }
-}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16c, 16>& kernel)
-{
-  int8x16x2_t zip8[8];
-  uint16x8x2_t zip16[8];
-  uint32x4x2_t zip32[8];
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 8; i++)
-    zip8[i] = vzipq_s8(kernel.packet[i*2], kernel.packet[i*2+1]);
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 4; i++)
-  {
-    EIGEN_UNROLL_LOOP
-    for (int j = 0; j != 2; j++)
-    {
-      zip16[i*2+j] = vzipq_u16(vreinterpretq_u16_s8(zip8[i*2].val[j]),
-          vreinterpretq_u16_s8(zip8[i*2+1].val[j]));
-    }
-  }
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 2; i++)
-  {
-    EIGEN_UNROLL_LOOP
-    for (int j = 0; j != 2; j++)
-    {
-      EIGEN_UNROLL_LOOP
-      for (int k = 0; k != 2; k++)
-        zip32[i*4+j*2+k] = vzipq_u32(vreinterpretq_u32_u16(zip16[i*4+j].val[k]),
-            vreinterpretq_u32_u16(zip16[i*4+j+2].val[k]));
-    }
-  }
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 4; i++)
-  {
-    EIGEN_UNROLL_LOOP
-    for (int j = 0; j != 2; j++)
-    {
-      kernel.packet[i*4+j*2] = vreinterpretq_s8_u32(vcombine_u32(vget_low_u32(zip32[i].val[j]),
-          vget_low_u32(zip32[i+4].val[j])));
-      kernel.packet[i*4+j*2+1] = vreinterpretq_s8_u32(vcombine_u32(vget_high_u32(zip32[i].val[j]),
-          vget_high_u32(zip32[i+4].val[j])));
-    }
-  }
-}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4uc, 4>& kernel)
-{
-  const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1));
-  const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1));
-
-  const uint8x8x2_t zip8 = vzip_u8(a,b);
-  const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1]));
-
-  kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0);
-  kernel.packet[1] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 1);
-  kernel.packet[2] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 0);
-  kernel.packet[3] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 1);
-}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8uc, 8>& kernel)
-{
-  uint8x8x2_t zip8[4];
-  uint16x4x2_t zip16[4];
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 4; i++)
-    zip8[i] = vzip_u8(kernel.packet[i*2], kernel.packet[i*2+1]);
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 2; i++)
-  {
-    EIGEN_UNROLL_LOOP
-    for (int j = 0; j != 2; j++)
-      zip16[i*2+j] = vzip_u16(vreinterpret_u16_u8(zip8[i*2].val[j]), vreinterpret_u16_u8(zip8[i*2+1].val[j]));
-  }
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 2; i++)
-  {
-    EIGEN_UNROLL_LOOP
-    for (int j = 0; j != 2; j++)
-    {
-      const uint32x2x2_t z = vzip_u32(vreinterpret_u32_u16(zip16[i].val[j]), vreinterpret_u32_u16(zip16[i+2].val[j]));
-      EIGEN_UNROLL_LOOP
-      for (int k = 0; k != 2; k++)
-        kernel.packet[i*4+j*2+k] = vreinterpret_u8_u32(z.val[k]);
-    }
-  }
-}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16uc, 16>& kernel)
-{
-  uint8x16x2_t zip8[8];
-  uint16x8x2_t zip16[8];
-  uint32x4x2_t zip32[8];
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 8; i++)
-    zip8[i] = vzipq_u8(kernel.packet[i*2], kernel.packet[i*2+1]);
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 4; i++)
-  {
-    EIGEN_UNROLL_LOOP
-    for (int j = 0; j != 2; j++)
-      zip16[i*2+j] = vzipq_u16(vreinterpretq_u16_u8(zip8[i*2].val[j]),
-          vreinterpretq_u16_u8(zip8[i*2+1].val[j]));
-  }
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 2; i++)
-  {
-    EIGEN_UNROLL_LOOP
-    for (int j = 0; j != 2; j++)
-    {
-      EIGEN_UNROLL_LOOP
-      for (int k = 0; k != 2; k++)
-        zip32[i*4+j*2+k] = vzipq_u32(vreinterpretq_u32_u16(zip16[i*4+j].val[k]),
-            vreinterpretq_u32_u16(zip16[i*4+j+2].val[k]));
-    }
-  }
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 4; i++)
-  {
-    EIGEN_UNROLL_LOOP
-    for (int j = 0; j != 2; j++)
-    {
-      kernel.packet[i*4+j*2] = vreinterpretq_u8_u32(vcombine_u32(vget_low_u32(zip32[i].val[j]),
-          vget_low_u32(zip32[i+4].val[j])));
-      kernel.packet[i*4+j*2+1] = vreinterpretq_u8_u32(vcombine_u32(vget_high_u32(zip32[i].val[j]),
-          vget_high_u32(zip32[i+4].val[j])));
-    }
-  }
-}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4s, 4>& kernel)
-{
-  const int16x4x2_t zip16_1 = vzip_s16(kernel.packet[0], kernel.packet[1]);
-  const int16x4x2_t zip16_2 = vzip_s16(kernel.packet[2], kernel.packet[3]);
-
-  const uint32x2x2_t zip32_1 = vzip_u32(vreinterpret_u32_s16(zip16_1.val[0]), vreinterpret_u32_s16(zip16_2.val[0]));
-  const uint32x2x2_t zip32_2 = vzip_u32(vreinterpret_u32_s16(zip16_1.val[1]), vreinterpret_u32_s16(zip16_2.val[1]));
-
-  kernel.packet[0] = vreinterpret_s16_u32(zip32_1.val[0]);
-  kernel.packet[1] = vreinterpret_s16_u32(zip32_1.val[1]);
-  kernel.packet[2] = vreinterpret_s16_u32(zip32_2.val[0]);
-  kernel.packet[3] = vreinterpret_s16_u32(zip32_2.val[1]);
-}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8s, 8>& kernel)
-{
-  const int16x8x2_t zip16_1 = vzipq_s16(kernel.packet[0], kernel.packet[1]);
-  const int16x8x2_t zip16_2 = vzipq_s16(kernel.packet[2], kernel.packet[3]);
-  const int16x8x2_t zip16_3 = vzipq_s16(kernel.packet[4], kernel.packet[5]);
-  const int16x8x2_t zip16_4 = vzipq_s16(kernel.packet[6], kernel.packet[7]);
-
-  const uint32x4x2_t zip32_1 = vzipq_u32(vreinterpretq_u32_s16(zip16_1.val[0]), vreinterpretq_u32_s16(zip16_2.val[0]));
-  const uint32x4x2_t zip32_2 = vzipq_u32(vreinterpretq_u32_s16(zip16_1.val[1]), vreinterpretq_u32_s16(zip16_2.val[1]));
-  const uint32x4x2_t zip32_3 = vzipq_u32(vreinterpretq_u32_s16(zip16_3.val[0]), vreinterpretq_u32_s16(zip16_4.val[0]));
-  const uint32x4x2_t zip32_4 = vzipq_u32(vreinterpretq_u32_s16(zip16_3.val[1]), vreinterpretq_u32_s16(zip16_4.val[1]));
-
-  kernel.packet[0] = vreinterpretq_s16_u32(vcombine_u32(vget_low_u32(zip32_1.val[0]), vget_low_u32(zip32_3.val[0])));
-  kernel.packet[1] = vreinterpretq_s16_u32(vcombine_u32(vget_high_u32(zip32_1.val[0]), vget_high_u32(zip32_3.val[0])));
-  kernel.packet[2] = vreinterpretq_s16_u32(vcombine_u32(vget_low_u32(zip32_1.val[1]), vget_low_u32(zip32_3.val[1])));
-  kernel.packet[3] = vreinterpretq_s16_u32(vcombine_u32(vget_high_u32(zip32_1.val[1]), vget_high_u32(zip32_3.val[1])));
-  kernel.packet[4] = vreinterpretq_s16_u32(vcombine_u32(vget_low_u32(zip32_2.val[0]), vget_low_u32(zip32_4.val[0])));
-  kernel.packet[5] = vreinterpretq_s16_u32(vcombine_u32(vget_high_u32(zip32_2.val[0]), vget_high_u32(zip32_4.val[0])));
-  kernel.packet[6] = vreinterpretq_s16_u32(vcombine_u32(vget_low_u32(zip32_2.val[1]), vget_low_u32(zip32_4.val[1])));
-  kernel.packet[7] = vreinterpretq_s16_u32(vcombine_u32(vget_high_u32(zip32_2.val[1]), vget_high_u32(zip32_4.val[1])));
-}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4us, 4>& kernel)
-{
-  const uint16x4x2_t zip16_1 = vzip_u16(kernel.packet[0], kernel.packet[1]);
-  const uint16x4x2_t zip16_2 = vzip_u16(kernel.packet[2], kernel.packet[3]);
-
-  const uint32x2x2_t zip32_1 = vzip_u32(vreinterpret_u32_u16(zip16_1.val[0]), vreinterpret_u32_u16(zip16_2.val[0]));
-  const uint32x2x2_t zip32_2 = vzip_u32(vreinterpret_u32_u16(zip16_1.val[1]), vreinterpret_u32_u16(zip16_2.val[1]));
-
-  kernel.packet[0] = vreinterpret_u16_u32(zip32_1.val[0]);
-  kernel.packet[1] = vreinterpret_u16_u32(zip32_1.val[1]);
-  kernel.packet[2] = vreinterpret_u16_u32(zip32_2.val[0]);
-  kernel.packet[3] = vreinterpret_u16_u32(zip32_2.val[1]);
-}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8us, 8>& kernel)
-{
-  const uint16x8x2_t zip16_1 = vzipq_u16(kernel.packet[0], kernel.packet[1]);
-  const uint16x8x2_t zip16_2 = vzipq_u16(kernel.packet[2], kernel.packet[3]);
-  const uint16x8x2_t zip16_3 = vzipq_u16(kernel.packet[4], kernel.packet[5]);
-  const uint16x8x2_t zip16_4 = vzipq_u16(kernel.packet[6], kernel.packet[7]);
-
-  const uint32x4x2_t zip32_1 = vzipq_u32(vreinterpretq_u32_u16(zip16_1.val[0]), vreinterpretq_u32_u16(zip16_2.val[0]));
-  const uint32x4x2_t zip32_2 = vzipq_u32(vreinterpretq_u32_u16(zip16_1.val[1]), vreinterpretq_u32_u16(zip16_2.val[1]));
-  const uint32x4x2_t zip32_3 = vzipq_u32(vreinterpretq_u32_u16(zip16_3.val[0]), vreinterpretq_u32_u16(zip16_4.val[0]));
-  const uint32x4x2_t zip32_4 = vzipq_u32(vreinterpretq_u32_u16(zip16_3.val[1]), vreinterpretq_u32_u16(zip16_4.val[1]));
-
-  kernel.packet[0] = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(zip32_1.val[0]), vget_low_u32(zip32_3.val[0])));
-  kernel.packet[1] = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(zip32_1.val[0]), vget_high_u32(zip32_3.val[0])));
-  kernel.packet[2] = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(zip32_1.val[1]), vget_low_u32(zip32_3.val[1])));
-  kernel.packet[3] = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(zip32_1.val[1]), vget_high_u32(zip32_3.val[1])));
-  kernel.packet[4] = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(zip32_2.val[0]), vget_low_u32(zip32_4.val[0])));
-  kernel.packet[5] = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(zip32_2.val[0]), vget_high_u32(zip32_4.val[0])));
-  kernel.packet[6] = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(zip32_2.val[1]), vget_low_u32(zip32_4.val[1])));
-  kernel.packet[7] = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(zip32_2.val[1]), vget_high_u32(zip32_4.val[1])));
-}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2i, 2>& kernel)
-{
-  const int32x2x2_t z = vzip_s32(kernel.packet[0], kernel.packet[1]);
-  kernel.packet[0] = z.val[0];
-  kernel.packet[1] = z.val[1];
-}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel)
-{
-  const int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]);
-  const int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]);
 
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4i,4>& kernel) {
+  int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]);
+  int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]);
   kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0]));
   kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0]));
   kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1]));
   kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1]));
 }
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2ui, 2>& kernel)
-{
-  const uint32x2x2_t z = vzip_u32(kernel.packet[0], kernel.packet[1]);
-  kernel.packet[0] = z.val[0];
-  kernel.packet[1] = z.val[1];
-}
-EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4ui, 4>& kernel)
-{
-  const uint32x4x2_t tmp1 = vzipq_u32(kernel.packet[0], kernel.packet[1]);
-  const uint32x4x2_t tmp2 = vzipq_u32(kernel.packet[2], kernel.packet[3]);
-
-  kernel.packet[0] = vcombine_u32(vget_low_u32(tmp1.val[0]), vget_low_u32(tmp2.val[0]));
-  kernel.packet[1] = vcombine_u32(vget_high_u32(tmp1.val[0]), vget_high_u32(tmp2.val[0]));
-  kernel.packet[2] = vcombine_u32(vget_low_u32(tmp1.val[1]), vget_low_u32(tmp2.val[1]));
-  kernel.packet[3] = vcombine_u32(vget_high_u32(tmp1.val[1]), vget_high_u32(tmp2.val[1]));
-}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2l, 2>& kernel)
-{
-#if EIGEN_ARCH_ARM64
-  const int64x2_t tmp1 = vzip1q_s64(kernel.packet[0], kernel.packet[1]);
-  const int64x2_t tmp2 = vzip2q_s64(kernel.packet[0], kernel.packet[1]);
-
-  kernel.packet[0] = tmp1;
-  kernel.packet[1] = tmp2;
-#else
-  const int64x1_t tmp[2][2] = {
-    { vget_low_s64(kernel.packet[0]), vget_high_s64(kernel.packet[0]) },
-    { vget_low_s64(kernel.packet[1]), vget_high_s64(kernel.packet[1]) }
-  };
-
-  kernel.packet[0] = vcombine_s64(tmp[0][0], tmp[1][0]);
-  kernel.packet[1] = vcombine_s64(tmp[0][1], tmp[1][1]);
-#endif
-}
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2ul, 2>& kernel)
-{
-#if EIGEN_ARCH_ARM64
-  const uint64x2_t tmp1 = vzip1q_u64(kernel.packet[0], kernel.packet[1]);
-  const uint64x2_t tmp2 = vzip2q_u64(kernel.packet[0], kernel.packet[1]);
-
-  kernel.packet[0] = tmp1;
-  kernel.packet[1] = tmp2;
-#else
-  const uint64x1_t tmp[2][2] = {
-    { vget_low_u64(kernel.packet[0]), vget_high_u64(kernel.packet[0]) },
-    { vget_low_u64(kernel.packet[1]), vget_high_u64(kernel.packet[1]) }
-  };
-
-  kernel.packet[0] = vcombine_u64(tmp[0][0], tmp[1][0]);
-  kernel.packet[1] = vcombine_u64(tmp[0][1], tmp[1][1]);
-#endif
-}
-
-template<> EIGEN_DEVICE_FUNC inline Packet2f pselect( const Packet2f& mask, const Packet2f& a, const Packet2f& b)
-{ return vbsl_f32(vreinterpret_u32_f32(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b)
-{ return vbslq_f32(vreinterpretq_u32_f32(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet8c pselect(const Packet8c& mask, const Packet8c& a, const Packet8c& b)
-{ return vbsl_s8(vreinterpret_u8_s8(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b)
-{ return vbslq_s8(vreinterpretq_u8_s8(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet8uc pselect(const Packet8uc& mask, const Packet8uc& a, const Packet8uc& b)
-{ return vbsl_u8(mask, a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a, const Packet16uc& b)
-{ return vbslq_u8(mask, a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet4s pselect(const Packet4s& mask, const Packet4s& a, const Packet4s& b)
-{ return vbsl_s16(vreinterpret_u16_s16(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b)
-{ return vbslq_s16(vreinterpretq_u16_s16(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet4us pselect(const Packet4us& mask, const Packet4us& a, const Packet4us& b)
-{ return vbsl_u16(mask, a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b)
-{ return vbslq_u16(mask, a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet2i pselect(const Packet2i& mask, const Packet2i& a, const Packet2i& b)
-{ return vbsl_s32(vreinterpret_u32_s32(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b)
-{ return vbslq_s32(vreinterpretq_u32_s32(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet2ui pselect(const Packet2ui& mask, const Packet2ui& a, const Packet2ui& b)
-{ return vbsl_u32(mask, a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b)
-{ return vbslq_u32(mask, a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b)
-{ return vbslq_s64(vreinterpretq_u64_s64(mask), a, b); }
-template<> EIGEN_DEVICE_FUNC inline Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b)
-{ return vbslq_u64(mask, a, b); }
-
-/**
- * Computes the integer square root
- * @remarks The calculation is performed using an algorithm which iterates through each binary digit of the result
- *   and tests whether setting that digit to 1 would cause the square of the value to be greater than the argument
- *   value. The algorithm is described in detail here: http://ww1.microchip.com/downloads/en/AppNotes/91040a.pdf .
- */
-template<> EIGEN_STRONG_INLINE Packet4uc psqrt(const Packet4uc& a) {
-  uint8x8_t x = vreinterpret_u8_u32(vdup_n_u32(a));
-  uint8x8_t res = vdup_n_u8(0);
-  uint8x8_t add = vdup_n_u8(0x8);
-  for (int i = 0; i < 4; i++)
-  {
-    const uint8x8_t temp = vorr_u8(res, add);
-    res = vbsl_u8(vcge_u8(x, vmul_u8(temp, temp)), temp, res);
-    add = vshr_n_u8(add, 1);
-  }
-  return vget_lane_u32(vreinterpret_u32_u8(res), 0);
-}
-/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
-template<> EIGEN_STRONG_INLINE Packet8uc psqrt(const Packet8uc& a) {
-  uint8x8_t res = vdup_n_u8(0);
-  uint8x8_t add = vdup_n_u8(0x8);
-  for (int i = 0; i < 4; i++)
-  {
-    const uint8x8_t temp = vorr_u8(res, add);
-    res = vbsl_u8(vcge_u8(a, vmul_u8(temp, temp)), temp, res);
-    add = vshr_n_u8(add, 1);
-  }
-  return res;
-}
-/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
-template<> EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) {
-  uint8x16_t res = vdupq_n_u8(0);
-  uint8x16_t add = vdupq_n_u8(0x8);
-  for (int i = 0; i < 4; i++)
-  {
-    const uint8x16_t temp = vorrq_u8(res, add);
-    res = vbslq_u8(vcgeq_u8(a, vmulq_u8(temp, temp)), temp, res);
-    add = vshrq_n_u8(add, 1);
-  }
-  return res;
-}
-/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
-template<> EIGEN_STRONG_INLINE Packet4us psqrt(const Packet4us& a) {
-  uint16x4_t res = vdup_n_u16(0);
-  uint16x4_t add = vdup_n_u16(0x80);
-  for (int i = 0; i < 8; i++)
-  {
-    const uint16x4_t temp = vorr_u16(res, add);
-    res = vbsl_u16(vcge_u16(a, vmul_u16(temp, temp)), temp, res);
-    add = vshr_n_u16(add, 1);
-  }
-  return res;
-}
-/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
-template<> EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) {
-  uint16x8_t res = vdupq_n_u16(0);
-  uint16x8_t add = vdupq_n_u16(0x80);
-  for (int i = 0; i < 8; i++)
-  {
-    const uint16x8_t temp = vorrq_u16(res, add);
-    res = vbslq_u16(vcgeq_u16(a, vmulq_u16(temp, temp)), temp, res);
-    add = vshrq_n_u16(add, 1);
-  }
-  return res;
-}
-/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
-template<> EIGEN_STRONG_INLINE Packet2ui psqrt(const Packet2ui& a) {
-  uint32x2_t res = vdup_n_u32(0);
-  uint32x2_t add = vdup_n_u32(0x8000);
-  for (int i = 0; i < 16; i++)
-  {
-    const uint32x2_t temp = vorr_u32(res, add);
-    res = vbsl_u32(vcge_u32(a, vmul_u32(temp, temp)), temp, res);
-    add = vshr_n_u32(add, 1);
-  }
-  return res;
-}
-/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
-template<> EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) {
-  uint32x4_t res = vdupq_n_u32(0);
-  uint32x4_t add = vdupq_n_u32(0x8000);
-  for (int i = 0; i < 16; i++)
-  {
-    const uint32x4_t temp = vorrq_u32(res, add);
-    res = vbslq_u32(vcgeq_u32(a, vmulq_u32(temp, temp)), temp, res);
-    add = vshrq_n_u32(add, 1);
-  }
-  return res;
-}
 
 //---------- double ----------
 
@@ -3184,9 +571,17 @@ template<> EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) {
 // Defining these functions as templates ensures that if these intrinsics are
 // already defined in arm_neon.h, then our workaround doesn't cause a conflict
 // and has lower priority in overload resolution.
-template <typename T> uint64x2_t vreinterpretq_u64_f64(T a) { return (uint64x2_t) a; }
+template <typename T>
+uint64x2_t vreinterpretq_u64_f64(T a)
+{
+  return (uint64x2_t) a;
+}
 
-template <typename T> float64x2_t vreinterpretq_f64_u64(T a) { return (float64x2_t) a; }
+template <typename T>
+float64x2_t vreinterpretq_f64_u64(T a)
+{
+  return (float64x2_t) a;
+}
 
 typedef float64x2_t Packet2d;
 typedef float64x1_t Packet1d;
@@ -3195,67 +590,32 @@ template<> struct packet_traits<double>  : default_packet_traits
 {
   typedef Packet2d type;
   typedef Packet2d half;
-  enum
-  {
+  enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
-    HasHalfPacket = 0,
-
-    HasCast      = 1,
-    HasCmp       = 1,
-    HasAdd       = 1,
-    HasSub       = 1,
-    HasShift     = 1,
-    HasMul       = 1,
-    HasNegate    = 1,
-    HasAbs       = 1,
-    HasArg       = 0,
-    HasAbs2      = 1,
-    HasAbsDiff   = 1,
-    HasMin       = 1,
-    HasMax       = 1,
-    HasConj      = 1,
-    HasSetLinear = 0,
-    HasBlend     = 0,
-    HasInsert    = 1,
-
-    HasDiv   = 1,
-    HasFloor = 1,
-
+    HasHalfPacket=0,
+   
+    HasDiv  = 1,
+    // FIXME check the Has*
     HasSin  = 0,
     HasCos  = 0,
     HasLog  = 0,
     HasExp  = 0,
-    HasSqrt = 0,
-    HasTanh = 0,
-    HasErf  = 0
+    HasSqrt = 0
   };
 };
 
-template<> struct unpacket_traits<Packet2d>
-{
-  typedef double type;
-  typedef Packet2d half;
-  typedef Packet2l integer_packet;
-  enum
-  {
-    size = 2,
-    alignment = Aligned16,
-    vectorizable = true,
-    masked_load_available = false,
-    masked_store_available = false
-  };
-};
+template<> struct unpacket_traits<Packet2d> { typedef double  type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
 
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double&  from) { return vdupq_n_f64(from); }
 
 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a)
 {
-  const double c[] = {0.0,1.0};
-  return vaddq_f64(pset1<Packet2d>(a), vld1q_f64(c));
+  const double countdown_raw[] = {0.0,1.0};
+  const Packet2d countdown = vld1q_f64(countdown_raw);
+  return vaddq_f64(pset1<Packet2d>(a), countdown);
 }
-
 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); }
 
 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); }
@@ -3270,130 +630,128 @@ template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const
 
 #ifdef __ARM_FEATURE_FMA
 // See bug 936. See above comment about FMA for float.
-template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c)
-{ return vfmaq_f64(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); }
 #else
-template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c)
-{ return vmlaq_f64(c,a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); }
 #endif
 
 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); }
 
 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
-{
-  const Packet2d cst_1 = pset1<Packet2d>(1.0);
-  /* perform a floorf */
-  const Packet2d tmp = vcvtq_f64_s64(vcvtq_s64_f64(a));
-
-  /* if greater, substract 1 */
-  uint64x2_t mask = vcgtq_f64(tmp, a);
-  mask = vandq_u64(mask, vreinterpretq_u64_f64(cst_1));
-  return vsubq_f64(tmp, vreinterpretq_f64_u64(mask));
-}
-
 // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b)
-{ return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
+{
+  return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
+}
 
 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b)
-{ return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
+{
+  return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
+}
 
 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b)
-{ return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
+{
+  return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
+}
 
 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b)
-{ return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); }
+{
+  return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b)));
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b)
-{ return vreinterpretq_f64_u64(vcleq_f64(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); }
 
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b)
-{ return vreinterpretq_f64_u64(vcltq_f64(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); }
 
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b)
-{ return vreinterpretq_f64_u64(vceqq_f64(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
+{
+  return vld1q_dup_f64(from);
+}
+template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); }
 
-template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); }
-
-template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); }
-
-template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) { return vld1q_dup_f64(from); }
-template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from)
-{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to,from); }
-
-template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from)
-{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to,from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from); }
 
 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
 {
   Packet2d res = pset1<Packet2d>(0.0);
-  res = vld1q_lane_f64(from + 0*stride, res, 0);
-  res = vld1q_lane_f64(from + 1*stride, res, 1);
+  res = vsetq_lane_f64(from[0*stride], res, 0);
+  res = vsetq_lane_f64(from[1*stride], res, 1);
   return res;
 }
-
 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
 {
-  vst1q_lane_f64(to + stride*0, from, 0);
-  vst1q_lane_f64(to + stride*1, from, 1);
+  to[stride*0] = vgetq_lane_f64(from, 0);
+  to[stride*1] = vgetq_lane_f64(from, 1);
 }
-
 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ARM_PREFETCH(addr); }
 
 // FIXME only store the 2 first elements ?
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(a,0); }
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(a, 0); }
 
-template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
-{ return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
+template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); }
 
 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); }
 
 #if EIGEN_COMP_CLANG && defined(__apple_build_version__)
 // workaround ICE, see bug 907
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
-{ return (vget_low_f64(a) + vget_high_f64(a))[0]; }
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; }
 #else
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
-{ return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); }
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); }
 #endif
 
+template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
+{
+  float64x2_t trn1, trn2;
+
+  // NEON zip performs interleaving of the supplied vectors.
+  // We perform two interleaves in a row to acquire the transposed vector
+  trn1 = vzip1q_f64(vecs[0], vecs[1]);
+  trn2 = vzip2q_f64(vecs[0], vecs[1]);
+
+  // Do the addition of the resulting vectors
+  return vaddq_f64(trn1, trn2);
+}
 // Other reduction functions:
 // mul
 #if EIGEN_COMP_CLANG && defined(__apple_build_version__)
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
-{ return (vget_low_f64(a) * vget_high_f64(a))[0]; }
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; }
 #else
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
-{ return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
 #endif
 
 // min
-template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
-{ return vgetq_lane_f64(vpminq_f64(a,a), 0); }
+template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); }
 
 // max
-template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
-{ return vgetq_lane_f64(vpmaxq_f64(a,a), 0); }
+template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) { return vgetq_lane_f64(vpmaxq_f64(a, a), 0); }
 
+// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
+// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
+#define PALIGN_NEON(Offset,Type,Command) \
+template<>\
+struct palign_impl<Offset,Type>\
+{\
+    EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
+    {\
+        if (Offset!=0)\
+            first = Command(first, second, Offset);\
+    }\
+};\
+
+PALIGN_NEON(0,Packet2d,vextq_f64)
+PALIGN_NEON(1,Packet2d,vextq_f64)
+#undef PALIGN_NEON
 
 EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2d, 2>& kernel)
-{
-  const float64x2_t tmp1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
-  const float64x2_t tmp2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
+ptranspose(PacketBlock<Packet2d,2>& kernel) {
+  float64x2_t trn1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
+  float64x2_t trn2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
 
-  kernel.packet[0] = tmp1;
-  kernel.packet[1] = tmp2;
+  kernel.packet[0] = trn1;
+  kernel.packet[1] = trn2;
 }
-
-template<> EIGEN_DEVICE_FUNC inline Packet2d pselect( const Packet2d& mask, const Packet2d& a, const Packet2d& b)
-{ return vbslq_f64(vreinterpretq_u64_f64(mask), a, b); }
-
-#endif // EIGEN_ARCH_ARM64
+#endif // EIGEN_ARCH_ARM64 
 
 } // end namespace internal
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/TypeCasting.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/TypeCasting.h
deleted file mode 100644
index 68d24dc5c..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/TypeCasting.h
+++ /dev/null
@@ -1,278 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_TYPE_CASTING_NEON_H
-#define EIGEN_TYPE_CASTING_NEON_H
-
-namespace Eigen {
-
-namespace internal {
-
-template<> struct type_casting_traits<float,numext::int32_t>
-{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; };
-template<> struct type_casting_traits<float,numext::uint32_t>
-{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; };
-template<> struct type_casting_traits<numext::int8_t,numext::uint8_t>
-{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; };
-template<> struct type_casting_traits<numext::uint8_t,numext::int8_t>
-{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; };
-template<> struct type_casting_traits<numext::int16_t,numext::uint16_t>
-{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; };
-template<> struct type_casting_traits<numext::uint16_t,numext::int16_t>
-{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; };
-template<> struct type_casting_traits<numext::int32_t,float>
-{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; };
-template<> struct type_casting_traits<numext::int32_t,numext::uint32_t>
-{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; };
-template<> struct type_casting_traits<numext::uint32_t,float>
-{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; };
-template<> struct type_casting_traits<numext::uint32_t,numext::int32_t>
-{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; };
-template<> struct type_casting_traits<numext::int64_t,numext::uint64_t>
-{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; };
-template<> struct type_casting_traits<numext::uint64_t,numext::int64_t>
-{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; };
-
-template<> EIGEN_STRONG_INLINE Packet2f pcast<Packet2i,Packet2f>(const Packet2i& a) { return vcvt_f32_s32(a); }
-template<> EIGEN_STRONG_INLINE Packet2f pcast<Packet2ui,Packet2f>(const Packet2ui& a) { return vcvt_f32_u32(a); }
-template<> EIGEN_STRONG_INLINE Packet2f pcast<Packet2l,Packet2f>(const Packet2l& a)
-{ return vcvt_f32_s32(vmovn_s64(a)); }
-template<> EIGEN_STRONG_INLINE Packet2f pcast<Packet2ul,Packet2f>(const Packet2ul& a)
-{ return vcvt_f32_u32(vmovn_u64(a)); }
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4c,Packet4f>(const Packet4c& a)
-{ return vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s32(vdup_n_s32(a)))))); }
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4uc,Packet4f>(const Packet4uc& a)
-{ return vcvtq_f32_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(a))))))); }
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4s,Packet4f>(const Packet4s& a)
-{ return vcvtq_f32_s32(vmovl_s16(a)); }
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4us,Packet4f>(const Packet4us& a)
-{ return vcvtq_f32_s32(vreinterpretq_s32_u32(vmovl_u16(a))); }
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4i,Packet4f>(const Packet4i& a) { return vcvtq_f32_s32(a); }
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4ui,Packet4f>(const Packet4ui& a) { return vcvtq_f32_u32(a); }
-template<> EIGEN_STRONG_INLINE Packet4c pcast<Packet4f,Packet4c>(const Packet4f& a)
-{
-  const int16x4_t b = vmovn_s32(vcvtq_s32_f32(a));
-  return vget_lane_s32(vreinterpret_s32_s8(vmovn_s16(vcombine_s16(b, b))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet4c pcast<Packet4uc,Packet4c>(const Packet4uc& a)
-{ return static_cast<Packet4c>(a); }
-template<> EIGEN_STRONG_INLINE Packet4c pcast<Packet4s,Packet4c>(const Packet4s& a)
-{ return vget_lane_s32(vreinterpret_s32_s8(vmovn_s16(vcombine_s16(a, a))), 0); }
-template<> EIGEN_STRONG_INLINE Packet4c pcast<Packet4us,Packet4c>(const Packet4us& a)
-{
-  const int16x4_t b = vreinterpret_s16_u16(a);
-  return vget_lane_s32(vreinterpret_s32_s8(vmovn_s16(vcombine_s16(b, b))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet4c pcast<Packet4i,Packet4c>(const Packet4i& a)
-{
-  const int16x4_t b = vmovn_s32(a);
-  return vget_lane_s32(vreinterpret_s32_s8(vmovn_s16(vcombine_s16(b, b))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet4c pcast<Packet4ui,Packet4c>(const Packet4ui& a)
-{
-  const int16x4_t b = vmovn_s32(vreinterpretq_s32_u32(a));
-  return vget_lane_s32(vreinterpret_s32_s8(vmovn_s16(vcombine_s16(b, b))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet8c pcast<Packet8uc,Packet8c>(const Packet8uc& a) { return vreinterpret_s8_u8(a); }
-template<> EIGEN_STRONG_INLINE Packet8c pcast<Packet8s,Packet8c>(const Packet8s& a) { return vmovn_s16(a); }
-template<> EIGEN_STRONG_INLINE Packet8c pcast<Packet8us,Packet8c>(const Packet8us& a)
-{ return vreinterpret_s8_u8(vmovn_u16(a)); }
-template<> EIGEN_STRONG_INLINE Packet16c pcast<Packet16uc,Packet16c>(const Packet16uc& a)
-{ return vreinterpretq_s8_u8(a); }
-template<> EIGEN_STRONG_INLINE Packet4uc pcast<Packet4f,Packet4uc>(const Packet4f& a)
-{
-  const uint16x4_t b = vmovn_u32(vreinterpretq_u32_s32(vcvtq_s32_f32(a)));
-  return vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(b, b))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet4uc pcast<Packet4i,Packet4uc>(const Packet4i& a)
-{
-  const uint16x4_t b = vmovn_u32(vreinterpretq_u32_s32(a));
-  return vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(b, b))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet4uc pcast<Packet4ui,Packet4uc>(const Packet4ui& a)
-{
-  const uint16x4_t b = vmovn_u32(a);
-  return vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(b, b))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet4uc pcast<Packet4c,Packet4uc>(const Packet4c& a)
-{ return static_cast<Packet4uc>(a); }
-template<> EIGEN_STRONG_INLINE Packet4uc pcast<Packet4s,Packet4uc>(const Packet4s& a)
-{
-  const uint16x4_t b = vreinterpret_u16_s16(a);
-  return vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(b, b))), 0);
-}
-template<> EIGEN_STRONG_INLINE Packet4uc pcast<Packet4us,Packet4uc>(const Packet4us& a)
-{ return vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(a, a))), 0); }
-template<> EIGEN_STRONG_INLINE Packet8uc pcast<Packet8c,Packet8uc>(const Packet8c& a) { return vreinterpret_u8_s8(a); }
-template<> EIGEN_STRONG_INLINE Packet8uc pcast<Packet8s,Packet8uc>(const Packet8s& a)
-{ return vreinterpret_u8_s8(vmovn_s16(a)); }
-template<> EIGEN_STRONG_INLINE Packet8uc pcast<Packet8us,Packet8uc>(const Packet8us& a) { return vmovn_u16(a); }
-template<> EIGEN_STRONG_INLINE Packet16uc pcast<Packet16c,Packet16uc>(const Packet16c& a)
-{ return vreinterpretq_u8_s8(a); }
-template<> EIGEN_STRONG_INLINE Packet4s pcast<Packet4f,Packet4s>(const Packet4f& a)
-{ return vmovn_s32(vcvtq_s32_f32(a)); }
-template<> EIGEN_STRONG_INLINE Packet4s pcast<Packet4c,Packet4s>(const Packet4c& a)
-{ return vget_low_s16(vmovl_s8(vreinterpret_s8_s32(vdup_n_s32(a)))); }
-template<> EIGEN_STRONG_INLINE Packet4s pcast<Packet4uc,Packet4s>(const Packet4uc& a)
-{ return vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(a))))); }
-template<> EIGEN_STRONG_INLINE Packet4s pcast<Packet4us,Packet4s>(const Packet4us& a)
-{ return vreinterpret_s16_u16(a); }
-template<> EIGEN_STRONG_INLINE Packet4s pcast<Packet4i,Packet4s>(const Packet4i& a) { return vmovn_s32(a); }
-template<> EIGEN_STRONG_INLINE Packet4s pcast<Packet4ui,Packet4s>(const Packet4ui& a)
-{ return vmovn_s32(vreinterpretq_s32_u32(a)); }
-template<> EIGEN_STRONG_INLINE Packet8s pcast<Packet8uc,Packet8s>(const Packet8uc& a)
-{ return vreinterpretq_s16_u16(vmovl_u8(a)); }
-template<> EIGEN_STRONG_INLINE Packet8s pcast<Packet8c,Packet8s>(const Packet8c& a) { return vmovl_s8(a); }
-template<> EIGEN_STRONG_INLINE Packet8s pcast<Packet8us,Packet8s>(const Packet8us& a)
-{ return vreinterpretq_s16_u16(a); }
-template<> EIGEN_STRONG_INLINE Packet4us pcast<Packet4f,Packet4us>(const Packet4f& a)
-{ return vmovn_u32(vreinterpretq_u32_s32(vcvtq_s32_f32(a))); }
-template<> EIGEN_STRONG_INLINE Packet4us pcast<Packet4c,Packet4us>(const Packet4c& a)
-{ return vget_low_u16(vreinterpretq_u16_s16(vmovl_s8(vreinterpret_s8_s32(vdup_n_s32(a))))); }
-template<> EIGEN_STRONG_INLINE Packet4us pcast<Packet4uc,Packet4us>(const Packet4uc& a)
-{ return vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(a)))); }
-template<> EIGEN_STRONG_INLINE Packet4us pcast<Packet4s,Packet4us>(const Packet4s& a)
-{ return vreinterpret_u16_s16(a); }
-template<> EIGEN_STRONG_INLINE Packet4us pcast<Packet4i,Packet4us>(const Packet4i& a)
-{ return vmovn_u32(vreinterpretq_u32_s32(a)); }
-template<> EIGEN_STRONG_INLINE Packet4us pcast<Packet4ui,Packet4us>(const Packet4ui& a) { return vmovn_u32(a); }
-template<> EIGEN_STRONG_INLINE Packet8us pcast<Packet8c,Packet8us>(const Packet8c& a)
-{ return vreinterpretq_u16_s16(vmovl_s8(a)); }
-template<> EIGEN_STRONG_INLINE Packet8us pcast<Packet8uc,Packet8us>(const Packet8uc& a) { return vmovl_u8(a); }
-template<> EIGEN_STRONG_INLINE Packet8us pcast<Packet8s,Packet8us>(const Packet8s& a)
-{ return vreinterpretq_u16_s16(a); }
-template<> EIGEN_STRONG_INLINE Packet2i pcast<Packet2f,Packet2i>(const Packet2f& a) { return vcvt_s32_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet2i pcast<Packet2ui,Packet2i>(const Packet2ui& a)
-{ return vreinterpret_s32_u32(a); }
-template<> EIGEN_STRONG_INLINE Packet2i pcast<Packet2l,Packet2i>(const Packet2l& a)
-{ return vmovn_s64(a); }
-template<> EIGEN_STRONG_INLINE Packet2i pcast<Packet2ul,Packet2i>(const Packet2ul& a)
-{ return vmovn_s64(vreinterpretq_s64_u64(a)); }
-template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4f,Packet4i>(const Packet4f& a) { return vcvtq_s32_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4c,Packet4i>(const Packet4c& a)
-{ return vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s32(vdup_n_s32(a))))); }
-template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4uc,Packet4i>(const Packet4uc& a)
-{ return vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(a)))))); }
-template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4s,Packet4i>(const Packet4s& a) { return vmovl_s16(a); }
-template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4us,Packet4i>(const Packet4us& a)
-{ return vreinterpretq_s32_u32(vmovl_u16(a)); }
-template<> EIGEN_STRONG_INLINE Packet4i pcast<Packet4ui,Packet4i>(const Packet4ui& a)
-{ return vreinterpretq_s32_u32(a); }
-template<> EIGEN_STRONG_INLINE Packet2ui pcast<Packet2f,Packet2ui>(const Packet2f& a) { return vcvt_u32_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet2ui pcast<Packet2i,Packet2ui>(const Packet2i& a)
-{ return vreinterpret_u32_s32(a); }
-template<> EIGEN_STRONG_INLINE Packet2ui pcast<Packet2l,Packet2ui>(const Packet2l& a)
-{ return vmovn_u64(vreinterpretq_u64_s64(a)); }
-template<> EIGEN_STRONG_INLINE Packet2ui pcast<Packet2ul,Packet2ui>(const Packet2ul& a)
-{ return vmovn_u64(a); }
-template<> EIGEN_STRONG_INLINE Packet4ui pcast<Packet4f,Packet4ui>(const Packet4f& a) { return vcvtq_u32_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet4ui pcast<Packet4c,Packet4ui>(const Packet4c& a)
-{ return vreinterpretq_u32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s32(vdup_n_s32(a)))))); }
-template<> EIGEN_STRONG_INLINE Packet4ui pcast<Packet4uc,Packet4ui>(const Packet4uc& a)
-{ return vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(a))))); }
-template<> EIGEN_STRONG_INLINE Packet4ui pcast<Packet4s,Packet4ui>(const Packet4s& a)
-{ return vreinterpretq_u32_s32(vmovl_s16(a)); }
-template<> EIGEN_STRONG_INLINE Packet4ui pcast<Packet4us,Packet4ui>(const Packet4us& a) { return vmovl_u16(a); }
-template<> EIGEN_STRONG_INLINE Packet4ui pcast<Packet4i,Packet4ui>(const Packet4i& a)
-{ return vreinterpretq_u32_s32(a); }
-template<> EIGEN_STRONG_INLINE Packet2l pcast<Packet2f,Packet2l>(const Packet2f& a)
-{ return vmovl_s32(vcvt_s32_f32(a)); }
-template<> EIGEN_STRONG_INLINE Packet2l pcast<Packet2i,Packet2l>(const Packet2i& a)
-{ return vmovl_s32(a); }
-template<> EIGEN_STRONG_INLINE Packet2l pcast<Packet2ui,Packet2l>(const Packet2ui& a)
-{ return vreinterpretq_s64_u64(vmovl_u32(a)); }
-template<> EIGEN_STRONG_INLINE Packet2l pcast<Packet2ul,Packet2l>(const Packet2ul& a)
-{ return vreinterpretq_s64_u64(a); }
-template<> EIGEN_STRONG_INLINE Packet2ul pcast<Packet2f,Packet2ul>(const Packet2f& a)
-{ return vmovl_u32(vcvt_u32_f32(a)); }
-template<> EIGEN_STRONG_INLINE Packet2ul pcast<Packet2i,Packet2ul>(const Packet2i& a)
-{ return vreinterpretq_u64_s64(vmovl_s32(a)); }
-template<> EIGEN_STRONG_INLINE Packet2ul pcast<Packet2ui,Packet2ul>(const Packet2ui& a)
-{ return vmovl_u32(a); }
-template<> EIGEN_STRONG_INLINE Packet2ul pcast<Packet2l,Packet2ul>(const Packet2l& a)
-{ return vreinterpretq_u64_s64(a); }
-
-template<> EIGEN_STRONG_INLINE Packet2f preinterpret<Packet2f,Packet2i>(const Packet2i& a)
-{ return vreinterpret_f32_s32(a); }
-template<> EIGEN_STRONG_INLINE Packet2f preinterpret<Packet2f,Packet2ui>(const Packet2ui& a)
-{ return vreinterpret_f32_u32(a); }
-template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a)
-{ return vreinterpretq_f32_s32(a); }
-template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4ui>(const Packet4ui& a)
-{ return vreinterpretq_f32_u32(a); }
-template<> EIGEN_STRONG_INLINE Packet4c preinterpret<Packet4c,Packet4uc>(const Packet4uc& a)
-{ return static_cast<Packet4c>(a); }
-template<> EIGEN_STRONG_INLINE Packet8c preinterpret<Packet8c,Packet8uc>(const Packet8uc& a)
-{ return vreinterpret_s8_u8(a); }
-template<> EIGEN_STRONG_INLINE Packet16c preinterpret<Packet16c,Packet16uc>(const Packet16uc& a)
-{ return vreinterpretq_s8_u8(a); }
-template<> EIGEN_STRONG_INLINE Packet4uc preinterpret<Packet4uc,Packet4c>(const Packet4c& a)
-{ return static_cast<Packet4uc>(a); }
-template<> EIGEN_STRONG_INLINE Packet8uc preinterpret<Packet8uc,Packet8c>(const Packet8c& a)
-{ return vreinterpret_u8_s8(a); }
-template<> EIGEN_STRONG_INLINE Packet16uc preinterpret<Packet16uc,Packet16c>(const Packet16c& a)
-{ return vreinterpretq_u8_s8(a); }
-template<> EIGEN_STRONG_INLINE Packet4s preinterpret<Packet4s,Packet4us>(const Packet4us& a)
-{ return vreinterpret_s16_u16(a); }
-template<> EIGEN_STRONG_INLINE Packet8s preinterpret<Packet8s,Packet8us>(const Packet8us& a)
-{ return vreinterpretq_s16_u16(a); }
-template<> EIGEN_STRONG_INLINE Packet4us preinterpret<Packet4us,Packet4s>(const Packet4s& a)
-{ return vreinterpret_u16_s16(a); }
-template<> EIGEN_STRONG_INLINE Packet8us preinterpret<Packet8us,Packet8s>(const Packet8s& a)
-{ return vreinterpretq_u16_s16(a); }
-template<> EIGEN_STRONG_INLINE Packet2i preinterpret<Packet2i,Packet2f>(const Packet2f& a)
-{ return vreinterpret_s32_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet2i preinterpret<Packet2i,Packet2ui>(const Packet2ui& a)
-{ return vreinterpret_s32_u32(a); }
-template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a)
-{ return vreinterpretq_s32_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4ui>(const Packet4ui& a)
-{ return vreinterpretq_s32_u32(a); }
-template<> EIGEN_STRONG_INLINE Packet2ui preinterpret<Packet2ui,Packet2f>(const Packet2f& a)
-{ return vreinterpret_u32_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet2ui preinterpret<Packet2ui,Packet2i>(const Packet2i& a)
-{ return vreinterpret_u32_s32(a); }
-template<> EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui,Packet4f>(const Packet4f& a)
-{ return vreinterpretq_u32_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui,Packet4i>(const Packet4i& a)
-{ return vreinterpretq_u32_s32(a); }
-template<> EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l,Packet2ul>(const Packet2ul& a)
-{ return vreinterpretq_s64_u64(a); }
-template<> EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul,Packet2l>(const Packet2l& a)
-{ return vreinterpretq_u64_s64(a); }
-
-#if EIGEN_ARCH_ARM64
-
-template<> EIGEN_STRONG_INLINE Packet2f pcast<Packet2d,Packet2f>(const Packet2d& a) { return vcvt_f32_f64(a); }
-template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet2f,Packet2d>(const Packet2f& a) { return vcvt_f64_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet2i,Packet2d>(const Packet2i& a) { return vcvtq_f64_s64(vmovl_s32(a)); }
-template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet2ui,Packet2d>(const Packet2ui& a) { return vcvtq_f64_u64(vmovl_u32(a)); }
-template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet2l,Packet2d>(const Packet2l& a) { return vcvtq_f64_s64(a); }
-template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet2ul,Packet2d>(const Packet2ul& a) { return vcvtq_f64_u64(a); }
-template<> EIGEN_STRONG_INLINE Packet2i pcast<Packet2d,Packet2i>(const Packet2d& a) { return vcvt_s32_f32(vcvt_f32_f64(a)); }
-template<> EIGEN_STRONG_INLINE Packet2ui pcast<Packet2d,Packet2ui>(const Packet2d& a) { return vcvt_u32_f32(vcvt_f32_f64(a)); }
-template<> EIGEN_STRONG_INLINE Packet2l pcast<Packet2d,Packet2l>(const Packet2d& a) { return vcvtq_s64_f64(a); }
-template<> EIGEN_STRONG_INLINE Packet2ul pcast<Packet2d,Packet2ul>(const Packet2d& a) { return vcvtq_u64_f64(a); }
-
-template<> EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d,Packet2l>(const Packet2l& a)
-{ return vreinterpretq_f64_s64(a); }
-template<> EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d,Packet2ul>(const Packet2ul& a)
-{ return vreinterpretq_f64_u64(a); }
-template<> EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l,Packet2d>(const Packet2d& a)
-{ return vreinterpretq_s64_f64(a); }
-template<> EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul,Packet2d>(const Packet2d& a)
-{ return vreinterpretq_u64_f64(a); }
-
-#endif // EIGEN_ARCH_ARM64
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_TYPE_CASTING_NEON_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/Complex.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/Complex.h
index 8bf8bfe85..d075043ce 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/Complex.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/Complex.h
@@ -45,13 +45,12 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
     HasMin    = 0,
     HasMax    = 0,
     HasSetLinear = 0,
-    HasBlend  = 1,
-    HasInsert = 1
+    HasBlend = 1
   };
 };
 #endif
 
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
 
 template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
@@ -83,13 +82,10 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, con
   #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf ptrue  <Packet2cf>(const Packet2cf& a) { return Packet2cf(ptrue(Packet4f(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet2cf pnot   <Packet2cf>(const Packet2cf& a) { return Packet2cf(pnot(Packet4f(a.v))); }
-
 template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(b.v,a.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); }
 
 template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(&numext::real_ref(*from))); }
 template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(&numext::real_ref(*from))); }
@@ -156,11 +152,29 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packe
   return pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v,a.v))));
 }
 
+template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
+{
+  return Packet2cf(_mm_add_ps(_mm_movelh_ps(vecs[0].v,vecs[1].v), _mm_movehl_ps(vecs[1].v,vecs[0].v)));
+}
+
 template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
 {
   return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v))));
 }
 
+template<int Offset>
+struct palign_impl<Offset,Packet2cf>
+{
+  static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
+  {
+    if (Offset==1)
+    {
+      first.v = _mm_movehl_ps(first.v, first.v);
+      first.v = _mm_movelh_ps(first.v, second.v);
+    }
+  }
+};
+
 template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
 {
   EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
@@ -266,7 +280,7 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
 };
 #endif
 
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
 
 template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
@@ -291,12 +305,10 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
   #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd ptrue  <Packet1cd>(const Packet1cd& a) { return Packet1cd(ptrue(Packet2d(a.v))); }
-template<> EIGEN_STRONG_INLINE Packet1cd pnot   <Packet1cd>(const Packet1cd& a) { return Packet1cd(pnot(Packet2d(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(b.v,a.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); }
 
 // FIXME force unaligned load, this is a temporary fix
 template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from)
@@ -328,11 +340,26 @@ template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Pack
   return pfirst(a);
 }
 
+template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
+{
+  return vecs[0];
+}
+
 template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
 {
   return pfirst(a);
 }
 
+template<int Offset>
+struct palign_impl<Offset,Packet1cd>
+{
+  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
+  {
+    // FIXME is it sure we never have to align a Packet1cd?
+    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
+  }
+};
+
 template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
 {
   EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
@@ -412,23 +439,31 @@ ptranspose(PacketBlock<Packet2cf,2>& kernel) {
   kernel.packet[1].v = tmp;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b)
-{
-  __m128 eq = _mm_cmpeq_ps(a.v, b.v);
-  return Packet2cf(pand<Packet4f>(eq, vec4f_swizzle1(eq, 1, 0, 3, 2)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b)
-{
-  __m128d eq = _mm_cmpeq_pd(a.v, b.v);
-  return Packet1cd(pand<Packet2d>(eq, vec2d_swizzle1(eq, 1, 0)));
-}
-
 template<>  EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
   __m128d result = pblend<Packet2d>(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
   return Packet2cf(_mm_castpd_ps(result));
 }
 
+template<> EIGEN_STRONG_INLINE Packet2cf pinsertfirst(const Packet2cf& a, std::complex<float> b)
+{
+  return Packet2cf(_mm_loadl_pi(a.v, reinterpret_cast<const __m64*>(&b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd pinsertfirst(const Packet1cd&, std::complex<double> b)
+{
+  return pset1<Packet1cd>(b);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2cf pinsertlast(const Packet2cf& a, std::complex<float> b)
+{
+  return Packet2cf(_mm_loadh_pi(a.v, reinterpret_cast<const __m64*>(&b)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd pinsertlast(const Packet1cd&, std::complex<double> b)
+{
+  return pset1<Packet1cd>(b);
+}
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/MathFunctions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/MathFunctions.h
index 92c1eecc7..7b5f948e1 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -8,7 +8,7 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-/* The sin and cos and functions of this file come from
+/* The sin, cos, exp, and log functions of this file come from
  * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
  */
 
@@ -20,42 +20,426 @@ namespace Eigen {
 namespace internal {
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f plog<Packet4f>(const Packet4f& _x) {
-  return plog_float(_x);
-}
+Packet4f plog<Packet4f>(const Packet4f& _x)
+{
+  Packet4f x = _x;
+  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
 
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f plog1p<Packet4f>(const Packet4f& _x) {
-  return generic_plog1p(_x);
-}
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
 
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f pexpm1<Packet4f>(const Packet4f& _x) {
-  return generic_expm1(_x);
+  /* the smallest non denormalized float number */
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000);//-1.f/0.f);
+
+  /* natural logarithm computed for 4 simultaneous float
+    return NaN for x <= 0
+  */
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+
+
+  Packet4i emm0;
+
+  Packet4f invalid_mask = _mm_cmpnge_ps(x, _mm_setzero_ps()); // not greater equal is true if x is NaN
+  Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
+
+  x = pmax(x, p4f_min_norm_pos);  /* cut off denormalized stuff */
+  emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
+
+  /* keep only the fractional part */
+  x = _mm_and_ps(x, p4f_inv_mant_mask);
+  x = _mm_or_ps(x, p4f_half);
+
+  emm0 = _mm_sub_epi32(emm0, p4i_0x7f);
+  Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1);
+
+  /* part2:
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF);
+  Packet4f tmp = pand(x, mask);
+  x = psub(x, p4f_1);
+  e = psub(e, pand(p4f_1, mask));
+  x = padd(x, tmp);
+
+  Packet4f x2 = pmul(x,x);
+  Packet4f x3 = pmul(x2,x);
+
+  Packet4f y, y1, y2;
+  y  = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
+  y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
+  y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
+  y  = pmadd(y , x, p4f_cephes_log_p2);
+  y1 = pmadd(y1, x, p4f_cephes_log_p5);
+  y2 = pmadd(y2, x, p4f_cephes_log_p8);
+  y = pmadd(y, x3, y1);
+  y = pmadd(y, x3, y2);
+  y = pmul(y, x3);
+
+  y1 = pmul(e, p4f_cephes_log_q1);
+  tmp = pmul(x2, p4f_half);
+  y = padd(y, y1);
+  x = psub(x, tmp);
+  y2 = pmul(e, p4f_cephes_log_q2);
+  x = padd(x, y);
+  x = padd(x, y2);
+  // negative arg will be NAN, 0 will be -INF
+  return _mm_or_ps(_mm_andnot_ps(iszero_mask, _mm_or_ps(x, invalid_mask)),
+                   _mm_and_ps(iszero_mask, p4f_minus_inf));
 }
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f pexp<Packet4f>(const Packet4f& _x)
 {
-  return pexp_float(_x);
+  Packet4f x = _x;
+  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+
+
+  _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
+  _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
+
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
+
+  Packet4f tmp, fx;
+  Packet4i emm0;
+
+  // clamp x
+  x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
+
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
+
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  fx = _mm_floor_ps(fx);
+#else
+  emm0 = _mm_cvttps_epi32(fx);
+  tmp  = _mm_cvtepi32_ps(emm0);
+  /* if greater, substract 1 */
+  Packet4f mask = _mm_cmpgt_ps(tmp, fx);
+  mask = _mm_and_ps(mask, p4f_1);
+  fx = psub(tmp, mask);
+#endif
+
+  tmp = pmul(fx, p4f_cephes_exp_C1);
+  Packet4f z = pmul(fx, p4f_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  z = pmul(x,x);
+
+  Packet4f y = p4f_cephes_exp_p0;
+  y = pmadd(y, x, p4f_cephes_exp_p1);
+  y = pmadd(y, x, p4f_cephes_exp_p2);
+  y = pmadd(y, x, p4f_cephes_exp_p3);
+  y = pmadd(y, x, p4f_cephes_exp_p4);
+  y = pmadd(y, x, p4f_cephes_exp_p5);
+  y = pmadd(y, z, x);
+  y = padd(y, p4f_1);
+
+  // build 2^n
+  emm0 = _mm_cvttps_epi32(fx);
+  emm0 = _mm_add_epi32(emm0, p4i_0x7f);
+  emm0 = _mm_slli_epi32(emm0, 23);
+  return pmax(pmul(y, Packet4f(_mm_castsi128_ps(emm0))), _x);
+}
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+Packet2d pexp<Packet2d>(const Packet2d& _x)
+{
+  Packet2d x = _x;
+
+  _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
+  _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
+  _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
+
+  _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
+  _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
+
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
+
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
+
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
+
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
+  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
+  static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0);
+
+  Packet2d tmp, fx;
+  Packet4i emm0;
+
+  // clamp x
+  x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);
+
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  fx = _mm_floor_pd(fx);
+#else
+  emm0 = _mm_cvttpd_epi32(fx);
+  tmp  = _mm_cvtepi32_pd(emm0);
+  /* if greater, substract 1 */
+  Packet2d mask = _mm_cmpgt_pd(tmp, fx);
+  mask = _mm_and_pd(mask, p2d_1);
+  fx = psub(tmp, mask);
+#endif
+
+  tmp = pmul(fx, p2d_cephes_exp_C1);
+  Packet2d z = pmul(fx, p2d_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  Packet2d x2 = pmul(x,x);
+
+  Packet2d px = p2d_cephes_exp_p0;
+  px = pmadd(px, x2, p2d_cephes_exp_p1);
+  px = pmadd(px, x2, p2d_cephes_exp_p2);
+  px = pmul (px, x);
+
+  Packet2d qx = p2d_cephes_exp_q0;
+  qx = pmadd(qx, x2, p2d_cephes_exp_q1);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q2);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q3);
+
+  x = pdiv(px,psub(qx,px));
+  x = pmadd(p2d_2,x,p2d_1);
+
+  // build 2^n
+  emm0 = _mm_cvttpd_epi32(fx);
+  emm0 = _mm_add_epi32(emm0, p4i_1023_0);
+  emm0 = _mm_slli_epi32(emm0, 20);
+  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
+  return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x);
 }
 
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet2d pexp<Packet2d>(const Packet2d& x)
-{
-  return pexp_double(x);
-}
+/* evaluation of 4 sines at onces, using SSE2 intrinsics.
+
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+*/
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f psin<Packet4f>(const Packet4f& _x)
 {
-  return psin_float(_x);
+  Packet4f x = _x;
+  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+
+  _EIGEN_DECLARE_CONST_Packet4i(1, 1);
+  _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
+  _EIGEN_DECLARE_CONST_Packet4i(2, 2);
+  _EIGEN_DECLARE_CONST_Packet4i(4, 4);
+
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000);
+
+  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
+  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
+  _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(sincof_p1,  8.3321608736E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(coscof_p0,  2.443315711809948E-005f);
+  _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
+  _EIGEN_DECLARE_CONST_Packet4f(coscof_p2,  4.166664568298827E-002f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
+
+  Packet4f xmm1, xmm2, xmm3, sign_bit, y;
+
+  Packet4i emm0, emm2;
+  sign_bit = x;
+  /* take the absolute value */
+  x = pabs(x);
+
+  /* take the modulo */
+
+  /* extract the sign bit (upper one) */
+  sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask);
+
+  /* scale by 4/Pi */
+  y = pmul(x, p4f_cephes_FOPI);
+
+  /* store the integer part of y in mm0 */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, p4i_1);
+  emm2 = _mm_and_si128(emm2, p4i_not1);
+  y = _mm_cvtepi32_ps(emm2);
+  /* get the swap sign flag */
+  emm0 = _mm_and_si128(emm2, p4i_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+
+     Both branches will be computed.
+  */
+  emm2 = _mm_and_si128(emm2, p4i_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+
+  Packet4f swap_sign_bit = _mm_castsi128_ps(emm0);
+  Packet4f poly_mask = _mm_castsi128_ps(emm2);
+  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = pmul(y, p4f_minus_cephes_DP1);
+  xmm2 = pmul(y, p4f_minus_cephes_DP2);
+  xmm3 = pmul(y, p4f_minus_cephes_DP3);
+  x = padd(x, xmm1);
+  x = padd(x, xmm2);
+  x = padd(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = p4f_coscof_p0;
+  Packet4f z = _mm_mul_ps(x,x);
+
+  y = pmadd(y, z, p4f_coscof_p1);
+  y = pmadd(y, z, p4f_coscof_p2);
+  y = pmul(y, z);
+  y = pmul(y, z);
+  Packet4f tmp = pmul(z, p4f_half);
+  y = psub(y, tmp);
+  y = padd(y, p4f_1);
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+
+  Packet4f y2 = p4f_sincof_p0;
+  y2 = pmadd(y2, z, p4f_sincof_p1);
+  y2 = pmadd(y2, z, p4f_sincof_p2);
+  y2 = pmul(y2, z);
+  y2 = pmul(y2, x);
+  y2 = padd(y2, x);
+
+  /* select the correct result from the two polynoms */
+  y2 = _mm_and_ps(poly_mask, y2);
+  y = _mm_andnot_ps(poly_mask, y);
+  y = _mm_or_ps(y,y2);
+  /* update the sign */
+  return _mm_xor_ps(y, sign_bit);
 }
 
+/* almost the same as psin */
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f pcos<Packet4f>(const Packet4f& _x)
 {
-  return pcos_float(_x);
+  Packet4f x = _x;
+  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
+  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+
+  _EIGEN_DECLARE_CONST_Packet4i(1, 1);
+  _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
+  _EIGEN_DECLARE_CONST_Packet4i(2, 2);
+  _EIGEN_DECLARE_CONST_Packet4i(4, 4);
+
+  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
+  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
+  _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
+  _EIGEN_DECLARE_CONST_Packet4f(sincof_p1,  8.3321608736E-3f);
+  _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
+  _EIGEN_DECLARE_CONST_Packet4f(coscof_p0,  2.443315711809948E-005f);
+  _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
+  _EIGEN_DECLARE_CONST_Packet4f(coscof_p2,  4.166664568298827E-002f);
+  _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
+
+  Packet4f xmm1, xmm2, xmm3, y;
+  Packet4i emm0, emm2;
+
+  x = pabs(x);
+
+  /* scale by 4/Pi */
+  y = pmul(x, p4f_cephes_FOPI);
+
+  /* get the integer part of y */
+  emm2 = _mm_cvttps_epi32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = _mm_add_epi32(emm2, p4i_1);
+  emm2 = _mm_and_si128(emm2, p4i_not1);
+  y = _mm_cvtepi32_ps(emm2);
+
+  emm2 = _mm_sub_epi32(emm2, p4i_2);
+
+  /* get the swap sign flag */
+  emm0 = _mm_andnot_si128(emm2, p4i_4);
+  emm0 = _mm_slli_epi32(emm0, 29);
+  /* get the polynom selection mask */
+  emm2 = _mm_and_si128(emm2, p4i_2);
+  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
+
+  Packet4f sign_bit = _mm_castsi128_ps(emm0);
+  Packet4f poly_mask = _mm_castsi128_ps(emm2);
+
+  /* The magic pass: "Extended precision modular arithmetic"
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = pmul(y, p4f_minus_cephes_DP1);
+  xmm2 = pmul(y, p4f_minus_cephes_DP2);
+  xmm3 = pmul(y, p4f_minus_cephes_DP3);
+  x = padd(x, xmm1);
+  x = padd(x, xmm2);
+  x = padd(x, xmm3);
+
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
+  y = p4f_coscof_p0;
+  Packet4f z = pmul(x,x);
+
+  y = pmadd(y,z,p4f_coscof_p1);
+  y = pmadd(y,z,p4f_coscof_p2);
+  y = pmul(y, z);
+  y = pmul(y, z);
+  Packet4f tmp = _mm_mul_ps(z, p4f_half);
+  y = psub(y, tmp);
+  y = padd(y, p4f_1);
+
+  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
+  Packet4f y2 = p4f_sincof_p0;
+  y2 = pmadd(y2, z, p4f_sincof_p1);
+  y2 = pmadd(y2, z, p4f_sincof_p2);
+  y2 = pmul(y2, z);
+  y2 = pmadd(y2, x, x);
+
+  /* select the correct result from the two polynoms */
+  y2 = _mm_and_ps(poly_mask, y2);
+  y  = _mm_andnot_ps(poly_mask, y);
+  y  = _mm_or_ps(y,y2);
+
+  /* update the sign */
+  return _mm_xor_ps(y, sign_bit);
 }
 
 #if EIGEN_FAST_MATH
@@ -98,34 +482,30 @@ Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f prsqrt<Packet4f>(const Packet4f& _x) {
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000);
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000);
   _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f);
   _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f);
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000u);
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000u);
+  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000);
 
   Packet4f neg_half = pmul(_x, p4f_minus_half);
 
-  // Identity infinite, zero, negative and denormal arguments.
-  Packet4f lt_min_mask = _mm_cmplt_ps(_x, p4f_flt_min);
-  Packet4f inf_mask = _mm_cmpeq_ps(_x, p4f_inf);
-  Packet4f not_normal_finite_mask = _mm_or_ps(lt_min_mask, inf_mask);
+  // select only the inverse sqrt of positive normal inputs (denormals are
+  // flushed to zero and cause infs as well).
+  Packet4f le_zero_mask = _mm_cmple_ps(_x, p4f_flt_min);
+  Packet4f x = _mm_andnot_ps(le_zero_mask, _mm_rsqrt_ps(_x));
 
-  // Compute an approximate result using the rsqrt intrinsic.
-  Packet4f y_approx = _mm_rsqrt_ps(_x);
+  // Fill in NaNs and Infs for the negative/zero entries.
+  Packet4f neg_mask = _mm_cmplt_ps(_x, _mm_setzero_ps());
+  Packet4f zero_mask = _mm_andnot_ps(neg_mask, le_zero_mask);
+  Packet4f infs_and_nans = _mm_or_ps(_mm_and_ps(neg_mask, p4f_nan),
+                                     _mm_and_ps(zero_mask, p4f_inf));
 
-  // Do a single step of Newton-Raphson iteration to improve the approximation.
-  // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n).
-  // It is essential to evaluate the inner term like this because forming
-  // y_n^2 may over- or underflow.
-  Packet4f y_newton = pmul(
-      y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p4f_one_point_five));
+  // Do a single step of Newton's iteration.
+  x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five));
 
-  // Select the result of the Newton-Raphson step for positive normal arguments.
-  // For other arguments, choose the output of the intrinsic. This will
-  // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if
-  // x is zero or a positive denormalized float (equivalent to flushing positive
-  // denormalized inputs to zero).
-  return pselect<Packet4f>(not_normal_finite_mask, y_approx, y_newton);
+  // Insert NaNs and Infs in all the right places.
+  return _mm_or_ps(x, infs_and_nans);
 }
 
 #else
@@ -168,7 +548,7 @@ double sqrt(const double &x)
 {
 #if EIGEN_COMP_GNUC_STRICT
   // This works around a GCC bug generating poor code for _mm_sqrt_pd
-  // See https://gitlab.com/libeigen/eigen/commit/8dca9f97e38970
+  // See https://bitbucket.org/eigen/eigen/commits/14f468dba4d350d7c19c9b93072e19f7b3df563b
   return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x))));
 #else
   return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x))));
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/PacketMath.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/PacketMath.h
index 645aee0cd..60e2517e4 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -18,13 +18,11 @@ namespace internal {
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
 #endif
 
-#if !defined(EIGEN_VECTORIZE_AVX) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
-// 32 bits =>  8 registers
-// 64 bits => 16 registers
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
 #endif
 
-#ifdef EIGEN_VECTORIZE_FMA
+#ifdef __FMA__
 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1
 #endif
@@ -36,37 +34,47 @@ namespace internal {
 // One solution is to increase ABI version using -fabi-version=4 (or greater).
 // Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper
 // structure:
+template<typename T>
+struct eigen_packet_wrapper
+{
+  EIGEN_ALWAYS_INLINE operator T&() { return m_val; }
+  EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; }
+  EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {}
+  EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {}
+  EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) {
+    m_val = v;
+    return *this;
+  }
+  
+  T m_val;
+};
 typedef eigen_packet_wrapper<__m128>  Packet4f;
+typedef eigen_packet_wrapper<__m128i> Packet4i;
 typedef eigen_packet_wrapper<__m128d> Packet2d;
 #else
 typedef __m128  Packet4f;
+typedef __m128i Packet4i;
 typedef __m128d Packet2d;
 #endif
 
-typedef eigen_packet_wrapper<__m128i, 0> Packet4i;
-typedef eigen_packet_wrapper<__m128i, 1> Packet16b;
-
 template<> struct is_arithmetic<__m128>  { enum { value = true }; };
 template<> struct is_arithmetic<__m128i> { enum { value = true }; };
 template<> struct is_arithmetic<__m128d> { enum { value = true }; };
-template<> struct is_arithmetic<Packet16b>  { enum { value = true }; };
-
-#define EIGEN_SSE_SHUFFLE_MASK(p,q,r,s) ((s)<<6|(r)<<4|(q)<<2|(p))
 
 #define vec4f_swizzle1(v,p,q,r,s) \
-  (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s))))
+  (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p)))))
 
 #define vec4i_swizzle1(v,p,q,r,s) \
-  (_mm_shuffle_epi32( v, EIGEN_SSE_SHUFFLE_MASK(p,q,r,s)))
+  (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p))))
 
 #define vec2d_swizzle1(v,p,q) \
-  (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), EIGEN_SSE_SHUFFLE_MASK(2*p,2*p+1,2*q,2*q+1))))
-
+  (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2)))))
+  
 #define vec4f_swizzle2(a,b,p,q,r,s) \
-  (_mm_shuffle_ps( (a), (b), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s)))
+  (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p))))
 
 #define vec4i_swizzle2(a,b,p,q,r,s) \
-  (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s)))))
+  (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p))))))
 
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
   const Packet4f p4f_##NAME = pset1<Packet4f>(X)
@@ -75,7 +83,7 @@ template<> struct is_arithmetic<Packet16b>  { enum { value = true }; };
   const Packet2d p2d_##NAME = pset1<Packet2d>(X)
 
 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
-  const Packet4f p4f_##NAME = pset1frombits<Packet4f>(X)
+  const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1<Packet4i>(X))
 
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
   const Packet4i p4i_##NAME = pset1<Packet4i>(X)
@@ -84,43 +92,36 @@ template<> struct is_arithmetic<Packet16b>  { enum { value = true }; };
 // Use the packet_traits defined in AVX/PacketMath.h instead if we're going
 // to leverage AVX instructions.
 #ifndef EIGEN_VECTORIZE_AVX
-template <>
-struct packet_traits<float> : default_packet_traits {
+template<> struct packet_traits<float>  : default_packet_traits
+{
   typedef Packet4f type;
   typedef Packet4f half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size = 4,
+    size=4,
     HasHalfPacket = 0,
 
-    HasDiv = 1,
-    HasSin = EIGEN_FAST_MATH,
-    HasCos = EIGEN_FAST_MATH,
-    HasLog = 1,
-    HasLog1p = 1,
-    HasExpm1 = 1,
-    HasNdtri = 1,
-    HasExp = 1,
-    HasBessel = 1,
+    HasDiv  = 1,
+    HasSin  = EIGEN_FAST_MATH,
+    HasCos  = EIGEN_FAST_MATH,
+    HasLog  = 1,
+    HasExp  = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
-    HasTanh = EIGEN_FAST_MATH,
-    HasErf = EIGEN_FAST_MATH,
-    HasBlend = 1,
-    HasInsert = 1,
-    HasFloor = 1
+    HasTanh  = EIGEN_FAST_MATH,
+    HasBlend = 1
 
 #ifdef EIGEN_VECTORIZE_SSE4_1
     ,
-    HasRint = 1,
     HasRound = 1,
+    HasFloor = 1,
     HasCeil = 1
 #endif
   };
 };
-template <>
-struct packet_traits<double> : default_packet_traits {
+template<> struct packet_traits<double> : default_packet_traits
+{
   typedef Packet2d type;
   typedef Packet2d half;
   enum {
@@ -133,13 +134,11 @@ struct packet_traits<double> : default_packet_traits {
     HasExp  = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
-    HasBlend = 1,
-    HasInsert = 1
+    HasBlend = 1
 
 #ifdef EIGEN_VECTORIZE_SSE4_1
     ,
     HasRound = 1,
-    HasRint = 1,
     HasFloor = 1,
     HasCeil = 1
 #endif
@@ -155,55 +154,13 @@ template<> struct packet_traits<int>    : default_packet_traits
     AlignedOnScalar = 1,
     size=4,
 
-    HasShift = 1,
     HasBlend = 1
   };
 };
 
-template<> struct packet_traits<bool> : default_packet_traits
-{
-  typedef Packet16b type;
-  typedef Packet16b half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    HasHalfPacket = 0,
-    size=16,
-
-    HasAdd       = 1,
-    HasSub       = 0,
-    HasShift     = 0,
-    HasMul       = 1,
-    HasNegate    = 0,
-    HasAbs       = 0,
-    HasAbs2      = 0,
-    HasMin       = 0,
-    HasMax       = 0,
-    HasConj      = 0
-  };
-};
-
-template<> struct unpacket_traits<Packet4f> {
-  typedef float     type;
-  typedef Packet4f  half;
-  typedef Packet4i  integer_packet;
-  enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
-};
-template<> struct unpacket_traits<Packet2d> {
-  typedef double    type;
-  typedef Packet2d  half;
-  enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
-};
-template<> struct unpacket_traits<Packet4i> {
-  typedef int       type;
-  typedef Packet4i  half;
-  enum {size=4, alignment=Aligned16, vectorizable=false, masked_load_available=false, masked_store_available=false};
-};
-template<> struct unpacket_traits<Packet16b> {
-  typedef bool       type;
-  typedef Packet16b  half;
-  enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
-};
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
 
 #ifndef EIGEN_VECTORIZE_AVX
 template<> struct scalar_div_cost<float,true> { enum { value = 7 }; };
@@ -222,13 +179,6 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { re
 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set1_epi32(from); }
 #endif
-template<> EIGEN_STRONG_INLINE Packet16b pset1<Packet16b>(const bool&    from) { return _mm_set1_epi8(static_cast<char>(from)); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) { return _mm_castsi128_ps(pset1<Packet4i>(from)); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) { return _mm_setzero_ps(); }
-template<> EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) { return _mm_setzero_pd(); }
-template<> EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) { return _mm_setzero_si128(); }
 
 // GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
 // However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
@@ -240,7 +190,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float *from) {
   return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0);
 }
 #endif
-
+  
 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
 template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
@@ -249,8 +199,6 @@ template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const
 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet16b padd<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); }
-
 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
@@ -292,62 +240,18 @@ template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet16b pmul<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); }
-
 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
 
 // for some weird raisons, it has to be overloaded for packet of integers
 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
-#ifdef EIGEN_VECTORIZE_FMA
+#ifdef __FMA__
 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); }
 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); }
 #endif
 
-#ifdef EIGEN_VECTORIZE_SSE4_1
-template<> EIGEN_DEVICE_FUNC inline Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {  return _mm_blendv_ps(b,a,mask); }
-
-template<> EIGEN_DEVICE_FUNC inline Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {  return _mm_blendv_pd(b,a,mask); }
-#endif
-
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
-  // There appears to be a bug in GCC, by which the optimizer may
-  // flip the argument order in calls to _mm_min_ps, so we have to
-  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
-  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
-  #ifdef EIGEN_VECTORIZE_AVX
-  Packet4f res;
-  asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
-  #else
-  Packet4f res = b;
-  asm("minps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
-  #endif
-  return res;
-#else
-  // Arguments are reversed to match NaN propagation behavior of std::min.
-  return _mm_min_ps(b, a);
-#endif
-}
-template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
-  // There appears to be a bug in GCC, by which the optimizer may
-  // flip the argument order in calls to _mm_min_pd, so we have to
-  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
-  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
-  #ifdef EIGEN_VECTORIZE_AVX
-  Packet2d res;
-  asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
-  #else
-  Packet2d res = b;
-  asm("minpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
-  #endif
-  return res;
-#else
-  // Arguments are reversed to match NaN propagation behavior of std::min.
-  return _mm_min_pd(b, a);
-#endif
-}
+template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b)
 {
 #ifdef EIGEN_VECTORIZE_SSE4_1
@@ -359,44 +263,8 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
-  // There appears to be a bug in GCC, by which the optimizer may
-  // flip the argument order in calls to _mm_max_ps, so we have to
-  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
-  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
-  #ifdef EIGEN_VECTORIZE_AVX
-  Packet4f res;
-  asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
-  #else
-  Packet4f res = b;
-  asm("maxps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
-  #endif
-  return res;
-#else
-  // Arguments are reversed to match NaN propagation behavior of std::max.
-  return _mm_max_ps(b, a);
-#endif
-}
-template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
-  // There appears to be a bug in GCC, by which the optimizer may
-  // flip the argument order in calls to _mm_max_pd, so we have to
-  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
-  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
-  #ifdef EIGEN_VECTORIZE_AVX
-  Packet2d res;
-  asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b));
-  #else
-  Packet2d res = b;
-  asm("maxpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a));
-  #endif
-  return res;
-#else
-  // Arguments are reversed to match NaN propagation behavior of std::max.
-  return _mm_max_pd(b, a);
-#endif
-}
+template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); }
 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b)
 {
 #ifdef EIGEN_VECTORIZE_SSE4_1
@@ -408,112 +276,36 @@ template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const
 #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return _mm_cmple_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return _mm_cmplt_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) { return _mm_cmpnge_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return _mm_cmplt_epi32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) { return _mm_cmpeq_epi8(a,b); }
-
-
-template<> EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); }
-template<> EIGEN_STRONG_INLINE Packet16b ptrue<Packet16b>(const Packet16b& a) { return _mm_cmpeq_epi8(a, a); }
-template<> EIGEN_STRONG_INLINE Packet4f
-ptrue<Packet4f>(const Packet4f& a) {
-  Packet4i b = _mm_castps_si128(a);
-  return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b));
-}
-template<> EIGEN_STRONG_INLINE Packet2d
-ptrue<Packet2d>(const Packet2d& a) {
-  Packet4i b = _mm_castpd_si128(a);
-  return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b));
-}
-
-
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16b pand<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16b por<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
-template<> EIGEN_STRONG_INLINE Packet16b pxor<Packet16b>(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); }
-template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); }
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); }
-
-template<int N> EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) { return _mm_srai_epi32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a) { return _mm_srli_epi32(a,N); }
-template<int N> EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) { return _mm_slli_epi32(a,N); }
-
 #ifdef EIGEN_VECTORIZE_SSE4_1
-template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
-{
-  // Unfortunatly _mm_round_ps doesn't have a rounding mode to implement numext::round.
-  const Packet4f mask = pset1frombits<Packet4f>(0x80000000u);
-  const Packet4f prev0dot5 = pset1frombits<Packet4f>(0x3EFFFFFFu);
-  return _mm_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
-}
-
-template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a)
-{
-  const Packet2d mask = _mm_castsi128_pd(_mm_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull));
-  const Packet2d prev0dot5 = _mm_castsi128_pd(_mm_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull));
-  return _mm_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION); }
-template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); }
+template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return _mm_round_ps(a, 0); }
+template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return _mm_round_pd(a, 0); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return _mm_ceil_ps(a); }
 template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return _mm_ceil_pd(a); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return _mm_floor_ps(a); }
 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return _mm_floor_pd(a); }
-#else
-template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
-{
-  const Packet4f cst_1 = pset1<Packet4f>(1.0f);
-  Packet4i emm0 = _mm_cvttps_epi32(a);
-  Packet4f tmp  = _mm_cvtepi32_ps(emm0);
-  /* if greater, substract 1 */
-  Packet4f mask = _mm_cmpgt_ps(tmp, a);
-  mask = pand(mask, cst_1);
-  return psub(tmp, mask);
-}
-
-// WARNING: this pfloor implementation makes sense for small inputs only,
-// It is currently only used by pexp and not exposed through HasFloor.
-template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a)
-{
-  const Packet2d cst_1 = pset1<Packet2d>(1.0);
-  Packet4i emm0 = _mm_cvttpd_epi32(a);
-  Packet2d tmp  = _mm_cvtepi32_pd(emm0);
-  /* if greater, substract 1 */
-  Packet2d mask = _mm_cmpgt_pd(tmp, a);
-  mask = pand(mask, cst_1);
-  return psub(tmp, mask);
-}
 #endif
 
+template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
+
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); }
+template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); }
+template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); }
+
 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
-template<> EIGEN_STRONG_INLINE Packet16b pload<Packet16b>(const bool*     from) { EIGEN_DEBUG_ALIGNED_LOAD return  _mm_load_si128(reinterpret_cast<const __m128i*>(from)); }
 
 #if EIGEN_COMP_MSVC
   template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*  from) {
@@ -548,10 +340,6 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
   EIGEN_DEBUG_UNALIGNED_LOAD
   return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
 }
-template<> EIGEN_STRONG_INLINE Packet16b ploadu<Packet16b>(const bool*     from) {
-  EIGEN_DEBUG_UNALIGNED_LOAD
-  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
-}
 
 
 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
@@ -570,12 +358,10 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
 template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
 template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
-template<> EIGEN_STRONG_INLINE void pstore<bool>(bool*     to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); }
 
 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); }
 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); }
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<bool>(bool*     to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); }
 
 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
 {
@@ -623,7 +409,7 @@ template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double&
   pstore(to, Packet2d(vec2d_swizzle1(pa,0,0)));
 }
 
-#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
+#if EIGEN_COMP_PGI
 typedef const void * SsePrefetchPtrType;
 #else
 typedef const char * SsePrefetchPtrType;
@@ -650,7 +436,6 @@ template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int
 template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); }
 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); }
 template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); }
-template<> EIGEN_STRONG_INLINE bool   pfirst<Packet16b>(const Packet16b& a) { int x = _mm_cvtsi128_si32(a); return static_cast<bool>(x & 1); }
 #endif
 
 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
@@ -680,23 +465,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
   #endif
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
-  return pfrexp_float(a,exponent);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
-  return pldexp_float(a,exponent);
-}
-
-template<> EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
-  const Packet4i cst_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0);
-  Packet4i emm0 = _mm_cvttpd_epi32(exponent);
-  emm0 = padd(emm0, cst_1023_0);
-  emm0 = _mm_slli_epi32(emm0, 20);
-  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
-  return pmul(a, Packet2d(_mm_castsi128_pd(emm0)));
-}
-
 // with AVX, the default implementations based on pload1 are faster
 #ifndef __AVX__
 template<> EIGEN_STRONG_INLINE void
@@ -737,6 +505,38 @@ EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
   vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
 }
 
+#ifdef EIGEN_VECTORIZE_SSE3
+template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
+{
+  return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
+{
+  return _mm_hadd_pd(vecs[0], vecs[1]);
+}
+
+#else
+template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
+{
+  Packet4f tmp0, tmp1, tmp2;
+  tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]);
+  tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]);
+  tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]);
+  tmp0 = _mm_add_ps(tmp0, tmp1);
+  tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]);
+  tmp1 = _mm_add_ps(tmp1, tmp2);
+  tmp2 = _mm_movehl_ps(tmp1, tmp0);
+  tmp0 = _mm_movelh_ps(tmp0, tmp1);
+  return _mm_add_ps(tmp0, tmp2);
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
+{
+  return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1]));
+}
+#endif  // SSE3
+
 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
 {
   // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures
@@ -762,28 +562,38 @@ template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
 }
 
 #ifdef EIGEN_VECTORIZE_SSSE3
+template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
+{
+  return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));
+}
 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
 {
   Packet4i tmp0 = _mm_hadd_epi32(a,a);
   return pfirst<Packet4i>(_mm_hadd_epi32(tmp0,tmp0));
 }
-
 #else
 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
 {
   Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
   return pfirst(tmp) + pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1));
 }
-#endif
 
-template<> EIGEN_STRONG_INLINE bool predux<Packet16b>(const Packet16b& a) {
-Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a,a));
-  return (pfirst(tmp) != 0) || (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) != 0);
+template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
+{
+  Packet4i tmp0, tmp1, tmp2;
+  tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
+  tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
+  tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
+  tmp0 = _mm_add_epi32(tmp0, tmp1);
+  tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
+  tmp1 = _mm_add_epi32(tmp1, tmp2);
+  tmp2 = _mm_unpacklo_epi64(tmp0, tmp1);
+  tmp0 = _mm_unpackhi_epi64(tmp0, tmp1);
+  return _mm_add_epi32(tmp0, tmp2);
 }
-
+#endif
 // Other reduction functions:
 
-
 // mul
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
 {
@@ -801,7 +611,7 @@ template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
   // TODO try to call _mm_mul_epu32 directly
   EIGEN_ALIGN16 int aux[4];
   pstore(aux, a);
-  return  (aux[0] * aux[1]) * (aux[2] * aux[3]);
+  return  (aux[0] * aux[1]) * (aux[2] * aux[3]);;
 }
 
 // min
@@ -856,16 +666,113 @@ template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
 #endif // EIGEN_VECTORIZE_SSE4_1
 }
 
-// not needed yet
-// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x)
+#if EIGEN_COMP_GNUC
+// template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f&  a, const Packet4f&  b, const Packet4f&  c)
 // {
-//   return _mm_movemask_ps(x) == 0xF;
+//   Packet4f res = b;
+//   asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c));
+//   return res;
 // }
+// EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i&  a, const Packet4i&  b, const int i)
+// {
+//   Packet4i res = a;
+//   asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i));
+//   return res;
+// }
+#endif
 
-template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
+#ifdef EIGEN_VECTORIZE_SSSE3
+// SSSE3 versions
+template<int Offset>
+struct palign_impl<Offset,Packet4f>
 {
-  return _mm_movemask_ps(x) != 0x0;
-}
+  static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
+  {
+    if (Offset!=0)
+      first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4));
+  }
+};
+
+template<int Offset>
+struct palign_impl<Offset,Packet4i>
+{
+  static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
+  {
+    if (Offset!=0)
+      first = _mm_alignr_epi8(second,first, Offset*4);
+  }
+};
+
+template<int Offset>
+struct palign_impl<Offset,Packet2d>
+{
+  static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
+  {
+    if (Offset==1)
+      first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8));
+  }
+};
+#else
+// SSE2 versions
+template<int Offset>
+struct palign_impl<Offset,Packet4f>
+{
+  static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
+  {
+    if (Offset==1)
+    {
+      first = _mm_move_ss(first,second);
+      first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39));
+    }
+    else if (Offset==2)
+    {
+      first = _mm_movehl_ps(first,first);
+      first = _mm_movelh_ps(first,second);
+    }
+    else if (Offset==3)
+    {
+      first = _mm_move_ss(first,second);
+      first = _mm_shuffle_ps(first,second,0x93);
+    }
+  }
+};
+
+template<int Offset>
+struct palign_impl<Offset,Packet4i>
+{
+  static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
+  {
+    if (Offset==1)
+    {
+      first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
+      first = _mm_shuffle_epi32(first,0x39);
+    }
+    else if (Offset==2)
+    {
+      first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first)));
+      first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
+    }
+    else if (Offset==3)
+    {
+      first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
+      first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93));
+    }
+  }
+};
+
+template<int Offset>
+struct palign_impl<Offset,Packet2d>
+{
+  static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
+  {
+    if (Offset==1)
+    {
+      first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first)));
+      first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second)));
+    }
+  }
+};
+#endif
 
 EIGEN_DEVICE_FUNC inline void
 ptranspose(PacketBlock<Packet4f,4>& kernel) {
@@ -892,19 +799,6 @@ ptranspose(PacketBlock<Packet4i,4>& kernel) {
   kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
 }
 
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet16b,4>& kernel) {
-  __m128i T0 =  _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
-  __m128i T1 =  _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
-  __m128i T2 =  _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
-  __m128i T3 =  _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);
-  kernel.packet[0] = _mm_unpacklo_epi16(T0, T2);
-  kernel.packet[1] = _mm_unpackhi_epi16(T0, T2);
-  kernel.packet[2] = _mm_unpacklo_epi16(T1, T3);
-  kernel.packet[3] = _mm_unpackhi_epi16(T1, T3);
-}
-
-
 template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]);
@@ -936,8 +830,46 @@ template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, cons
 #endif
 }
 
+template<> EIGEN_STRONG_INLINE Packet4f pinsertfirst(const Packet4f& a, float b)
+{
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_blend_ps(a,pset1<Packet4f>(b),1);
+#else
+  return _mm_move_ss(a, _mm_load_ss(&b));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pinsertfirst(const Packet2d& a, double b)
+{
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_blend_pd(a,pset1<Packet2d>(b),1);
+#else
+  return _mm_move_sd(a, _mm_load_sd(&b));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f pinsertlast(const Packet4f& a, float b)
+{
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_blend_ps(a,pset1<Packet4f>(b),(1<<3));
+#else
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x0,0x0,0x0,0xFFFFFFFF));
+  return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, pset1<Packet4f>(b)));
+#endif
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d pinsertlast(const Packet2d& a, double b)
+{
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_blend_pd(a,pset1<Packet2d>(b),(1<<1));
+#else
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x0,0xFFFFFFFF,0xFFFFFFFF));
+  return _mm_or_pd(_mm_andnot_pd(mask, a), _mm_and_pd(mask, pset1<Packet2d>(b)));
+#endif
+}
+
 // Scalar path for pmadd with FMA to ensure consistency with vectorized path.
-#ifdef EIGEN_VECTORIZE_FMA
+#ifdef __FMA__
 template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
   return ::fmaf(a,b,c);
 }
@@ -946,219 +878,11 @@ template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, co
 }
 #endif
 
-
-// Packet math for Eigen::half
-// Disable the following code since it's broken on too many platforms / compilers.
-//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
-#if 0
-
-typedef struct {
-  __m64 x;
-} Packet4h;
-
-
-template<> struct is_arithmetic<Packet4h> { enum { value = true }; };
-
-template <>
-struct packet_traits<Eigen::half> : default_packet_traits {
-  typedef Packet4h type;
-  // There is no half-size packet for Packet4h.
-  typedef Packet4h half;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = 4,
-    HasHalfPacket = 0,
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
-    HasNegate = 0,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasConj   = 0,
-    HasSetLinear = 0,
-    HasSqrt = 0,
-    HasRsqrt = 0,
-    HasExp = 0,
-    HasLog = 0,
-    HasBlend = 0
-  };
-};
-
-
-template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h half; };
-
-template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
-  Packet4h result;
-  result.x = _mm_set1_pi16(from.x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
-  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
-
-template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
-  __int64_t a64 = _mm_cvtm64_si64(a.x);
-  __int64_t b64 = _mm_cvtm64_si64(b.x);
-
-  Eigen::half h[4];
-
-  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
-  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
-  h[0] = ha + hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
-  h[1] = ha + hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
-  h[2] = ha + hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
-  h[3] = ha + hb;
-  Packet4h result;
-  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h psub<Packet4h>(const Packet4h& a, const Packet4h& b) {
-  __int64_t a64 = _mm_cvtm64_si64(a.x);
-  __int64_t b64 = _mm_cvtm64_si64(b.x);
-
-  Eigen::half h[4];
-
-  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
-  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
-  h[0] = ha - hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
-  h[1] = ha - hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
-  h[2] = ha - hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
-  h[3] = ha - hb;
-  Packet4h result;
-  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
-  __int64_t a64 = _mm_cvtm64_si64(a.x);
-  __int64_t b64 = _mm_cvtm64_si64(b.x);
-
-  Eigen::half h[4];
-
-  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
-  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
-  h[0] = ha * hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
-  h[1] = ha * hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
-  h[2] = ha * hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
-  h[3] = ha * hb;
-  Packet4h result;
-  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h pdiv<Packet4h>(const Packet4h& a, const Packet4h& b) {
-  __int64_t a64 = _mm_cvtm64_si64(a.x);
-  __int64_t b64 = _mm_cvtm64_si64(b.x);
-
-  Eigen::half h[4];
-
-  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
-  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
-  h[0] = ha / hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
-  h[1] = ha / hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
-  h[2] = ha / hb;
-  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
-  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
-  h[3] = ha / hb;
-  Packet4h result;
-  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
-  Packet4h result;
-  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
-  Packet4h result;
-  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
-  __int64_t r = _mm_cvtm64_si64(from.x);
-  *(reinterpret_cast<__int64_t*>(to)) = r;
-}
-
-template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
-  __int64_t r = _mm_cvtm64_si64(from.x);
-  *(reinterpret_cast<__int64_t*>(to)) = r;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h
-ploadquad<Packet4h>(const Eigen::half* from) {
-  return pset1<Packet4h>(*from);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
-{
-  Packet4h result;
-  result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
-  return result;
-}
-
-template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
-{
-  __int64_t a = _mm_cvtm64_si64(from.x);
-  to[stride*0].x = static_cast<unsigned short>(a);
-  to[stride*1].x = static_cast<unsigned short>(a >> 16);
-  to[stride*2].x = static_cast<unsigned short>(a >> 32);
-  to[stride*3].x = static_cast<unsigned short>(a >> 48);
-}
-
-EIGEN_STRONG_INLINE void
-ptranspose(PacketBlock<Packet4h,4>& kernel) {
-  __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
-  __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
-  __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
-  __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
-
-  kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
-  kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
-  kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
-  kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
-}
-
-#endif
-
-
 } // end namespace internal
 
 } // end namespace Eigen
 
-#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
+#if EIGEN_COMP_PGI
 // PGI++ does not define the following intrinsics in C++ mode.
 static inline __m128  _mm_castpd_ps   (__m128d x) { return reinterpret_cast<__m128&>(x);  }
 static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); }
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/TypeCasting.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/TypeCasting.h
index 1b8e9a550..c6ca8c716 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/TypeCasting.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -69,64 +69,6 @@ template<> EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f
   return _mm_cvtps_pd(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i,Packet4f>(const Packet4f& a) {
-  return _mm_castps_si128(a);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f,Packet4i>(const Packet4i& a) {
-  return _mm_castsi128_ps(a);
-}
-
-
-// Disable the following code since it's broken on too many platforms / compilers.
-//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
-#if 0
-
-template <>
-struct type_casting_traits<Eigen::half, float> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4h, Packet4f>(const Packet4h& a) {
-  __int64_t a64 = _mm_cvtm64_si64(a.x);
-  Eigen::half h = raw_uint16_to_half(static_cast<unsigned short>(a64));
-  float f1 = static_cast<float>(h);
-  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
-  float f2 = static_cast<float>(h);
-  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
-  float f3 = static_cast<float>(h);
-  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
-  float f4 = static_cast<float>(h);
-  return _mm_set_ps(f4, f3, f2, f1);
-}
-
-template <>
-struct type_casting_traits<float, Eigen::half> {
-  enum {
-    VectorizedCast = 1,
-    SrcCoeffRatio = 1,
-    TgtCoeffRatio = 1
-  };
-};
-
-template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f& a) {
-  EIGEN_ALIGN16 float aux[4];
-  pstore(aux, a);
-  Eigen::half h0(aux[0]);
-  Eigen::half h1(aux[1]);
-  Eigen::half h2(aux[2]);
-  Eigen::half h3(aux[3]);
-
-  Packet4h result;
-  result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x);
-  return result;
-}
-
-#endif
 
 } // end namespace internal
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h
deleted file mode 100644
index 710059d50..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h
+++ /dev/null
@@ -1,229 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * InteropHeaders.h
- *
- * \brief:
- *  InteropHeaders
- *
- *****************************************************************/
-
-#ifndef EIGEN_INTEROP_HEADERS_SYCL_H
-#define EIGEN_INTEROP_HEADERS_SYCL_H
-
-namespace Eigen {
-
-#if !defined(EIGEN_DONT_VECTORIZE_SYCL)
-
-namespace internal {
-
-template <int has_blend, int lengths>
-struct sycl_packet_traits : default_packet_traits {
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size = lengths,
-    HasHalfPacket = 0,
-    HasDiv = 1,
-    HasLog = 1,
-    HasExp = 1,
-    HasSqrt = 1,
-    HasRsqrt = 1,
-    HasSin = 1,
-    HasCos = 1,
-    HasTan = 1,
-    HasASin = 1,
-    HasACos = 1,
-    HasATan = 1,
-    HasSinh = 1,
-    HasCosh = 1,
-    HasTanh = 1,
-    HasLGamma = 0,
-    HasDiGamma = 0,
-    HasZeta = 0,
-    HasPolygamma = 0,
-    HasErf = 0,
-    HasErfc = 0,
-    HasNdtri = 0,
-    HasIGamma = 0,
-    HasIGammac = 0,
-    HasBetaInc = 0,
-    HasBlend = has_blend,
-    HasMax = 1,
-    HasMin = 1,
-    HasMul = 1,
-    HasAdd = 1,
-    HasFloor = 1,
-    HasRound = 1,
-    HasRint = 1,
-    HasLog1p = 1,
-    HasExpm1 = 1,
-    HasCeil = 1,
-  };
-};
-
-#ifdef SYCL_DEVICE_ONLY
-#define SYCL_PACKET_TRAITS(packet_type, has_blend, unpacket_type, lengths) \
-  template <>                                                              \
-  struct packet_traits<unpacket_type>                                      \
-      : sycl_packet_traits<has_blend, lengths> {                           \
-    typedef packet_type type;                                              \
-    typedef packet_type half;                                              \
-  };
-
-SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4)
-SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4)
-SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2)
-SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, const double, 2)
-#undef SYCL_PACKET_TRAITS
-
-// Make sure this is only available when targeting a GPU: we don't want to
-// introduce conflicts between these packet_traits definitions and the ones
-// we'll use on the host side (SSE, AVX, ...)
-#define SYCL_ARITHMETIC(packet_type)  \
-  template <>                         \
-  struct is_arithmetic<packet_type> { \
-    enum { value = true };            \
-  };
-SYCL_ARITHMETIC(cl::sycl::cl_float4)
-SYCL_ARITHMETIC(cl::sycl::cl_double2)
-#undef SYCL_ARITHMETIC
-
-#define SYCL_UNPACKET_TRAITS(packet_type, unpacket_type, lengths)        \
-  template <>                                                            \
-  struct unpacket_traits<packet_type> {                                  \
-    typedef unpacket_type type;                                          \
-    enum { size = lengths, vectorizable = true, alignment = Aligned16 }; \
-    typedef packet_type half;                                            \
-  };
-SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4)
-SYCL_UNPACKET_TRAITS(cl::sycl::cl_double2, double, 2)
-
-#undef SYCL_UNPACKET_TRAITS
-#endif
-
-}  // end namespace internal
-
-#endif
-
-namespace TensorSycl {
-namespace internal {
-
-template <typename PacketReturnType, int PacketSize>
-struct PacketWrapper;
-// This function should never get called on the device
-#ifndef SYCL_DEVICE_ONLY
-template <typename PacketReturnType, int PacketSize>
-struct PacketWrapper {
-  typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type
-      Scalar;
-  template <typename Index>
-  EIGEN_DEVICE_FUNC static Scalar scalarize(Index, PacketReturnType &) {
-    eigen_assert(false && "THERE IS NO PACKETIZE VERSION FOR  THE CHOSEN TYPE");
-    abort();
-  }
-  EIGEN_DEVICE_FUNC static PacketReturnType convert_to_packet_type(Scalar in,
-                                                                   Scalar) {
-    return ::Eigen::internal::template plset<PacketReturnType>(in);
-  }
-  EIGEN_DEVICE_FUNC static void set_packet(PacketReturnType, Scalar *) {
-    eigen_assert(false && "THERE IS NO PACKETIZE VERSION FOR  THE CHOSEN TYPE");
-    abort();
-  }
-};
-
-#elif defined(SYCL_DEVICE_ONLY)
-template <typename PacketReturnType>
-struct PacketWrapper<PacketReturnType, 4> {
-  typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type
-      Scalar;
-  template <typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType &in) {
-    switch (index) {
-      case 0:
-        return in.x();
-      case 1:
-        return in.y();
-      case 2:
-        return in.z();
-      case 3:
-        return in.w();
-      default:
-      //INDEX MUST BE BETWEEN 0 and 3.There is no abort function in SYCL kernel. so we cannot use abort here. 
-      // The code will never reach here
-      __builtin_unreachable();
-    }
-    __builtin_unreachable();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(
-      Scalar in, Scalar other) {
-    return PacketReturnType(in, other, other, other);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) {
-    lhs = PacketReturnType(rhs[0], rhs[1], rhs[2], rhs[3]);
-  }
-};
-
-template <typename PacketReturnType>
-struct PacketWrapper<PacketReturnType, 1> {
-  typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type
-      Scalar;
-  template <typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index, PacketReturnType &in) {
-    return in;
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(Scalar in,
-                                                                   Scalar) {
-    return PacketReturnType(in);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) {
-    lhs = rhs[0];
-  }
-};
-
-template <typename PacketReturnType>
-struct PacketWrapper<PacketReturnType, 2> {
-  typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type
-      Scalar;
-  template <typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType &in) {
-    switch (index) {
-      case 0:
-        return in.x();
-      case 1:
-        return in.y();
-      default:
-        //INDEX MUST BE BETWEEN 0 and 1.There is no abort function in SYCL kernel. so we cannot use abort here. 
-      // The code will never reach here
-        __builtin_unreachable();
-    }
-    __builtin_unreachable();
-  }
-  
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(
-      Scalar in, Scalar other) {
-    return PacketReturnType(in, other);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) {
-    lhs = PacketReturnType(rhs[0], rhs[1]);
-  }
-};
-
-#endif
-
-}  // end namespace internal
-}  // end namespace TensorSycl
-}  // end namespace Eigen
-
-#endif  // EIGEN_INTEROP_HEADERS_SYCL_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h
deleted file mode 100644
index a96625e2c..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h
+++ /dev/null
@@ -1,289 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * MathFunctions.h
- *
- * \brief:
- *  MathFunctions
- *
- *****************************************************************/
-
-#ifndef EIGEN_MATH_FUNCTIONS_SYCL_H
-#define EIGEN_MATH_FUNCTIONS_SYCL_H
-
-namespace Eigen {
-
-namespace internal {
-
-// Make sure this is only available when targeting a GPU: we don't want to
-// introduce conflicts between these packet_traits definitions and the ones
-// we'll use on the host side (SSE, AVX, ...)
-#if defined(SYCL_DEVICE_ONLY)
-#define SYCL_PLOG(packet_type)                                         \
-  template <>                                                          \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog<packet_type>( \
-      const packet_type& a) {                                          \
-    return cl::sycl::log(a);                                           \
-  }
-
-SYCL_PLOG(cl::sycl::cl_float4)
-SYCL_PLOG(cl::sycl::cl_double2)
-#undef SYCL_PLOG
-
-#define SYCL_PLOG1P(packet_type)                                         \
-  template <>                                                            \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog1p<packet_type>( \
-      const packet_type& a) {                                            \
-    return cl::sycl::log1p(a);                                           \
-  }
-
-SYCL_PLOG1P(cl::sycl::cl_float4)
-SYCL_PLOG1P(cl::sycl::cl_double2)
-#undef SYCL_PLOG1P
-
-#define SYCL_PLOG10(packet_type)                                         \
-  template <>                                                            \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog10<packet_type>( \
-      const packet_type& a) {                                            \
-    return cl::sycl::log10(a);                                           \
-  }
-
-SYCL_PLOG10(cl::sycl::cl_float4)
-SYCL_PLOG10(cl::sycl::cl_double2)
-#undef SYCL_PLOG10
-
-#define SYCL_PEXP(packet_type)                                         \
-  template <>                                                          \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexp<packet_type>( \
-      const packet_type& a) {                                          \
-    return cl::sycl::exp(a);                                           \
-  }
-
-SYCL_PEXP(cl::sycl::cl_float4)
-SYCL_PEXP(cl::sycl::cl_double2)
-#undef SYCL_PEXP
-
-#define SYCL_PEXPM1(packet_type)                                         \
-  template <>                                                            \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexpm1<packet_type>( \
-      const packet_type& a) {                                            \
-    return cl::sycl::expm1(a);                                           \
-  }
-
-SYCL_PEXPM1(cl::sycl::cl_float4)
-SYCL_PEXPM1(cl::sycl::cl_double2)
-#undef SYCL_PEXPM1
-
-#define SYCL_PSQRT(packet_type)                                         \
-  template <>                                                           \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psqrt<packet_type>( \
-      const packet_type& a) {                                           \
-    return cl::sycl::sqrt(a);                                           \
-  }
-
-SYCL_PSQRT(cl::sycl::cl_float4)
-SYCL_PSQRT(cl::sycl::cl_double2)
-#undef SYCL_PSQRT
-
-#define SYCL_PRSQRT(packet_type)                                         \
-  template <>                                                            \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type prsqrt<packet_type>( \
-      const packet_type& a) {                                            \
-    return cl::sycl::rsqrt(a);                                           \
-  }
-
-SYCL_PRSQRT(cl::sycl::cl_float4)
-SYCL_PRSQRT(cl::sycl::cl_double2)
-#undef SYCL_PRSQRT
-
-/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
-#define SYCL_PSIN(packet_type)                                         \
-  template <>                                                          \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psin<packet_type>( \
-      const packet_type& a) {                                          \
-    return cl::sycl::sin(a);                                           \
-  }
-
-SYCL_PSIN(cl::sycl::cl_float4)
-SYCL_PSIN(cl::sycl::cl_double2)
-#undef SYCL_PSIN
-
-/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
-#define SYCL_PCOS(packet_type)                                         \
-  template <>                                                          \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcos<packet_type>( \
-      const packet_type& a) {                                          \
-    return cl::sycl::cos(a);                                           \
-  }
-
-SYCL_PCOS(cl::sycl::cl_float4)
-SYCL_PCOS(cl::sycl::cl_double2)
-#undef SYCL_PCOS
-
-/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
-#define SYCL_PTAN(packet_type)                                         \
-  template <>                                                          \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptan<packet_type>( \
-      const packet_type& a) {                                          \
-    return cl::sycl::tan(a);                                           \
-  }
-
-SYCL_PTAN(cl::sycl::cl_float4)
-SYCL_PTAN(cl::sycl::cl_double2)
-#undef SYCL_PTAN
-
-/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
-#define SYCL_PASIN(packet_type)                                         \
-  template <>                                                           \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pasin<packet_type>( \
-      const packet_type& a) {                                           \
-    return cl::sycl::asin(a);                                           \
-  }
-
-SYCL_PASIN(cl::sycl::cl_float4)
-SYCL_PASIN(cl::sycl::cl_double2)
-#undef SYCL_PASIN
-
-/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
-#define SYCL_PACOS(packet_type)                                         \
-  template <>                                                           \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pacos<packet_type>( \
-      const packet_type& a) {                                           \
-    return cl::sycl::acos(a);                                           \
-  }
-
-SYCL_PACOS(cl::sycl::cl_float4)
-SYCL_PACOS(cl::sycl::cl_double2)
-#undef SYCL_PACOS
-
-/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
-#define SYCL_PATAN(packet_type)                                         \
-  template <>                                                           \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type patan<packet_type>( \
-      const packet_type& a) {                                           \
-    return cl::sycl::atan(a);                                           \
-  }
-
-SYCL_PATAN(cl::sycl::cl_float4)
-SYCL_PATAN(cl::sycl::cl_double2)
-#undef SYCL_PATAN
-
-/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
-#define SYCL_PSINH(packet_type)                                         \
-  template <>                                                           \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psinh<packet_type>( \
-      const packet_type& a) {                                           \
-    return cl::sycl::sinh(a);                                           \
-  }
-
-SYCL_PSINH(cl::sycl::cl_float4)
-SYCL_PSINH(cl::sycl::cl_double2)
-#undef SYCL_PSINH
-
-/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
-#define SYCL_PCOSH(packet_type)                                         \
-  template <>                                                           \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcosh<packet_type>( \
-      const packet_type& a) {                                           \
-    return cl::sycl::cosh(a);                                           \
-  }
-
-SYCL_PCOSH(cl::sycl::cl_float4)
-SYCL_PCOSH(cl::sycl::cl_double2)
-#undef SYCL_PCOSH
-
-/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
-#define SYCL_PTANH(packet_type)                                         \
-  template <>                                                           \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptanh<packet_type>( \
-      const packet_type& a) {                                           \
-    return cl::sycl::tanh(a);                                           \
-  }
-
-SYCL_PTANH(cl::sycl::cl_float4)
-SYCL_PTANH(cl::sycl::cl_double2)
-#undef SYCL_PTANH
-
-#define SYCL_PCEIL(packet_type)                                         \
-  template <>                                                           \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pceil<packet_type>( \
-      const packet_type& a) {                                           \
-    return cl::sycl::ceil(a);                                           \
-  }
-
-SYCL_PCEIL(cl::sycl::cl_float4)
-SYCL_PCEIL(cl::sycl::cl_double2)
-#undef SYCL_PCEIL
-
-#define SYCL_PROUND(packet_type)                                         \
-  template <>                                                            \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pround<packet_type>( \
-      const packet_type& a) {                                            \
-    return cl::sycl::round(a);                                           \
-  }
-
-SYCL_PROUND(cl::sycl::cl_float4)
-SYCL_PROUND(cl::sycl::cl_double2)
-#undef SYCL_PROUND
-
-#define SYCL_PRINT(packet_type)                                         \
-  template<>                                                            \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type print<packet_type>( \
-      const packet_type& a) {                                           \
-    return cl::sycl::rint(a);                                           \
-  }
-
-SYCL_PRINT(cl::sycl::cl_float4)
-SYCL_PRINT(cl::sycl::cl_double2)
-#undef SYCL_PRINT
-
-#define SYCL_FLOOR(packet_type)                                          \
-  template <>                                                            \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pfloor<packet_type>( \
-      const packet_type& a) {                                            \
-    return cl::sycl::floor(a);                                           \
-  }
-
-SYCL_FLOOR(cl::sycl::cl_float4)
-SYCL_FLOOR(cl::sycl::cl_double2)
-#undef SYCL_FLOOR
-
-#define SYCL_PMIN(packet_type, expr)                                   \
-  template <>                                                          \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmin<packet_type>( \
-      const packet_type& a, const packet_type& b) {                    \
-    return expr;                                                       \
-  }
-
-SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b))
-SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b))
-#undef SYCL_PMIN
-
-#define SYCL_PMAX(packet_type, expr)                                   \
-  template <>                                                          \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmax<packet_type>( \
-      const packet_type& a, const packet_type& b) {                    \
-    return expr;                                                       \
-  }
-
-SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b))
-SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b))
-#undef SYCL_PMAX
-
-#endif
-
-}  // end namespace internal
-
-}  // end namespace Eigen
-
-#endif  // EIGEN_MATH_FUNCTIONS_SYCL_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/PacketMath.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/PacketMath.h
deleted file mode 100644
index b11b5af9d..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/PacketMath.h
+++ /dev/null
@@ -1,670 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * PacketMath.h
- *
- * \brief:
- *  PacketMath
- *
- *****************************************************************/
-
-#ifndef EIGEN_PACKET_MATH_SYCL_H
-#define EIGEN_PACKET_MATH_SYCL_H
-#include <type_traits>
-namespace Eigen {
-
-namespace internal {
-#ifdef SYCL_DEVICE_ONLY
-
-#define SYCL_PLOADT_RO(address_space_target)                                 \
-  template <typename packet_type, int Alignment>                             \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt_ro(               \
-      typename cl::sycl::multi_ptr<                                          \
-          const typename unpacket_traits<packet_type>::type,                 \
-          cl::sycl::access::address_space::address_space_target>::pointer_t  \
-          from) {                                                            \
-    typedef typename unpacket_traits<packet_type>::type scalar;              \
-    typedef cl::sycl::multi_ptr<                                             \
-        scalar, cl::sycl::access::address_space::address_space_target>       \
-        multi_ptr;                                                           \
-    auto res = packet_type(                                                  \
-        static_cast<typename unpacket_traits<packet_type>::type>(0));        \
-    res.load(0, multi_ptr(const_cast<typename multi_ptr::pointer_t>(from))); \
-    return res;                                                              \
-  }
-
-SYCL_PLOADT_RO(global_space)
-SYCL_PLOADT_RO(local_space)
-#undef SYCL_PLOADT_RO
-#endif
-
-template <typename packet_type, int Alignment, typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
-ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess<
-          cl::sycl::access::mode::read_write, T>& from) {
-  return ploadt_ro<packet_type, Alignment>(from.get_pointer());
-}
-
-#ifdef SYCL_DEVICE_ONLY
-#define SYCL_PLOAD(address_space_target, Alignment, AlignedType)            \
-  template <typename packet_type>                                           \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType(     \
-      typename cl::sycl::multi_ptr<                                         \
-          const typename unpacket_traits<packet_type>::type,                \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          from) {                                                           \
-    return ploadt_ro<packet_type, Alignment>(from);                         \
-  }
-
-// global space
-SYCL_PLOAD(global_space, Unaligned, u)
-SYCL_PLOAD(global_space, Aligned, )
-// local space
-SYCL_PLOAD(local_space, Unaligned, u)
-SYCL_PLOAD(local_space, Aligned, )
-
-#undef SYCL_PLOAD
-#endif
-
-#define SYCL_PLOAD(Alignment, AlignedType)                              \
-  template <typename packet_type>                                       \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \
-      const Eigen::TensorSycl::internal::RangeAccess<                   \
-          cl::sycl::access::mode::read_write,                           \
-          typename unpacket_traits<packet_type>::type>                  \
-          from) {                                                       \
-    return ploadt_ro<packet_type, Alignment>(from);                     \
-  }
-SYCL_PLOAD(Unaligned, u)
-SYCL_PLOAD(Aligned, )
-#undef SYCL_PLOAD
-
-#ifdef SYCL_DEVICE_ONLY
-/** \internal \returns a packet version of \a *from.
- * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
-#define SYCL_PLOADT(address_space_target)                                   \
-  template <typename packet_type, int Alignment>                            \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt(                 \
-      typename cl::sycl::multi_ptr<                                         \
-          const typename unpacket_traits<packet_type>::type,                \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          from) {                                                           \
-    if (Alignment >= unpacket_traits<packet_type>::alignment)               \
-      return pload<packet_type>(from);                                      \
-    else                                                                    \
-      return ploadu<packet_type>(from);                                     \
-  }
-
-// global space
-SYCL_PLOADT(global_space)
-// local space
-SYCL_PLOADT(local_space)
-#undef SYCL_PLOADT
-#endif
-
-template <typename packet_type, int Alignment>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type
-ploadt(const Eigen::TensorSycl::internal::RangeAccess<
-       cl::sycl::access::mode::read_write,
-       typename unpacket_traits<packet_type>::type>& from) {
-  return ploadt<packet_type, Alignment>(from.get_pointer());
-}
-#ifdef SYCL_DEVICE_ONLY
-
-// private_space
-#define SYCL_PLOADT_RO_SPECIAL(packet_type, Alignment)                 \
-  template <>                                                          \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type                    \
-  ploadt_ro<packet_type, Alignment>(                                   \
-      const typename unpacket_traits<packet_type>::type* from) {       \
-    typedef typename unpacket_traits<packet_type>::type scalar;        \
-    auto res = packet_type(static_cast<scalar>(0));                    \
-    res.template load<cl::sycl::access::address_space::private_space>( \
-        0, const_cast<scalar*>(from));                                 \
-    return res;                                                        \
-  }
-
-SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Aligned)
-SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Aligned)
-SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Unaligned)
-SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Unaligned)
-
-#define SYCL_PLOAD_SPECIAL(packet_type, alignment_type)                    \
-  template <>                                                              \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##alignment_type( \
-      const typename unpacket_traits<packet_type>::type* from) {           \
-    typedef typename unpacket_traits<packet_type>::type scalar;            \
-    auto res = packet_type(static_cast<scalar>(0));                        \
-    res.template load<cl::sycl::access::address_space::private_space>(     \
-        0, const_cast<scalar*>(from));                                     \
-    return res;                                                            \
-  }
-SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, )
-SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, )
-SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, u)
-SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, u)
-
-#undef SYCL_PLOAD_SPECIAL
-
-#define SYCL_PSTORE(scalar, packet_type, address_space_target, alignment)   \
-  template <>                                                               \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment(             \
-      typename cl::sycl::multi_ptr<                                         \
-          scalar,                                                           \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          to,                                                               \
-      const packet_type& from) {                                            \
-    typedef cl::sycl::multi_ptr<                                            \
-        scalar, cl::sycl::access::address_space::address_space_target>      \
-        multi_ptr;                                                          \
-    from.store(0, multi_ptr(to));                                           \
-  }
-
-// global space
-SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, )
-SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, u)
-SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, )
-SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, u)
-SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, )
-SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, u)
-SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, )
-SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, u)
-
-SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, )
-SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, u)
-SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, )
-SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, u)
-#undef SYCL_PSTORE
-
-#define SYCL_PSTORE_T(address_space_target)                                 \
-  template <typename scalar, typename packet_type, int Alignment>           \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(                       \
-      typename cl::sycl::multi_ptr<                                         \
-          scalar,                                                           \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          to,                                                               \
-      const packet_type& from) {                                            \
-    if (Alignment)                                                          \
-      pstore(to, from);                                                     \
-    else                                                                    \
-      pstoreu(to, from);                                                    \
-  }
-
-SYCL_PSTORE_T(global_space)
-
-SYCL_PSTORE_T(local_space)
-
-#undef SYCL_PSTORE_T
-
-#define SYCL_PSET1(packet_type)                                         \
-  template <>                                                           \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>( \
-      const typename unpacket_traits<packet_type>::type& from) {        \
-    return packet_type(from);                                           \
-  }
-
-// global space
-SYCL_PSET1(cl::sycl::cl_float4)
-SYCL_PSET1(cl::sycl::cl_double2)
-
-#undef SYCL_PSET1
-
-template <typename packet_type>
-struct get_base_packet {
-  template <typename sycl_multi_pointer>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type
-  get_ploaddup(sycl_multi_pointer) {}
-
-  template <typename sycl_multi_pointer>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type
-  get_pgather(sycl_multi_pointer, Index) {}
-};
-
-template <>
-struct get_base_packet<cl::sycl::cl_float4> {
-  template <typename sycl_multi_pointer>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_ploaddup(
-      sycl_multi_pointer from) {
-    return cl::sycl::cl_float4(from[0], from[0], from[1], from[1]);
-  }
-  template <typename sycl_multi_pointer>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_pgather(
-      sycl_multi_pointer from, Index stride) {
-    return cl::sycl::cl_float4(from[0 * stride], from[1 * stride],
-                               from[2 * stride], from[3 * stride]);
-  }
-
-  template <typename sycl_multi_pointer>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(
-      sycl_multi_pointer to, const cl::sycl::cl_float4& from, Index stride) {
-    auto tmp = stride;
-    to[0] = from.x();
-    to[tmp] = from.y();
-    to[tmp += stride] = from.z();
-    to[tmp += stride] = from.w();
-  }
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 set_plset(
-      const float& a) {
-    return cl::sycl::cl_float4(static_cast<float>(a), static_cast<float>(a + 1),
-                               static_cast<float>(a + 2),
-                               static_cast<float>(a + 3));
-  }
-};
-
-template <>
-struct get_base_packet<cl::sycl::cl_double2> {
-  template <typename sycl_multi_pointer>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2
-  get_ploaddup(const sycl_multi_pointer from) {
-    return cl::sycl::cl_double2(from[0], from[0]);
-  }
-
-  template <typename sycl_multi_pointer, typename Index>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_pgather(
-      const sycl_multi_pointer from, Index stride) {
-    return cl::sycl::cl_double2(from[0 * stride], from[1 * stride]);
-  }
-
-  template <typename sycl_multi_pointer>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(
-      sycl_multi_pointer to, const cl::sycl::cl_double2& from, Index stride) {
-    to[0] = from.x();
-    to[stride] = from.y();
-  }
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 set_plset(
-      const double& a) {
-    return cl::sycl::cl_double2(static_cast<double>(a),
-                                static_cast<double>(a + 1));
-  }
-};
-
-#define SYCL_PLOAD_DUP(address_space_target)                                \
-  template <typename packet_type>                                           \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup(               \
-      typename cl::sycl::multi_ptr<                                         \
-          const typename unpacket_traits<packet_type>::type,                \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          from) {                                                           \
-    return get_base_packet<packet_type>::get_ploaddup(from);                \
-  }
-
-// global space
-SYCL_PLOAD_DUP(global_space)
-// local_space
-SYCL_PLOAD_DUP(local_space)
-#undef SYCL_PLOAD_DUP
-
-#define SYCL_PLOAD_DUP_SPECILIZE(packet_type)                              \
-  template <>                                                              \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup<packet_type>( \
-      const typename unpacket_traits<packet_type>::type* from) {           \
-    return get_base_packet<packet_type>::get_ploaddup(from);               \
-  }
-
-SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_float4)
-SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
-
-#undef SYCL_PLOAD_DUP_SPECILIZE
-
-#define SYCL_PLSET(packet_type)                                         \
-  template <>                                                           \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type plset<packet_type>( \
-      const typename unpacket_traits<packet_type>::type& a) {           \
-    return get_base_packet<packet_type>::set_plset(a);                  \
-  }
-
-SYCL_PLSET(cl::sycl::cl_float4)
-SYCL_PLSET(cl::sycl::cl_double2)
-
-#undef SYCL_PLSET
-
-#define SYCL_PGATHER(address_space_target)                                  \
-  template <typename Scalar, typename packet_type>                          \
-  EIGEN_DEVICE_FUNC inline packet_type pgather(                             \
-      typename cl::sycl::multi_ptr<                                         \
-          const typename unpacket_traits<packet_type>::type,                \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          from,                                                             \
-      Index stride) {                                                       \
-    return get_base_packet<packet_type>::get_pgather(from, stride);         \
-  }
-
-// global space
-SYCL_PGATHER(global_space)
-// local space
-SYCL_PGATHER(local_space)
-
-#undef SYCL_PGATHER
-
-#define SYCL_PGATHER_SPECILIZE(scalar, packet_type)                            \
-  template <>                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type                            \
-  pgather<scalar, packet_type>(                                                \
-      const typename unpacket_traits<packet_type>::type* from, Index stride) { \
-    return get_base_packet<packet_type>::get_pgather(from, stride);            \
-  }
-
-SYCL_PGATHER_SPECILIZE(float, cl::sycl::cl_float4)
-SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
-
-#undef SYCL_PGATHER_SPECILIZE
-
-#define SYCL_PSCATTER(address_space_target)                                 \
-  template <typename Scalar, typename packet_type>                          \
-  EIGEN_DEVICE_FUNC inline void pscatter(                                   \
-      typename cl::sycl::multi_ptr<                                         \
-          typename unpacket_traits<packet_type>::type,                      \
-          cl::sycl::access::address_space::address_space_target>::pointer_t \
-          to,                                                               \
-      const packet_type& from, Index stride) {                              \
-    get_base_packet<packet_type>::set_pscatter(to, from, stride);           \
-  }
-
-// global space
-SYCL_PSCATTER(global_space)
-// local space
-SYCL_PSCATTER(local_space)
-
-#undef SYCL_PSCATTER
-
-#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type)                        \
-  template <>                                                               \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<scalar, packet_type>( \
-      typename unpacket_traits<packet_type>::type * to,                     \
-      const packet_type& from, Index stride) {                              \
-    get_base_packet<packet_type>::set_pscatter(to, from, stride);           \
-  }
-
-SYCL_PSCATTER_SPECILIZE(float, cl::sycl::cl_float4)
-SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2)
-
-#undef SYCL_PSCATTER_SPECILIZE
-
-#define SYCL_PMAD(packet_type)                                            \
-  template <>                                                             \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pmadd(                \
-      const packet_type& a, const packet_type& b, const packet_type& c) { \
-    return cl::sycl::mad(a, b, c);                                        \
-  }
-
-SYCL_PMAD(cl::sycl::cl_float4)
-SYCL_PMAD(cl::sycl::cl_double2)
-#undef SYCL_PMAD
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float pfirst<cl::sycl::cl_float4>(
-    const cl::sycl::cl_float4& a) {
-  return a.x();
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double pfirst<cl::sycl::cl_double2>(
-    const cl::sycl::cl_double2& a) {
-  return a.x();
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux<cl::sycl::cl_float4>(
-    const cl::sycl::cl_float4& a) {
-  return a.x() + a.y() + a.z() + a.w();
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux<cl::sycl::cl_double2>(
-    const cl::sycl::cl_double2& a) {
-  return a.x() + a.y();
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_max<cl::sycl::cl_float4>(
-    const cl::sycl::cl_float4& a) {
-  return cl::sycl::fmax(cl::sycl::fmax(a.x(), a.y()),
-                        cl::sycl::fmax(a.z(), a.w()));
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_max<cl::sycl::cl_double2>(
-    const cl::sycl::cl_double2& a) {
-  return cl::sycl::fmax(a.x(), a.y());
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_min<cl::sycl::cl_float4>(
-    const cl::sycl::cl_float4& a) {
-  return cl::sycl::fmin(cl::sycl::fmin(a.x(), a.y()),
-                        cl::sycl::fmin(a.z(), a.w()));
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_min<cl::sycl::cl_double2>(
-    const cl::sycl::cl_double2& a) {
-  return cl::sycl::fmin(a.x(), a.y());
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_mul<cl::sycl::cl_float4>(
-    const cl::sycl::cl_float4& a) {
-  return a.x() * a.y() * a.z() * a.w();
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_mul<cl::sycl::cl_double2>(
-    const cl::sycl::cl_double2& a) {
-  return a.x() * a.y();
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
-pabs<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
-  return cl::sycl::cl_float4(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()),
-                             cl::sycl::fabs(a.z()), cl::sycl::fabs(a.w()));
-}
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2
-pabs<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
-  return cl::sycl::cl_double2(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()));
-}
-
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_le(const Packet &a,
-                                                          const Packet &b) {
-  return ((a <= b)
-              .template convert<typename unpacket_traits<Packet>::type,
-                                cl::sycl::rounding_mode::automatic>());
-}
-
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_lt(const Packet &a,
-                                                          const Packet &b) {
-  return ((a < b)
-              .template convert<typename unpacket_traits<Packet>::type,
-                                cl::sycl::rounding_mode::automatic>());
-}
-
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_eq(const Packet &a,
-                                                          const Packet &b) {
-  return ((a == b)
-              .template convert<typename unpacket_traits<Packet>::type,
-                                cl::sycl::rounding_mode::automatic>());
-}
-
-#define SYCL_PCMP(OP, TYPE)                                                    \
-  template <>                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TYPE pcmp_##OP<TYPE>(const TYPE &a,    \
-                                                             const TYPE &b) {  \
-    return sycl_pcmp_##OP<TYPE>(a, b);                                         \
-  }
-
-SYCL_PCMP(le, cl::sycl::cl_float4)
-SYCL_PCMP(lt, cl::sycl::cl_float4)
-SYCL_PCMP(eq, cl::sycl::cl_float4)
-SYCL_PCMP(le, cl::sycl::cl_double2)
-SYCL_PCMP(lt, cl::sycl::cl_double2)
-SYCL_PCMP(eq, cl::sycl::cl_double2)
-#undef SYCL_PCMP
-
-template <typename T> struct convert_to_integer;
-
-template <> struct convert_to_integer<float> {
-  using type = int;
-  using packet_type = cl::sycl::cl_int4;
-};
-template <> struct convert_to_integer<double> {
-  using type = long;
-  using packet_type = cl::sycl::cl_long2;
-};
-
-template <typename PacketIn>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename convert_to_integer<
-    typename unpacket_traits<PacketIn>::type>::packet_type
-vector_as_int(const PacketIn &p) {
-  return (
-      p.template convert<typename convert_to_integer<
-                             typename unpacket_traits<PacketIn>::type>::type,
-                         cl::sycl::rounding_mode::automatic>());
-}
-
-template <typename packetOut, typename PacketIn>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packetOut
-convert_vector(const PacketIn &p) {
-  return (p.template convert<typename unpacket_traits<packetOut>::type,
-                             cl::sycl::rounding_mode::automatic>());
-}
-
-#define SYCL_PAND(TYPE)                                                        \
-  template <>                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE pand<TYPE>(const TYPE &a,         \
-                                                        const TYPE &b) {       \
-    return convert_vector<TYPE>(vector_as_int(a) & vector_as_int(b));          \
-  }
-SYCL_PAND(cl::sycl::cl_float4)
-SYCL_PAND(cl::sycl::cl_double2)
-#undef SYCL_PAND
-
-#define SYCL_POR(TYPE)                                                         \
-  template <>                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE por<TYPE>(const TYPE &a,          \
-                                                       const TYPE &b) {        \
-    return convert_vector<TYPE>(vector_as_int(a) | vector_as_int(b));          \
-  }
-
-SYCL_POR(cl::sycl::cl_float4)
-SYCL_POR(cl::sycl::cl_double2)
-#undef SYCL_POR
-
-#define SYCL_PXOR(TYPE)                                                        \
-  template <>                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE pxor<TYPE>(const TYPE &a,         \
-                                                        const TYPE &b) {       \
-    return convert_vector<TYPE>(vector_as_int(a) ^ vector_as_int(b));          \
-  }
-
-SYCL_PXOR(cl::sycl::cl_float4)
-SYCL_PXOR(cl::sycl::cl_double2)
-#undef SYCL_PXOR
-
-#define SYCL_PANDNOT(TYPE)                                                     \
-  template <>                                                                  \
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE pandnot<TYPE>(const TYPE &a,      \
-                                                           const TYPE &b) {    \
-    return convert_vector<TYPE>(vector_as_int(a) & (~vector_as_int(b)));       \
-  }
-SYCL_PANDNOT(cl::sycl::cl_float4)
-SYCL_PANDNOT(cl::sycl::cl_double2)
-#undef SYCL_PANDNOT
-
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
-    PacketBlock<cl::sycl::cl_float4, 4>& kernel) {
-  float tmp = kernel.packet[0].y();
-  kernel.packet[0].y() = kernel.packet[1].x();
-  kernel.packet[1].x() = tmp;
-
-  tmp = kernel.packet[0].z();
-  kernel.packet[0].z() = kernel.packet[2].x();
-  kernel.packet[2].x() = tmp;
-
-  tmp = kernel.packet[0].w();
-  kernel.packet[0].w() = kernel.packet[3].x();
-  kernel.packet[3].x() = tmp;
-
-  tmp = kernel.packet[1].z();
-  kernel.packet[1].z() = kernel.packet[2].y();
-  kernel.packet[2].y() = tmp;
-
-  tmp = kernel.packet[1].w();
-  kernel.packet[1].w() = kernel.packet[3].y();
-  kernel.packet[3].y() = tmp;
-
-  tmp = kernel.packet[2].w();
-  kernel.packet[2].w() = kernel.packet[3].z();
-  kernel.packet[3].z() = tmp;
-}
-
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(
-    PacketBlock<cl::sycl::cl_double2, 2>& kernel) {
-  double tmp = kernel.packet[0].y();
-  kernel.packet[0].y() = kernel.packet[1].x();
-  kernel.packet[1].x() = tmp;
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pblend(
-    const Selector<unpacket_traits<cl::sycl::cl_float4>::size>& ifPacket,
-    const cl::sycl::cl_float4& thenPacket,
-    const cl::sycl::cl_float4& elsePacket) {
-  cl::sycl::cl_int4 condition(
-      ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1,
-      ifPacket.select[2] ? 0 : -1, ifPacket.select[3] ? 0 : -1);
-  return cl::sycl::select(thenPacket, elsePacket, condition);
-}
-
-template <>
-inline cl::sycl::cl_double2 pblend(
-    const Selector<unpacket_traits<cl::sycl::cl_double2>::size>& ifPacket,
-    const cl::sycl::cl_double2& thenPacket,
-    const cl::sycl::cl_double2& elsePacket) {
-  cl::sycl::cl_long2 condition(ifPacket.select[0] ? 0 : -1,
-                               ifPacket.select[1] ? 0 : -1);
-  return cl::sycl::select(thenPacket, elsePacket, condition);
-}
-#endif  // SYCL_DEVICE_ONLY
-
-#define SYCL_PSTORE(alignment)                                  \
-  template <typename packet_type>                               \
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \
-      const Eigen::TensorSycl::internal::RangeAccess<           \
-          cl::sycl::access::mode::read_write,                   \
-          typename unpacket_traits<packet_type>::type>& to,     \
-      const packet_type& from) {                                \
-    pstore##alignment(to.get_pointer(), from);                  \
-  }
-
-// global space
-SYCL_PSTORE()
-SYCL_PSTORE(u)
-
-#undef SYCL_PSTORE
-
-template <typename scalar, typename packet_type, int Alignment>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(
-    Eigen::TensorSycl::internal::RangeAccess<
-        cl::sycl::access::mode::read_write,
-        typename unpacket_traits<packet_type>::type>
-        to,
-    const packet_type& from) {
-  pstoret<scalar, packet_type, Alignment>(to.get_pointer(), from);
-}
-
-}  // end namespace internal
-
-}  // end namespace Eigen
-
-#endif  // EIGEN_PACKET_MATH_SYCL_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h
deleted file mode 100644
index f81e59db5..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h
+++ /dev/null
@@ -1,694 +0,0 @@
-/***************************************************************************
- *  Copyright (C) 2017 Codeplay Software Limited
- *  This Source Code Form is subject to the terms of the Mozilla
- *  Public License v. 2.0. If a copy of the MPL was not distributed
- *  with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
- *
- *
- *  SyclMemoryModel.h
- *
- *  Description:
- *    Interface for SYCL buffers to behave as a non-dereferenceable pointer
- *    Interface for Placeholder accessor to behave as a pointer on both host
- *    and device
- *
- * Authors:
- *
- *    Ruyman Reyes   Codeplay Software Ltd.
- *    Mehdi Goli     Codeplay Software Ltd.
- *    Vanya Yaneva   Codeplay Software Ltd.
- *
- **************************************************************************/
-
-#if defined(EIGEN_USE_SYCL) && \
-    !defined(EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H)
-#define EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H
-
-#include <CL/sycl.hpp>
-#ifdef EIGEN_EXCEPTIONS
-#include <stdexcept>
-#endif
-#include <cstddef>
-#include <queue>
-#include <set>
-#include <unordered_map>
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-
-using sycl_acc_target = cl::sycl::access::target;
-using sycl_acc_mode = cl::sycl::access::mode;
-
-/**
- * Default values for template arguments
- */
-using buffer_data_type_t = uint8_t;
-const sycl_acc_target default_acc_target = sycl_acc_target::global_buffer;
-const sycl_acc_mode default_acc_mode = sycl_acc_mode::read_write;
-
-/**
- * PointerMapper
- *  Associates fake pointers with buffers.
- *
- */
-class PointerMapper {
- public:
-  using base_ptr_t = std::intptr_t;
-
-  /* Structure of a virtual pointer
-   *
-   * |================================================|
-   * |               POINTER ADDRESS                  |
-   * |================================================|
-   */
-  struct virtual_pointer_t {
-    /* Type for the pointers
-     */
-    base_ptr_t m_contents;
-
-    /** Conversions from virtual_pointer_t to
-     * void * should just reinterpret_cast the integer number
-     */
-    operator void *() const { return reinterpret_cast<void *>(m_contents); }
-
-    /**
-     * Convert back to the integer number.
-     */
-    operator base_ptr_t() const { return m_contents; }
-
-    /**
-     * Add a certain value to the pointer to create a
-     * new pointer to that offset
-     */
-    virtual_pointer_t operator+(size_t off) { return m_contents + off; }
-
-    /* Numerical order for sorting pointers in containers. */
-    bool operator<(virtual_pointer_t rhs) const {
-      return (static_cast<base_ptr_t>(m_contents) <
-              static_cast<base_ptr_t>(rhs.m_contents));
-    }
-
-    bool operator>(virtual_pointer_t rhs) const {
-      return (static_cast<base_ptr_t>(m_contents) >
-              static_cast<base_ptr_t>(rhs.m_contents));
-    }
-
-    /**
-     * Numerical order for sorting pointers in containers
-     */
-    bool operator==(virtual_pointer_t rhs) const {
-      return (static_cast<base_ptr_t>(m_contents) ==
-              static_cast<base_ptr_t>(rhs.m_contents));
-    }
-
-    /**
-     * Simple forward to the equality overload.
-     */
-    bool operator!=(virtual_pointer_t rhs) const {
-      return !(this->operator==(rhs));
-    }
-
-    /**
-     * Converts a void * into a virtual pointer structure.
-     * Note that this will only work if the void * was
-     * already a virtual_pointer_t, but we have no way of
-     * checking
-     */
-    virtual_pointer_t(const void *ptr)
-        : m_contents(reinterpret_cast<base_ptr_t>(ptr)){};
-
-    /**
-     * Creates a virtual_pointer_t from the given integer
-     * number
-     */
-    virtual_pointer_t(base_ptr_t u) : m_contents(u){};
-  };
-
-  /* Definition of a null pointer
-   */
-  const virtual_pointer_t null_virtual_ptr = nullptr;
-
-  /**
-   * Whether if a pointer is null or not.
-   * A pointer is nullptr if the value is of null_virtual_ptr
-   */
-  static inline bool is_nullptr(virtual_pointer_t ptr) {
-    return (static_cast<void *>(ptr) == nullptr);
-  }
-
-  /* basic type for all buffers
-   */
-  using buffer_t = cl::sycl::buffer_mem;
-
-  /**
-   * Node that stores information about a device allocation.
-   * Nodes are sorted by size to organise a free list of nodes
-   * that can be recovered.
-   */
-  struct pMapNode_t {
-    buffer_t m_buffer;
-    size_t m_size;
-    bool m_free;
-
-    pMapNode_t(buffer_t b, size_t size, bool f)
-        : m_buffer{b}, m_size{size}, m_free{f} {
-      m_buffer.set_final_data(nullptr);
-    }
-
-    bool operator<=(const pMapNode_t &rhs) { return (m_size <= rhs.m_size); }
-  };
-
-  /** Storage of the pointer / buffer tree
-   */
-  using pointerMap_t = std::map<virtual_pointer_t, pMapNode_t>;
-
-  /**
-   * Obtain the insertion point in the pointer map for
-   * a pointer of the given size.
-   * \param requiredSize Size attemted to reclaim
-   */
-  typename pointerMap_t::iterator get_insertion_point(size_t requiredSize) {
-    typename pointerMap_t::iterator retVal;
-    bool reuse = false;
-    if (!m_freeList.empty()) {
-      // try to re-use an existing block
-      for (auto freeElem : m_freeList) {
-        if (freeElem->second.m_size >= requiredSize) {
-          retVal = freeElem;
-          reuse = true;
-          // Element is not going to be free anymore
-          m_freeList.erase(freeElem);
-          break;
-        }
-      }
-    }
-    if (!reuse) {
-      retVal = std::prev(m_pointerMap.end());
-    }
-    return retVal;
-  }
-
-  /**
-   * Returns an iterator to the node that stores the information
-   * of the given virtual pointer from the given pointer map structure.
-   * If pointer is not found, throws std::out_of_range.
-   * If the pointer map structure is empty, throws std::out_of_range
-   *
-   * \param pMap the pointerMap_t structure storing all the pointers
-   * \param virtual_pointer_ptr The virtual pointer to obtain the node of
-   * \throws std::out:of_range if the pointer is not found or pMap is empty
-   */
-  typename pointerMap_t::iterator get_node(const virtual_pointer_t ptr) {
-    if (this->count() == 0) {
-      m_pointerMap.clear();
-      EIGEN_THROW_X(std::out_of_range("There are no pointers allocated\n"));
-
-    }
-    if (is_nullptr(ptr)) {
-      m_pointerMap.clear();
-      EIGEN_THROW_X(std::out_of_range("Cannot access null pointer\n"));
-    }
-    // The previous element to the lower bound is the node that
-    // holds this memory address
-    auto node = m_pointerMap.lower_bound(ptr);
-    // If the value of the pointer is not the one of the node
-    // then we return the previous one
-    if (node == std::end(m_pointerMap)) {
-      --node;
-    } else if (node->first != ptr) {
-      if (node == std::begin(m_pointerMap)) {
-        m_pointerMap.clear();
-        EIGEN_THROW_X(
-            std::out_of_range("The pointer is not registered in the map\n"));
-
-      }
-      --node;
-    }
-
-    return node;
-  }
-
-  /* get_buffer.
-   * Returns a buffer from the map using the pointer address
-   */
-  template <typename buffer_data_type = buffer_data_type_t>
-  cl::sycl::buffer<buffer_data_type, 1> get_buffer(
-      const virtual_pointer_t ptr) {
-    using sycl_buffer_t = cl::sycl::buffer<buffer_data_type, 1>;
-
-    // get_node() returns a `buffer_mem`, so we need to cast it to a `buffer<>`.
-    // We can do this without the `buffer_mem` being a pointer, as we
-    // only declare member variables in the base class (`buffer_mem`) and not in
-    // the child class (`buffer<>).
-    auto node = get_node(ptr);
-    eigen_assert(node->first == ptr || node->first < ptr);
-    eigen_assert(ptr < static_cast<virtual_pointer_t>(node->second.m_size +
-                                                      node->first));
-    return *(static_cast<sycl_buffer_t *>(&node->second.m_buffer));
-  }
-
-  /**
-   * @brief Returns an accessor to the buffer of the given virtual pointer
-   * @param accessMode
-   * @param accessTarget
-   * @param ptr The virtual pointer
-   */
-  template <sycl_acc_mode access_mode = default_acc_mode,
-            sycl_acc_target access_target = default_acc_target,
-            typename buffer_data_type = buffer_data_type_t>
-  cl::sycl::accessor<buffer_data_type, 1, access_mode, access_target>
-  get_access(const virtual_pointer_t ptr) {
-    auto buf = get_buffer<buffer_data_type>(ptr);
-    return buf.template get_access<access_mode, access_target>();
-  }
-
-  /**
-   * @brief Returns an accessor to the buffer of the given virtual pointer
-   *        in the given command group scope
-   * @param accessMode
-   * @param accessTarget
-   * @param ptr The virtual pointer
-   * @param cgh Reference to the command group scope
-   */
-  template <sycl_acc_mode access_mode = default_acc_mode,
-            sycl_acc_target access_target = default_acc_target,
-            typename buffer_data_type = buffer_data_type_t>
-  cl::sycl::accessor<buffer_data_type, 1, access_mode, access_target>
-  get_access(const virtual_pointer_t ptr, cl::sycl::handler &cgh) {
-    auto buf = get_buffer<buffer_data_type>(ptr);
-    return buf.template get_access<access_mode, access_target>(cgh);
-  }
-
-  /*
-   * Returns the offset from the base address of this pointer.
-   */
-  inline std::ptrdiff_t get_offset(const virtual_pointer_t ptr) {
-    // The previous element to the lower bound is the node that
-    // holds this memory address
-    auto node = get_node(ptr);
-    auto start = node->first;
-    eigen_assert(start == ptr || start < ptr);
-    eigen_assert(ptr < start + node->second.m_size);
-    return (ptr - start);
-  }
-
-  /*
-   * Returns the number of elements by which the given pointer is offset from
-   * the base address.
-   */
-  template <typename buffer_data_type>
-  inline size_t get_element_offset(const virtual_pointer_t ptr) {
-    return get_offset(ptr) / sizeof(buffer_data_type);
-  }
-
-  /**
-   * Constructs the PointerMapper structure.
-   */
-  PointerMapper(base_ptr_t baseAddress = 4096)
-      : m_pointerMap{}, m_freeList{}, m_baseAddress{baseAddress} {
-    if (m_baseAddress == 0) {
-      EIGEN_THROW_X(std::invalid_argument("Base address cannot be zero\n"));
-    }
-  };
-
-  /**
-   * PointerMapper cannot be copied or moved
-   */
-  PointerMapper(const PointerMapper &) = delete;
-
-  /**
-   * Empty the pointer list
-   */
-  inline void clear() {
-    m_freeList.clear();
-    m_pointerMap.clear();
-  }
-
-  /* add_pointer.
-   * Adds an existing pointer to the map and returns the virtual pointer id.
-   */
-  inline virtual_pointer_t add_pointer(const buffer_t &b) {
-    return add_pointer_impl(b);
-  }
-
-  /* add_pointer.
-   * Adds a pointer to the map and returns the virtual pointer id.
-   */
-  inline virtual_pointer_t add_pointer(buffer_t &&b) {
-    return add_pointer_impl(b);
-  }
-
-  /**
-   * @brief Fuses the given node with the previous nodes in the
-   *        pointer map if they are free
-   *
-   * @param node A reference to the free node to be fused
-   */
-  void fuse_forward(typename pointerMap_t::iterator &node) {
-    while (node != std::prev(m_pointerMap.end())) {
-      // if following node is free
-      // remove it and extend the current node with its size
-      auto fwd_node = std::next(node);
-      if (!fwd_node->second.m_free) {
-        break;
-      }
-      auto fwd_size = fwd_node->second.m_size;
-      m_freeList.erase(fwd_node);
-      m_pointerMap.erase(fwd_node);
-
-      node->second.m_size += fwd_size;
-    }
-  }
-
-  /**
-   * @brief Fuses the given node with the following nodes in the
-   *        pointer map if they are free
-   *
-   * @param node A reference to the free node to be fused
-   */
-  void fuse_backward(typename pointerMap_t::iterator &node) {
-    while (node != m_pointerMap.begin()) {
-      // if previous node is free, extend it
-      // with the size of the current one
-      auto prev_node = std::prev(node);
-      if (!prev_node->second.m_free) {
-        break;
-      }
-      prev_node->second.m_size += node->second.m_size;
-
-      // remove the current node
-      m_freeList.erase(node);
-      m_pointerMap.erase(node);
-
-      // point to the previous node
-      node = prev_node;
-    }
-  }
-
-  /* remove_pointer.
-   * Removes the given pointer from the map.
-   * The pointer is allowed to be reused only if ReUse if true.
-   */
-  template <bool ReUse = true>
-  void remove_pointer(const virtual_pointer_t ptr) {
-    if (is_nullptr(ptr)) {
-      return;
-    }
-    auto node = this->get_node(ptr);
-
-    node->second.m_free = true;
-    m_freeList.emplace(node);
-
-    // Fuse the node
-    // with free nodes before and after it
-    fuse_forward(node);
-    fuse_backward(node);
-
-    // If after fusing the node is the last one
-    // simply remove it (since it is free)
-    if (node == std::prev(m_pointerMap.end())) {
-      m_freeList.erase(node);
-      m_pointerMap.erase(node);
-    }
-  }
-
-  /* count.
-   * Return the number of active pointers (i.e, pointers that
-   * have been malloc but not freed).
-   */
-  size_t count() const { return (m_pointerMap.size() - m_freeList.size()); }
-
- private:
-  /* add_pointer_impl.
-   * Adds a pointer to the map and returns the virtual pointer id.
-   * BufferT is either a const buffer_t& or a buffer_t&&.
-   */
-  template <class BufferT>
-  virtual_pointer_t add_pointer_impl(BufferT b) {
-    virtual_pointer_t retVal = nullptr;
-    size_t bufSize = b.get_count();
-    pMapNode_t p{b, bufSize, false};
-    // If this is the first pointer:
-    if (m_pointerMap.empty()) {
-      virtual_pointer_t initialVal{m_baseAddress};
-      m_pointerMap.emplace(initialVal, p);
-      return initialVal;
-    }
-
-    auto lastElemIter = get_insertion_point(bufSize);
-    // We are recovering an existing free node
-    if (lastElemIter->second.m_free) {
-      lastElemIter->second.m_buffer = b;
-      lastElemIter->second.m_free = false;
-
-      // If the recovered node is bigger than the inserted one
-      // add a new free node with the remaining space
-      if (lastElemIter->second.m_size > bufSize) {
-        // create a new node with the remaining space
-        auto remainingSize = lastElemIter->second.m_size - bufSize;
-        pMapNode_t p2{b, remainingSize, true};
-
-        // update size of the current node
-        lastElemIter->second.m_size = bufSize;
-
-        // add the new free node
-        auto newFreePtr = lastElemIter->first + bufSize;
-        auto freeNode = m_pointerMap.emplace(newFreePtr, p2).first;
-        m_freeList.emplace(freeNode);
-      }
-
-      retVal = lastElemIter->first;
-    } else {
-      size_t lastSize = lastElemIter->second.m_size;
-      retVal = lastElemIter->first + lastSize;
-      m_pointerMap.emplace(retVal, p);
-    }
-    return retVal;
-  }
-
-  /**
-   * Compare two iterators to pointer map entries according to
-   * the size of the allocation on the device.
-   */
-  struct SortBySize {
-    bool operator()(typename pointerMap_t::iterator a,
-                    typename pointerMap_t::iterator b) const {
-      return ((a->first < b->first) && (a->second <= b->second)) ||
-             ((a->first < b->first) && (b->second <= a->second));
-    }
-  };
-
-  /* Maps the pointer addresses to buffer and size pairs.
-   */
-  pointerMap_t m_pointerMap;
-
-  /* List of free nodes available for re-using
-   */
-  std::set<typename pointerMap_t::iterator, SortBySize> m_freeList;
-
-  /* Base address used when issuing the first virtual pointer, allows users
-   * to specify alignment. Cannot be zero. */
-  std::intptr_t m_baseAddress;
-};
-
-/* remove_pointer.
- * Removes the given pointer from the map.
- * The pointer is allowed to be reused only if ReUse if true.
- */
-template <>
-inline void PointerMapper::remove_pointer<false>(const virtual_pointer_t ptr) {
-  if (is_nullptr(ptr)) {
-    return;
-  }
-  m_pointerMap.erase(this->get_node(ptr));
-}
-
-/**
- * Malloc-like interface to the pointer-mapper.
- * Given a size, creates a byte-typed buffer and returns a
- * fake pointer to keep track of it.
- * \param size Size in bytes of the desired allocation
- * \throw cl::sycl::exception if error while creating the buffer
- */
-inline void *SYCLmalloc(size_t size, PointerMapper &pMap) {
-  if (size == 0) {
-    return nullptr;
-  }
-  // Create a generic buffer of the given size
-  using buffer_t = cl::sycl::buffer<buffer_data_type_t, 1>;
-  auto thePointer = pMap.add_pointer(buffer_t(cl::sycl::range<1>{size}));
-  // Store the buffer on the global list
-  return static_cast<void *>(thePointer);
-}
-
-/**
- * Free-like interface to the pointer mapper.
- * Given a fake-pointer created with the virtual-pointer malloc,
- * destroys the buffer and remove it from the list.
- * If ReUse is false, the pointer is not added to the freeList,
- * it should be false only for sub-buffers.
- */
-template <bool ReUse = true, typename PointerMapper>
-inline void SYCLfree(void *ptr, PointerMapper &pMap) {
-  pMap.template remove_pointer<ReUse>(ptr);
-}
-
-/**
- * Clear all the memory allocated by SYCL.
- */
-template <typename PointerMapper>
-inline void SYCLfreeAll(PointerMapper &pMap) {
-  pMap.clear();
-}
-
-template <cl::sycl::access::mode AcMd, typename T>
-struct RangeAccess {
-  static const auto global_access = cl::sycl::access::target::global_buffer;
-  static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
-  typedef T scalar_t;
-  typedef scalar_t &ref_t;
-  typedef typename cl::sycl::global_ptr<scalar_t>::pointer_t ptr_t;
-
-  // the accessor type does not necessarily the same as T
-  typedef cl::sycl::accessor<scalar_t, 1, AcMd, global_access, is_place_holder>
-      accessor;
-
-  typedef RangeAccess<AcMd, T> self_t;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE RangeAccess(accessor access,
-                                                    size_t offset,
-                                                    std::intptr_t virtual_ptr)
-      : access_(access), offset_(offset), virtual_ptr_(virtual_ptr) {}
-
-  RangeAccess(cl::sycl::buffer<scalar_t, 1> buff =
-                  cl::sycl::buffer<scalar_t, 1>(cl::sycl::range<1>(1)))
-      : access_{accessor{buff}}, offset_(0), virtual_ptr_(-1) {}
-
-  // This should be only used for null constructor on the host side
-  RangeAccess(std::nullptr_t) : RangeAccess() {}
-  // This template parameter must be removed and scalar_t should be replaced
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t get_pointer() const {
-    return (access_.get_pointer().get() + offset_);
-  }
-  template <typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator+=(Index offset) {
-    offset_ += (offset);
-    return *this;
-  }
-  template <typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator+(Index offset) const {
-    return self_t(access_, offset_ + offset, virtual_ptr_);
-  }
-  template <typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator-(Index offset) const {
-    return self_t(access_, offset_ - offset, virtual_ptr_);
-  }
-  template <typename Index>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator-=(Index offset) {
-    offset_ -= offset;
-    return *this;
-  }
-
-  // THIS IS FOR NULL COMPARISON ONLY
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator==(
-      const RangeAccess &lhs, std::nullptr_t) {
-    return ((lhs.virtual_ptr_ == -1));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator!=(
-      const RangeAccess &lhs, std::nullptr_t i) {
-    return !(lhs == i);
-  }
-
-  // THIS IS FOR NULL COMPARISON ONLY
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator==(
-      std::nullptr_t, const RangeAccess &rhs) {
-    return ((rhs.virtual_ptr_ == -1));
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator!=(
-      std::nullptr_t i, const RangeAccess &rhs) {
-    return !(i == rhs);
-  }
-  // Prefix operator (Increment and return value)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator++() {
-    offset_++;
-    return (*this);
-  }
-
-  // Postfix operator (Return value and increment)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator++(int i) {
-    EIGEN_UNUSED_VARIABLE(i);
-    self_t temp_iterator(*this);
-    offset_++;
-    return temp_iterator;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t get_size() const {
-    return (access_.get_count() - offset_);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t get_offset() const {
-    return offset_;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_offset(std::ptrdiff_t offset) {
-    offset_ = offset;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator*() const {
-    return *get_pointer();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator*() {
-    return *get_pointer();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t operator->() = delete;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator[](int x) {
-    return *(get_pointer() + x);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator[](int x) const {
-    return *(get_pointer() + x);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_t *get_virtual_pointer() const {
-    return reinterpret_cast<scalar_t *>(virtual_ptr_ +
-                                        (offset_ * sizeof(scalar_t)));
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit operator bool() const {
-    return (virtual_ptr_ != -1);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator RangeAccess<AcMd, const T>() {
-    return RangeAccess<AcMd, const T>(access_, offset_, virtual_ptr_);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  operator RangeAccess<AcMd, const T>() const {
-    return RangeAccess<AcMd, const T>(access_, offset_, virtual_ptr_);
-  }
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(
-      cl::sycl::handler &cgh) const {
-    cgh.require(access_);
-  }
-
- private:
-  accessor access_;
-  size_t offset_;
-  std::intptr_t virtual_ptr_;  // the location of the buffer in the map
-};
-
-template <cl::sycl::access::mode AcMd, typename T>
-struct RangeAccess<AcMd, const T> : RangeAccess<AcMd, T> {
-  typedef RangeAccess<AcMd, T> Base;
-  using Base::Base;
-};
-
-}  // namespace internal
-}  // namespace TensorSycl
-}  // namespace Eigen
-
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h
deleted file mode 100644
index 9208ab21d..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h
+++ /dev/null
@@ -1,85 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TypeCasting.h
- *
- * \brief:
- *  TypeCasting
- *
- *****************************************************************/
-
-#ifndef EIGEN_TYPE_CASTING_SYCL_H
-#define EIGEN_TYPE_CASTING_SYCL_H
-
-namespace Eigen {
-
-namespace internal {
-#ifdef SYCL_DEVICE_ONLY
-template <>
-struct type_casting_traits<float, int> {
-  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_int4
-pcast<cl::sycl::cl_float4, cl::sycl::cl_int4>(const cl::sycl::cl_float4& a) {
-  return a
-      .template convert<cl::sycl::cl_int, cl::sycl::rounding_mode::automatic>();
-}
-
-template <>
-struct type_casting_traits<int, float> {
-  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
-pcast<cl::sycl::cl_int4, cl::sycl::cl_float4>(const cl::sycl::cl_int4& a) {
-  return a.template convert<cl::sycl::cl_float,
-                            cl::sycl::rounding_mode::automatic>();
-}
-
-template <>
-struct type_casting_traits<double, float> {
-  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
-};
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4
-pcast<cl::sycl::cl_double2, cl::sycl::cl_float4>(
-    const cl::sycl::cl_double2& a, const cl::sycl::cl_double2& b) {
-  auto a1 = a.template convert<cl::sycl::cl_float,
-                               cl::sycl::rounding_mode::automatic>();
-  auto b1 = b.template convert<cl::sycl::cl_float,
-                               cl::sycl::rounding_mode::automatic>();
-  return cl::sycl::float4(a1.x(), a1.y(), b1.x(), b1.y());
-}
-
-template <>
-struct type_casting_traits<float, double> {
-  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
-};
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2
-pcast<cl::sycl::cl_float4, cl::sycl::cl_double2>(const cl::sycl::cl_float4& a) {
-  // Simply discard the second half of the input
-  return cl::sycl::cl_double2(a.x(), a.y());
-}
-
-#endif
-}  // end namespace internal
-
-}  // end namespace Eigen
-
-#endif  // EIGEN_TYPE_CASTING_SYCL_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/ZVector/Complex.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/ZVector/Complex.h
index d3e41b43e..1bfb73397 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/ZVector/Complex.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/ZVector/Complex.h
@@ -15,10 +15,6 @@ namespace Eigen {
 
 namespace internal {
 
-#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
-static Packet4ui  p4ui_CONJ_XOR = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; //vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);
-#endif
-
 static Packet2ul  p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 static Packet2ul  p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO,  (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 };
 
@@ -33,14 +29,10 @@ struct Packet2cf
 {
   EIGEN_STRONG_INLINE Packet2cf() {}
   EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
-#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
   union {
     Packet4f v;
     Packet1cd cd[2];
   };
-#else
-  Packet4f v;
-#endif
 };
 
 template<> struct packet_traits<std::complex<float> >  : default_packet_traits
@@ -91,33 +83,69 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
   };
 };
 
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float>  type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; };
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float>  type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; };
+template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; };
 
 /* Forward declaration */
 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel);
 
-/* complex<double> first */
+template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from)  { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
 template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from)  { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
 template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
+template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *     to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
+template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *     to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
 
 template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
 { /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
 
+template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
+{
+  Packet2cf res;
+  res.cd[0] = Packet1cd(vec_ld2f((const float *)&from));
+  res.cd[1] = res.cd[0];
+  return res;
+}
+template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
+{
+  std::complex<float> EIGEN_ALIGN16 af[2];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+  return pload<Packet2cf>(af);
+}
 template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index stride EIGEN_UNUSED)
 {
   return pload<Packet1cd>(from);
 }
+template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
+{
+  std::complex<float> EIGEN_ALIGN16 af[2];
+  pstore<std::complex<float> >((std::complex<float> *) af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+}
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from, Index stride EIGEN_UNUSED)
 {
   pstore<std::complex<double> >(to, from);
 }
+
+template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd<Packet4f>(a.v, b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); }
+template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v, b.v)); }
 template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); }
 template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); }
 template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
+{
+  Packet2cf res;
+  res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0]))).v;
+  res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1]))).v;
+  return res;
+}
+
 template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
   Packet2d a_re, a_im, v1, v2;
@@ -135,12 +163,27 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, con
 
   return Packet1cd(v1 + v2);
 }
-template<> EIGEN_STRONG_INLINE Packet1cd pand    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd por     <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pxor    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
-template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from) {  return pset1<Packet1cd>(*from); }
+template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
+{
+  Packet2cf res;
+  res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[0]))).v;
+  res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[1]))).v;
+  return res;
+}
 
+template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v,b.v)); }
+template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); }
+template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot<Packet4f>(a.v,b.v)); }
+
+template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>*     from) {  return pset1<Packet1cd>(*from); }
+template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*      from) {  return pset1<Packet2cf>(*from); }
+
+template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *     addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
 template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
 
 template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
@@ -150,16 +193,83 @@ template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Pac
 
   return res;
 }
+template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> EIGEN_ALIGN16 res[2];
+  pstore<std::complex<float> >(res, a);
+
+  return res[0];
+}
 
 template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
+{
+  Packet2cf res;
+  res.cd[0] = a.cd[1];
+  res.cd[1] = a.cd[0];
+  return res;
+}
+
 template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
 {
   return pfirst(a);
 }
+template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> res;
+  Packet1cd b = padd<Packet1cd>(a.cd[0], a.cd[1]);
+  vec_st2f(b.v, (float*)&res);
+  return res;
+}
+
+template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
+{
+  return vecs[0];
+}
+template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
+{
+  PacketBlock<Packet2cf,2> transpose;
+  transpose.packet[0] = vecs[0];
+  transpose.packet[1] = vecs[1];
+  ptranspose(transpose);
+
+  return padd<Packet2cf>(transpose.packet[0], transpose.packet[1]);
+} 
+
 template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
 {
   return pfirst(a);
 }
+template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
+{
+  std::complex<float> res;
+  Packet1cd b = pmul<Packet1cd>(a.cd[0], a.cd[1]);
+  vec_st2f(b.v, (float*)&res);
+  return res;
+}
+
+template<int Offset>
+struct palign_impl<Offset,Packet1cd>
+{
+  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
+  {
+    // FIXME is it sure we never have to align a Packet1cd?
+    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
+  }
+};
+
+template<int Offset>
+struct palign_impl<Offset,Packet2cf>
+{
+  static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
+  {
+    if (Offset == 1) {
+      first.cd[0] = first.cd[1];
+      first.cd[1] = second.cd[0];
+    }
+  }
+};
+
 template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
 {
   EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
@@ -193,134 +303,6 @@ template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
   }
 };
 
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
-
-template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
-  // TODO optimize it for AltiVec
-  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
-  Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_);
-  return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64)));
-}
-
-EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
-{
-  return Packet1cd(preverse(Packet2d(x.v)));
-}
-
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
-{
-  Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
-  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
-  kernel.packet[0].v = tmp;
-}
-
-/* complex<float> follows */
-template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from)  { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from)  { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *     to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *     to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
-
-template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
-{
-  std::complex<float> EIGEN_ALIGN16 res[2];
-  pstore<std::complex<float> >(res, a);
-
-  return res[0];
-}
-
-
-#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
-template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
-{
-  Packet2cf res;
-  res.cd[0] = Packet1cd(vec_ld2f((const float *)&from));
-  res.cd[1] = res.cd[0];
-  return res;
-}
-#else
-template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
-{
-  Packet2cf res;
-  if((std::ptrdiff_t(&from) % 16) == 0)
-    res.v = pload<Packet4f>((const float *)&from);
-  else
-    res.v = ploadu<Packet4f>((const float *)&from);
-  res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI);
-  return res;
-}
-#endif
-
-template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
-{
-  std::complex<float> EIGEN_ALIGN16 af[2];
-  af[0] = from[0*stride];
-  af[1] = from[1*stride];
-  return pload<Packet2cf>(af);
-}
-template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
-{
-  std::complex<float> EIGEN_ALIGN16 af[2];
-  pstore<std::complex<float> >((std::complex<float> *) af, from);
-  to[0*stride] = af[0];
-  to[1*stride] = af[1];
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd<Packet4f>(a.v, b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v, b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); }
-
-template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand<Packet4f>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por<Packet4f>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor<Packet4f>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot<Packet4f>(a.v,b.v)); }
-
-template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*      from) {  return pset1<Packet2cf>(*from); }
-
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *     addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
-
-
-#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
-{
-  Packet2cf res;
-  res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0]))).v;
-  res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1]))).v;
-  return res;
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  Packet2cf res;
-  res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[0]))).v;
-  res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[1]))).v;
-  return res;
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
-{
-  Packet2cf res;
-  res.cd[0] = a.cd[1];
-  res.cd[1] = a.cd[0];
-  return res;
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
-{
-  std::complex<float> res;
-  Packet1cd b = padd<Packet1cd>(a.cd[0], a.cd[1]);
-  vec_st2f(b.v, (float*)&res);
-  return res;
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
-{
-  std::complex<float> res;
-  Packet1cd b = pmul<Packet1cd>(a.cd[0], a.cd[1]);
-  vec_st2f(b.v, (float*)&res);
-  return res;
-}
-
 template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
 {
   EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
@@ -355,6 +337,15 @@ template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
 };
 
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
+
+template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
+{
+  // TODO optimize it for AltiVec
+  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
+  Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_);
+  return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64)));
+}
 
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
@@ -365,6 +356,11 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, con
   return res;
 }
 
+EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
+{
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
 EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
 {
   Packet2cf res;
@@ -373,6 +369,13 @@ EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
   return res;
 }
 
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd,2>& kernel)
+{
+  Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
+  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
+  kernel.packet[0].v = tmp;
+}
+
 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
 {
   Packet1cd tmp = kernel.packet[0].cd[1];
@@ -386,116 +389,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, con
   result.v = pblend<Packet4f>(ifPacket4, thenPacket.v, elsePacket.v);
   return result;
 }
-#else
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR))); }
-template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  Packet4f a_re, a_im, prod, prod_im;
-
-  // Permute and multiply the real parts of a and b
-  a_re = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
-  
-  // Get the imaginary parts of a
-  a_im = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
-
-  // multiply a_im * b and get the conjugate result
-  prod_im = a_im * b.v;
-  prod_im = pxor<Packet4f>(prod_im, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR));
-  // permute back to a proper order
-  prod_im = vec_perm(prod_im, prod_im, p16uc_COMPLEX32_REV);
-
-  // multiply a_re * b, add prod_im
-  prod = pmadd<Packet4f>(a_re, b.v, prod_im);
- 
-  return Packet2cf(prod);
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
-{
-  Packet4f rev_a;
-  rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX32_REV2);
-  return Packet2cf(rev_a);
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
-{
-  Packet4f b;
-  b = vec_sld(a.v, a.v, 8);
-  b = padd<Packet4f>(a.v, b);
-  return pfirst<Packet2cf>(Packet2cf(b));
-}
-
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
-{
-  Packet4f b;
-  Packet2cf prod;
-  b = vec_sld(a.v, a.v, 8);
-  prod = pmul<Packet2cf>(a, Packet2cf(b));
-
-  return pfirst<Packet2cf>(prod);
-}
-
-template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
-EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
-
-template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  // TODO optimize it for AltiVec
-  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a, b);
-  Packet4f s = pmul<Packet4f>(b.v, b.v);
-  return Packet2cf(pdiv(res.v, padd<Packet4f>(s, vec_perm(s, s, p16uc_COMPLEX32_REV))));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x)
-{
-  return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV));
-}
-
-EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel)
-{
-  Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
-  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
-  kernel.packet[0].v = tmp;
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) {
-  Packet2cf result;
-  result.v = reinterpret_cast<Packet4f>(pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
-  return result;
-}
-#endif
 
 } // end namespace internal
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h
index 689ecc702..5c7aa7256 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h
@@ -20,50 +20,6 @@ namespace Eigen {
 
 namespace internal {
 
-#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
-static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-static _EIGEN_DECLARE_CONST_Packet4i(23, 23);
-
-static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
-
-/* the smallest non denormalized float number */
-static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
-static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000); // -1.f/0.f
-static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan,     0xffffffff);
-  
-/* natural logarithm computed for 4 simultaneous float
-  return NaN for x <= 0
-*/
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
-
-static _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
-static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
-
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
-
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
-static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
-#endif
-
 static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
 static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
 static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
@@ -137,101 +93,43 @@ Packet2d pexp<Packet2d>(const Packet2d& _x)
 }
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f pexp<Packet4f>(const Packet4f& _x)
+Packet4f pexp<Packet4f>(const Packet4f& x)
 {
-#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
-/*
-  Packet4f x = _x;
-
-  Packet4f tmp, fx;
-  Packet4i emm0;
-
-  // clamp x
-  x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
-
-  // express exp(x) as exp(g + n*log(2))
-  fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
-
-  fx = pfloor(fx);
-
-  tmp = pmul(fx, p4f_cephes_exp_C1);
-  Packet4f z = pmul(fx, p4f_cephes_exp_C2);
-  x = psub(x, tmp);
-  x = psub(x, z);
-
-  z = pmul(x,x);
-
-  Packet4f y = p4f_cephes_exp_p0;
-  y = pmadd(y, x, p4f_cephes_exp_p1);
-  y = pmadd(y, x, p4f_cephes_exp_p2);
-  y = pmadd(y, x, p4f_cephes_exp_p3);
-  y = pmadd(y, x, p4f_cephes_exp_p4);
-  y = pmadd(y, x, p4f_cephes_exp_p5);
-  y = pmadd(y, z, x);
-  y = padd(y, p4f_1);
-
-  // build 2^n
-  emm0 = vec_cts(fx, 0);
-  emm0 = emm0 + p4i_0x7f;
-  emm0 = emm0 << reinterpret_cast<Packet4i>(p4i_23);
-
-  // Altivec's max & min operators just drop silent NaNs. Check NaNs in 
-  // inputs and return them unmodified.
-  Packet4ui isnumber_mask = reinterpret_cast<Packet4ui>(vec_cmpeq(_x, _x));
-  return vec_sel(_x, pmax(pmul(y, reinterpret_cast<Packet4f>(emm0)), _x),
-                 isnumber_mask);*/
-  return _x;
-#else
   Packet4f res;
-  res.v4f[0] = pexp<Packet2d>(_x.v4f[0]);
-  res.v4f[1] = pexp<Packet2d>(_x.v4f[1]);
+  res.v4f[0] = pexp<Packet2d>(x.v4f[0]);
+  res.v4f[1] = pexp<Packet2d>(x.v4f[1]);
   return res;
-#endif
 }
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d psqrt<Packet2d>(const Packet2d& x)
 {
-  return vec_sqrt(x);
+  return  __builtin_s390_vfsqdb(x);
 }
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f psqrt<Packet4f>(const Packet4f& x)
 {
   Packet4f res;
-#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
-  res = vec_sqrt(x);
-#else
   res.v4f[0] = psqrt<Packet2d>(x.v4f[0]);
   res.v4f[1] = psqrt<Packet2d>(x.v4f[1]);
-#endif
   return res;
 }
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet2d prsqrt<Packet2d>(const Packet2d& x) {
+  // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation.
   return pset1<Packet2d>(1.0) / psqrt<Packet2d>(x);
 }
 
 template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
 Packet4f prsqrt<Packet4f>(const Packet4f& x) {
   Packet4f res;
-#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
-  res = pset1<Packet4f>(1.0) / psqrt<Packet4f>(x);
-#else
   res.v4f[0] = prsqrt<Packet2d>(x.v4f[0]);
   res.v4f[1] = prsqrt<Packet2d>(x.v4f[1]);
-#endif
   return res;
 }
 
-// Hyperbolic Tangent function.
-template <>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f
-ptanh<Packet4f>(const Packet4f& x) {
-  return internal::generic_fast_tanh_float(x);
-}
-
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/ZVector/PacketMath.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/ZVector/PacketMath.h
index 3fb642a38..57b01fc63 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/ZVector/PacketMath.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/ZVector/PacketMath.h
@@ -17,7 +17,7 @@ namespace Eigen {
 namespace internal {
 
 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
 #endif
 
 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
@@ -29,7 +29,7 @@ namespace internal {
 #endif
 
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  32
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  16
 #endif
 
 typedef __vector int                 Packet4i;
@@ -41,14 +41,9 @@ typedef __vector double              Packet2d;
 typedef __vector unsigned long long  Packet2ul;
 typedef __vector long long           Packet2l;
 
-// Z14 has builtin support for float vectors
-#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
-typedef __vector float               Packet4f;
-#else
 typedef struct {
 	Packet2d  v4f[2];
 } Packet4f;
-#endif
 
 typedef union {
   int32_t   i[4];
@@ -56,15 +51,11 @@ typedef union {
   int64_t   l[2];
   uint64_t ul[2];
   double    d[2];
-  float     f[4];
   Packet4i  v4i;
   Packet4ui v4ui;
   Packet2l  v2l;
   Packet2ul v2ul;
   Packet2d  v2d;
-#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
-  Packet4f  v4f;
-#endif
 } Packet;
 
 // We don't want to write the same code all the time, but we need to reuse the constants
@@ -89,7 +80,7 @@ typedef union {
   Packet2l p2l_##NAME = pset1<Packet2l>(X)
 
 // These constants are endian-agnostic
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
+//static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,}
 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1}
 
 static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0);
@@ -99,21 +90,6 @@ static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1);
 static Packet2d p2d_ONE = { 1.0, 1.0 }; 
 static Packet2d p2d_ZERO_ = { -0.0, -0.0 };
 
-#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
-#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
-  Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
-
-#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
-  Packet4f p4f_##NAME = pset1<Packet4f>(X)
-
-#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
-  const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
-
-static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0}
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1}
-static Packet4f p4f_MZERO = { 0x80000000, 0x80000000, 0x80000000, 0x80000000};
-#endif
-
 static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 };
 static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 };
 static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet16uc>(p2d_ZERO), reinterpret_cast<Packet16uc>(p2d_ONE), 8));
@@ -144,9 +120,9 @@ static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0
 static Packet16uc p16uc_TRANSPOSE64_HI = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
 static Packet16uc p16uc_TRANSPOSE64_LO = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
 
-static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);                                         //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
+//static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);                                         //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
 
-static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);                                            //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+//static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);                                            //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
 
 
 #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
@@ -173,35 +149,29 @@ template<> struct packet_traits<int>    : default_packet_traits
   };
 };
 
-template <>
-struct packet_traits<float> : default_packet_traits {
+template<> struct packet_traits<float> : default_packet_traits
+{
   typedef Packet4f type;
   typedef Packet4f half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size = 4,
+    size=4,
     HasHalfPacket = 0,
 
-    HasAdd = 1,
-    HasSub = 1,
-    HasMul = 1,
-    HasDiv = 1,
-    HasMin = 1,
-    HasMax = 1,
-    HasAbs = 1,
-    HasSin = 0,
-    HasCos = 0,
-    HasLog = 0,
-#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
-    HasExp = 0,
-#else
-    HasExp = 1,
-#endif
+    HasAdd  = 1,
+    HasSub  = 1,
+    HasMul  = 1,
+    HasDiv  = 1,
+    HasMin  = 1,
+    HasMax  = 1,
+    HasAbs  = 1,
+    HasSin  = 0,
+    HasCos  = 0,
+    HasLog  = 0,
+    HasExp  = 1,
     HasSqrt = 1,
     HasRsqrt = 1,
-    HasTanh = 1,
-    HasErf = 1,
     HasRound = 1,
     HasFloor = 1,
     HasCeil = 1,
@@ -241,9 +211,9 @@ template<> struct packet_traits<double> : default_packet_traits
   };
 };
 
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4i half; };
-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4f half; };
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; };
+template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; };
+template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; };
+template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; };
 
 /* Forward declaration */
 EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f,4>& kernel);
@@ -288,301 +258,6 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v)
   return s;
 }
 
-#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
-inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
-{
-  Packet vt;
-  vt.v4f = v;
-  s << vt.f[0] << ", " << vt.f[1] << ", " << vt.f[2] << ", " << vt.f[3];
-  return s;
-}
-#endif
-
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from)
-{
-  // FIXME: No intrinsic yet
-  EIGEN_DEBUG_ALIGNED_LOAD
-  Packet *vfrom;
-  vfrom = (Packet *) from;
-  return vfrom->v4i;
-}
-
-template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
-{
-  // FIXME: No intrinsic yet
-  EIGEN_DEBUG_ALIGNED_LOAD
-  Packet *vfrom;
-  vfrom = (Packet *) from;
-  return vfrom->v2d;
-}
-
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from)
-{
-  // FIXME: No intrinsic yet
-  EIGEN_DEBUG_ALIGNED_STORE
-  Packet *vto;
-  vto = (Packet *) to;
-  vto->v4i = from;
-}
-
-template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from)
-{
-  // FIXME: No intrinsic yet
-  EIGEN_DEBUG_ALIGNED_STORE
-  Packet *vto;
-  vto = (Packet *) to;
-  vto->v2d = from;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)
-{
-  return vec_splats(from);
-}
-template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
-  return vec_splats(from);
-}
-
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4i>(const int *a,
-                      Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
-{
-  a3 = pload<Packet4i>(a);
-  a0 = vec_splat(a3, 0);
-  a1 = vec_splat(a3, 1);
-  a2 = vec_splat(a3, 2);
-  a3 = vec_splat(a3, 3);
-}
-
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet2d>(const double *a,
-                      Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
-{
-  a1 = pload<Packet2d>(a);
-  a0 = vec_splat(a1, 0);
-  a1 = vec_splat(a1, 1);
-  a3 = pload<Packet2d>(a+2);
-  a2 = vec_splat(a3, 0);
-  a3 = vec_splat(a3, 1);
-}
-
-template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
-{
-  int EIGEN_ALIGN16 ai[4];
-  ai[0] = from[0*stride];
-  ai[1] = from[1*stride];
-  ai[2] = from[2*stride];
-  ai[3] = from[3*stride];
- return pload<Packet4i>(ai);
-}
-
-template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
-{
-  double EIGEN_ALIGN16 af[2];
-  af[0] = from[0*stride];
-  af[1] = from[1*stride];
- return pload<Packet2d>(af);
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
-{
-  int EIGEN_ALIGN16 ai[4];
-  pstore<int>((int *)ai, from);
-  to[0*stride] = ai[0];
-  to[1*stride] = ai[1];
-  to[2*stride] = ai[2];
-  to[3*stride] = ai[3];
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
-{
-  double EIGEN_ALIGN16 af[2];
-  pstore<double>(af, from);
-  to[0*stride] = af[0];
-  to[1*stride] = af[1];
-}
-
-template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a + b); }
-template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a + b); }
-
-template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a - b); }
-template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a - b); }
-
-template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a * b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a * b); }
-
-template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a / b); }
-template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a / b); }
-
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); }
-template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); }
-
-template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
-
-template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd<Packet4i>(pmul<Packet4i>(a, b), c); }
-template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
-
-template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)    { return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN); }
-template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN); }
-
-template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
-template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
-template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
-template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
-
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return pand<Packet4i>(a, vec_nor(b, b)); }
-template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
-
-template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
-template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const  Packet2d& a) { return vec_ceil(a); }
-template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
-
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int*       from) { return pload<Packet4i>(from); }
-template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double*    from) { return pload<Packet2d>(from); }
-
-
-template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
-{
-  Packet4i p = pload<Packet4i>(from);
-  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
-}
-
-template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
-{
-  Packet2d p = pload<Packet2d>(from);
-  return vec_perm(p, p, p16uc_PSET64_HI);
-}
-
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*        to, const Packet4i& from) { pstore<int>(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from) { pstore<double>(to, from); }
-
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
-
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int    EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; }
-
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
-{
-  return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
-{
-  return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
-}
-
-template<> EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(const Packet4i& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(const Packet2d& a) { return vec_abs(a); }
-
-template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
-{
-  Packet4i b, sum;
-  b   = vec_sld(a, a, 8);
-  sum = padd<Packet4i>(a, b);
-  b   = vec_sld(sum, sum, 4);
-  sum = padd<Packet4i>(sum, b);
-  return pfirst(sum);
-}
-
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
-{
-  Packet2d b, sum;
-  b   = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8));
-  sum = padd<Packet2d>(a, b);
-  return pfirst(sum);
-}
-
-// Other reduction functions:
-// mul
-template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
-{
-  EIGEN_ALIGN16 int aux[4];
-  pstore(aux, a);
-  return aux[0] * aux[1] * aux[2] * aux[3];
-}
-
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
-{
-  return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
-}
-
-// min
-template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
-{
-  Packet4i b, res;
-  b   = pmin<Packet4i>(a, vec_sld(a, a, 8));
-  res = pmin<Packet4i>(b, vec_sld(b, b, 4));
-  return pfirst(res);
-}
-
-template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
-{
-  return pfirst(pmin<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
-}
-
-// max
-template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
-{
-  Packet4i b, res;
-  b = pmax<Packet4i>(a, vec_sld(a, a, 8));
-  res = pmax<Packet4i>(b, vec_sld(b, b, 4));
-  return pfirst(res);
-}
-
-// max
-template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
-{
-  return pfirst(pmax<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4i,4>& kernel) {
-  Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
-  Packet4i t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
-  Packet4i t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
-  Packet4i t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
-  kernel.packet[0] = vec_mergeh(t0, t2);
-  kernel.packet[1] = vec_mergel(t0, t2);
-  kernel.packet[2] = vec_mergeh(t1, t3);
-  kernel.packet[3] = vec_mergel(t1, t3);
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet2d,2>& kernel) {
-  Packet2d t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
-  Packet2d t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
-  kernel.packet[0] = t0;
-  kernel.packet[1] = t1;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
-  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
-  Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
-  return vec_sel(elsePacket, thenPacket, mask);
-}
-
-
-template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
-  Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
-  Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
-  return vec_sel(elsePacket, thenPacket, mask);
-}
-
-/* z13 has no vector float support so we emulate that with double
-   z14 has proper vector float support.
-*/
-#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
 /* Helper function to simulate a vec_splat_packet4f
  */
 template<int element> EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f&   from)
@@ -609,6 +284,66 @@ template<int element> EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Pack
   return splat;
 }
 
+template<int Offset>
+struct palign_impl<Offset,Packet4i>
+{
+  static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
+  {
+    switch (Offset % 4) {
+    case 1:
+      first = vec_sld(first, second, 4); break;
+    case 2:
+      first = vec_sld(first, second, 8); break;
+    case 3:
+      first = vec_sld(first, second, 12); break;
+    }
+  }
+};
+
+/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double
+ */
+template<int Offset>
+struct palign_impl<Offset,Packet4f>
+{
+  static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
+  {
+    switch (Offset % 4) {
+    case 1:
+      first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8);
+      first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8);
+      break;
+    case 2:
+      first.v4f[0] = first.v4f[1];
+      first.v4f[1] = second.v4f[0];
+      break;
+    case 3:
+      first.v4f[0] = vec_sld(first.v4f[1],  second.v4f[0], 8);
+      first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8);
+      break;
+    }
+  }
+};
+
+
+template<int Offset>
+struct palign_impl<Offset,Packet2d>
+{
+  static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
+  {
+    if (Offset == 1)
+      first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(first), reinterpret_cast<Packet4i>(second), 8));
+  }
+};
+
+template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_LOAD
+  Packet *vfrom;
+  vfrom = (Packet *) from;
+  return vfrom->v4i;
+}
+
 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from)
 {
   // FIXME: No intrinsic yet
@@ -619,6 +354,24 @@ template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from)
   return vfrom;
 }
 
+template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_LOAD
+  Packet *vfrom;
+  vfrom = (Packet *) from;
+  return vfrom->v2d;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_STORE
+  Packet *vto;
+  vto = (Packet *) to;
+  vto->v4i = from;
+}
+
 template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from)
 {
   // FIXME: No intrinsic yet
@@ -627,6 +380,23 @@ template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& f
   vec_st2f(from.v4f[1], &to[2]);
 }
 
+
+template<> EIGEN_STRONG_INLINE void pstore<double>(double*   to, const Packet2d& from)
+{
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_STORE
+  Packet *vto;
+  vto = (Packet *) to;
+  vto->v2d = from;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)
+{
+  return vec_splats(from);
+}
+template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
+  return vec_splats(from);
+}
 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&    from)
 {
   Packet4f to;
@@ -635,6 +405,17 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&    from)
   return to;
 }
 
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet4i>(const int *a,
+                      Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3)
+{
+  a3 = pload<Packet4i>(a);
+  a0 = vec_splat(a3, 0);
+  a1 = vec_splat(a3, 1);
+  a2 = vec_splat(a3, 2);
+  a3 = vec_splat(a3, 3);
+}
+
 template<> EIGEN_STRONG_INLINE void
 pbroadcast4<Packet4f>(const float *a,
                       Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
@@ -646,6 +427,28 @@ pbroadcast4<Packet4f>(const float *a,
   a3 = vec_splat_packet4f<3>(a3);
 }
 
+template<> EIGEN_STRONG_INLINE void
+pbroadcast4<Packet2d>(const double *a,
+                      Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
+{
+  a1 = pload<Packet2d>(a);
+  a0 = vec_splat(a1, 0);
+  a1 = vec_splat(a1, 1);
+  a3 = pload<Packet2d>(a+2);
+  a2 = vec_splat(a3, 0);
+  a3 = vec_splat(a3, 1);
+}
+
+template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
+{
+  int EIGEN_ALIGN16 ai[4];
+  ai[0] = from[0*stride];
+  ai[1] = from[1*stride];
+  ai[2] = from[2*stride];
+  ai[3] = from[3*stride];
+ return pload<Packet4i>(ai);
+}
+
 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
 {
   float EIGEN_ALIGN16 ai[4];
@@ -656,6 +459,24 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const floa
  return pload<Packet4f>(ai);
 }
 
+template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
+{
+  double EIGEN_ALIGN16 af[2];
+  af[0] = from[0*stride];
+  af[1] = from[1*stride];
+ return pload<Packet2d>(af);
+}
+
+template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
+{
+  int EIGEN_ALIGN16 ai[4];
+  pstore<int>((int *)ai, from);
+  to[0*stride] = ai[0];
+  to[1*stride] = ai[1];
+  to[2*stride] = ai[2];
+  to[3*stride] = ai[3];
+}
+
 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
 {
   float EIGEN_ALIGN16 ai[4];
@@ -666,6 +487,15 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, co
   to[3*stride] = ai[3];
 }
 
+template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
+{
+  double EIGEN_ALIGN16 af[2];
+  pstore<double>(af, from);
+  to[0*stride] = af[0];
+  to[1*stride] = af[1];
+}
+
+template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a + b); }
 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
   Packet4f c;
@@ -673,7 +503,9 @@ template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const
   c.v4f[1] = a.v4f[1] + b.v4f[1];
   return c;
 }
+template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a + b); }
 
+template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a - b); }
 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
   Packet4f c;
@@ -681,7 +513,9 @@ template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const
   c.v4f[1] = a.v4f[1] - b.v4f[1];
   return c;
 }
+template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a - b); }
 
+template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a * b); }
 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
   Packet4f c;
@@ -689,7 +523,9 @@ template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const
   c.v4f[1] = a.v4f[1] * b.v4f[1];
   return c;
 }
+template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a * b); }
 
+template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) { return (a / b); }
 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
   Packet4f c;
@@ -697,7 +533,9 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const
   c.v4f[1] = a.v4f[1] / b.v4f[1];
   return c;
 }
+template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return (a / b); }
 
+template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); }
 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
 {
   Packet4f c;
@@ -705,7 +543,13 @@ template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
   c.v4f[1] = -a.v4f[1];
   return c;
 }
+template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); }
 
+template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
+template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd<Packet4i>(pmul<Packet4i>(a, b), c); }
 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c)
 {
   Packet4f res;
@@ -713,7 +557,14 @@ template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f&
   res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
   return res;
 }
+template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); }
 
+template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a)    { return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN); }
+template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a)  { return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN); }
+template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN); }
+
+template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
   Packet4f res;
@@ -722,6 +573,8 @@ template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const
   return res;
 }
 
+template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
   Packet4f res;
@@ -730,6 +583,8 @@ template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const
   return res;
 }
 
+template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
   Packet4f res;
@@ -738,6 +593,8 @@ template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const
   return res;
 }
 
+template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
   Packet4f res;
@@ -746,6 +603,8 @@ template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const P
   return res;
 }
 
+template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
+template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
   Packet4f res;
@@ -754,6 +613,8 @@ template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const
   return res;
 }
 
+template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return pand<Packet4i>(a, vec_nor(b, b)); }
+template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); }
 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
 {
   Packet4f res;
@@ -769,7 +630,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a)
   res.v4f[1] = vec_round(a.v4f[1]);
   return res;
 }
-
+template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a)
 {
   Packet4f res;
@@ -777,7 +638,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const  Packet4f& a)
   res.v4f[1] = vec_ceil(a.v4f[1]);
   return res;
 }
-
+template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const  Packet2d& a) { return vec_ceil(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
 {
   Packet4f res;
@@ -785,6 +646,18 @@ template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a)
   res.v4f[1] = vec_floor(a.v4f[1]);
   return res;
 }
+template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); }
+
+template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int*       from) { return pload<Packet4i>(from); }
+template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*     from) { return pload<Packet4f>(from); }
+template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double*    from) { return pload<Packet2d>(from); }
+
+
+template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
+{
+  Packet4i p = pload<Packet4i>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
+}
 
 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*    from)
 {
@@ -794,7 +667,33 @@ template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*    from)
   return p;
 }
 
+template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
+{
+  Packet2d p = pload<Packet2d>(from);
+  return vec_perm(p, p, p16uc_PSET64_HI);
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*        to, const Packet4i& from) { pstore<int>(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*    to, const Packet4f& from) { pstore<float>(to, from); }
+template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d& from) { pstore<double>(to, from); }
+
+template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
+
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int    EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
 template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float  EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; }
+
+template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
+{
+  return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
+{
+  return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
+}
 
 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
 {
@@ -804,6 +703,8 @@ template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
   return rev;
 }
 
+template<> EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(const Packet4i& a) { return vec_abs(a); }
+template<> EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(const Packet2d& a) { return vec_abs(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a)
 {
   Packet4f res;
@@ -812,6 +713,23 @@ template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a)
   return res;
 }
 
+template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
+{
+  Packet4i b, sum;
+  b   = vec_sld(a, a, 8);
+  sum = padd<Packet4i>(a, b);
+  b   = vec_sld(sum, sum, 4);
+  sum = padd<Packet4i>(sum, b);
+  return pfirst(sum);
+}
+
+template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
+{
+  Packet2d b, sum;
+  b   = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8));
+  sum = padd<Packet2d>(a, b);
+  return pfirst(sum);
+}
 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
 {
   Packet2d sum;
@@ -820,12 +738,94 @@ template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
   return static_cast<float>(first);
 }
 
+template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
+{
+  Packet4i v[4], sum[4];
+
+  // It's easier and faster to transpose then add as columns
+  // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
+  // Do the transpose, first set of moves
+  v[0] = vec_mergeh(vecs[0], vecs[2]);
+  v[1] = vec_mergel(vecs[0], vecs[2]);
+  v[2] = vec_mergeh(vecs[1], vecs[3]);
+  v[3] = vec_mergel(vecs[1], vecs[3]);
+  // Get the resulting vectors
+  sum[0] = vec_mergeh(v[0], v[2]);
+  sum[1] = vec_mergel(v[0], v[2]);
+  sum[2] = vec_mergeh(v[1], v[3]);
+  sum[3] = vec_mergel(v[1], v[3]);
+
+  // Now do the summation:
+  // Lines 0+1
+  sum[0] = padd<Packet4i>(sum[0], sum[1]);
+  // Lines 2+3
+  sum[1] = padd<Packet4i>(sum[2], sum[3]);
+  // Add the results
+  sum[0] = padd<Packet4i>(sum[0], sum[1]);
+
+  return sum[0];
+}
+
+template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
+{
+  Packet2d v[2], sum;
+  v[0] = padd<Packet2d>(vecs[0], reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(vecs[0]), reinterpret_cast<Packet4ui>(vecs[0]), 8)));
+  v[1] = padd<Packet2d>(vecs[1], reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(vecs[1]), reinterpret_cast<Packet4ui>(vecs[1]), 8)));
+ 
+  sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v[0]), reinterpret_cast<Packet4ui>(v[1]), 8));
+
+  return sum;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
+{
+  PacketBlock<Packet4f,4> transpose;
+  transpose.packet[0] = vecs[0];
+  transpose.packet[1] = vecs[1];
+  transpose.packet[2] = vecs[2];
+  transpose.packet[3] = vecs[3];
+  ptranspose(transpose);
+
+  Packet4f sum = padd(transpose.packet[0], transpose.packet[1]);
+  sum = padd(sum, transpose.packet[2]);
+  sum = padd(sum, transpose.packet[3]);
+  return sum;
+}
+
+// Other reduction functions:
+// mul
+template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
+{
+  EIGEN_ALIGN16 int aux[4];
+  pstore(aux, a);
+  return aux[0] * aux[1] * aux[2] * aux[3];
+}
+
+template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
+{
+  return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
+}
+
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
 {
   // Return predux_mul<Packet2d> of the subvectors product
   return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
 }
 
+// min
+template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
+{
+  Packet4i b, res;
+  b   = pmin<Packet4i>(a, vec_sld(a, a, 8));
+  res = pmin<Packet4i>(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
+{
+  return pfirst(pmin<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
+}
+
 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
 {
   Packet2d b, res;
@@ -834,6 +834,21 @@ template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
   return static_cast<float>(pfirst(res));
 }
 
+// max
+template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
+{
+  Packet4i b, res;
+  b = pmax<Packet4i>(a, vec_sld(a, a, 8));
+  res = pmax<Packet4i>(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+// max
+template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
+{
+  return pfirst(pmax<Packet2d>(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
+}
+
 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
 {
   Packet2d b, res;
@@ -842,6 +857,26 @@ template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
   return static_cast<float>(pfirst(res));
 }
 
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet4i,4>& kernel) {
+  Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  Packet4i t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  Packet4i t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  Packet4i t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+EIGEN_DEVICE_FUNC inline void
+ptranspose(PacketBlock<Packet2d,2>& kernel) {
+  Packet2d t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
+  Packet2d t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
+  kernel.packet[0] = t0;
+  kernel.packet[1] = t1;
+}
+
 /* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one
  */
 EIGEN_DEVICE_FUNC inline void
@@ -880,6 +915,12 @@ ptranspose(PacketBlock<Packet4f,4>& kernel) {
   kernel.packet[3].v4f[1] = t3.packet[1];
 }
 
+template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) {
+  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
+  Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
 template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
   Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] };
   Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] };
@@ -890,153 +931,13 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons
   result.v4f[1] = vec_sel(elsePacket.v4f[1], thenPacket.v4f[1], mask_lo);
   return result;
 }
-#else
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from)
-{
-  // FIXME: No intrinsic yet
-  EIGEN_DEBUG_ALIGNED_LOAD
-  Packet *vfrom;
-  vfrom = (Packet *) from;
-  return vfrom->v4f;
-}
 
-template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from)
-{
-  // FIXME: No intrinsic yet
-  EIGEN_DEBUG_ALIGNED_STORE
-  Packet *vto;
-  vto = (Packet *) to;
-  vto->v4f = from;
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from)
-{
-  return vec_splats(from);
-}
-
-template<> EIGEN_STRONG_INLINE void
-pbroadcast4<Packet4f>(const float *a,
-                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
-{
-  a3 = pload<Packet4f>(a);
-  a0 = vec_splat(a3, 0);
-  a1 = vec_splat(a3, 1);
-  a2 = vec_splat(a3, 2);
-  a3 = vec_splat(a3, 3);
-}
-
-template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
-{
-  float EIGEN_ALIGN16 af[4];
-  af[0] = from[0*stride];
-  af[1] = from[1*stride];
-  af[2] = from[2*stride];
-  af[3] = from[3*stride];
- return pload<Packet4f>(af);
-}
-
-template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
-{
-  float EIGEN_ALIGN16 af[4];
-  pstore<float>((float*)af, from);
-  to[0*stride] = af[0];
-  to[1*stride] = af[1];
-  to[2*stride] = af[2];
-  to[3*stride] = af[3];
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a + b); }
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a - b); }
-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a * b); }
-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return (a / b); }
-template<> EIGEN_STRONG_INLINE Packet4f pnegate<Packet4f>(const Packet4f& a) { return (-a); }
-template<> EIGEN_STRONG_INLINE Packet4f pconj<Packet4f>  (const Packet4f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4f pmadd<Packet4f>  (const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); }
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>   (const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>   (const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>   (const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>    (const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>   (const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
-template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f> (const Packet4f& a) { return vec_round(a); }
-template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>  (const Packet4f& a) { return vec_ceil(a); }
-template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f> (const Packet4f& a) { return vec_floor(a); }
-template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>   (const Packet4f& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
-
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
-{
-  Packet4f p = pload<Packet4f>(from);
-  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
-{
-  return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
-}
-
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
-  Packet4f b, sum;
-  b   = vec_sld(a, a, 8);
-  sum = padd<Packet4f>(a, b);
-  b   = vec_sld(sum, sum, 4);
-  sum = padd<Packet4f>(sum, b);
-  return pfirst(sum);
-}
-
-// Other reduction functions:
-// mul
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{
-  Packet4f prod;
-  prod = pmul(a, vec_sld(a, a, 8));
-  return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
-}
-
-// min
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
-{
-  Packet4f b, res;
-  b   = pmin<Packet4f>(a, vec_sld(a, a, 8));
-  res = pmin<Packet4f>(b, vec_sld(b, b, 4));
-  return pfirst(res);
-}
-
-// max
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
-{
-  Packet4f b, res;
-  b = pmax<Packet4f>(a, vec_sld(a, a, 8));
-  res = pmax<Packet4f>(b, vec_sld(b, b, 4));
-  return pfirst(res);
-}
-
-EIGEN_DEVICE_FUNC inline void
-ptranspose(PacketBlock<Packet4f,4>& kernel) {
-  Packet4f t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
-  Packet4f t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
-  Packet4f t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
-  Packet4f t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
-  kernel.packet[0] = vec_mergeh(t0, t2);
-  kernel.packet[1] = vec_mergel(t0, t2);
-  kernel.packet[2] = vec_mergeh(t1, t3);
-  kernel.packet[3] = vec_mergel(t1, t3);
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) {
-  Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] };
-  Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
+template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) {
+  Packet2ul select = { ifPacket.select[0], ifPacket.select[1] };
+  Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
   return vec_sel(elsePacket, thenPacket, mask);
 }
 
-#endif
-
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f> (const float* from) { return pload<Packet4f>(from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) { pstore<float>(to, from); }
-template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>  (const float& a)  { return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN); }
-
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/functors/AssignmentFunctors.h b/uppsrc/plugin/Eigen/Eigen/src/Core/functors/AssignmentFunctors.h
index bf64ef4ed..4153b877c 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/functors/AssignmentFunctors.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/functors/AssignmentFunctors.h
@@ -144,7 +144,7 @@ template<typename Scalar> struct swap_assign_op {
   EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const
   {
-#ifdef EIGEN_GPUCC
+#ifdef __CUDACC__
     // FIXME is there some kind of cuda::swap?
     Scalar t=b; const_cast<Scalar&>(b)=a; a=t;
 #else
@@ -157,16 +157,7 @@ template<typename Scalar>
 struct functor_traits<swap_assign_op<Scalar> > {
   enum {
     Cost = 3 * NumTraits<Scalar>::ReadCost,
-    PacketAccess = 
-    #if defined(EIGEN_VECTORIZE_AVX) && EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<800 || defined(__apple_build_version__))
-    // This is a partial workaround for a bug in clang generating bad code
-    // when mixing 256/512 bits loads and 128 bits moves.
-    // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1684
-    //     https://bugs.llvm.org/show_bug.cgi?id=40815
-    0
-    #else
-    packet_traits<Scalar>::Vectorizable
-    #endif
+    PacketAccess = packet_traits<Scalar>::Vectorizable
   };
 };
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/functors/BinaryFunctors.h b/uppsrc/plugin/Eigen/Eigen/src/Core/functors/BinaryFunctors.h
index 697816663..3eae6b8ca 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/functors/BinaryFunctors.h
@@ -39,12 +39,12 @@ struct scalar_sum_op : binary_op_base<LhsScalar,RhsScalar>
     EIGEN_SCALAR_BINARY_OP_PLUGIN
   }
 #endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
   { return internal::padd(a,b); }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
   { return internal::predux(a); }
 };
 template<typename LhsScalar,typename RhsScalar>
@@ -56,9 +56,15 @@ struct functor_traits<scalar_sum_op<LhsScalar,RhsScalar> > {
   };
 };
 
-
-template<>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_sum_op<bool,bool>::operator() (const bool& a, const bool& b) const { return a || b; }
+/** \internal
+  * \brief Template specialization to deprecate the summation of boolean expressions.
+  * This is required to solve Bug 426.
+  * \sa DenseBase::count(), DenseBase::any(), ArrayBase::cast(), MatrixBase::cast()
+  */
+template<> struct scalar_sum_op<bool,bool> : scalar_sum_op<int,int> {
+  EIGEN_DEPRECATED
+  scalar_sum_op() {}
+};
 
 
 /** \internal
@@ -77,12 +83,12 @@ struct scalar_product_op  : binary_op_base<LhsScalar,RhsScalar>
     EIGEN_SCALAR_BINARY_OP_PLUGIN
   }
 #endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
   { return internal::pmul(a,b); }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
   { return internal::predux_mul(a); }
 };
 template<typename LhsScalar,typename RhsScalar>
@@ -94,10 +100,6 @@ struct functor_traits<scalar_product_op<LhsScalar,RhsScalar> > {
   };
 };
 
-template<>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_product_op<bool,bool>::operator() (const bool& a, const bool& b) const { return a && b; }
-
-
 /** \internal
   * \brief Template functor to compute the conjugate product of two scalars
   *
@@ -114,11 +116,11 @@ struct scalar_conj_product_op  : binary_op_base<LhsScalar,RhsScalar>
   typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_conj_product_op>::ReturnType result_type;
   
   EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
   { return conj_helper<LhsScalar,RhsScalar,Conj,false>().pmul(a,b); }
   
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
   { return conj_helper<Packet,Packet,Conj,false>().pmul(a,b); }
 };
 template<typename LhsScalar,typename RhsScalar>
@@ -139,12 +141,12 @@ struct scalar_min_op : binary_op_base<LhsScalar,RhsScalar>
 {
   typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_min_op>::ReturnType result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::mini(a, b); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::mini(a, b); }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
   { return internal::pmin(a,b); }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
   { return internal::predux_min(a); }
 };
 template<typename LhsScalar,typename RhsScalar>
@@ -165,12 +167,12 @@ struct scalar_max_op  : binary_op_base<LhsScalar,RhsScalar>
 {
   typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_max_op>::ReturnType result_type;
   EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::maxi(a, b); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::maxi(a, b); }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
   { return internal::pmax(a,b); }
   template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
   { return internal::predux_max(a); }
 };
 template<typename LhsScalar,typename RhsScalar>
@@ -380,14 +382,11 @@ struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > {
 struct scalar_boolean_and_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pand(a,b); }
 };
 template<> struct functor_traits<scalar_boolean_and_op> {
   enum {
     Cost = NumTraits<bool>::AddCost,
-    PacketAccess = true
+    PacketAccess = false
   };
 };
 
@@ -399,14 +398,11 @@ template<> struct functor_traits<scalar_boolean_and_op> {
 struct scalar_boolean_or_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::por(a,b); }
 };
 template<> struct functor_traits<scalar_boolean_or_op> {
   enum {
     Cost = NumTraits<bool>::AddCost,
-    PacketAccess = true
+    PacketAccess = false
   };
 };
 
@@ -418,44 +414,11 @@ template<> struct functor_traits<scalar_boolean_or_op> {
 struct scalar_boolean_xor_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_xor_op)
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a ^ b; }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pxor(a,b); }
 };
 template<> struct functor_traits<scalar_boolean_xor_op> {
   enum {
     Cost = NumTraits<bool>::AddCost,
-    PacketAccess = true
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the absolute difference of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::absolute_difference
-  */
-template<typename LhsScalar,typename RhsScalar>
-struct scalar_absolute_difference_op : binary_op_base<LhsScalar,RhsScalar>
-{
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar,scalar_absolute_difference_op>::ReturnType result_type;
-#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_absolute_difference_op)
-#else
-  scalar_absolute_difference_op() {
-    EIGEN_SCALAR_BINARY_OP_PLUGIN
-  }
-#endif
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
-  { return numext::absdiff(a,b); }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pabsdiff(a,b); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_absolute_difference_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
-    PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasAbsDiff
+    PacketAccess = false
   };
 };
 
@@ -473,7 +436,7 @@ template<typename BinaryOp> struct bind1st_op : BinaryOp {
   typedef typename BinaryOp::second_argument_type second_argument_type;
   typedef typename BinaryOp::result_type          result_type;
 
-  EIGEN_DEVICE_FUNC explicit bind1st_op(const first_argument_type &val) : m_value(val) {}
+  bind1st_op(const first_argument_type &val) : m_value(val) {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); }
 
@@ -492,7 +455,7 @@ template<typename BinaryOp> struct bind2nd_op : BinaryOp {
   typedef typename BinaryOp::second_argument_type second_argument_type;
   typedef typename BinaryOp::result_type          result_type;
 
-  EIGEN_DEVICE_FUNC explicit bind2nd_op(const second_argument_type &val) : m_value(val) {}
+  bind2nd_op(const second_argument_type &val) : m_value(val) {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); }
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/functors/NullaryFunctors.h b/uppsrc/plugin/Eigen/Eigen/src/Core/functors/NullaryFunctors.h
index 4aa33a19f..b03be0269 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/functors/NullaryFunctors.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/functors/NullaryFunctors.h
@@ -37,27 +37,26 @@ template<typename Scalar>
 struct functor_traits<scalar_identity_op<Scalar> >
 { enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true }; };
 
-template <typename Scalar, bool IsInteger> struct linspaced_op_impl;
+template <typename Scalar, typename Packet, bool IsInteger> struct linspaced_op_impl;
 
-template <typename Scalar>
-struct linspaced_op_impl<Scalar,/*IsInteger*/false>
+template <typename Scalar, typename Packet>
+struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/false>
 {
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-
   linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
-    m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : Scalar((high-low)/RealScalar(num_steps-1))),
+    m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)),
     m_flip(numext::abs(high)<numext::abs(low))
   {}
 
   template<typename IndexType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const {
+    typedef typename NumTraits<Scalar>::Real RealScalar;
     if(m_flip)
-      return (i==0)? m_low : Scalar(m_high - RealScalar(m_size1-i)*m_step);
+      return (i==0)? m_low : (m_high - RealScalar(m_size1-i)*m_step);
     else
-      return (i==m_size1)? m_high : Scalar(m_low + RealScalar(i)*m_step);
+      return (i==m_size1)? m_high : (m_low + RealScalar(i)*m_step);
   }
 
-  template<typename Packet, typename IndexType>
+  template<typename IndexType>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const
   {
     // Principle:
@@ -66,17 +65,17 @@ struct linspaced_op_impl<Scalar,/*IsInteger*/false>
     {
       Packet pi = plset<Packet>(Scalar(i-m_size1));
       Packet res = padd(pset1<Packet>(m_high), pmul(pset1<Packet>(m_step), pi));
-      if (EIGEN_PREDICT_TRUE(i != 0)) return res;
-      Packet mask = pcmp_lt(pset1<Packet>(0), plset<Packet>(0));
-      return pselect<Packet>(mask, res, pset1<Packet>(m_low));
+      if(i==0)
+        res = pinsertfirst(res, m_low);
+      return res;
     }
     else
     {
       Packet pi = plset<Packet>(Scalar(i));
       Packet res = padd(pset1<Packet>(m_low), pmul(pset1<Packet>(m_step), pi));
-      if(EIGEN_PREDICT_TRUE(i != m_size1-unpacket_traits<Packet>::size+1)) return res;
-      Packet mask = pcmp_lt(plset<Packet>(0), pset1<Packet>(unpacket_traits<Packet>::size-1));
-      return pselect<Packet>(mask, res, pset1<Packet>(m_high));
+      if(i==m_size1-unpacket_traits<Packet>::size+1)
+        res = pinsertlast(res, m_high);
+      return res;
     }
   }
 
@@ -87,8 +86,8 @@ struct linspaced_op_impl<Scalar,/*IsInteger*/false>
   const bool m_flip;
 };
 
-template <typename Scalar>
-struct linspaced_op_impl<Scalar,/*IsInteger*/true>
+template <typename Scalar, typename Packet>
+struct linspaced_op_impl<Scalar,Packet,/*IsInteger*/true>
 {
   linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) :
     m_low(low),
@@ -116,8 +115,8 @@ struct linspaced_op_impl<Scalar,/*IsInteger*/true>
 // Forward declaration (we default to random access which does not really give
 // us a speed gain when using packet access but it allows to use the functor in
 // nested expressions).
-template <typename Scalar> struct linspaced_op;
-template <typename Scalar> struct functor_traits< linspaced_op<Scalar> >
+template <typename Scalar, typename PacketType> struct linspaced_op;
+template <typename Scalar, typename PacketType> struct functor_traits< linspaced_op<Scalar,PacketType> >
 {
   enum
   {
@@ -127,7 +126,7 @@ template <typename Scalar> struct functor_traits< linspaced_op<Scalar> >
     IsRepeatable = true
   };
 };
-template <typename Scalar> struct linspaced_op
+template <typename Scalar, typename PacketType> struct linspaced_op
 {
   linspaced_op(const Scalar& low, const Scalar& high, Index num_steps)
     : impl((num_steps==1 ? high : low),high,num_steps)
@@ -137,11 +136,11 @@ template <typename Scalar> struct linspaced_op
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { return impl(i); }
 
   template<typename Packet,typename IndexType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { return impl.template packetOp<Packet>(i); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { return impl.packetOp(i); }
 
   // This proxy object handles the actual required temporaries and the different
   // implementations (integer vs. floating point).
-  const linspaced_op_impl<Scalar,NumTraits<Scalar>::IsInteger> impl;
+  const linspaced_op_impl<Scalar,PacketType,NumTraits<Scalar>::IsInteger> impl;
 };
 
 // Linear access is automatically determined from the operator() prototypes available for the given functor.
@@ -167,12 +166,12 @@ struct has_unary_operator<scalar_identity_op<Scalar>,IndexType> { enum { value =
 template<typename Scalar,typename IndexType>
 struct has_binary_operator<scalar_identity_op<Scalar>,IndexType> { enum { value = 1}; };
 
-template<typename Scalar,typename IndexType>
-struct has_nullary_operator<linspaced_op<Scalar>,IndexType> { enum { value = 0}; };
-template<typename Scalar,typename IndexType>
-struct has_unary_operator<linspaced_op<Scalar>,IndexType> { enum { value = 1}; };
-template<typename Scalar,typename IndexType>
-struct has_binary_operator<linspaced_op<Scalar>,IndexType> { enum { value = 0}; };
+template<typename Scalar, typename PacketType,typename IndexType>
+struct has_nullary_operator<linspaced_op<Scalar,PacketType>,IndexType> { enum { value = 0}; };
+template<typename Scalar, typename PacketType,typename IndexType>
+struct has_unary_operator<linspaced_op<Scalar,PacketType>,IndexType> { enum { value = 1}; };
+template<typename Scalar, typename PacketType,typename IndexType>
+struct has_binary_operator<linspaced_op<Scalar,PacketType>,IndexType> { enum { value = 0}; };
 
 template<typename Scalar,typename IndexType>
 struct has_nullary_operator<scalar_random_op<Scalar>,IndexType> { enum { value = 1}; };
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/functors/UnaryFunctors.h b/uppsrc/plugin/Eigen/Eigen/src/Core/functors/UnaryFunctors.h
index a55d7b74e..b56e7afd2 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/functors/UnaryFunctors.h
@@ -117,15 +117,7 @@ template<typename Scalar>
 struct functor_traits<scalar_conjugate_op<Scalar> >
 {
   enum {
-    Cost = 0,
-    // Yes the cost is zero even for complexes because in most cases for which
-    // the cost is used, conjugation turns to be a no-op. Some examples:
-    //   cost(a*conj(b)) == cost(a*b)
-    //   cost(a+conj(b)) == cost(a+b)
-    //   <etc.
-    // If we don't set it to zero, then:
-    //   A.conjugate().lazyProduct(B.conjugate())
-    // will bake its operands. We definitely don't want that!
+    Cost = NumTraits<Scalar>::IsComplex ? NumTraits<Scalar>::AddCost : 0,
     PacketAccess = packet_traits<Scalar>::HasConj
   };
 };
@@ -166,44 +158,6 @@ template<typename Scalar, typename NewType>
 struct functor_traits<scalar_cast_op<Scalar,NewType> >
 { enum { Cost = is_same<Scalar, NewType>::value ? 0 : NumTraits<NewType>::AddCost, PacketAccess = false }; };
 
-/** \internal
-  * \brief Template functor to arithmetically shift a scalar right by a number of bits
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::shift_right()
-  */
-template<typename Scalar, int N>
-struct scalar_shift_right_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_shift_right_op)
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const
-  { return a >> N; }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::parithmetic_shift_right<N>(a); }
-};
-template<typename Scalar, int N>
-struct functor_traits<scalar_shift_right_op<Scalar,N> >
-{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasShift }; };
-
-/** \internal
-  * \brief Template functor to logically shift a scalar left by a number of bits
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::shift_left()
-  */
-template<typename Scalar, int N>
-struct scalar_shift_left_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_shift_left_op)
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const
-  { return a << N; }
-  template<typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::plogical_shift_left<N>(a); }
-};
-template<typename Scalar, int N>
-struct functor_traits<scalar_shift_left_op<Scalar,N> >
-{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasShift }; };
-
 /** \internal
   * \brief Template functor to extract the real part of a complex
   *
@@ -308,26 +262,6 @@ struct functor_traits<scalar_exp_op<Scalar> > {
   };
 };
 
-/** \internal
-  *
-  * \brief Template functor to compute the exponential of a scalar - 1.
-  *
-  * \sa class CwiseUnaryOp, ArrayBase::expm1()
-  */
-template<typename Scalar> struct scalar_expm1_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_expm1_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::expm1(a); }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pexpm1(a); }
-};
-template <typename Scalar>
-struct functor_traits<scalar_expm1_op<Scalar> > {
-  enum {
-    PacketAccess = packet_traits<Scalar>::HasExpm1,
-    Cost = functor_traits<scalar_exp_op<Scalar> >::Cost // TODO measure cost of expm1
-  };
-};
-
 /** \internal
   *
   * \brief Template functor to compute the logarithm of a scalar
@@ -594,23 +528,6 @@ struct functor_traits<scalar_tanh_op<Scalar> > {
   };
 };
 
-#if EIGEN_HAS_CXX11_MATH
-/** \internal
-  * \brief Template functor to compute the atanh of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::atanh()
-  */
-template <typename Scalar>
-struct scalar_atanh_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_atanh_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::atanh(a); }
-};
-
-template <typename Scalar>
-struct functor_traits<scalar_atanh_op<Scalar> > {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
-};
-#endif
-
 /** \internal
   * \brief Template functor to compute the sinh of a scalar
   * \sa class CwiseUnaryOp, ArrayBase::sinh()
@@ -630,23 +547,6 @@ struct functor_traits<scalar_sinh_op<Scalar> >
   };
 };
 
-#if EIGEN_HAS_CXX11_MATH
-/** \internal
-  * \brief Template functor to compute the asinh of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::asinh()
-  */
-template <typename Scalar>
-struct scalar_asinh_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_asinh_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::asinh(a); }
-};
-
-template <typename Scalar>
-struct functor_traits<scalar_asinh_op<Scalar> > {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
-};
-#endif
-
 /** \internal
   * \brief Template functor to compute the cosh of a scalar
   * \sa class CwiseUnaryOp, ArrayBase::cosh()
@@ -666,23 +566,6 @@ struct functor_traits<scalar_cosh_op<Scalar> >
   };
 };
 
-#if EIGEN_HAS_CXX11_MATH
-/** \internal
-  * \brief Template functor to compute the acosh of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::acosh()
-  */
-template <typename Scalar>
-struct scalar_acosh_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_acosh_op)
-  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::acosh(a); }
-};
-
-template <typename Scalar>
-struct functor_traits<scalar_acosh_op<Scalar> > {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
-};
-#endif
-
 /** \internal
   * \brief Template functor to compute the inverse of a scalar
   * \sa class CwiseUnaryOp, Cwise::inverse()
@@ -695,13 +578,9 @@ struct scalar_inverse_op {
   EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const
   { return internal::pdiv(pset1<Packet>(Scalar(1)),a); }
 };
-template <typename Scalar>
-struct functor_traits<scalar_inverse_op<Scalar> > {
-  enum {
-    PacketAccess = packet_traits<Scalar>::HasDiv,
-    Cost = scalar_div_cost<Scalar, PacketAccess>::value
-  };
-};
+template<typename Scalar>
+struct functor_traits<scalar_inverse_op<Scalar> >
+{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
 
 /** \internal
   * \brief Template functor to compute the square of a scalar
@@ -773,25 +652,6 @@ struct functor_traits<scalar_floor_op<Scalar> >
   };
 };
 
-/** \internal
-  * \brief Template functor to compute the rounded (with current rounding mode)  value of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::rint()
-  */
-template<typename Scalar> struct scalar_rint_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_rint_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::rint(a); }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::print(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_rint_op<Scalar> >
-{
-  enum {
-    Cost = NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasRint
-  };
-};
-
 /** \internal
   * \brief Template functor to compute the ceil of a scalar
   * \sa class CwiseUnaryOp, ArrayBase::ceil()
@@ -818,13 +678,7 @@ struct functor_traits<scalar_ceil_op<Scalar> >
 template<typename Scalar> struct scalar_isnan_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_isnan_op)
   typedef bool result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const {
-#if defined(SYCL_DEVICE_ONLY)
-    return numext::isnan(a);
-#else
-    return (numext::isnan)(a);
-#endif
-  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isnan)(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_isnan_op<Scalar> >
@@ -842,13 +696,7 @@ struct functor_traits<scalar_isnan_op<Scalar> >
 template<typename Scalar> struct scalar_isinf_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_isinf_op)
   typedef bool result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const {
-#if defined(SYCL_DEVICE_ONLY)
-    return numext::isinf(a);
-#else
-    return (numext::isinf)(a);
-#endif
-  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isinf)(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_isinf_op<Scalar> >
@@ -866,13 +714,7 @@ struct functor_traits<scalar_isinf_op<Scalar> >
 template<typename Scalar> struct scalar_isfinite_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_isfinite_op)
   typedef bool result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const {
-#if defined(SYCL_DEVICE_ONLY)
-    return numext::isfinite(a);
-#else
-    return (numext::isfinite)(a);
-#endif
-  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isfinite)(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_isfinite_op<Scalar> >
@@ -935,7 +777,7 @@ struct scalar_sign_op<Scalar,true> {
 template<typename Scalar>
 struct functor_traits<scalar_sign_op<Scalar> >
 { enum {
-    Cost =
+    Cost = 
         NumTraits<Scalar>::IsComplex
         ? ( 8*NumTraits<Scalar>::MulCost  ) // roughly
         : ( 3*NumTraits<Scalar>::AddCost),
@@ -943,130 +785,6 @@ struct functor_traits<scalar_sign_op<Scalar> >
   };
 };
 
-/** \internal
-  * \brief Template functor to compute the logistic function of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::logistic()
-  */
-template <typename T>
-struct scalar_logistic_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
-    const T one = T(1);
-    return one / (one + numext::exp(-x));
-  }
-
-  template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Packet packetOp(const Packet& x) const {
-    const Packet one = pset1<Packet>(T(1));
-    return pdiv(one, padd(one, pexp(pnegate(x))));
-  }
-};
-
-#ifndef EIGEN_GPU_COMPILE_PHASE
-/** \internal
-  * \brief Template specialization of the logistic function for float.
-  *
-  *  Uses just a 9/10-degree rational interpolant which
-  *  interpolates 1/(1+exp(-x)) - 0.5 up to a couple of ulps in the range
-  *  [-9, 18]. Below -9 we use the more accurate approximation
-  *  1/(1+exp(-x)) ~= exp(x), and above 18 the logistic function is 1 withing
-  *  one ulp. The shifted logistic is interpolated because it was easier to
-  *  make the fit converge.
-  *
-  */
-template <>
-struct scalar_logistic_op<float> {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator()(const float& x) const {
-    // The upper cut-off is the smallest x for which the rational approximation evaluates to 1.
-    // Choosing this value saves us a few instructions clamping the results at the end.
-#ifdef EIGEN_VECTORIZE_FMA
-    const float cutoff_upper = 15.7243833541870117f;
-#else
-    const float cutoff_upper = 15.6437711715698242f;
-#endif
-    const float cutoff_lower = -9.f;
-    if (x > cutoff_upper) return 1.0f;
-    else if (x < cutoff_lower) return numext::exp(x);
-    else return 1.0f / (1.0f + numext::exp(-x));
-  }
-
-  template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  Packet packetOp(const Packet& _x) const {
-    const Packet cutoff_lower = pset1<Packet>(-9.f);
-    const Packet lt_mask = pcmp_lt<Packet>(_x, cutoff_lower);
-    const bool any_small = predux(lt_mask);
-
-    // Clamp the input to be at most 'cutoff_upper'.
-#ifdef EIGEN_VECTORIZE_FMA
-    const Packet cutoff_upper = pset1<Packet>(15.7243833541870117f);
-#else
-    const Packet cutoff_upper = pset1<Packet>(15.6437711715698242f);
-#endif
-    const Packet x = pmin(_x, cutoff_upper);
-
-    // The monomial coefficients of the numerator polynomial (odd).
-    const Packet alpha_1 = pset1<Packet>(2.48287947061529e-01f);
-    const Packet alpha_3 = pset1<Packet>(8.51377133304701e-03f);
-    const Packet alpha_5 = pset1<Packet>(6.08574864600143e-05f);
-    const Packet alpha_7 = pset1<Packet>(1.15627324459942e-07f);
-    const Packet alpha_9 = pset1<Packet>(4.37031012579801e-11f);
-
-    // The monomial coefficients of the denominator polynomial (even).
-    const Packet beta_0 = pset1<Packet>(9.93151921023180e-01f);
-    const Packet beta_2 = pset1<Packet>(1.16817656904453e-01f);
-    const Packet beta_4 = pset1<Packet>(1.70198817374094e-03f);
-    const Packet beta_6 = pset1<Packet>(6.29106785017040e-06f);
-    const Packet beta_8 = pset1<Packet>(5.76102136993427e-09f);
-    const Packet beta_10 = pset1<Packet>(6.10247389755681e-13f);
-
-    // Since the polynomials are odd/even, we need x^2.
-    const Packet x2 = pmul(x, x);
-
-    // Evaluate the numerator polynomial p.
-    Packet p = pmadd(x2, alpha_9, alpha_7);
-    p = pmadd(x2, p, alpha_5);
-    p = pmadd(x2, p, alpha_3);
-    p = pmadd(x2, p, alpha_1);
-    p = pmul(x, p);
-
-    // Evaluate the denominator polynomial q.
-    Packet q = pmadd(x2, beta_10, beta_8);
-    q = pmadd(x2, q, beta_6);
-    q = pmadd(x2, q, beta_4);
-    q = pmadd(x2, q, beta_2);
-    q = pmadd(x2, q, beta_0);
-    // Divide the numerator by the denominator and shift it up.
-    const Packet logistic = padd(pdiv(p, q), pset1<Packet>(0.5f));
-    if (EIGEN_PREDICT_FALSE(any_small)) {
-      const Packet exponential = pexp(_x);
-      return pselect(lt_mask, exponential, logistic);
-    } else {
-      return logistic;
-    }
-  }
-};
-#endif  // #ifndef EIGEN_GPU_COMPILE_PHASE
-
-template <typename T>
-struct functor_traits<scalar_logistic_op<T> > {
-  enum {
-    // The cost estimate for float here here is for the common(?) case where
-    // all arguments are greater than -9.
-    Cost = scalar_div_cost<T, packet_traits<T>::HasDiv>::value +
-           (internal::is_same<T, float>::value
-                ? NumTraits<T>::AddCost * 15 + NumTraits<T>::MulCost * 11
-                : NumTraits<T>::AddCost * 2 +
-                      functor_traits<scalar_exp_op<T> >::Cost),
-    PacketAccess =
-        packet_traits<T>::HasAdd && packet_traits<T>::HasDiv &&
-        (internal::is_same<T, float>::value
-             ? packet_traits<T>::HasMul && packet_traits<T>::HasMax &&
-                   packet_traits<T>::HasMin
-             : packet_traits<T>::HasNegate && packet_traits<T>::HasExp)
-  };
-};
-
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 64e7f79cf..681451cc3 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -15,13 +15,7 @@ namespace Eigen {
 
 namespace internal {
 
-enum GEBPPacketSizeType {
-  GEBPPacketFull = 0,
-  GEBPPacketHalf,
-  GEBPPacketQuarter
-};
-
-template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false, int Arch=Architecture::Target, int _PacketSize=GEBPPacketFull>
+template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
 class gebp_traits;
 
 
@@ -31,42 +25,16 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff
   return a<=0 ? b : a;
 }
 
-#if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
-#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
-#else
-#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
-#endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
-
-#if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
-#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
-#else
-#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
-#endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
-
-#if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
-#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_SET_DEFAULT_L3_CACHE_SIZE
-#else
-#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
-#endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
-  
 #if EIGEN_ARCH_i386_OR_x86_64
-const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32*1024);
-const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*1024);
-const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024);
-#elif EIGEN_ARCH_PPC
-const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024);
-const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
-const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024);
+const std::ptrdiff_t defaultL1CacheSize = 32*1024;
+const std::ptrdiff_t defaultL2CacheSize = 256*1024;
+const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024;
 #else
-const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024);
-const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024);
-const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*1024);
+const std::ptrdiff_t defaultL1CacheSize = 16*1024;
+const std::ptrdiff_t defaultL2CacheSize = 512*1024;
+const std::ptrdiff_t defaultL3CacheSize = 512*1024;
 #endif
 
-#undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
-#undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE
-#undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE
-
 /** \internal */
 struct CacheSizes {
   CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) {
@@ -82,6 +50,7 @@ struct CacheSizes {
   std::ptrdiff_t m_l3;
 };
 
+
 /** \internal */
 inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3)
 {
@@ -132,16 +101,6 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
   // at the register level. This small horizontal panel has to stay within L1 cache.
   std::ptrdiff_t l1, l2, l3;
   manage_caching_sizes(GetAction, &l1, &l2, &l3);
-  #ifdef EIGEN_VECTORIZE_AVX512
-  // We need to find a rationale for that, but without this adjustment,
-  // performance with AVX512 is pretty bad, like -20% slower.
-  // One reason is that with increasing packet-size, the blocking size k
-  // has to become pretty small if we want that 1 lhs panel fit within L1.
-  // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are:
-  //   k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144.
-  // This is quite small for a good reuse of the accumulation registers.
-  l1 *= 4;
-  #endif
 
   if (num_threads > 1) {
     typedef typename Traits::ResScalar ResScalar;
@@ -156,7 +115,8 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n
     // registers. However once the latency is hidden there is no point in
     // increasing the value of k, so we'll cap it at 320 (value determined
     // experimentally).
-    const Index k_cache = (numext::mini<Index>)((l1-ksub)/kdiv, 320);
+    // To avoid that k vanishes, we make k_cache at least as big as kr
+    const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1-ksub)/kdiv, 320));
     if (k_cache < k) {
       k = k_cache - (k_cache % kr);
       eigen_internal_assert(k > 0);
@@ -378,61 +338,6 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_
 //   #define CJMADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = padd(C,T);
 #endif
 
-template <typename RhsPacket, typename RhsPacketx4, int registers_taken>
-struct RhsPanelHelper {
- private:
-  static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken;
- public:
-  typedef typename conditional<remaining_registers>=4, RhsPacketx4, RhsPacket>::type type;
-};
-
-template <typename Packet>
-struct QuadPacket
-{
-  Packet B_0, B1, B2, B3;
-  const Packet& get(const FixedInt<0>&) const { return B_0; }
-  const Packet& get(const FixedInt<1>&) const { return B1; }
-  const Packet& get(const FixedInt<2>&) const { return B2; }
-  const Packet& get(const FixedInt<3>&) const { return B3; }
-};
-
-template <int N, typename T1, typename T2, typename T3>
-struct packet_conditional { typedef T3 type; };
-
-template <typename T1, typename T2, typename T3>
-struct packet_conditional<GEBPPacketFull, T1, T2, T3> { typedef T1 type; };
-
-template <typename T1, typename T2, typename T3>
-struct packet_conditional<GEBPPacketHalf, T1, T2, T3> { typedef T2 type; };
-
-#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size)         \
-  typedef typename packet_conditional<packet_size,                 \
-                                      typename packet_traits<name ## Scalar>::type, \
-                                      typename packet_traits<name ## Scalar>::half, \
-                                      typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
-  prefix ## name ## Packet
-
-#define PACKET_DECL_COND(name, packet_size)                        \
-  typedef typename packet_conditional<packet_size,                 \
-                                      typename packet_traits<name ## Scalar>::type, \
-                                      typename packet_traits<name ## Scalar>::half, \
-                                      typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
-  name ## Packet
-
-#define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size)        \
-  typedef typename packet_conditional<packet_size,                 \
-                                      typename packet_traits<Scalar>::type, \
-                                      typename packet_traits<Scalar>::half, \
-                                      typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
-  prefix ## ScalarPacket
-
-#define PACKET_DECL_COND_SCALAR(packet_size)                       \
-  typedef typename packet_conditional<packet_size,                 \
-                                      typename packet_traits<Scalar>::type, \
-                                      typename packet_traits<Scalar>::half, \
-                                      typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type \
-  ScalarPacket
-
 /* Vectorization logic
  *  real*real: unpack rhs to constant packets, ...
  * 
@@ -443,7 +348,7 @@ struct packet_conditional<GEBPPacketHalf, T1, T2, T3> { typedef T2 type; };
  *  cplx*real : unpack rhs to constant packets, ...
  *  real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
  */
-template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
+template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
 class gebp_traits
 {
 public:
@@ -451,17 +356,13 @@ public:
   typedef _RhsScalar RhsScalar;
   typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 
-  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
-  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
-  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
-
   enum {
     ConjLhs = _ConjLhs,
     ConjRhs = _ConjRhs,
-    Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
-    LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
-    RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
-    ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
+    Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
+    LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
+    RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+    ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
     
     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
 
@@ -470,12 +371,10 @@ public:
 
     // register block size along the M direction (currently, this one cannot be modified)
     default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize,
-#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \
-    && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914))
-    // we assume 16 registers or more
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
+    // we assume 16 registers
     // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
     // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
-    // Bug 1515: MSVC prior to v19.14 yields to register spilling.
     mr = Vectorizable ? 3*LhsPacketSize : default_mr,
 #else
     mr = default_mr,
@@ -485,41 +384,37 @@ public:
     RhsProgress = 1
   };
 
+  typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
+  typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
+  typedef typename packet_traits<ResScalar>::type  _ResPacket;
 
   typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
   typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
   typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
-  typedef LhsPacket LhsPacket4Packing;
 
-  typedef QuadPacket<RhsPacket> RhsPacketx4;
   typedef ResPacket AccPacket;
   
   EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
   {
     p = pset1<ResPacket>(ResScalar(0));
   }
-
+  
+  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+  {
+    pbroadcast4(b, b0, b1, b2, b3);
+  }
+  
+//   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
+//   {
+//     pbroadcast2(b, b0, b1);
+//   }
+  
   template<typename RhsPacketType>
   EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
   {
     dest = pset1<RhsPacketType>(*b);
   }
-
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
-  {
-    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
-  }
-
-  template<typename RhsPacketType>
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
-  {
-    loadRhs(b, dest);
-  }
-
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
-  {
-  }
-
+  
   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
   {
     dest = ploadquad<RhsPacket>(b);
@@ -537,8 +432,8 @@ public:
     dest = ploadu<LhsPacketType>(a);
   }
 
-  template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
-  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
+  template<typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const
   {
     conj_helper<LhsPacketType,RhsPacketType,ConjLhs,ConjRhs> cj;
     // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
@@ -553,12 +448,6 @@ public:
 #endif
   }
 
-  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
-  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
-  {
-    madd(a, b.get(lane), c, tmp, lane);
-  }
-
   EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
   {
     r = pmadd(c,alpha,r);
@@ -572,25 +461,21 @@ public:
 
 };
 
-template<typename RealScalar, bool _ConjLhs, int Arch, int _PacketSize>
-class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false, Arch, _PacketSize>
+template<typename RealScalar, bool _ConjLhs>
+class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
 {
 public:
   typedef std::complex<RealScalar> LhsScalar;
   typedef RealScalar RhsScalar;
   typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 
-  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
-  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
-  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
-
   enum {
     ConjLhs = _ConjLhs,
     ConjRhs = false,
-    Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable,
-    LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
-    RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
-    ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
+    Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
+    LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
+    RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+    ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
     
     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
     nr = 4,
@@ -605,12 +490,13 @@ public:
     RhsProgress = 1
   };
 
+  typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
+  typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
+  typedef typename packet_traits<ResScalar>::type  _ResPacket;
+
   typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
   typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
   typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
-  typedef LhsPacket LhsPacket4Packing;
-
-  typedef QuadPacket<RhsPacket> RhsPacketx4;
 
   typedef ResPacket AccPacket;
 
@@ -619,42 +505,13 @@ public:
     p = pset1<ResPacket>(ResScalar(0));
   }
 
-  template<typename RhsPacketType>
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
   {
-    dest = pset1<RhsPacketType>(*b);
+    dest = pset1<RhsPacket>(*b);
   }
-
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
-  {
-    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
-  }
-
-  template<typename RhsPacketType>
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
-  {
-    loadRhs(b, dest);
-  }
-
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
-  {}
   
   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
   {
-    loadRhsQuad_impl(b,dest, typename conditional<RhsPacketSize==16,true_type,false_type>::type());
-  }
-
-  EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const
-  {
-    // FIXME we can do better!
-    // what we want here is a ploadheight
-    RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]};
-    dest = ploadquad<RhsPacket>(tmp);
-  }
-
-  EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const
-  {
-    eigen_internal_assert(RhsPacketSize<=8);
     dest = pset1<RhsPacket>(*b);
   }
 
@@ -663,20 +520,27 @@ public:
     dest = pload<LhsPacket>(a);
   }
 
-  template<typename LhsPacketType>
-  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
   {
-    dest = ploadu<LhsPacketType>(a);
+    dest = ploadu<LhsPacket>(a);
   }
 
-  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
-  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
+  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+  {
+    pbroadcast4(b, b0, b1, b2, b3);
+  }
+  
+//   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
+//   {
+//     pbroadcast2(b, b0, b1);
+//   }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
   {
     madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
   }
 
-  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
-  EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
+  EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
   {
 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
     EIGEN_UNUSED_VARIABLE(tmp);
@@ -691,20 +555,13 @@ public:
     c += a * b;
   }
 
-  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
-  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
   {
-    madd(a, b.get(lane), c, tmp, lane);
-  }
-
-  template <typename ResPacketType, typename AccPacketType>
-  EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
-  {
-    conj_helper<ResPacketType,ResPacketType,ConjLhs,false> cj;
     r = cj.pmadd(c,alpha,r);
   }
 
 protected:
+  conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
 };
 
 template<typename Packet>
@@ -723,57 +580,13 @@ DoublePacket<Packet> padd(const DoublePacket<Packet> &a, const DoublePacket<Pack
   return res;
 }
 
-// note that for DoublePacket<RealPacket> the "4" in "downto4"
-// corresponds to the number of complexes, so it means "8"
-// it terms of real coefficients.
-
 template<typename Packet>
-const DoublePacket<Packet>&
-predux_half_dowto4(const DoublePacket<Packet> &a,
-                   typename enable_if<unpacket_traits<Packet>::size<=8>::type* = 0)
+const DoublePacket<Packet>& predux_downto4(const DoublePacket<Packet> &a)
 {
   return a;
 }
 
-template<typename Packet>
-DoublePacket<typename unpacket_traits<Packet>::half>
-predux_half_dowto4(const DoublePacket<Packet> &a,
-                   typename enable_if<unpacket_traits<Packet>::size==16>::type* = 0)
-{
-  // yes, that's pretty hackish :(
-  DoublePacket<typename unpacket_traits<Packet>::half> res;
-  typedef std::complex<typename unpacket_traits<Packet>::type> Cplx;
-  typedef typename packet_traits<Cplx>::type CplxPacket;
-  res.first  = predux_half_dowto4(CplxPacket(a.first)).v;
-  res.second = predux_half_dowto4(CplxPacket(a.second)).v;
-  return res;
-}
-
-// same here, "quad" actually means "8" in terms of real coefficients
-template<typename Scalar, typename RealPacket>
-void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
-                            typename enable_if<unpacket_traits<RealPacket>::size<=8>::type* = 0)
-{
-  dest.first  = pset1<RealPacket>(numext::real(*b));
-  dest.second = pset1<RealPacket>(numext::imag(*b));
-}
-
-template<typename Scalar, typename RealPacket>
-void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
-                            typename enable_if<unpacket_traits<RealPacket>::size==16>::type* = 0)
-{
-  // yes, that's pretty hackish too :(
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])};
-  RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])};
-  dest.first  = ploadquad<RealPacket>(r);
-  dest.second = ploadquad<RealPacket>(i);
-}
-
-
-template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > {
-  typedef DoublePacket<typename unpacket_traits<Packet>::half> half;
-};
+template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > { typedef DoublePacket<Packet> half; };
 // template<typename Packet>
 // DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
 // {
@@ -783,8 +596,8 @@ template<typename Packet> struct unpacket_traits<DoublePacket<Packet> > {
 //   return res;
 // }
 
-template<typename RealScalar, bool _ConjLhs, bool _ConjRhs, int Arch, int _PacketSize>
-class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs, Arch, _PacketSize >
+template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
+class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
 {
 public:
   typedef std::complex<RealScalar>  Scalar;
@@ -792,21 +605,15 @@ public:
   typedef std::complex<RealScalar>  RhsScalar;
   typedef std::complex<RealScalar>  ResScalar;
   
-  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
-  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
-  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
-  PACKET_DECL_COND(Real, _PacketSize);
-  PACKET_DECL_COND_SCALAR(_PacketSize);
-
   enum {
     ConjLhs = _ConjLhs,
     ConjRhs = _ConjRhs,
-    Vectorizable = unpacket_traits<RealPacket>::vectorizable
-                && unpacket_traits<ScalarPacket>::vectorizable,
-    ResPacketSize   = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
-    LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
-    RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1,
-    RealPacketSize  = Vectorizable ? unpacket_traits<RealPacket>::size : 1,
+    Vectorizable = packet_traits<RealScalar>::Vectorizable
+                && packet_traits<Scalar>::Vectorizable,
+    RealPacketSize  = Vectorizable ? packet_traits<RealScalar>::size : 1,
+    ResPacketSize   = Vectorizable ? packet_traits<ResScalar>::size : 1,
+    LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
+    RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
 
     // FIXME: should depend on NumberOfRegisters
     nr = 4,
@@ -816,16 +623,14 @@ public:
     RhsProgress = 1
   };
   
-  typedef DoublePacket<RealPacket>                 DoublePacketType;
+  typedef typename packet_traits<RealScalar>::type RealPacket;
+  typedef typename packet_traits<Scalar>::type     ScalarPacket;
+  typedef DoublePacket<RealPacket> DoublePacketType;
 
-  typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type LhsPacket4Packing;
   typedef typename conditional<Vectorizable,RealPacket,  Scalar>::type LhsPacket;
   typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type RhsPacket;
   typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
   typedef typename conditional<Vectorizable,DoublePacketType,Scalar>::type AccPacket;
-
-  // this actualy holds 8 packets!
-  typedef QuadPacket<RhsPacket> RhsPacketx4;
   
   EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
 
@@ -836,41 +641,17 @@ public:
   }
 
   // Scalar path
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const
   {
-    dest = pset1<ScalarPacket>(*b);
+    dest = pset1<ResPacket>(*b);
   }
 
   // Vectorized path
-  template<typename RealPacketType>
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const
   {
-    dest.first  = pset1<RealPacketType>(numext::real(*b));
-    dest.second = pset1<RealPacketType>(numext::imag(*b));
+    dest.first  = pset1<RealPacket>(numext::real(*b));
+    dest.second = pset1<RealPacket>(numext::imag(*b));
   }
-
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
-  {
-    loadRhs(b, dest.B_0);
-    loadRhs(b + 1, dest.B1);
-    loadRhs(b + 2, dest.B2);
-    loadRhs(b + 3, dest.B3);
-  }
-
-  // Scalar path
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const
-  {
-    loadRhs(b, dest);
-  }
-
-  // Vectorized path
-  template<typename RealPacketType>
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const
-  {
-    loadRhs(b, dest);
-  }
-
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
   
   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const
   {
@@ -878,7 +659,33 @@ public:
   }
   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const
   {
-    loadQuadToDoublePacket(b,dest);
+    eigen_internal_assert(unpacket_traits<ScalarPacket>::size<=4);
+    loadRhs(b,dest);
+  }
+  
+  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
+  {
+    // FIXME not sure that's the best way to implement it!
+    loadRhs(b+0, b0);
+    loadRhs(b+1, b1);
+    loadRhs(b+2, b2);
+    loadRhs(b+3, b3);
+  }
+  
+  // Vectorized path
+  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1)
+  {
+    // FIXME not sure that's the best way to implement it!
+    loadRhs(b+0, b0);
+    loadRhs(b+1, b1);
+  }
+  
+  // Scalar path
+  EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1)
+  {
+    // FIXME not sure that's the best way to implement it!
+    loadRhs(b+0, b0);
+    loadRhs(b+1, b1);
   }
 
   // nothing special here
@@ -887,59 +694,47 @@ public:
     dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
   }
 
-  template<typename LhsPacketType>
-  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
   {
-    dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a));
+    dest = ploadu<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
   }
 
-  template<typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType, typename LaneIdType>
-  EIGEN_STRONG_INLINE
-  typename enable_if<!is_same<RhsPacketType,RhsPacketx4>::value>::type
-  madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket<ResPacketType>& c, TmpType& /*tmp*/, const LaneIdType&) const
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const
   {
     c.first   = padd(pmul(a,b.first), c.first);
     c.second  = padd(pmul(a,b.second),c.second);
   }
 
-  template<typename LaneIdType>
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const
   {
     c = cj.pmadd(a,b,c);
   }
-
-  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
-  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
-  {
-    madd(a, b.get(lane), c, tmp, lane);
-  }
   
   EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
   
-  template<typename RealPacketType, typename ResPacketType>
-  EIGEN_STRONG_INLINE void acc(const DoublePacket<RealPacketType>& c, const ResPacketType& alpha, ResPacketType& r) const
+  EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const
   {
     // assemble c
-    ResPacketType tmp;
+    ResPacket tmp;
     if((!ConjLhs)&&(!ConjRhs))
     {
-      tmp = pcplxflip(pconj(ResPacketType(c.second)));
-      tmp = padd(ResPacketType(c.first),tmp);
+      tmp = pcplxflip(pconj(ResPacket(c.second)));
+      tmp = padd(ResPacket(c.first),tmp);
     }
     else if((!ConjLhs)&&(ConjRhs))
     {
-      tmp = pconj(pcplxflip(ResPacketType(c.second)));
-      tmp = padd(ResPacketType(c.first),tmp);
+      tmp = pconj(pcplxflip(ResPacket(c.second)));
+      tmp = padd(ResPacket(c.first),tmp);
     }
     else if((ConjLhs)&&(!ConjRhs))
     {
-      tmp = pcplxflip(ResPacketType(c.second));
-      tmp = padd(pconj(ResPacketType(c.first)),tmp);
+      tmp = pcplxflip(ResPacket(c.second));
+      tmp = padd(pconj(ResPacket(c.first)),tmp);
     }
     else if((ConjLhs)&&(ConjRhs))
     {
-      tmp = pcplxflip(ResPacketType(c.second));
-      tmp = psub(pconj(ResPacketType(c.first)),tmp);
+      tmp = pcplxflip(ResPacket(c.second));
+      tmp = psub(pconj(ResPacket(c.first)),tmp);
     }
     
     r = pmadd(tmp,alpha,r);
@@ -949,8 +744,8 @@ protected:
   conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
 };
 
-template<typename RealScalar, bool _ConjRhs, int Arch, int _PacketSize>
-class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs, Arch, _PacketSize >
+template<typename RealScalar, bool _ConjRhs>
+class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
 {
 public:
   typedef std::complex<RealScalar>  Scalar;
@@ -958,25 +753,14 @@ public:
   typedef Scalar      RhsScalar;
   typedef Scalar      ResScalar;
 
-  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
-  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
-  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
-  PACKET_DECL_COND_PREFIX(_, Real, _PacketSize);
-  PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize);
-
-#undef PACKET_DECL_COND_SCALAR_PREFIX
-#undef PACKET_DECL_COND_PREFIX
-#undef PACKET_DECL_COND_SCALAR
-#undef PACKET_DECL_COND
-
   enum {
     ConjLhs = false,
     ConjRhs = _ConjRhs,
-    Vectorizable = unpacket_traits<_RealPacket>::vectorizable
-                && unpacket_traits<_ScalarPacket>::vectorizable,
-    LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
-    RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
-    ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1,
+    Vectorizable = packet_traits<RealScalar>::Vectorizable
+                && packet_traits<Scalar>::Vectorizable,
+    LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
+    RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+    ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
     
     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
     // FIXME: should depend on NumberOfRegisters
@@ -987,11 +771,14 @@ public:
     RhsProgress = 1
   };
 
+  typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
+  typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
+  typedef typename packet_traits<ResScalar>::type  _ResPacket;
+
   typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
   typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
   typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
-  typedef LhsPacket LhsPacket4Packing;
-  typedef QuadPacket<RhsPacket> RhsPacketx4;
+
   typedef ResPacket AccPacket;
 
   EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
@@ -999,25 +786,22 @@ public:
     p = pset1<ResPacket>(ResScalar(0));
   }
 
-  template<typename RhsPacketType>
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
   {
-    dest = pset1<RhsPacketType>(*b);
+    dest = pset1<RhsPacket>(*b);
   }
-
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
+  
+  void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3)
   {
-    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
+    pbroadcast4(b, b0, b1, b2, b3);
   }
-
-  template<typename RhsPacketType>
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const
-  {
-    loadRhs(b, dest);
-  }
-
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
-  {}
+  
+//   EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1)
+//   {
+//     // FIXME not sure that's the best way to implement it!
+//     b0 = pload1<RhsPacket>(b+0);
+//     b1 = pload1<RhsPacket>(b+1);
+//   }
 
   EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
   {
@@ -1026,23 +810,21 @@ public:
   
   EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
   {
-    dest = ploadquad<RhsPacket>(b);
+    eigen_internal_assert(unpacket_traits<RhsPacket>::size<=4);
+    loadRhs(b,dest);
   }
 
-  template<typename LhsPacketType>
-  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const
   {
-    dest = ploaddup<LhsPacketType>(a);
+    dest = ploaddup<LhsPacket>(a);
   }
 
-  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
-  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
   {
     madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
   }
 
-  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
-  EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const
+  EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
   {
 #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
     EIGEN_UNUSED_VARIABLE(tmp);
@@ -1058,166 +840,16 @@ public:
     c += a * b;
   }
 
-  template<typename LhsPacketType, typename AccPacketType, typename LaneIdType>
-  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const
+  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
   {
-    madd(a, b.get(lane), c, tmp, lane);
-  }
-
-  template <typename ResPacketType, typename AccPacketType>
-  EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const
-  {
-    conj_helper<ResPacketType,ResPacketType,false,ConjRhs> cj;
     r = cj.pmadd(alpha,c,r);
   }
 
 protected:
-
+  conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
 };
 
-
-#if EIGEN_ARCH_ARM64 && defined EIGEN_VECTORIZE_NEON
-
-template<>
-struct gebp_traits <float, float, false, false,Architecture::NEON,GEBPPacketFull>
- : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
-{
-  typedef float RhsPacket;
-
-  typedef float32x4_t RhsPacketx4;
-
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
-  {
-    dest = *b;
-  }
-
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
-  {
-    dest = vld1q_f32(b);
-  }
-
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
-  {
-    dest = *b;
-  }
-
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketx4& dest) const
-  {}
-
-  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
-  {
-    loadRhs(b,dest);
-  }
-
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
-  {
-    c = vfmaq_n_f32(c, a, b);
-  }
-
-  // NOTE: Template parameter inference failed when compiled with Android NDK:
-  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
-
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
-  { madd_helper<0>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
-  { madd_helper<1>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
-  { madd_helper<2>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
-  { madd_helper<3>(a, b, c); }
-
- private:
-  template<int LaneID>
-  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
-  {
-    #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
-    // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
-    // vfmaq_laneq_f32 is implemented through a costly dup
-         if(LaneID==0)  asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) :  );
-    else if(LaneID==1)  asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) :  );
-    else if(LaneID==2)  asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) :  );
-    else if(LaneID==3)  asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) :  );
-    #else
-    c = vfmaq_laneq_f32(c, a, b, LaneID);
-    #endif
-  }
-};
-
-
-template<>
-struct gebp_traits <double, double, false, false,Architecture::NEON>
- : gebp_traits<double,double,false,false,Architecture::Generic>
-{
-  typedef double RhsPacket;
-
-  struct RhsPacketx4 {
-    float64x2_t B_0, B_1;
-  };
-
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
-  {
-    dest = *b;
-  }
-
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
-  {
-    dest.B_0 = vld1q_f64(b);
-    dest.B_1 = vld1q_f64(b+2);
-  }
-
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
-  {
-    loadRhs(b,dest);
-  }
-
-  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketx4& dest) const
-  {}
-
-  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
-  {
-    loadRhs(b,dest);
-  }
-
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
-  {
-    c = vfmaq_n_f64(c, a, b);
-  }
-
-  // NOTE: Template parameter inference failed when compiled with Android NDK:
-  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
-
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
-  { madd_helper<0>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
-  { madd_helper<1>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
-  { madd_helper<2>(a, b, c); }
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
-  { madd_helper<3>(a, b, c); }
-
- private:
-  template <int LaneID>
-  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
-  {
-    #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
-    // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
-    // vfmaq_laneq_f64 is implemented through a costly dup
-         if(LaneID==0)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
-    else if(LaneID==1)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) :  );
-    else if(LaneID==2)  asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
-    else if(LaneID==3)  asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) :  );
-    #else
-         if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0);
-    else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1);
-    else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0);
-    else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1);
-    #endif
-  }
-};
-
-#endif
-
-/* optimized General packed Block * packed Panel product kernel
+/* optimized GEneral packed Block * packed Panel product kernel
  *
  * Mixing type logic: C += A * B
  *  |  A  |  B  | comments
@@ -1227,47 +859,26 @@ struct gebp_traits <double, double, false, false,Architecture::NEON>
 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
 struct gebp_kernel
 {
-  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
-  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketHalf> HalfTraits;
-  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target,GEBPPacketQuarter> QuarterTraits;
-  
+  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
   typedef typename Traits::ResScalar ResScalar;
   typedef typename Traits::LhsPacket LhsPacket;
   typedef typename Traits::RhsPacket RhsPacket;
   typedef typename Traits::ResPacket ResPacket;
   typedef typename Traits::AccPacket AccPacket;
-  typedef typename Traits::RhsPacketx4 RhsPacketx4;
-
-  typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15;
-
-  typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
 
+  typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs> SwappedTraits;
   typedef typename SwappedTraits::ResScalar SResScalar;
   typedef typename SwappedTraits::LhsPacket SLhsPacket;
   typedef typename SwappedTraits::RhsPacket SRhsPacket;
   typedef typename SwappedTraits::ResPacket SResPacket;
   typedef typename SwappedTraits::AccPacket SAccPacket;
 
-  typedef typename HalfTraits::LhsPacket LhsPacketHalf;
-  typedef typename HalfTraits::RhsPacket RhsPacketHalf;
-  typedef typename HalfTraits::ResPacket ResPacketHalf;
-  typedef typename HalfTraits::AccPacket AccPacketHalf;
-
-  typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
-  typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
-  typedef typename QuarterTraits::ResPacket ResPacketQuarter;
-  typedef typename QuarterTraits::AccPacket AccPacketQuarter;
-
   typedef typename DataMapper::LinearMapper LinearMapper;
 
   enum {
     Vectorizable  = Traits::Vectorizable,
     LhsProgress   = Traits::LhsProgress,
-    LhsProgressHalf      = HalfTraits::LhsProgress,
-    LhsProgressQuarter   = QuarterTraits::LhsProgress,
     RhsProgress   = Traits::RhsProgress,
-    RhsProgressHalf      = HalfTraits::RhsProgress,
-    RhsProgressQuarter   = QuarterTraits::RhsProgress,
     ResPacketSize = Traits::ResPacketSize
   };
 
@@ -1277,299 +888,6 @@ struct gebp_kernel
                   Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0);
 };
 
-template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs,
-int SwappedLhsProgress = gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target>::LhsProgress>
-struct last_row_process_16_packets
-{
-  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
-  typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
-
-  typedef typename Traits::ResScalar ResScalar;
-  typedef typename SwappedTraits::LhsPacket SLhsPacket;
-  typedef typename SwappedTraits::RhsPacket SRhsPacket;
-  typedef typename SwappedTraits::ResPacket SResPacket;
-  typedef typename SwappedTraits::AccPacket SAccPacket;
-
-  EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
-                  const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
-                  ResScalar alpha, SAccPacket &C0)
-    {
-      EIGEN_UNUSED_VARIABLE(res);
-      EIGEN_UNUSED_VARIABLE(straits);
-      EIGEN_UNUSED_VARIABLE(blA);
-      EIGEN_UNUSED_VARIABLE(blB);
-      EIGEN_UNUSED_VARIABLE(depth);
-      EIGEN_UNUSED_VARIABLE(endk);
-      EIGEN_UNUSED_VARIABLE(i);
-      EIGEN_UNUSED_VARIABLE(j2);
-      EIGEN_UNUSED_VARIABLE(alpha);
-      EIGEN_UNUSED_VARIABLE(C0);
-    }
-};
-
-
-template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper,  mr,  nr, ConjugateLhs,  ConjugateRhs, 16> {
-  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs,Architecture::Target> Traits;
-  typedef gebp_traits<RhsScalar,LhsScalar,ConjugateRhs,ConjugateLhs,Architecture::Target> SwappedTraits;
-
-  typedef typename Traits::ResScalar ResScalar;
-  typedef typename SwappedTraits::LhsPacket SLhsPacket;
-  typedef typename SwappedTraits::RhsPacket SRhsPacket;
-  typedef typename SwappedTraits::ResPacket SResPacket;
-  typedef typename SwappedTraits::AccPacket SAccPacket;
-
-  EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA,
-                  const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
-                  ResScalar alpha, SAccPacket &C0)
-  {
-    typedef typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half SResPacketQuarter;
-    typedef typename unpacket_traits<typename unpacket_traits<SLhsPacket>::half>::half SLhsPacketQuarter;
-    typedef typename unpacket_traits<typename unpacket_traits<SRhsPacket>::half>::half SRhsPacketQuarter;
-    typedef typename unpacket_traits<typename unpacket_traits<SAccPacket>::half>::half SAccPacketQuarter;
-
-    SResPacketQuarter R = res.template gatherPacket<SResPacketQuarter>(i, j2);
-    SResPacketQuarter alphav = pset1<SResPacketQuarter>(alpha);
-
-    if (depth - endk > 0)
-      {
-	// We have to handle the last row(s) of the rhs, which
-	// correspond to a half-packet
-	SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0));
-
-	for (Index kk = endk; kk < depth; kk++)
-	  {
-	    SLhsPacketQuarter a0;
-	    SRhsPacketQuarter b0;
-	    straits.loadLhsUnaligned(blB, a0);
-	    straits.loadRhs(blA, b0);
-	    straits.madd(a0,b0,c0,b0, fix<0>);
-	    blB += SwappedTraits::LhsProgress/4;
-	    blA += 1;
-	  }
-	straits.acc(c0, alphav, R);
-      }
-    else
-      {
-	straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R);
-      }
-    res.scatterPacket(i, j2, R);
-  }
-};
-
-template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
-struct lhs_process_one_packet
-{
-  typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
-
-  EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
-  {
-    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
-    EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
-    traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0);
-    traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel);
-    traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
-    traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
-    traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
-    traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
-    #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE)
-    __asm__  ("" : "+x,m" (*A0));
-    #endif
-    EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
-  }
-
-  EIGEN_STRONG_INLINE void operator()(
-    const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha,
-    Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB,
-    int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4)
-  {
-    GEBPTraits traits;
-
-    // loops on each largest micro horizontal panel of lhs
-    // (LhsProgress x depth)
-    for(Index i=peelStart; i<peelEnd; i+=LhsProgress)
-    {
-      // loops on each largest micro vertical panel of rhs (depth * nr)
-      for(Index j2=0; j2<packet_cols4; j2+=nr)
-      {
-        // We select a LhsProgress x nr micro block of res
-        // which is entirely stored into 1 x nr registers.
-
-        const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
-        prefetch(&blA[0]);
-
-        // gets res block as register
-        AccPacket C0, C1, C2, C3;
-        traits.initAcc(C0);
-        traits.initAcc(C1);
-        traits.initAcc(C2);
-        traits.initAcc(C3);
-        // To improve instruction pipelining, let's double the accumulation registers:
-        //  even k will accumulate in C*, while odd k will accumulate in D*.
-        // This trick is crutial to get good performance with FMA, otherwise it is 
-        // actually faster to perform separated MUL+ADD because of a naturally
-        // better instruction-level parallelism.
-        AccPacket D0, D1, D2, D3;
-        traits.initAcc(D0);
-        traits.initAcc(D1);
-        traits.initAcc(D2);
-        traits.initAcc(D3);
-
-        LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
-        LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
-        LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
-        LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
-
-        r0.prefetch(prefetch_res_offset);
-        r1.prefetch(prefetch_res_offset);
-        r2.prefetch(prefetch_res_offset);
-        r3.prefetch(prefetch_res_offset);
-
-        // performs "inner" products
-        const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
-        prefetch(&blB[0]);
-        LhsPacket A0, A1;
-
-        for(Index k=0; k<peeled_kc; k+=pk)
-        {
-          EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
-          RhsPacketx4 rhs_panel;
-          RhsPacket T0;
-
-          internal::prefetch(blB+(48+0));
-          peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
-          peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
-          peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
-          peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
-          internal::prefetch(blB+(48+16));
-          peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
-          peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
-          peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
-          peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
-
-          blB += pk*4*RhsProgress;
-          blA += pk*LhsProgress;
-
-          EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4");
-        }
-        C0 = padd(C0,D0);
-        C1 = padd(C1,D1);
-        C2 = padd(C2,D2);
-        C3 = padd(C3,D3);
-
-        // process remaining peeled loop
-        for(Index k=peeled_kc; k<depth; k++)
-        {
-          RhsPacketx4 rhs_panel;
-          RhsPacket T0;
-          peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
-          blB += 4*RhsProgress;
-          blA += LhsProgress;
-        }
-
-        ResPacket R0, R1;
-        ResPacket alphav = pset1<ResPacket>(alpha);
-
-        R0 = r0.template loadPacket<ResPacket>(0);
-        R1 = r1.template loadPacket<ResPacket>(0);
-        traits.acc(C0, alphav, R0);
-        traits.acc(C1,  alphav, R1);
-        r0.storePacket(0, R0);
-        r1.storePacket(0, R1);
-
-        R0 = r2.template loadPacket<ResPacket>(0);
-        R1 = r3.template loadPacket<ResPacket>(0);
-        traits.acc(C2,  alphav, R0);
-        traits.acc(C3,  alphav, R1);
-        r2.storePacket(0, R0);
-        r3.storePacket(0, R1);
-      }
-
-      // Deal with remaining columns of the rhs
-      for(Index j2=packet_cols4; j2<cols; j2++)
-      {
-        // One column at a time
-        const LhsScalar* blA = &blockA[i*strideA+offsetA*(LhsProgress)];
-        prefetch(&blA[0]);
-
-        // gets res block as register
-        AccPacket C0;
-        traits.initAcc(C0);
-
-        LinearMapper r0 = res.getLinearMapper(i, j2);
-
-        // performs "inner" products
-        const RhsScalar* blB = &blockB[j2*strideB+offsetB];
-        LhsPacket A0;
-
-        for(Index k= 0; k<peeled_kc; k+=pk)
-        {
-          EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1");
-          RhsPacket B_0;
-
-#define EIGEN_GEBGP_ONESTEP(K)                                          \
-	      do {                                                      \
-		EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
-		EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
-    /* FIXME: why unaligned???? */ \
-		traits.loadLhsUnaligned(&blA[(0+1*K)*LhsProgress], A0); \
-		traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);		\
-		traits.madd(A0, B_0, C0, B_0, fix<0>);				\
-		EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \
-	      } while(false);
-
-          EIGEN_GEBGP_ONESTEP(0);
-          EIGEN_GEBGP_ONESTEP(1);
-          EIGEN_GEBGP_ONESTEP(2);
-          EIGEN_GEBGP_ONESTEP(3);
-          EIGEN_GEBGP_ONESTEP(4);
-          EIGEN_GEBGP_ONESTEP(5);
-          EIGEN_GEBGP_ONESTEP(6);
-          EIGEN_GEBGP_ONESTEP(7);
-
-          blB += pk*RhsProgress;
-          blA += pk*LhsProgress;
-
-          EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1");
-        }
-
-        // process remaining peeled loop
-        for(Index k=peeled_kc; k<depth; k++)
-        {
-          RhsPacket B_0;
-          EIGEN_GEBGP_ONESTEP(0);
-          blB += RhsProgress;
-          blA += LhsProgress;
-        }
-#undef EIGEN_GEBGP_ONESTEP
-        ResPacket R0;
-        ResPacket alphav = pset1<ResPacket>(alpha);
-        R0 = r0.template loadPacket<ResPacket>(0);
-        traits.acc(C0, alphav, R0);
-        r0.storePacket(0, R0);
-      }
-    }
-  }
-};
-
-template<int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar, typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits, typename LinearMapper, typename DataMapper>
-struct lhs_process_fraction_of_packet : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper>
-{
-
-EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3)
-  {
-        EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
-        EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
-        traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0);
-        traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3);
-        traits.madd(*A0, *B_0, *C0, *B_0);
-        traits.madd(*A0, *B1,  *C1, *B1);
-        traits.madd(*A0, *B2,  *C2, *B2);
-        traits.madd(*A0, *B3,  *C3, *B3);
-        EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
-  }
-};
-
 template<typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
 EIGEN_DONT_INLINE
 void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,ConjugateRhs>
@@ -1586,12 +904,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
     Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
     const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0;
     const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0;
-    const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0;
-    const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0;
-    const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0;
+    const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0;
     enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
     const Index peeled_kc  = depth & ~(pk-1);
-    const int prefetch_res_offset = 32/sizeof(ResScalar);    
+    const Index prefetch_res_offset = 32/sizeof(ResScalar);    
 //     const Index depth2     = depth & ~1;
 
     //---------- Process 3 * LhsProgress rows at once ----------
@@ -1649,48 +965,36 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           for(Index k=0; k<peeled_kc; k+=pk)
           {
             EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
-            // 15 registers are taken (12 for acc, 2 for lhs).
-            RhsPanel15 rhs_panel;
-            RhsPacket T0;
+            RhsPacket B_0, T0;
             LhsPacket A2;
-            #if EIGEN_COMP_GNUC_STRICT && EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && !(EIGEN_GNUC_AT_LEAST(9,0))
-            // see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
-            // without this workaround A0, A1, and A2 are loaded in the same register,
-            // which is not good for pipelining
-            #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__  ("" : "+w,m" (A0), "+w,m" (A1), "+w,m" (A2));
-            #else
-            #define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
-            #endif
-#define EIGEN_GEBP_ONESTEP(K)                                                     \
-            do {                                                                  \
-              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4");          \
+
+#define EIGEN_GEBP_ONESTEP(K) \
+            do { \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4"); \
               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
-              internal::prefetch(blA + (3 * K + 16) * LhsProgress);               \
-              if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) {                            \
-                internal::prefetch(blB + (4 * K + 16) * RhsProgress);             \
-              } /* Bug 953 */                                                     \
-              traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0);                \
-              traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1);                \
-              traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2);                \
-              EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND \
-              traits.loadRhs(blB + (0+4*K) * Traits::RhsProgress, rhs_panel);     \
-              traits.madd(A0, rhs_panel, C0, T0, fix<0>);                         \
-              traits.madd(A1, rhs_panel, C4, T0, fix<0>);                         \
-              traits.madd(A2, rhs_panel, C8, T0, fix<0>);                         \
-              traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel);   \
-              traits.madd(A0, rhs_panel, C1, T0, fix<1>);                         \
-              traits.madd(A1, rhs_panel, C5, T0, fix<1>);                         \
-              traits.madd(A2, rhs_panel, C9, T0, fix<1>);                         \
-              traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel);   \
-              traits.madd(A0, rhs_panel, C2, T0, fix<2>);                         \
-              traits.madd(A1, rhs_panel, C6, T0, fix<2>);                         \
-              traits.madd(A2, rhs_panel, C10, T0, fix<2>);                        \
-              traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel);   \
-              traits.madd(A0, rhs_panel, C3, T0, fix<3>);                         \
-              traits.madd(A1, rhs_panel, C7, T0, fix<3>);                         \
-              traits.madd(A2, rhs_panel, C11, T0, fix<3>);                        \
-              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4");            \
-            } while (false)
+              internal::prefetch(blA+(3*K+16)*LhsProgress); \
+              if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \
+              traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
+              traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
+              traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
+              traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \
+              traits.madd(A0, B_0, C0, T0); \
+              traits.madd(A1, B_0, C4, T0); \
+              traits.madd(A2, B_0, C8, B_0); \
+              traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \
+              traits.madd(A0, B_0, C1, T0); \
+              traits.madd(A1, B_0, C5, T0); \
+              traits.madd(A2, B_0, C9, B_0); \
+              traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \
+              traits.madd(A0, B_0, C2,  T0); \
+              traits.madd(A1, B_0, C6,  T0); \
+              traits.madd(A2, B_0, C10, B_0); \
+              traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \
+              traits.madd(A0, B_0, C3 , T0); \
+              traits.madd(A1, B_0, C7,  T0); \
+              traits.madd(A2, B_0, C11, B_0); \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \
+            } while(false)
 
             internal::prefetch(blB);
             EIGEN_GEBP_ONESTEP(0);
@@ -1710,8 +1014,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           // process remaining peeled loop
           for(Index k=peeled_kc; k<depth; k++)
           {
-            RhsPanel15 rhs_panel;
-            RhsPacket T0;
+            RhsPacket B_0, T0;
             LhsPacket A2;
             EIGEN_GEBP_ONESTEP(0);
             blB += 4*RhsProgress;
@@ -1723,9 +1026,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           ResPacket R0, R1, R2;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r0.loadPacket(2 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C4, alphav, R1);
           traits.acc(C8, alphav, R2);
@@ -1733,9 +1036,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           r0.storePacket(1 * Traits::ResPacketSize, R1);
           r0.storePacket(2 * Traits::ResPacketSize, R2);
 
-          R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+          R0 = r1.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r1.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r1.loadPacket(2 * Traits::ResPacketSize);
           traits.acc(C1, alphav, R0);
           traits.acc(C5, alphav, R1);
           traits.acc(C9, alphav, R2);
@@ -1743,9 +1046,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           r1.storePacket(1 * Traits::ResPacketSize, R1);
           r1.storePacket(2 * Traits::ResPacketSize, R2);
 
-          R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r2.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r2.loadPacket(2 * Traits::ResPacketSize);
           traits.acc(C2, alphav, R0);
           traits.acc(C6, alphav, R1);
           traits.acc(C10, alphav, R2);
@@ -1753,9 +1056,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           r2.storePacket(1 * Traits::ResPacketSize, R1);
           r2.storePacket(2 * Traits::ResPacketSize, R2);
 
-          R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+          R0 = r3.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r3.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r3.loadPacket(2 * Traits::ResPacketSize);
           traits.acc(C3, alphav, R0);
           traits.acc(C7, alphav, R1);
           traits.acc(C11, alphav, R2);
@@ -1791,20 +1094,20 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           {
             EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
             RhsPacket B_0;
-#define EIGEN_GEBGP_ONESTEP(K)                                                    \
-            do {                                                                  \
-              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1");          \
+#define EIGEN_GEBGP_ONESTEP(K) \
+            do { \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1"); \
               EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
-              traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0);                \
-              traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1);                \
-              traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2);                \
-              traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0);                   \
-              traits.madd(A0, B_0, C0, B_0, fix<0>);                              \
-              traits.madd(A1, B_0, C4, B_0, fix<0>);                              \
-              traits.madd(A2, B_0, C8, B_0, fix<0>);                              \
-              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1");            \
-            } while (false)
-
+              traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0);  \
+              traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1);  \
+              traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2);  \
+              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);   \
+              traits.madd(A0, B_0, C0, B_0); \
+              traits.madd(A1, B_0, C4, B_0); \
+              traits.madd(A2, B_0, C8, B_0); \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \
+            } while(false)
+        
             EIGEN_GEBGP_ONESTEP(0);
             EIGEN_GEBGP_ONESTEP(1);
             EIGEN_GEBGP_ONESTEP(2);
@@ -1832,9 +1135,9 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           ResPacket R0, R1, R2;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r0.loadPacket(2 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C4, alphav, R1);
           traits.acc(C8, alphav, R2);
@@ -1893,8 +1196,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           for(Index k=0; k<peeled_kc; k+=pk)
           {
             EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
-            RhsPacketx4 rhs_panel;
-            RhsPacket T0;
+            RhsPacket B_0, B1, B2, B3, T0;
 
           // NOTE: the begin/end asm comments below work around bug 935!
           // but they are not enough for gcc>=6 without FMA (bug 1637)
@@ -1903,24 +1205,24 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           #else
             #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
           #endif
-#define EIGEN_GEBGP_ONESTEP(K)                                            \
-            do {                                                          \
-              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4");  \
-              traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0);        \
-              traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1);        \
-              traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
-              traits.madd(A0, rhs_panel, C0, T0, fix<0>);                 \
-              traits.madd(A1, rhs_panel, C4, T0, fix<0>);                 \
-              traits.madd(A0, rhs_panel, C1, T0, fix<1>);                 \
-              traits.madd(A1, rhs_panel, C5, T0, fix<1>);                 \
-              traits.madd(A0, rhs_panel, C2, T0, fix<2>);                 \
-              traits.madd(A1, rhs_panel, C6, T0, fix<2>);                 \
-              traits.madd(A0, rhs_panel, C3, T0, fix<3>);                 \
-              traits.madd(A1, rhs_panel, C7, T0, fix<3>);                 \
-              EIGEN_GEBP_2PX4_SPILLING_WORKAROUND                         \
-              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4");    \
-            } while (false)
-
+          #define EIGEN_GEBGP_ONESTEP(K) \
+            do {                                                                \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4");        \
+              traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                    \
+              traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                    \
+              traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
+              traits.madd(A0, B_0, C0, T0);                                     \
+              traits.madd(A1, B_0, C4, B_0);                                    \
+              traits.madd(A0, B1,  C1, T0);                                     \
+              traits.madd(A1, B1,  C5, B1);                                     \
+              traits.madd(A0, B2,  C2, T0);                                     \
+              traits.madd(A1, B2,  C6, B2);                                     \
+              traits.madd(A0, B3,  C3, T0);                                     \
+              traits.madd(A1, B3,  C7, B3);                                     \
+              EIGEN_GEBP_2PX4_SPILLING_WORKAROUND                               \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4");          \
+            } while(false)
+            
             internal::prefetch(blB+(48+0));
             EIGEN_GEBGP_ONESTEP(0);
             EIGEN_GEBGP_ONESTEP(1);
@@ -1940,8 +1242,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           // process remaining peeled loop
           for(Index k=peeled_kc; k<depth; k++)
           {
-            RhsPacketx4 rhs_panel;
-            RhsPacket T0;
+            RhsPacket B_0, B1, B2, B3, T0;
             EIGEN_GEBGP_ONESTEP(0);
             blB += 4*RhsProgress;
             blA += 2*Traits::LhsProgress;
@@ -1951,10 +1252,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           ResPacket R0, R1, R2, R3;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r1.loadPacket(0 * Traits::ResPacketSize);
+          R3 = r1.loadPacket(1 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C4, alphav, R1);
           traits.acc(C1, alphav, R2);
@@ -1964,10 +1265,10 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           r1.storePacket(0 * Traits::ResPacketSize, R2);
           r1.storePacket(1 * Traits::ResPacketSize, R3);
 
-          R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
-          R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r2.loadPacket(1 * Traits::ResPacketSize);
+          R2 = r3.loadPacket(0 * Traits::ResPacketSize);
+          R3 = r3.loadPacket(1 * Traits::ResPacketSize);
           traits.acc(C2,  alphav, R0);
           traits.acc(C6,  alphav, R1);
           traits.acc(C3,  alphav, R2);
@@ -2012,8 +1313,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
               traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0);                      \
               traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1);                      \
               traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                       \
-              traits.madd(A0, B_0, C0, B1, fix<0>);                               \
-              traits.madd(A1, B_0, C4, B_0, fix<0>);                              \
+              traits.madd(A0, B_0, C0, B1);                                       \
+              traits.madd(A1, B_0, C4, B_0);                                      \
               EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1");            \
             } while(false)
         
@@ -2044,8 +1345,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           ResPacket R0, R1;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
-          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r0.loadPacket(1 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
           traits.acc(C4, alphav, R1);
           r0.storePacket(0 * Traits::ResPacketSize, R0);
@@ -2057,43 +1358,186 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
     //---------- Process 1 * LhsProgress rows at once ----------
     if(mr>=1*Traits::LhsProgress)
     {
-      lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket, RhsPacket, ResPacket, Traits, LinearMapper, DataMapper> p;
-      p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
-    }
-    //---------- Process LhsProgressHalf rows at once ----------
-    if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf)
-    {
-      lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf, LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper> p;
-      p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
-    }
-    //---------- Process LhsProgressQuarter rows at once ----------
-    if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter)
-    {
-      lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar, AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter, QuarterTraits, LinearMapper, DataMapper> p;
-      p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
+      // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth)
+      for(Index i=peeled_mc2; i<peeled_mc1; i+=1*LhsProgress)
+      {
+        // loops on each largest micro vertical panel of rhs (depth * nr)
+        for(Index j2=0; j2<packet_cols4; j2+=nr)
+        {
+          // We select a 1*Traits::LhsProgress x nr micro block of res which is entirely
+          // stored into 1 x nr registers.
+          
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
+          prefetch(&blA[0]);
+
+          // gets res block as register
+          AccPacket C0, C1, C2, C3;
+          traits.initAcc(C0);
+          traits.initAcc(C1);
+          traits.initAcc(C2);
+          traits.initAcc(C3);
+
+          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+          r0.prefetch(prefetch_res_offset);
+          r1.prefetch(prefetch_res_offset);
+          r2.prefetch(prefetch_res_offset);
+          r3.prefetch(prefetch_res_offset);
+
+          // performs "inner" products
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
+          prefetch(&blB[0]);
+          LhsPacket A0;
+
+          for(Index k=0; k<peeled_kc; k+=pk)
+          {
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX4");
+            RhsPacket B_0, B1, B2, B3;
+               
+#define EIGEN_GEBGP_ONESTEP(K) \
+            do {                                                                \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX4");        \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
+              traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3);  \
+              traits.madd(A0, B_0, C0, B_0);                                    \
+              traits.madd(A0, B1,  C1, B1);                                     \
+              traits.madd(A0, B2,  C2, B2);                                     \
+              traits.madd(A0, B3,  C3, B3);                                     \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX4");          \
+            } while(false)
+            
+            internal::prefetch(blB+(48+0));
+            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBGP_ONESTEP(1);
+            EIGEN_GEBGP_ONESTEP(2);
+            EIGEN_GEBGP_ONESTEP(3);
+            internal::prefetch(blB+(48+16));
+            EIGEN_GEBGP_ONESTEP(4);
+            EIGEN_GEBGP_ONESTEP(5);
+            EIGEN_GEBGP_ONESTEP(6);
+            EIGEN_GEBGP_ONESTEP(7);
+
+            blB += pk*4*RhsProgress;
+            blA += pk*1*LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 1pX4");
+          }
+          // process remaining peeled loop
+          for(Index k=peeled_kc; k<depth; k++)
+          {
+            RhsPacket B_0, B1, B2, B3;
+            EIGEN_GEBGP_ONESTEP(0);
+            blB += 4*RhsProgress;
+            blA += 1*LhsProgress;
+          }
+#undef EIGEN_GEBGP_ONESTEP
+
+          ResPacket R0, R1;
+          ResPacket alphav = pset1<ResPacket>(alpha);
+
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r1.loadPacket(0 * Traits::ResPacketSize);
+          traits.acc(C0, alphav, R0);
+          traits.acc(C1,  alphav, R1);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r1.storePacket(0 * Traits::ResPacketSize, R1);
+
+          R0 = r2.loadPacket(0 * Traits::ResPacketSize);
+          R1 = r3.loadPacket(0 * Traits::ResPacketSize);
+          traits.acc(C2,  alphav, R0);
+          traits.acc(C3,  alphav, R1);
+          r2.storePacket(0 * Traits::ResPacketSize, R0);
+          r3.storePacket(0 * Traits::ResPacketSize, R1);
+        }
+
+        // Deal with remaining columns of the rhs
+        for(Index j2=packet_cols4; j2<cols; j2++)
+        {
+          // One column at a time
+          const LhsScalar* blA = &blockA[i*strideA+offsetA*(1*Traits::LhsProgress)];
+          prefetch(&blA[0]);
+
+          // gets res block as register
+          AccPacket C0;
+          traits.initAcc(C0);
+
+          LinearMapper r0 = res.getLinearMapper(i, j2);
+
+          // performs "inner" products
+          const RhsScalar* blB = &blockB[j2*strideB+offsetB];
+          LhsPacket A0;
+
+          for(Index k=0; k<peeled_kc; k+=pk)
+          {
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX1");
+            RhsPacket B_0;
+        
+#define EIGEN_GEBGP_ONESTEP(K) \
+            do {                                                                \
+              EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX1");        \
+              EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+              traits.loadLhs(&blA[(0+1*K)*LhsProgress], A0);                    \
+              traits.loadRhs(&blB[(0+K)*RhsProgress], B_0);                     \
+              traits.madd(A0, B_0, C0, B_0);                                    \
+              EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX1");          \
+            } while(false);
+
+            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBGP_ONESTEP(1);
+            EIGEN_GEBGP_ONESTEP(2);
+            EIGEN_GEBGP_ONESTEP(3);
+            EIGEN_GEBGP_ONESTEP(4);
+            EIGEN_GEBGP_ONESTEP(5);
+            EIGEN_GEBGP_ONESTEP(6);
+            EIGEN_GEBGP_ONESTEP(7);
+
+            blB += pk*RhsProgress;
+            blA += pk*1*Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 1pX1");
+          }
+
+          // process remaining peeled loop
+          for(Index k=peeled_kc; k<depth; k++)
+          {
+            RhsPacket B_0;
+            EIGEN_GEBGP_ONESTEP(0);
+            blB += RhsProgress;
+            blA += 1*Traits::LhsProgress;
+          }
+#undef EIGEN_GEBGP_ONESTEP
+          ResPacket R0;
+          ResPacket alphav = pset1<ResPacket>(alpha);
+          R0 = r0.loadPacket(0 * Traits::ResPacketSize);
+          traits.acc(C0, alphav, R0);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+        }
+      }
     }
     //---------- Process remaining rows, 1 at once ----------
-    if(peeled_mc_quarter<rows)
+    if(peeled_mc1<rows)
     {
       // loop on each panel of the rhs
       for(Index j2=0; j2<packet_cols4; j2+=nr)
       {
         // loop on each row of the lhs (1*LhsProgress x depth)
-        for(Index i=peeled_mc_quarter; i<rows; i+=1)
+        for(Index i=peeled_mc1; i<rows; i+=1)
         {
           const LhsScalar* blA = &blockA[i*strideA+offsetA];
           prefetch(&blA[0]);
           const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
 
-          // If LhsProgress is 8 or 16, it assumes that there is a
-          // half or quarter packet, respectively, of the same size as
-          // nr (which is currently 4) for the return type.
+          // The following piece of code wont work for 512 bit registers
+          // Moreover, if LhsProgress==8 it assumes that there is a half packet of the same size
+          // as nr (which is currently 4) for the return type.
           const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
-          const int SResPacketQuarterSize = unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size;
           if ((SwappedTraits::LhsProgress % 4) == 0 &&
-              (SwappedTraits::LhsProgress<=16) &&
-              (SwappedTraits::LhsProgress!=8  || SResPacketHalfSize==nr) &&
-              (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr))
+              (SwappedTraits::LhsProgress <= 8) &&
+              (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr))
           {
             SAccPacket C0, C1, C2, C3;
             straits.initAcc(C0);
@@ -2116,15 +1560,15 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
 
               straits.loadRhsQuad(blA+0*spk, B_0);
               straits.loadRhsQuad(blA+1*spk, B_1);
-              straits.madd(A0,B_0,C0,B_0, fix<0>);
-              straits.madd(A1,B_1,C1,B_1, fix<0>);
+              straits.madd(A0,B_0,C0,B_0);
+              straits.madd(A1,B_1,C1,B_1);
 
               straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0);
               straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1);
               straits.loadRhsQuad(blA+2*spk, B_0);
               straits.loadRhsQuad(blA+3*spk, B_1);
-              straits.madd(A0,B_0,C2,B_0, fix<0>);
-              straits.madd(A1,B_1,C3,B_1, fix<0>);
+              straits.madd(A0,B_0,C2,B_0);
+              straits.madd(A1,B_1,C3,B_1);
 
               blB += 4*SwappedTraits::LhsProgress;
               blA += 4*spk;
@@ -2137,7 +1581,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
 
               straits.loadLhsUnaligned(blB, A0);
               straits.loadRhsQuad(blA, B_0);
-              straits.madd(A0,B_0,C0,B_0, fix<0>);
+              straits.madd(A0,B_0,C0,B_0);
 
               blB += SwappedTraits::LhsProgress;
               blA += spk;
@@ -2147,7 +1591,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
               // Special case where we have to first reduce the accumulation register C0
               typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SResPacket>::half,SResPacket>::type SResPacketHalf;
               typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SLhsPacket>::type SLhsPacketHalf;
-              typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SRhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
+              typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SLhsPacket>::half,SRhsPacket>::type SRhsPacketHalf;
               typedef typename conditional<SwappedTraits::LhsProgress>=8,typename unpacket_traits<SAccPacket>::half,SAccPacket>::type SAccPacketHalf;
 
               SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
@@ -2160,25 +1604,16 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
                 SRhsPacketHalf b0;
                 straits.loadLhsUnaligned(blB, a0);
                 straits.loadRhs(blA, b0);
-                SAccPacketHalf c0 = predux_half_dowto4(C0);
-                straits.madd(a0,b0,c0,b0, fix<0>);
+                SAccPacketHalf c0 = predux_downto4(C0);
+                straits.madd(a0,b0,c0,b0);
                 straits.acc(c0, alphav, R);
               }
               else
               {
-                straits.acc(predux_half_dowto4(C0), alphav, R);
+                straits.acc(predux_downto4(C0), alphav, R);
               }
               res.scatterPacket(i, j2, R);
             }
-            else if (SwappedTraits::LhsProgress==16)
-            {
-              // Special case where we have to first reduce the
-              // accumulation register C0. We specialize the block in
-              // template form, so that LhsProgress < 16 paths don't
-              // fail to compile
-              last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p;
-	            p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0);
-            }
             else
             {
               SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
@@ -2222,7 +1657,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
       for(Index j2=packet_cols4; j2<cols; j2++)
       {
         // loop on each row of the lhs (1*LhsProgress x depth)
-        for(Index i=peeled_mc_quarter; i<rows; i+=1)
+        for(Index i=peeled_mc1; i<rows; i+=1)
         {
           const LhsScalar* blA = &blockA[i*strideA+offsetA];
           prefetch(&blA[0]);
@@ -2258,24 +1693,19 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
 //
 //  32 33 34 35 ...
 //  36 36 38 39 ...
-template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
 {
   typedef typename DataMapper::LinearMapper LinearMapper;
   EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
 };
 
-template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, ColMajor, Conjugate, PanelMode>
   ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
-  typedef typename unpacket_traits<Packet>::half HalfPacket;
-  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
-  enum { PacketSize = unpacket_traits<Packet>::size,
-         HalfPacketSize = unpacket_traits<HalfPacket>::size,
-         QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
-         HasHalf = (int)HalfPacketSize < (int)PacketSize,
-         HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
+  typedef typename packet_traits<Scalar>::type Packet;
+  enum { PacketSize = packet_traits<Scalar>::size };
 
   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
   EIGEN_UNUSED_VARIABLE(stride);
@@ -2287,12 +1717,9 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
 
   const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
   const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
-  const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
-  const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
-  const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0;
-  const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0;
-  const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter
-                         : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0;
+  const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
+  const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1
+                         : Pack2>1             ? (rows/Pack2)*Pack2 : 0;
 
   Index i=0;
 
@@ -2306,9 +1733,9 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
       for(Index k=0; k<depth; k++)
       {
         Packet A, B, C;
-        A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
-        B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
-        C = lhs.template loadPacket<Packet>(i+2*PacketSize, k);
+        A = lhs.loadPacket(i+0*PacketSize, k);
+        B = lhs.loadPacket(i+1*PacketSize, k);
+        C = lhs.loadPacket(i+2*PacketSize, k);
         pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
         pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
         pstore(blockA+count, cj.pconj(C)); count+=PacketSize;
@@ -2326,8 +1753,8 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
       for(Index k=0; k<depth; k++)
       {
         Packet A, B;
-        A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
-        B = lhs.template loadPacket<Packet>(i+1*PacketSize, k);
+        A = lhs.loadPacket(i+0*PacketSize, k);
+        B = lhs.loadPacket(i+1*PacketSize, k);
         pstore(blockA+count, cj.pconj(A)); count+=PacketSize;
         pstore(blockA+count, cj.pconj(B)); count+=PacketSize;
       }
@@ -2344,67 +1771,27 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
       for(Index k=0; k<depth; k++)
       {
         Packet A;
-        A = lhs.template loadPacket<Packet>(i+0*PacketSize, k);
+        A = lhs.loadPacket(i+0*PacketSize, k);
         pstore(blockA+count, cj.pconj(A));
         count+=PacketSize;
       }
       if(PanelMode) count += (1*PacketSize) * (stride-offset-depth);
     }
   }
-  // Pack half packets
-  if(HasHalf && Pack1>=HalfPacketSize)
-  {
-    for(; i<peeled_mc_half; i+=HalfPacketSize)
-    {
-      if(PanelMode) count += (HalfPacketSize) * offset;
-
-      for(Index k=0; k<depth; k++)
-      {
-        HalfPacket A;
-        A = lhs.template loadPacket<HalfPacket>(i+0*(HalfPacketSize), k);
-        pstoreu(blockA+count, cj.pconj(A));
-        count+=HalfPacketSize;
-      }
-      if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth);
-    }
-  }
-  // Pack quarter packets
-  if(HasQuarter && Pack1>=QuarterPacketSize)
-  {
-    for(; i<peeled_mc_quarter; i+=QuarterPacketSize)
-    {
-      if(PanelMode) count += (QuarterPacketSize) * offset;
-
-      for(Index k=0; k<depth; k++)
-      {
-        QuarterPacket A;
-        A = lhs.template loadPacket<QuarterPacket>(i+0*(QuarterPacketSize), k);
-        pstoreu(blockA+count, cj.pconj(A));
-        count+=QuarterPacketSize;
-      }
-      if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth);
-    }
-  }
-  // Pack2 may be *smaller* than PacketSize—that happens for
-  // products like real * complex, where we have to go half the
-  // progress on the lhs in order to duplicate those operands to
-  // address both real & imaginary parts on the rhs. This portion will
-  // pack those half ones until they match the number expected on the
-  // last peeling loop at this point (for the rhs).
+  // Pack scalars
   if(Pack2<PacketSize && Pack2>1)
   {
-    for(; i<peeled_mc0; i+=last_lhs_progress)
+    for(; i<peeled_mc0; i+=Pack2)
     {
-      if(PanelMode) count += last_lhs_progress * offset;
+      if(PanelMode) count += Pack2 * offset;
 
       for(Index k=0; k<depth; k++)
-        for(Index w=0; w<last_lhs_progress; w++)
+        for(Index w=0; w<Pack2; w++)
           blockA[count++] = cj(lhs(i+w, k));
 
-      if(PanelMode) count += last_lhs_progress * (stride-offset-depth);
+      if(PanelMode) count += Pack2 * (stride-offset-depth);
     }
   }
-  // Pack scalars
   for(; i<rows; i++)
   {
     if(PanelMode) count += offset;
@@ -2414,24 +1801,19 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
   }
 }
 
-template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
 {
   typedef typename DataMapper::LinearMapper LinearMapper;
   EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0);
 };
 
-template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, RowMajor, Conjugate, PanelMode>
   ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset)
 {
-  typedef typename unpacket_traits<Packet>::half HalfPacket;
-  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
-  enum { PacketSize = unpacket_traits<Packet>::size,
-         HalfPacketSize = unpacket_traits<HalfPacket>::size,
-         QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
-         HasHalf = (int)HalfPacketSize < (int)PacketSize,
-         HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
+  typedef typename packet_traits<Scalar>::type Packet;
+  enum { PacketSize = packet_traits<Scalar>::size };
 
   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
   EIGEN_UNUSED_VARIABLE(stride);
@@ -2439,51 +1821,37 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
   eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
   conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
   Index count = 0;
-  bool gone_half = false, gone_quarter = false, gone_last = false;
 
-  Index i = 0;
+//   const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
+//   const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
+//   const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
+
   int pack = Pack1;
-  int psize = PacketSize;
+  Index i = 0;
   while(pack>0)
   {
     Index remaining_rows = rows-i;
-    Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack;
-    Index starting_pos = i;
+    Index peeled_mc = i+(remaining_rows/pack)*pack;
     for(; i<peeled_mc; i+=pack)
     {
       if(PanelMode) count += pack * offset;
 
+      const Index peeled_k = (depth/PacketSize)*PacketSize;
       Index k=0;
-      if(pack>=psize && psize >= QuarterPacketSize)
+      if(pack>=PacketSize)
       {
-        const Index peeled_k = (depth/psize)*psize;
-        for(; k<peeled_k; k+=psize)
+        for(; k<peeled_k; k+=PacketSize)
         {
-          for (Index m = 0; m < pack; m += psize)
+          for (Index m = 0; m < pack; m += PacketSize)
           {
-            if (psize == PacketSize) {
-              PacketBlock<Packet> kernel;
-              for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
-              ptranspose(kernel);
-              for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
-            } else if (HasHalf && psize == HalfPacketSize) {
-              gone_half = true;
-              PacketBlock<HalfPacket> kernel_half;
-              for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k);
-              ptranspose(kernel_half);
-              for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p]));
-            } else if (HasQuarter && psize == QuarterPacketSize) {
-              gone_quarter = true;
-              PacketBlock<QuarterPacket> kernel_quarter;
-              for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k);
-              ptranspose(kernel_quarter);
-              for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p]));
-	    }
+            PacketBlock<Packet> kernel;
+            for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k);
+            ptranspose(kernel);
+            for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
           }
-          count += psize*pack;
+          count += PacketSize*pack;
         }
       }
-
       for(; k<depth; k++)
       {
         Index w=0;
@@ -2506,28 +1874,9 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
       if(PanelMode) count += pack * (stride-offset-depth);
     }
 
-    pack -= psize;
-    Index left = rows - i;
-    if (pack <= 0) {
-      if (!gone_last &&
-          (starting_pos == i || left >= psize/2 || left >= psize/4) &&
-          ((psize/2 == HalfPacketSize && HasHalf && !gone_half) ||
-           (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
-        psize /= 2;
-        pack = psize;
-        continue;
-      }
-      // Pack2 may be *smaller* than PacketSize—that happens for
-      // products like real * complex, where we have to go half the
-      // progress on the lhs in order to duplicate those operands to
-      // address both real & imaginary parts on the rhs. This portion will
-      // pack those half ones until they match the number expected on the
-      // last peeling loop at this point (for the rhs).
-      if (Pack2 < PacketSize && !gone_last) {
-        gone_last = true;
-        psize = pack = left & ~1;
-      }
-    }
+    pack -= PacketSize;
+    if(pack<Pack2 && (pack+PacketSize)!=Pack2)
+      pack = Pack2;
   }
 
   for(; i<rows; i++)
@@ -2583,7 +1932,7 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co
 //       const Scalar* b6 = &rhs[(j2+6)*rhsStride];
 //       const Scalar* b7 = &rhs[(j2+7)*rhsStride];
 //       Index k=0;
-//       if(PacketSize==8) // TODO enable vectorized transposition for PacketSize==4
+//       if(PacketSize==8) // TODO enbale vectorized transposition for PacketSize==4
 //       {
 //         for(; k<peeled_k; k+=PacketSize) {
 //           PacketBlock<Packet> kernel;
@@ -2630,10 +1979,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Co
       {
         for(; k<peeled_k; k+=PacketSize) {
           PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize> kernel;
-          kernel.packet[0           ] = dm0.template loadPacket<Packet>(k);
-          kernel.packet[1%PacketSize] = dm1.template loadPacket<Packet>(k);
-          kernel.packet[2%PacketSize] = dm2.template loadPacket<Packet>(k);
-          kernel.packet[3%PacketSize] = dm3.template loadPacket<Packet>(k);
+          kernel.packet[0] = dm0.loadPacket(k);
+          kernel.packet[1%PacketSize] = dm1.loadPacket(k);
+          kernel.packet[2%PacketSize] = dm2.loadPacket(k);
+          kernel.packet[3%PacketSize] = dm3.loadPacket(k);
           ptranspose(kernel);
           pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0]));
           pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize]));
@@ -2674,14 +2023,8 @@ template<typename Scalar, typename Index, typename DataMapper, int nr, bool Conj
 struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
 {
   typedef typename packet_traits<Scalar>::type Packet;
-  typedef typename unpacket_traits<Packet>::half HalfPacket;
-  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
   typedef typename DataMapper::LinearMapper LinearMapper;
-  enum { PacketSize = packet_traits<Scalar>::size,
-         HalfPacketSize = unpacket_traits<HalfPacket>::size,
-         QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
-         HasHalf = (int)HalfPacketSize < (int)PacketSize,
-         HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize };
+  enum { PacketSize = packet_traits<Scalar>::size };
   EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0);
 };
 
@@ -2740,17 +2083,9 @@ EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Co
       for(Index k=0; k<depth; k++)
       {
         if (PacketSize==4) {
-          Packet A = rhs.template loadPacket<Packet>(k, j2);
+          Packet A = rhs.loadPacket(k, j2);
           pstoreu(blockB+count, cj.pconj(A));
           count += PacketSize;
-        } else if (HasHalf && HalfPacketSize==4) {
-          HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
-          pstoreu(blockB+count, cj.pconj(A));
-          count += HalfPacketSize;
-        } else if (HasQuarter && QuarterPacketSize==4) {
-          QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
-          pstoreu(blockB+count, cj.pconj(A));
-          count += QuarterPacketSize;
         } else {
           const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
           blockB[count+0] = cj(dm0(0));
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h b/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 508c05c97..ed6234c37 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -77,7 +77,7 @@ static void run(Index rows, Index cols, Index depth,
   Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
   Index nc = (std::min)(cols,blocking.nc());  // cache block size along the N direction
 
-  gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
+  gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
   gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
   gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
 
@@ -110,7 +110,7 @@ static void run(Index rows, Index cols, Index depth,
       // i.e., we test that info[tid].users equals 0.
       // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
       while(info[tid].users!=0) {}
-      info[tid].users = threads;
+      info[tid].users += threads;
 
       pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length);
 
@@ -148,9 +148,7 @@ static void run(Index rows, Index cols, Index depth,
       // Release all the sub blocks A'_i of A' for the current thread,
       // i.e., we simply decrement the number of users by 1
       for(Index i=0; i<threads; ++i)
-#if !EIGEN_HAS_CXX11_ATOMIC
         #pragma omp atomic
-#endif
         info[i].users -= 1;
     }
   }
@@ -431,13 +429,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
   template<typename Dst>
   static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
-    // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=404 for a discussion and helper program
-    // to determine the following heuristic.
-    // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h,
-    // unless it has been specialized by the user or for a given architecture.
-    // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs.
-    // I'm not sure it is still required.
-    if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
+    if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
       lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar,Scalar>());
     else
     {
@@ -449,7 +441,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
   template<typename Dst>
   static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
-    if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
+    if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
       lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar,Scalar>());
     else
       scaleAndAddTo(dst,lhs, rhs, Scalar(1));
@@ -458,7 +450,7 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
   template<typename Dst>
   static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
   {
-    if((rhs.rows()+dst.rows()+dst.cols())<EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows()>0)
+    if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0)
       lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op<typename Dst::Scalar,Scalar>());
     else
       scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
@@ -471,20 +463,6 @@ struct generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
     if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0)
       return;
 
-    // Fallback to GEMV if either the lhs or rhs is a runtime vector
-    if (dst.cols() == 1)
-    {
-      typename Dest::ColXpr dst_vec(dst.col(0));
-      return internal::generic_product_impl<Lhs,typename Rhs::ConstColXpr,DenseShape,DenseShape,GemvProduct>
-        ::scaleAndAddTo(dst_vec, a_lhs, a_rhs.col(0), alpha);
-    }
-    else if (dst.rows() == 1)
-    {
-      typename Dest::RowXpr dst_vec(dst.row(0));
-      return internal::generic_product_impl<typename Lhs::ConstRowXpr,Rhs,DenseShape,DenseShape,GemvProduct>
-        ::scaleAndAddTo(dst_vec, a_lhs.row(0), a_rhs, alpha);
-    }
-
     typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(a_lhs);
     typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(a_rhs);
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
index 6ba0d9bdb..d68d2f965 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -87,7 +87,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
     ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
     ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
 
-    gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
+    gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
     gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
     gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
     tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, ResInnerStride, UpLo> sybb;
@@ -302,13 +302,13 @@ struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
 
 template<typename MatrixType, unsigned int UpLo>
 template<typename ProductType>
-EIGEN_DEVICE_FUNC TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta)
+TriangularView<MatrixType,UpLo>& TriangularViewImpl<MatrixType,UpLo,Dense>::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta)
 {
   EIGEN_STATIC_ASSERT((UpLo&UnitDiag)==0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED);
   eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols());
-
+  
   general_product_to_triangular_selector<MatrixType, ProductType, UpLo, internal::traits<ProductType>::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta);
-
+  
   return derived();
 }
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
index 9a650ec23..691f95d69 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
@@ -37,7 +37,7 @@ namespace Eigen {
 
 namespace internal {
 
-template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int UpLo>
+template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int  UpLo>
 struct general_matrix_matrix_rankupdate :
        general_matrix_matrix_triangular_product<
          Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,1,UpLo,BuiltIn> {};
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixVector.h b/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixVector.h
index dfb6aebce..a597c1f4e 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -14,57 +14,11 @@ namespace Eigen {
 
 namespace internal {
 
-enum GEMVPacketSizeType {
-  GEMVPacketFull = 0,
-  GEMVPacketHalf,
-  GEMVPacketQuarter
-};
-
-template <int N, typename T1, typename T2, typename T3>
-struct gemv_packet_cond { typedef T3 type; };
-
-template <typename T1, typename T2, typename T3>
-struct gemv_packet_cond<GEMVPacketFull, T1, T2, T3> { typedef T1 type; };
-
-template <typename T1, typename T2, typename T3>
-struct gemv_packet_cond<GEMVPacketHalf, T1, T2, T3> { typedef T2 type; };
-
-template<typename LhsScalar, typename RhsScalar, int _PacketSize=GEMVPacketFull>
-class gemv_traits
-{
-  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
-
-#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size)                        \
-  typedef typename gemv_packet_cond<packet_size,                                  \
-                                    typename packet_traits<name ## Scalar>::type, \
-                                    typename packet_traits<name ## Scalar>::half, \
-                                    typename unpacket_traits<typename packet_traits<name ## Scalar>::half>::half>::type \
-  prefix ## name ## Packet
-
-  PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize);
-  PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize);
-  PACKET_DECL_COND_PREFIX(_, Res, _PacketSize);
-#undef PACKET_DECL_COND_PREFIX
-
-public:
-  enum {
-        Vectorizable = unpacket_traits<_LhsPacket>::vectorizable &&
-        unpacket_traits<_RhsPacket>::vectorizable &&
-        int(unpacket_traits<_LhsPacket>::size)==int(unpacket_traits<_RhsPacket>::size),
-        LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1,
-        RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1,
-        ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1
-  };
-
-  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
-  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
-  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
-};
-
-
 /* Optimized col-major matrix * vector product:
- * This algorithm processes the matrix per vertical panels,
- * which are then processed horizontaly per chunck of 8*PacketSize x 1 vertical segments.
+ * This algorithm processes 4 columns at onces that allows to both reduce
+ * the number of load/stores of the result by a factor 4 and to reduce
+ * the instruction dependency. Moreover, we know that all bands have the
+ * same alignment pattern.
  *
  * Mixing type logic: C += alpha * A * B
  *  |  A  |  B  |alpha| comments
@@ -73,30 +27,56 @@ public:
  *  |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
  *  |cplx |real |real | optimal case, vectorization possible via real-cplx mul
  *
+ * Accesses to the matrix coefficients follow the following logic:
+ *
+ * - if all columns have the same alignment then
+ *   - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case)
+ *   - otherwise perform unaligned loads only (-> NoneAligned case)
+ * - otherwise
+ *   - if even columns have the same alignment then
+ *     // odd columns are guaranteed to have the same alignment too
+ *     - if even or odd columns have the same alignment as the result, then
+ *       // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double)
+ *       - perform half aligned and half unaligned loads (-> EvenAligned case)
+ *     - otherwise perform unaligned loads only (-> NoneAligned case)
+ *   - otherwise, if the register size is 4 scalars (e.g., SSE with float) then
+ *     - one over 4 consecutive columns is guaranteed to be aligned with the result vector,
+ *       perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case)
+ *       // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h
+ *   - otherwise,
+ *     // if we get here, this means the register size is greater than 4 (e.g., AVX with floats),
+ *     // we currently fall back to the NoneAligned case
+ *
  * The same reasoning apply for the transposed case.
+ *
+ * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet...
+ * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment
+ * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow
+ * compared to unaligned loads on a 4 byte boundary.
+ *
  */
 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
 struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
 {
-  typedef gemv_traits<LhsScalar,RhsScalar> Traits;
-  typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits;
-  typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits;
-
   typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 
-  typedef typename Traits::LhsPacket LhsPacket;
-  typedef typename Traits::RhsPacket RhsPacket;
-  typedef typename Traits::ResPacket ResPacket;
+enum {
+  Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
+              && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
+  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
+  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
+};
 
-  typedef typename HalfTraits::LhsPacket LhsPacketHalf;
-  typedef typename HalfTraits::RhsPacket RhsPacketHalf;
-  typedef typename HalfTraits::ResPacket ResPacketHalf;
+typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
+typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
+typedef typename packet_traits<ResScalar>::type  _ResPacket;
 
-  typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
-  typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
-  typedef typename QuarterTraits::ResPacket ResPacketQuarter;
+typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
 
-EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
+EIGEN_DONT_INLINE static void run(
   Index rows, Index cols,
   const LhsMapper& lhs,
   const RhsMapper& rhs,
@@ -105,187 +85,244 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
 };
 
 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
   Index rows, Index cols,
-  const LhsMapper& alhs,
+  const LhsMapper& lhs,
   const RhsMapper& rhs,
         ResScalar* res, Index resIncr,
   RhsScalar alpha)
 {
   EIGEN_UNUSED_VARIABLE(resIncr);
   eigen_internal_assert(resIncr==1);
+  #ifdef _EIGEN_ACCUMULATE_PACKETS
+  #error _EIGEN_ACCUMULATE_PACKETS has already been defined
+  #endif
+  #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \
+    pstore(&res[j], \
+      padd(pload<ResPacket>(&res[j]), \
+        padd( \
+      padd(pcj.pmul(lhs0.template load<LhsPacket, Alignment0>(j),    ptmp0), \
+      pcj.pmul(lhs1.template load<LhsPacket, Alignment13>(j),   ptmp1)),   \
+      padd(pcj.pmul(lhs2.template load<LhsPacket, Alignment2>(j),    ptmp2), \
+      pcj.pmul(lhs3.template load<LhsPacket, Alignment13>(j),   ptmp3)) )))
 
-  // The following copy tells the compiler that lhs's attributes are not modified outside this function
-  // This helps GCC to generate propoer code.
-  LhsMapper lhs(alhs);
+  typedef typename LhsMapper::VectorMapper LhsScalars;
 
   conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
   conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
-  conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half;
-  conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter;
+  if(ConjugateRhs)
+    alpha = numext::conj(alpha);
+
+  enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
+  const Index columnsAtOnce = 4;
+  const Index peels = 2;
+  const Index LhsPacketAlignedMask = LhsPacketSize-1;
+  const Index ResPacketAlignedMask = ResPacketSize-1;
+//  const Index PeelAlignedMask = ResPacketSize*peels-1;
+  const Index size = rows;
 
   const Index lhsStride = lhs.stride();
-  // TODO: for padded aligned inputs, we could enable aligned reads
-  enum { LhsAlignment = Unaligned,
-         ResPacketSize = Traits::ResPacketSize,
-         ResPacketSizeHalf = HalfTraits::ResPacketSize,
-         ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
-         LhsPacketSize = Traits::LhsPacketSize,
-         HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
-         HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
-  };
 
-  const Index n8 = rows-8*ResPacketSize+1;
-  const Index n4 = rows-4*ResPacketSize+1;
-  const Index n3 = rows-3*ResPacketSize+1;
-  const Index n2 = rows-2*ResPacketSize+1;
-  const Index n1 = rows-1*ResPacketSize+1;
-  const Index n_half = rows-1*ResPacketSizeHalf+1;
-  const Index n_quarter = rows-1*ResPacketSizeQuarter+1;
+  // How many coeffs of the result do we have to skip to be aligned.
+  // Here we assume data are at least aligned on the base scalar type.
+  Index alignedStart = internal::first_default_aligned(res,size);
+  Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
+  const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
 
-  // TODO: improve the following heuristic:
-  const Index block_cols = cols<128 ? cols : (lhsStride*sizeof(LhsScalar)<32000?16:4);
-  ResPacket palpha = pset1<ResPacket>(alpha);
-  ResPacketHalf palpha_half = pset1<ResPacketHalf>(alpha);
-  ResPacketQuarter palpha_quarter = pset1<ResPacketQuarter>(alpha);
+  const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
+  Index alignmentPattern = alignmentStep==0 ? AllAligned
+                       : alignmentStep==(LhsPacketSize/2) ? EvenAligned
+                       : FirstAligned;
 
-  for(Index j2=0; j2<cols; j2+=block_cols)
+  // we cannot assume the first element is aligned because of sub-matrices
+  const Index lhsAlignmentOffset = lhs.firstAligned(size);
+
+  // find how many columns do we have to skip to be aligned with the result (if possible)
+  Index skipColumns = 0;
+  // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
+  if( (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == size) || (UIntPtr(res)%sizeof(ResScalar)) )
   {
-    Index jend = numext::mini(j2+block_cols,cols);
-    Index i=0;
-    for(; i<n8; i+=ResPacketSize*8)
-    {
-      ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
-                c1 = pset1<ResPacket>(ResScalar(0)),
-                c2 = pset1<ResPacket>(ResScalar(0)),
-                c3 = pset1<ResPacket>(ResScalar(0)),
-                c4 = pset1<ResPacket>(ResScalar(0)),
-                c5 = pset1<ResPacket>(ResScalar(0)),
-                c6 = pset1<ResPacket>(ResScalar(0)),
-                c7 = pset1<ResPacket>(ResScalar(0));
+    alignedSize = 0;
+    alignedStart = 0;
+    alignmentPattern = NoneAligned;
+  }
+  else if(LhsPacketSize > 4)
+  {
+    // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
+    // Currently, it seems to be better to perform unaligned loads anyway
+    alignmentPattern = NoneAligned;
+  }
+  else if (LhsPacketSize>1)
+  {
+  //    eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
 
-      for(Index j=j2; j<jend; j+=1)
-      {
-        RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
-        c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
-        c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
-        c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
-        c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
-        c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*4,j),b0,c4);
-        c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*5,j),b0,c5);
-        c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*6,j),b0,c6);
-        c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*7,j),b0,c7);
-      }
-      pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
-      pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
-      pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
-      pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
-      pstoreu(res+i+ResPacketSize*4, pmadd(c4,palpha,ploadu<ResPacket>(res+i+ResPacketSize*4)));
-      pstoreu(res+i+ResPacketSize*5, pmadd(c5,palpha,ploadu<ResPacket>(res+i+ResPacketSize*5)));
-      pstoreu(res+i+ResPacketSize*6, pmadd(c6,palpha,ploadu<ResPacket>(res+i+ResPacketSize*6)));
-      pstoreu(res+i+ResPacketSize*7, pmadd(c7,palpha,ploadu<ResPacket>(res+i+ResPacketSize*7)));
-    }
-    if(i<n4)
+    while (skipColumns<LhsPacketSize &&
+          alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
+      ++skipColumns;
+    if (skipColumns==LhsPacketSize)
     {
-      ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
-                c1 = pset1<ResPacket>(ResScalar(0)),
-                c2 = pset1<ResPacket>(ResScalar(0)),
-                c3 = pset1<ResPacket>(ResScalar(0));
+      // nothing can be aligned, no need to skip any column
+      alignmentPattern = NoneAligned;
+      skipColumns = 0;
+    }
+    else
+    {
+      skipColumns = (std::min)(skipColumns,cols);
+      // note that the skiped columns are processed later.
+    }
 
-      for(Index j=j2; j<jend; j+=1)
-      {
-        RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
-        c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
-        c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
-        c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
-        c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*3,j),b0,c3);
-      }
-      pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
-      pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
-      pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
-      pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu<ResPacket>(res+i+ResPacketSize*3)));
+    /*    eigen_internal_assert(  (alignmentPattern==NoneAligned)
+                      || (skipColumns + columnsAtOnce >= cols)
+                      || LhsPacketSize > size
+                      || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/
+  }
+  else if(Vectorizable)
+  {
+    alignedStart = 0;
+    alignedSize = size;
+    alignmentPattern = AllAligned;
+  }
 
-      i+=ResPacketSize*4;
-    }
-    if(i<n3)
-    {
-      ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
-                c1 = pset1<ResPacket>(ResScalar(0)),
-                c2 = pset1<ResPacket>(ResScalar(0));
+  const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1;
+  const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3;
 
-      for(Index j=j2; j<jend; j+=1)
-      {
-        RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
-        c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
-        c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
-        c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*2,j),b0,c2);
-      }
-      pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
-      pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
-      pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu<ResPacket>(res+i+ResPacketSize*2)));
+  Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
+  for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
+  {
+    RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(i, 0)),
+              ptmp1 = pset1<RhsPacket>(alpha*rhs(i+offset1, 0)),
+              ptmp2 = pset1<RhsPacket>(alpha*rhs(i+2, 0)),
+              ptmp3 = pset1<RhsPacket>(alpha*rhs(i+offset3, 0));
 
-      i+=ResPacketSize*3;
-    }
-    if(i<n2)
-    {
-      ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
-                c1 = pset1<ResPacket>(ResScalar(0));
+    // this helps a lot generating better binary code
+    const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0),   lhs1 = lhs.getVectorMapper(0, i+offset1),
+                     lhs2 = lhs.getVectorMapper(0, i+2),   lhs3 = lhs.getVectorMapper(0, i+offset3);
 
-      for(Index j=j2; j<jend; j+=1)
-      {
-        RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
-        c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*0,j),b0,c0);
-        c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+LhsPacketSize*1,j),b0,c1);
-      }
-      pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
-      pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu<ResPacket>(res+i+ResPacketSize*1)));
-      i+=ResPacketSize*2;
-    }
-    if(i<n1)
+    if (Vectorizable)
     {
-      ResPacket c0 = pset1<ResPacket>(ResScalar(0));
-      for(Index j=j2; j<jend; j+=1)
+      /* explicit vectorization */
+      // process initial unaligned coeffs
+      for (Index j=0; j<alignedStart; ++j)
       {
-        RhsPacket b0 = pset1<RhsPacket>(rhs(j,0));
-        c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
+        res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
+        res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
+        res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
+        res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
       }
-      pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu<ResPacket>(res+i+ResPacketSize*0)));
-      i+=ResPacketSize;
-    }
-    if(HasHalf && i<n_half)
-    {
-      ResPacketHalf c0 = pset1<ResPacketHalf>(ResScalar(0));
-      for(Index j=j2; j<jend; j+=1)
+
+      if (alignedSize>alignedStart)
       {
-        RhsPacketHalf b0 = pset1<RhsPacketHalf>(rhs(j,0));
-        c0 = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i+0,j),b0,c0);
+        switch(alignmentPattern)
+        {
+          case AllAligned:
+            for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
+            break;
+          case EvenAligned:
+            for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
+            break;
+          case FirstAligned:
+          {
+            Index j = alignedStart;
+            if(peels>1)
+            {
+              LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
+              ResPacket T0, T1;
+
+              A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
+              A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
+              A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
+
+              for (; j<peeledSize; j+=peels*ResPacketSize)
+              {
+                A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize);  palign<1>(A01,A11);
+                A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize);  palign<2>(A02,A12);
+                A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize);  palign<3>(A03,A13);
+
+                A00 = lhs0.template load<LhsPacket, Aligned>(j);
+                A10 = lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize);
+                T0  = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
+                T1  = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
+
+                T0  = pcj.pmadd(A01, ptmp1, T0);
+                A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize);  palign<1>(A11,A01);
+                T0  = pcj.pmadd(A02, ptmp2, T0);
+                A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize);  palign<2>(A12,A02);
+                T0  = pcj.pmadd(A03, ptmp3, T0);
+                pstore(&res[j],T0);
+                A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize);  palign<3>(A13,A03);
+                T1  = pcj.pmadd(A11, ptmp1, T1);
+                T1  = pcj.pmadd(A12, ptmp2, T1);
+                T1  = pcj.pmadd(A13, ptmp3, T1);
+                pstore(&res[j+ResPacketSize],T1);
+              }
+            }
+            for (; j<alignedSize; j+=ResPacketSize)
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
+            break;
+          }
+          default:
+            for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
+              _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
+            break;
+        }
       }
-      pstoreu(res+i+ResPacketSizeHalf*0, pmadd(c0,palpha_half,ploadu<ResPacketHalf>(res+i+ResPacketSizeHalf*0)));
-      i+=ResPacketSizeHalf;
-    }
-    if(HasQuarter && i<n_quarter)
+    } // end explicit vectorization
+
+    /* process remaining coeffs (or all if there is no explicit vectorization) */
+    for (Index j=alignedSize; j<size; ++j)
     {
-      ResPacketQuarter c0 = pset1<ResPacketQuarter>(ResScalar(0));
-      for(Index j=j2; j<jend; j+=1)
-      {
-        RhsPacketQuarter b0 = pset1<RhsPacketQuarter>(rhs(j,0));
-        c0 = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i+0,j),b0,c0);
-      }
-      pstoreu(res+i+ResPacketSizeQuarter*0, pmadd(c0,palpha_quarter,ploadu<ResPacketQuarter>(res+i+ResPacketSizeQuarter*0)));
-      i+=ResPacketSizeQuarter;
-    }
-    for(;i<rows;++i)
-    {
-      ResScalar c0(0);
-      for(Index j=j2; j<jend; j+=1)
-        c0 += cj.pmul(lhs(i,j), rhs(j,0));
-      res[i] += alpha*c0;
+      res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]);
+      res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]);
+      res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]);
+      res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]);
     }
   }
+
+  // process remaining first and last columns (at most columnsAtOnce-1)
+  Index end = cols;
+  Index start = columnBound;
+  do
+  {
+    for (Index k=start; k<end; ++k)
+    {
+      RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs(k, 0));
+      const LhsScalars lhs0 = lhs.getVectorMapper(0, k);
+
+      if (Vectorizable)
+      {
+        /* explicit vectorization */
+        // process first unaligned result's coeffs
+        for (Index j=0; j<alignedStart; ++j)
+          res[j] += cj.pmul(lhs0(j), pfirst(ptmp0));
+        // process aligned result's coeffs
+        if (lhs0.template aligned<LhsPacket>(alignedStart))
+          for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
+            pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(i), ptmp0, pload<ResPacket>(&res[i])));
+        else
+          for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
+            pstore(&res[i], pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(i), ptmp0, pload<ResPacket>(&res[i])));
+      }
+
+      // process remaining scalars (or all if no explicit vectorization)
+      for (Index i=alignedSize; i<size; ++i)
+        res[i] += cj.pmul(lhs0(i), pfirst(ptmp0));
+    }
+    if (skipColumns)
+    {
+      start = 0;
+      end = skipColumns;
+      skipColumns = 0;
+    }
+    else
+      break;
+  } while(Vectorizable);
+  #undef _EIGEN_ACCUMULATE_PACKETS
 }
 
 /* Optimized row-major matrix * vector product:
- * This algorithm processes 4 rows at once that allows to both reduce
+ * This algorithm processes 4 rows at onces that allows to both reduce
  * the number of load/stores of the result by a factor 4 and to reduce
  * the instruction dependency. Moreover, we know that all bands have the
  * same alignment pattern.
@@ -297,25 +334,25 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
 struct general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>
 {
-  typedef gemv_traits<LhsScalar,RhsScalar> Traits;
-  typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketHalf> HalfTraits;
-  typedef gemv_traits<LhsScalar,RhsScalar,GEMVPacketQuarter> QuarterTraits;
+typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 
-  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+enum {
+  Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
+              && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
+  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
+  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
+  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
+};
 
-  typedef typename Traits::LhsPacket LhsPacket;
-  typedef typename Traits::RhsPacket RhsPacket;
-  typedef typename Traits::ResPacket ResPacket;
+typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
+typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
+typedef typename packet_traits<ResScalar>::type  _ResPacket;
 
-  typedef typename HalfTraits::LhsPacket LhsPacketHalf;
-  typedef typename HalfTraits::RhsPacket RhsPacketHalf;
-  typedef typename HalfTraits::ResPacket ResPacketHalf;
+typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
+typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
+typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
 
-  typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
-  typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
-  typedef typename QuarterTraits::ResPacket ResPacketQuarter;
-
-EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
+EIGEN_DONT_INLINE static void run(
   Index rows, Index cols,
   const LhsMapper& lhs,
   const RhsMapper& rhs,
@@ -324,191 +361,255 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(
 };
 
 template<typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version>
-EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
+EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,LhsMapper,RowMajor,ConjugateLhs,RhsScalar,RhsMapper,ConjugateRhs,Version>::run(
   Index rows, Index cols,
-  const LhsMapper& alhs,
+  const LhsMapper& lhs,
   const RhsMapper& rhs,
   ResScalar* res, Index resIncr,
   ResScalar alpha)
 {
-  // The following copy tells the compiler that lhs's attributes are not modified outside this function
-  // This helps GCC to generate propoer code.
-  LhsMapper lhs(alhs);
-
   eigen_internal_assert(rhs.stride()==1);
+
+  #ifdef _EIGEN_ACCUMULATE_PACKETS
+  #error _EIGEN_ACCUMULATE_PACKETS has already been defined
+  #endif
+
+  #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\
+    RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);  \
+    ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Alignment0>(j), b, ptmp0); \
+    ptmp1 = pcj.pmadd(lhs1.template load<LhsPacket, Alignment13>(j), b, ptmp1); \
+    ptmp2 = pcj.pmadd(lhs2.template load<LhsPacket, Alignment2>(j), b, ptmp2); \
+    ptmp3 = pcj.pmadd(lhs3.template load<LhsPacket, Alignment13>(j), b, ptmp3); }
+
   conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
   conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
-  conj_helper<LhsPacketHalf,RhsPacketHalf,ConjugateLhs,ConjugateRhs> pcj_half;
-  conj_helper<LhsPacketQuarter,RhsPacketQuarter,ConjugateLhs,ConjugateRhs> pcj_quarter;
 
-  // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
-  //       processing 8 rows at once might be counter productive wrt cache.
-  const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7;
-  const Index n4 = rows-3;
-  const Index n2 = rows-1;
+  typedef typename LhsMapper::VectorMapper LhsScalars;
 
-  // TODO: for padded aligned inputs, we could enable aligned reads
-  enum { LhsAlignment = Unaligned,
-         ResPacketSize = Traits::ResPacketSize,
-         ResPacketSizeHalf = HalfTraits::ResPacketSize,
-         ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
-         LhsPacketSize = Traits::LhsPacketSize,
-         LhsPacketSizeHalf = HalfTraits::LhsPacketSize,
-         LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize,
-         HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
-         HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
-  };
+  enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
+  const Index rowsAtOnce = 4;
+  const Index peels = 2;
+  const Index RhsPacketAlignedMask = RhsPacketSize-1;
+  const Index LhsPacketAlignedMask = LhsPacketSize-1;
+  const Index depth = cols;
+  const Index lhsStride = lhs.stride();
 
-  Index i=0;
-  for(; i<n8; i+=8)
+  // How many coeffs of the result do we have to skip to be aligned.
+  // Here we assume data are at least aligned on the base scalar type
+  // if that's not the case then vectorization is discarded, see below.
+  Index alignedStart = rhs.firstAligned(depth);
+  Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
+  const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
+
+  const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
+  Index alignmentPattern = alignmentStep==0 ? AllAligned
+                           : alignmentStep==(LhsPacketSize/2) ? EvenAligned
+                           : FirstAligned;
+
+  // we cannot assume the first element is aligned because of sub-matrices
+  const Index lhsAlignmentOffset = lhs.firstAligned(depth);
+  const Index rhsAlignmentOffset = rhs.firstAligned(rows);
+
+  // find how many rows do we have to skip to be aligned with rhs (if possible)
+  Index skipRows = 0;
+  // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
+  if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) ||
+      (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) ||
+      (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) )
   {
-    ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
-              c1 = pset1<ResPacket>(ResScalar(0)),
-              c2 = pset1<ResPacket>(ResScalar(0)),
-              c3 = pset1<ResPacket>(ResScalar(0)),
-              c4 = pset1<ResPacket>(ResScalar(0)),
-              c5 = pset1<ResPacket>(ResScalar(0)),
-              c6 = pset1<ResPacket>(ResScalar(0)),
-              c7 = pset1<ResPacket>(ResScalar(0));
-
-    Index j=0;
-    for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
-    {
-      RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
-
-      c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
-      c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
-      c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
-      c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
-      c4 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+4,j),b0,c4);
-      c5 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+5,j),b0,c5);
-      c6 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+6,j),b0,c6);
-      c7 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+7,j),b0,c7);
-    }
-    ResScalar cc0 = predux(c0);
-    ResScalar cc1 = predux(c1);
-    ResScalar cc2 = predux(c2);
-    ResScalar cc3 = predux(c3);
-    ResScalar cc4 = predux(c4);
-    ResScalar cc5 = predux(c5);
-    ResScalar cc6 = predux(c6);
-    ResScalar cc7 = predux(c7);
-    for(; j<cols; ++j)
-    {
-      RhsScalar b0 = rhs(j,0);
-
-      cc0 += cj.pmul(lhs(i+0,j), b0);
-      cc1 += cj.pmul(lhs(i+1,j), b0);
-      cc2 += cj.pmul(lhs(i+2,j), b0);
-      cc3 += cj.pmul(lhs(i+3,j), b0);
-      cc4 += cj.pmul(lhs(i+4,j), b0);
-      cc5 += cj.pmul(lhs(i+5,j), b0);
-      cc6 += cj.pmul(lhs(i+6,j), b0);
-      cc7 += cj.pmul(lhs(i+7,j), b0);
-    }
-    res[(i+0)*resIncr] += alpha*cc0;
-    res[(i+1)*resIncr] += alpha*cc1;
-    res[(i+2)*resIncr] += alpha*cc2;
-    res[(i+3)*resIncr] += alpha*cc3;
-    res[(i+4)*resIncr] += alpha*cc4;
-    res[(i+5)*resIncr] += alpha*cc5;
-    res[(i+6)*resIncr] += alpha*cc6;
-    res[(i+7)*resIncr] += alpha*cc7;
+    alignedSize = 0;
+    alignedStart = 0;
+    alignmentPattern = NoneAligned;
   }
-  for(; i<n4; i+=4)
+  else if(LhsPacketSize > 4)
   {
-    ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
-              c1 = pset1<ResPacket>(ResScalar(0)),
-              c2 = pset1<ResPacket>(ResScalar(0)),
-              c3 = pset1<ResPacket>(ResScalar(0));
-
-    Index j=0;
-    for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
-    {
-      RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
-
-      c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
-      c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
-      c2 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+2,j),b0,c2);
-      c3 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+3,j),b0,c3);
-    }
-    ResScalar cc0 = predux(c0);
-    ResScalar cc1 = predux(c1);
-    ResScalar cc2 = predux(c2);
-    ResScalar cc3 = predux(c3);
-    for(; j<cols; ++j)
-    {
-      RhsScalar b0 = rhs(j,0);
-
-      cc0 += cj.pmul(lhs(i+0,j), b0);
-      cc1 += cj.pmul(lhs(i+1,j), b0);
-      cc2 += cj.pmul(lhs(i+2,j), b0);
-      cc3 += cj.pmul(lhs(i+3,j), b0);
-    }
-    res[(i+0)*resIncr] += alpha*cc0;
-    res[(i+1)*resIncr] += alpha*cc1;
-    res[(i+2)*resIncr] += alpha*cc2;
-    res[(i+3)*resIncr] += alpha*cc3;
+    // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4.
+    alignmentPattern = NoneAligned;
   }
-  for(; i<n2; i+=2)
+  else if (LhsPacketSize>1)
   {
-    ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
-              c1 = pset1<ResPacket>(ResScalar(0));
+  //    eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0  || depth<LhsPacketSize);
 
-    Index j=0;
-    for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
+    while (skipRows<LhsPacketSize &&
+           alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
+      ++skipRows;
+    if (skipRows==LhsPacketSize)
     {
-      RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
-
-      c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+0,j),b0,c0);
-      c1 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i+1,j),b0,c1);
+      // nothing can be aligned, no need to skip any column
+      alignmentPattern = NoneAligned;
+      skipRows = 0;
     }
-    ResScalar cc0 = predux(c0);
-    ResScalar cc1 = predux(c1);
-    for(; j<cols; ++j)
+    else
     {
-      RhsScalar b0 = rhs(j,0);
-
-      cc0 += cj.pmul(lhs(i+0,j), b0);
-      cc1 += cj.pmul(lhs(i+1,j), b0);
+      skipRows = (std::min)(skipRows,Index(rows));
+      // note that the skiped columns are processed later.
     }
-    res[(i+0)*resIncr] += alpha*cc0;
-    res[(i+1)*resIncr] += alpha*cc1;
+    /*    eigen_internal_assert(  alignmentPattern==NoneAligned
+                      || LhsPacketSize==1
+                      || (skipRows + rowsAtOnce >= rows)
+                      || LhsPacketSize > depth
+                      || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/
   }
-  for(; i<rows; ++i)
+  else if(Vectorizable)
   {
-    ResPacket c0 = pset1<ResPacket>(ResScalar(0));
-    ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0));
-    ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0));
-    Index j=0;
-    for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
+    alignedStart = 0;
+    alignedSize = depth;
+    alignmentPattern = AllAligned;
+  }
+
+  const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1;
+  const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3;
+
+  Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
+  for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
+  {
+    // FIXME: what is the purpose of this EIGEN_ALIGN_DEFAULT ??
+    EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
+    ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
+
+    // this helps the compiler generating good binary code
+    const LhsScalars lhs0 = lhs.getVectorMapper(i+0, 0),    lhs1 = lhs.getVectorMapper(i+offset1, 0),
+                     lhs2 = lhs.getVectorMapper(i+2, 0),    lhs3 = lhs.getVectorMapper(i+offset3, 0);
+
+    if (Vectorizable)
     {
-      RhsPacket b0 = rhs.template load<RhsPacket,Unaligned>(j,0);
-      c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i,j),b0,c0);
-    }
-    ResScalar cc0 = predux(c0);
-    if (HasHalf) {
-      for(; j+LhsPacketSizeHalf<=cols; j+=LhsPacketSizeHalf)
+      /* explicit vectorization */
+      ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
+                ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
+
+      // process initial unaligned coeffs
+      // FIXME this loop get vectorized by the compiler !
+      for (Index j=0; j<alignedStart; ++j)
+      {
+        RhsScalar b = rhs(j, 0);
+        tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
+        tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
+      }
+
+      if (alignedSize>alignedStart)
+      {
+        switch(alignmentPattern)
         {
-          RhsPacketHalf b0 = rhs.template load<RhsPacketHalf,Unaligned>(j,0);
-          c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i,j),b0,c0_h);
+          case AllAligned:
+            for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Aligned,Aligned);
+            break;
+          case EvenAligned:
+            for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Aligned);
+            break;
+          case FirstAligned:
+          {
+            Index j = alignedStart;
+            if (peels>1)
+            {
+              /* Here we proccess 4 rows with with two peeled iterations to hide
+               * the overhead of unaligned loads. Moreover unaligned loads are handled
+               * using special shift/move operations between the two aligned packets
+               * overlaping the desired unaligned packet. This is *much* more efficient
+               * than basic unaligned loads.
+               */
+              LhsPacket A01, A02, A03, A11, A12, A13;
+              A01 = lhs1.template load<LhsPacket, Aligned>(alignedStart-1);
+              A02 = lhs2.template load<LhsPacket, Aligned>(alignedStart-2);
+              A03 = lhs3.template load<LhsPacket, Aligned>(alignedStart-3);
+
+              for (; j<peeledSize; j+=peels*RhsPacketSize)
+              {
+                RhsPacket b = rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0);
+                A11 = lhs1.template load<LhsPacket, Aligned>(j-1+LhsPacketSize);  palign<1>(A01,A11);
+                A12 = lhs2.template load<LhsPacket, Aligned>(j-2+LhsPacketSize);  palign<2>(A02,A12);
+                A13 = lhs3.template load<LhsPacket, Aligned>(j-3+LhsPacketSize);  palign<3>(A03,A13);
+
+                ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), b, ptmp0);
+                ptmp1 = pcj.pmadd(A01, b, ptmp1);
+                A01 = lhs1.template load<LhsPacket, Aligned>(j-1+2*LhsPacketSize);  palign<1>(A11,A01);
+                ptmp2 = pcj.pmadd(A02, b, ptmp2);
+                A02 = lhs2.template load<LhsPacket, Aligned>(j-2+2*LhsPacketSize);  palign<2>(A12,A02);
+                ptmp3 = pcj.pmadd(A03, b, ptmp3);
+                A03 = lhs3.template load<LhsPacket, Aligned>(j-3+2*LhsPacketSize);  palign<3>(A13,A03);
+
+                b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load<RhsPacket, Aligned>(0);
+                ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j+LhsPacketSize), b, ptmp0);
+                ptmp1 = pcj.pmadd(A11, b, ptmp1);
+                ptmp2 = pcj.pmadd(A12, b, ptmp2);
+                ptmp3 = pcj.pmadd(A13, b, ptmp3);
+              }
+            }
+            for (; j<alignedSize; j+=RhsPacketSize)
+              _EIGEN_ACCUMULATE_PACKETS(Aligned,Unaligned,Unaligned);
+            break;
+          }
+          default:
+            for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
+              _EIGEN_ACCUMULATE_PACKETS(Unaligned,Unaligned,Unaligned);
+            break;
         }
-      cc0 += predux(c0_h);
-    }
-    if (HasQuarter) {
-      for(; j+LhsPacketSizeQuarter<=cols; j+=LhsPacketSizeQuarter)
-        {
-          RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter,Unaligned>(j,0);
-          c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i,j),b0,c0_q);
-        }
-      cc0 += predux(c0_q);
-    }
-    for(; j<cols; ++j)
+        tmp0 += predux(ptmp0);
+        tmp1 += predux(ptmp1);
+        tmp2 += predux(ptmp2);
+        tmp3 += predux(ptmp3);
+      }
+    } // end explicit vectorization
+
+    // process remaining coeffs (or all if no explicit vectorization)
+    // FIXME this loop get vectorized by the compiler !
+    for (Index j=alignedSize; j<depth; ++j)
     {
-      cc0 += cj.pmul(lhs(i,j), rhs(j,0));
+      RhsScalar b = rhs(j, 0);
+      tmp0 += cj.pmul(lhs0(j),b); tmp1 += cj.pmul(lhs1(j),b);
+      tmp2 += cj.pmul(lhs2(j),b); tmp3 += cj.pmul(lhs3(j),b);
     }
-    res[i*resIncr] += alpha*cc0;
+    res[i*resIncr]            += alpha*tmp0;
+    res[(i+offset1)*resIncr]  += alpha*tmp1;
+    res[(i+2)*resIncr]        += alpha*tmp2;
+    res[(i+offset3)*resIncr]  += alpha*tmp3;
   }
+
+  // process remaining first and last rows (at most columnsAtOnce-1)
+  Index end = rows;
+  Index start = rowBound;
+  do
+  {
+    for (Index i=start; i<end; ++i)
+    {
+      EIGEN_ALIGN_MAX ResScalar tmp0 = ResScalar(0);
+      ResPacket ptmp0 = pset1<ResPacket>(tmp0);
+      const LhsScalars lhs0 = lhs.getVectorMapper(i, 0);
+      // process first unaligned result's coeffs
+      // FIXME this loop get vectorized by the compiler !
+      for (Index j=0; j<alignedStart; ++j)
+        tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
+
+      if (alignedSize>alignedStart)
+      {
+        // process aligned rhs coeffs
+        if (lhs0.template aligned<LhsPacket>(alignedStart))
+          for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
+            ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Aligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
+        else
+          for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
+            ptmp0 = pcj.pmadd(lhs0.template load<LhsPacket, Unaligned>(j), rhs.getVectorMapper(j, 0).template load<RhsPacket, Aligned>(0), ptmp0);
+        tmp0 += predux(ptmp0);
+      }
+
+      // process remaining scalars
+      // FIXME this loop get vectorized by the compiler !
+      for (Index j=alignedSize; j<depth; ++j)
+        tmp0 += cj.pmul(lhs0(j), rhs(j, 0));
+      res[i*resIncr] += alpha*tmp0;
+    }
+    if (skipRows)
+    {
+      start = 0;
+      end = skipRows;
+      skipRows = 0;
+    }
+    else
+      break;
+  } while(Vectorizable);
+
+  #undef _EIGEN_ACCUMULATE_PACKETS
 }
 
 } // end namespace internal
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/products/Parallelizer.h b/uppsrc/plugin/Eigen/Eigen/src/Core/products/Parallelizer.h
index 3bdd30e5a..67b2442b5 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/products/Parallelizer.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/products/Parallelizer.h
@@ -10,10 +10,6 @@
 #ifndef EIGEN_PARALLELIZER_H
 #define EIGEN_PARALLELIZER_H
 
-#if EIGEN_HAS_CXX11_ATOMIC
-#include <atomic>
-#endif
-
 namespace Eigen {
 
 namespace internal {
@@ -80,17 +76,8 @@ template<typename Index> struct GemmParallelInfo
 {
   GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
 
-  // volatile is not enough on all architectures (see bug 1572)
-  // to guarantee that when thread A says to thread B that it is
-  // done with packing a block, then all writes have been really
-  // carried out... C++11 memory model+atomic guarantees this.
-#if EIGEN_HAS_CXX11_ATOMIC
-  std::atomic<Index> sync;
-  std::atomic<int> users;
-#else
   Index volatile sync;
   int volatile users;
-#endif
 
   Index lhs_start;
   Index lhs_length;
@@ -101,14 +88,11 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,
 {
   // TODO when EIGEN_USE_BLAS is defined,
   // we should still enable OMP for other scalar types
-  // Without C++11, we have to disable GEMM's parallelization on
-  // non x86 architectures because there volatile is not enough for our purpose.
-  // See bug 1572.
-#if (! defined(EIGEN_HAS_OPENMP)) || defined(EIGEN_USE_BLAS) || ((!EIGEN_HAS_CXX11_ATOMIC) && !(EIGEN_ARCH_i386_OR_x86_64))
+#if !(defined (EIGEN_HAS_OPENMP)) || defined (EIGEN_USE_BLAS)
   // FIXME the transpose variable is only needed to properly split
   // the matrix product when multithreading is enabled. This is a temporary
   // fix to support row-major destination matrices. This whole
-  // parallelizer mechanism has to be redesigned anyway.
+  // parallelizer mechanism has to be redisigned anyway.
   EIGEN_UNUSED_VARIABLE(depth);
   EIGEN_UNUSED_VARIABLE(transpose);
   func(0,rows, 0,cols);
@@ -129,12 +113,12 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,
   double work = static_cast<double>(rows) * static_cast<double>(cols) *
       static_cast<double>(depth);
   double kMinTaskSize = 50000;  // FIXME improve this heuristic.
-  pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, static_cast<Index>( work / kMinTaskSize ) ));
+  pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, work / kMinTaskSize));
 
   // compute the number of threads we are going to use
   Index threads = std::min<Index>(nbThreads(), pb_max_threads);
 
-  // if multi-threading is explicitly disabled, not useful, or if we already are in a parallel session,
+  // if multi-threading is explicitely disabled, not useful, or if we already are in a parallel session,
   // then abort multi-threading
   // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
   if((!Condition) || (threads==1) || (omp_get_num_threads()>1))
@@ -148,7 +132,8 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,
 
   ei_declare_aligned_stack_constructed_variable(GemmParallelInfo<Index>,info,threads,0);
 
-  #pragma omp parallel num_threads(threads)
+  int errorCount = 0;
+  #pragma omp parallel num_threads(threads) reduction(+: errorCount)
   {
     Index i = omp_get_thread_num();
     // Note that the actual number of threads might be lower than the number of request ones.
@@ -167,9 +152,14 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth,
     info[i].lhs_start = r0;
     info[i].lhs_length = actualBlockRows;
 
-    if(transpose) func(c0, actualBlockCols, 0, rows, info);
-    else          func(0, rows, c0, actualBlockCols, info);
+    EIGEN_TRY {
+      if(transpose) func(c0, actualBlockCols, 0, rows, info);
+      else          func(0, rows, c0, actualBlockCols, info);
+    } EIGEN_CATCH(...) {
+      ++errorCount;
+    }
   }
+  if (errorCount) EIGEN_THROW_X(Eigen::eigen_assert_exception());
 #endif
 }
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index 33ecf10f6..04c933480 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -45,23 +45,14 @@ struct symm_pack_lhs
   }
   void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
   {
-    typedef typename unpacket_traits<typename packet_traits<Scalar>::type>::half HalfPacket;
-    typedef typename unpacket_traits<typename unpacket_traits<typename packet_traits<Scalar>::type>::half>::half QuarterPacket;
-    enum { PacketSize = packet_traits<Scalar>::size,
-           HalfPacketSize = unpacket_traits<HalfPacket>::size,
-           QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
-           HasHalf = (int)HalfPacketSize < (int)PacketSize,
-           HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
-
+    enum { PacketSize = packet_traits<Scalar>::size };
     const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
     Index count = 0;
     //Index peeled_mc3 = (rows/Pack1)*Pack1;
     
     const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0;
     const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0;
-    const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0;
-    const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0;
-    const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? peeled_mc_half+((rows-peeled_mc_half)/(QuarterPacketSize))*(QuarterPacketSize) : 0;
+    const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0;
     
     if(Pack1>=3*PacketSize)
       for(Index i=0; i<peeled_mc3; i+=3*PacketSize)
@@ -75,16 +66,8 @@ struct symm_pack_lhs
       for(Index i=peeled_mc2; i<peeled_mc1; i+=1*PacketSize)
         pack<1*PacketSize>(blockA, lhs, cols, i, count);
 
-    if(HasHalf && Pack1>=HalfPacketSize)
-      for(Index i=peeled_mc1; i<peeled_mc_half; i+=HalfPacketSize)
-        pack<HalfPacketSize>(blockA, lhs, cols, i, count);
-
-    if(HasQuarter && Pack1>=QuarterPacketSize)
-      for(Index i=peeled_mc_half; i<peeled_mc_quarter; i+=QuarterPacketSize)
-        pack<QuarterPacketSize>(blockA, lhs, cols, i, count);
-
     // do the same with mr==1
-    for(Index i=peeled_mc_quarter; i<rows; i++)
+    for(Index i=peeled_mc1; i<rows; i++)
     {
       for(Index k=0; k<i; k++)
         blockA[count++] = lhs(i, k);                   // normal
@@ -372,7 +355,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
     gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
     symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
     gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
-    gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
+    gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
 
     for(Index k2=0; k2<size; k2+=kc)
     {
@@ -407,7 +390,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
       for(Index i2=k2+kc; i2<size; i2+=mc)
       {
         const Index actual_mc = (std::min)(i2+mc,size)-i2;
-        gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder,false>()
+        gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
           (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
 
         gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
@@ -459,7 +442,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
     ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
 
     gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
     symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
 
     for(Index k2=0; k2<size; k2+=kc)
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h b/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h
index d38fd72b2..3fd180e6c 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -15,7 +15,7 @@ namespace Eigen {
 namespace internal {
 
 /* Optimized selfadjoint matrix * vector product:
- * This algorithm processes 2 columns at once that allows to both reduce
+ * This algorithm processes 2 columns at onces that allows to both reduce
  * the number of load/stores of the result by a factor 2 and to reduce
  * the instruction dependency.
  */
@@ -27,8 +27,7 @@ template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool Conju
 struct selfadjoint_matrix_vector_product
 
 {
-static EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC
-void run(
+static EIGEN_DONT_INLINE void run(
   Index size,
   const Scalar*  lhs, Index lhsStride,
   const Scalar*  rhs,
@@ -37,8 +36,7 @@ void run(
 };
 
 template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC
-void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
+EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
   Index size,
   const Scalar*  lhs, Index lhsStride,
   const Scalar*  rhs,
@@ -64,7 +62,8 @@ void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateL
 
   Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;
 
-  Index bound = numext::maxi(Index(0), size-8) & 0xfffffffe;
+
+  Index bound = (std::max)(Index(0),size-8) & 0xfffffffe;
   if (FirstTriangular)
     bound = size - bound;
 
@@ -176,8 +175,7 @@ struct selfadjoint_product_impl<Lhs,LhsMode,false,Rhs,0,true>
   enum { LhsUpLo = LhsMode&(Upper|Lower) };
 
   template<typename Dest>
-  static EIGEN_DEVICE_FUNC
-  void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
+  static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha)
   {
     typedef typename Dest::Scalar ResScalar;
     typedef typename Rhs::Scalar RhsScalar;
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointProduct.h b/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointProduct.h
index 61e8894e7..ef12c98f6 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointProduct.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointProduct.h
@@ -120,7 +120,7 @@ struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false>
 
 template<typename MatrixType, unsigned int UpLo>
 template<typename DerivedU>
-EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
+SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
 ::rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha)
 {
   selfadjoint_product_selector<MatrixType,DerivedU,UpLo>::run(_expression().const_cast_derived(), u.derived(), alpha);
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointRank2Update.h b/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointRank2Update.h
index 09209f733..2ae364111 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointRank2Update.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointRank2Update.h
@@ -24,8 +24,7 @@ struct selfadjoint_rank2_update_selector;
 template<typename Scalar, typename Index, typename UType, typename VType>
 struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Lower>
 {
-  static EIGEN_DEVICE_FUNC
-  void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
+  static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
   {
     const Index size = u.size();
     for (Index i=0; i<size; ++i)
@@ -58,7 +57,7 @@ template<bool Cond, typename T> struct conj_expr_if
 
 template<typename MatrixType, unsigned int UpLo>
 template<typename DerivedU, typename DerivedV>
-EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
+SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
 ::rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha)
 {
   typedef internal::blas_traits<DerivedU> UBlasTraits;
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h b/uppsrc/plugin/Eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h
index f0c60507a..2fb408d1d 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -155,7 +155,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
       triangularBuffer.diagonal().setOnes();
 
     gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
     gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
 
     for(Index k2=IsLower ? depth : 0;
@@ -226,7 +226,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
         for(Index i2=start; i2<end; i2+=mc)
         {
           const Index actual_mc = (std::min)(i2+mc,end)-i2;
-          gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder,false>()
+          gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
             (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
 
           gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc,
@@ -305,7 +305,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
       triangularBuffer.diagonal().setOnes();
 
     gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, LhsStorageOrder> pack_lhs;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
     gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder> pack_rhs;
     gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/products/TriangularSolverMatrix.h b/uppsrc/plugin/Eigen/Eigen/src/Core/products/TriangularSolverMatrix.h
index 0dcf3bb52..e3ed2cd19 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -76,7 +76,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conju
 
     conj_if<Conjugate> conj;
     gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, TriStorageOrder> pack_lhs;
+    gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
     gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs;
 
     // the goal here is to subdivise the Rhs panels such that we keep some cache
@@ -229,7 +229,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conj
     gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
     gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
     gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder,false,true> pack_rhs_panel;
-    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor, false, true> pack_lhs_panel;
+    gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
 
     for(Index k2=IsLower ? size : 0;
         IsLower ? k2>0 : k2<size;
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/products/TriangularSolverVector.h b/uppsrc/plugin/Eigen/Eigen/src/Core/products/TriangularSolverVector.h
index 647317016..b994759b2 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/products/TriangularSolverVector.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/products/TriangularSolverVector.h
@@ -58,7 +58,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
       {
         // let's directly call the low level product function because:
         // 1 - it is faster to compile
-        // 2 - it is slightly faster at runtime
+        // 2 - it is slighlty faster at runtime
         Index startRow = IsLower ? pi : pi-actualPanelWidth;
         Index startCol = IsLower ? 0 : pi;
 
@@ -77,7 +77,7 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
         if (k>0)
           rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum();
 
-        if((!(Mode & UnitDiag)) && numext::not_equal_strict(rhs[i],RhsScalar(0)))
+        if(!(Mode & UnitDiag))
           rhs[i] /= cjLhs(i,i);
       }
     }
@@ -114,23 +114,20 @@ struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Con
       for(Index k=0; k<actualPanelWidth; ++k)
       {
         Index i = IsLower ? pi+k : pi-k-1;
-        if(numext::not_equal_strict(rhs[i],RhsScalar(0)))
-        {
-          if(!(Mode & UnitDiag))
-            rhs[i] /= cjLhs.coeff(i,i);
+        if(!(Mode & UnitDiag))
+          rhs[i] /= cjLhs.coeff(i,i);
 
-          Index r = actualPanelWidth - k - 1; // remaining size
-          Index s = IsLower ? i+1 : i-r;
-          if (r>0)
-            Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r);
-        }
+        Index r = actualPanelWidth - k - 1; // remaining size
+        Index s = IsLower ? i+1 : i-r;
+        if (r>0)
+          Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r);
       }
       Index r = IsLower ? size - endBlock : startBlock; // remaining size
       if (r > 0)
       {
         // let's directly call the low level product function because:
         // 1 - it is faster to compile
-        // 2 - it is slightly faster at runtime
+        // 2 - it is slighlty faster at runtime
         general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,Conjugate,RhsScalar,RhsMapper,false>::run(
             r, actualPanelWidth,
             LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride),
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/BlasUtil.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/BlasUtil.h
index 643558cba..3dff9bc9b 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/BlasUtil.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/util/BlasUtil.h
@@ -24,7 +24,7 @@ struct gebp_kernel;
 template<typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
 struct gemm_pack_rhs;
 
-template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
+template<typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
 struct gemm_pack_lhs;
 
 template<
@@ -159,9 +159,11 @@ template<typename Scalar, typename Index, int AlignmentType, int Incr=1>
 class BlasLinearMapper;
 
 template<typename Scalar, typename Index, int AlignmentType>
-class BlasLinearMapper<Scalar,Index,AlignmentType>
-{
-public:
+class BlasLinearMapper<Scalar,Index,AlignmentType,1> {
+  public:
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data, Index incr=1)
     : m_data(data)
   {
@@ -177,17 +179,19 @@ public:
     return m_data[i];
   }
 
-  template<typename PacketType>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const {
-    return ploadt<PacketType, AlignmentType>(m_data + i);
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+    return ploadt<Packet, AlignmentType>(m_data + i);
   }
 
-  template<typename PacketType>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const {
-    pstoret<Scalar, PacketType, AlignmentType>(m_data + i, p);
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
+    return ploadt<HalfPacket, AlignmentType>(m_data + i);
   }
 
-protected:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const {
+    pstoret<Scalar, Packet, AlignmentType>(m_data + i, p);
+  }
+
+  protected:
   Scalar *m_data;
 };
 
@@ -199,6 +203,9 @@ template<typename Scalar, typename Index, int StorageOrder, int AlignmentType>
 class blas_data_mapper<Scalar,Index,StorageOrder,AlignmentType,1>
 {
 public:
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
   typedef BlasLinearMapper<Scalar, Index, AlignmentType> LinearMapper;
   typedef BlasVectorMapper<Scalar, Index> VectorMapper;
 
@@ -228,14 +235,12 @@ public:
     return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride];
   }
 
-  template<typename PacketType>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i, Index j) const {
-    return ploadt<PacketType, AlignmentType>(&operator()(i, j));
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
+    return ploadt<Packet, AlignmentType>(&operator()(i, j));
   }
 
-  template <typename PacketT, int AlignmentT>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const {
-    return ploadt<PacketT, AlignmentT>(&operator()(i, j));
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
+    return ploadt<HalfPacket, AlignmentType>(&operator()(i, j));
   }
 
   template<typename SubPacket>
@@ -258,7 +263,7 @@ public:
     return internal::first_default_aligned(m_data, size);
   }
 
-protected:
+  protected:
   Scalar* EIGEN_RESTRICT m_data;
   const Index m_stride;
 };
@@ -270,6 +275,9 @@ template<typename Scalar, typename Index, int AlignmentType, int Incr>
 class BlasLinearMapper
 {
 public:
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data,Index incr) : m_data(data), m_incr(incr) {}
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const {
@@ -280,9 +288,8 @@ public:
     return m_data[i*m_incr.value()];
   }
 
-  template<typename PacketType>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const {
-    return pgather<Scalar,PacketType>(m_data + i*m_incr.value(), m_incr.value());
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
+    return pgather<Scalar,Packet>(m_data + i*m_incr.value(), m_incr.value());
   }
 
   template<typename PacketType>
@@ -299,6 +306,9 @@ template<typename Scalar, typename Index, int StorageOrder, int AlignmentType,in
 class blas_data_mapper
 {
 public:
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename packet_traits<Scalar>::half HalfPacket;
+
   typedef BlasLinearMapper<Scalar, Index, AlignmentType,Incr> LinearMapper;
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr) : m_data(data), m_stride(stride), m_incr(incr) {}
@@ -317,9 +327,8 @@ public:
     return m_data[StorageOrder==RowMajor ? j*m_incr.value() + i*m_stride : i*m_incr.value() + j*m_stride];
   }
 
-  template<typename PacketType>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i, Index j) const {
-    return pgather<Scalar,PacketType>(&operator()(i, j),m_incr.value());
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
+    return pgather<Scalar,Packet>(&operator()(i, j),m_incr.value());
   }
 
   template <typename PacketT, int AlignmentT>
@@ -370,15 +379,14 @@ template<typename XprType> struct blas_traits
     HasUsableDirectAccess = (    (int(XprType::Flags)&DirectAccessBit)
                               && (   bool(XprType::IsVectorAtCompileTime)
                                   || int(inner_stride_at_compile_time<XprType>::ret) == 1)
-                             ) ?  1 : 0,
-    HasScalarFactor = false
+                             ) ?  1 : 0
   };
   typedef typename conditional<bool(HasUsableDirectAccess),
     ExtractType,
     typename _ExtractType::PlainObject
     >::type DirectLinearAccessType;
-  static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return x; }
-  static inline EIGEN_DEVICE_FUNC const Scalar extractScalarFactor(const XprType&) { return Scalar(1); }
+  static inline ExtractType extract(const XprType& x) { return x; }
+  static inline const Scalar extractScalarFactor(const XprType&) { return Scalar(1); }
 };
 
 // pop conjugate
@@ -403,23 +411,17 @@ template<typename Scalar, typename NestedXpr, typename Plain>
 struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> >
  : blas_traits<NestedXpr>
 {
-  enum {
-    HasScalarFactor = true
-  };
   typedef blas_traits<NestedXpr> Base;
   typedef CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain>, NestedXpr> XprType;
   typedef typename Base::ExtractType ExtractType;
-  static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); }
-  static inline EIGEN_DEVICE_FUNC Scalar extractScalarFactor(const XprType& x)
+  static inline ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); }
+  static inline Scalar extractScalarFactor(const XprType& x)
   { return x.lhs().functor().m_other * Base::extractScalarFactor(x.rhs()); }
 };
 template<typename Scalar, typename NestedXpr, typename Plain>
 struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, NestedXpr, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain> > >
  : blas_traits<NestedXpr>
 {
-  enum {
-    HasScalarFactor = true
-  };
   typedef blas_traits<NestedXpr> Base;
   typedef CwiseBinaryOp<scalar_product_op<Scalar>, NestedXpr, const CwiseNullaryOp<scalar_constant_op<Scalar>,Plain> > XprType;
   typedef typename Base::ExtractType ExtractType;
@@ -438,9 +440,6 @@ template<typename Scalar, typename NestedXpr>
 struct blas_traits<CwiseUnaryOp<scalar_opposite_op<Scalar>, NestedXpr> >
  : blas_traits<NestedXpr>
 {
-  enum {
-    HasScalarFactor = true
-  };
   typedef blas_traits<NestedXpr> Base;
   typedef CwiseUnaryOp<scalar_opposite_op<Scalar>, NestedXpr> XprType;
   typedef typename Base::ExtractType ExtractType;
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/ConfigureVectorization.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/ConfigureVectorization.h
deleted file mode 100644
index 952abc306..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/ConfigureVectorization.h
+++ /dev/null
@@ -1,483 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CONFIGURE_VECTORIZATION_H
-#define EIGEN_CONFIGURE_VECTORIZATION_H
-
-//------------------------------------------------------------------------------------------
-// Static and dynamic alignment control
-//
-// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES
-// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively.
-// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not,
-// a default value is automatically computed based on architecture, compiler, and OS.
-//
-// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX}
-// to be used to declare statically aligned buffers.
-//------------------------------------------------------------------------------------------
-
-
-/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements.
- * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled,
- * so that vectorization doesn't affect binary compatibility.
- *
- * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
- * vectorized and non-vectorized code.
- * 
- * FIXME: this code can be cleaned up once we switch to proper C++11 only.
- */
-#if (defined EIGEN_CUDACC)
-  #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
-  #define EIGEN_ALIGNOF(x) __alignof(x)
-#elif EIGEN_HAS_ALIGNAS
-  #define EIGEN_ALIGN_TO_BOUNDARY(n) alignas(n)
-  #define EIGEN_ALIGNOF(x) alignof(x)
-#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
-  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
-  #define EIGEN_ALIGNOF(x) __alignof(x)
-#elif EIGEN_COMP_MSVC
-  #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
-  #define EIGEN_ALIGNOF(x) __alignof(x)
-#elif EIGEN_COMP_SUNCC
-  // FIXME not sure about this one:
-  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
-  #define EIGEN_ALIGNOF(x) __alignof(x)
-#else
-  #error Please tell me what is the equivalent of alignas(n) and alignof(x) for your compiler
-#endif
-
-// If the user explicitly disable vectorization, then we also disable alignment
-#if defined(EIGEN_DONT_VECTORIZE)
-  #if defined(EIGEN_GPUCC)
-    // GPU code is always vectorized and requires memory alignment for
-    // statically allocated buffers.
-    #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
-  #else
-    #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0
-  #endif
-#elif defined(__AVX512F__)
-  // 64 bytes static alignment is preferred only if really required
-  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64
-#elif defined(__AVX__)
-  // 32 bytes static alignment is preferred only if really required
-  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32
-#else
-  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
-#endif
-
-
-// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense
-#define EIGEN_MIN_ALIGN_BYTES 16
-
-// Defined the boundary (in bytes) on which the data needs to be aligned. Note
-// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be
-// aligned at all regardless of the value of this #define.
-
-#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN))  && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0
-#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY.
-#endif
-
-// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprecated
-// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0
-#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)
-  #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES
-    #undef EIGEN_MAX_STATIC_ALIGN_BYTES
-  #endif
-  #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
-#endif
-
-#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES
-
-  // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES
-
-  // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
-  // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
-  // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
-  // certain common platform (compiler+architecture combinations) to avoid these problems.
-  // Only static alignment is really problematic (relies on nonstandard compiler extensions),
-  // try to keep heap alignment even when we have to disable static alignment.
-  #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || EIGEN_ARCH_MIPS)
-  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
-  #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6)
-  // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support.
-  // Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use.
-  // 4.8 and newer seem definitely unaffected.
-  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
-  #else
-  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
-  #endif
-
-  // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX
-  #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \
-  && !EIGEN_GCC3_OR_OLDER \
-  && !EIGEN_COMP_SUNCC \
-  && !EIGEN_OS_QNX
-    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
-  #else
-    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
-  #endif
-
-  #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT
-    #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
-  #else
-    #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
-  #endif
-
-#endif
-
-// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_STATIC_ALIGN_BYTES
-#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES<EIGEN_MAX_STATIC_ALIGN_BYTES
-#undef EIGEN_MAX_STATIC_ALIGN_BYTES
-#define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
-#endif
-
-#if EIGEN_MAX_STATIC_ALIGN_BYTES==0 && !defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
-  #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
-#endif
-
-// At this stage, EIGEN_MAX_STATIC_ALIGN_BYTES>0 is the true test whether we want to align arrays on the stack or not.
-// It takes into account both the user choice to explicitly enable/disable alignment (by setting EIGEN_MAX_STATIC_ALIGN_BYTES)
-// and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT).
-// Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used.
-
-
-// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY
-#define EIGEN_ALIGN8  EIGEN_ALIGN_TO_BOUNDARY(8)
-#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
-#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32)
-#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64)
-#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
-#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES)
-#else
-#define EIGEN_ALIGN_MAX
-#endif
-
-
-// Dynamic alignment control
-
-#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0
-#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN.
-#endif
-
-#ifdef EIGEN_DONT_ALIGN
-  #ifdef EIGEN_MAX_ALIGN_BYTES
-    #undef EIGEN_MAX_ALIGN_BYTES
-  #endif
-  #define EIGEN_MAX_ALIGN_BYTES 0
-#elif !defined(EIGEN_MAX_ALIGN_BYTES)
-  #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
-#endif
-
-#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES
-#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
-#else
-#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
-#endif
-
-
-#ifndef EIGEN_UNALIGNED_VECTORIZE
-#define EIGEN_UNALIGNED_VECTORIZE 1
-#endif
-
-//----------------------------------------------------------------------
-
-// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into
-// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks
-#if EIGEN_MAX_ALIGN_BYTES==0
-  #ifndef EIGEN_DONT_VECTORIZE
-    #define EIGEN_DONT_VECTORIZE
-  #endif
-#endif
-
-
-// The following (except #include <malloc.h> and _M_IX86_FP ??) can likely be
-// removed as gcc 4.1 and msvc 2008 are not supported anyways.
-#if EIGEN_COMP_MSVC
-  #include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
-  #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later
-    // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
-    #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64
-      #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
-    #endif
-  #endif
-#else
-  #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) )
-    #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
-  #endif
-#endif
-
-#if !(defined(EIGEN_DONT_VECTORIZE) || defined(EIGEN_GPUCC))
-
-  #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
-
-    // Defines symbols for compile-time detection of which instructions are
-    // used.
-    // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_SSE
-    #define EIGEN_VECTORIZE_SSE2
-
-    // Detect sse3/ssse3/sse4:
-    // gcc and icc defines __SSE3__, ...
-    // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you
-    // want to force the use of those instructions with msvc.
-    #ifdef __SSE3__
-      #define EIGEN_VECTORIZE_SSE3
-    #endif
-    #ifdef __SSSE3__
-      #define EIGEN_VECTORIZE_SSSE3
-    #endif
-    #ifdef __SSE4_1__
-      #define EIGEN_VECTORIZE_SSE4_1
-    #endif
-    #ifdef __SSE4_2__
-      #define EIGEN_VECTORIZE_SSE4_2
-    #endif
-    #ifdef __AVX__
-      #ifndef EIGEN_USE_SYCL 
-        #define EIGEN_VECTORIZE_AVX
-      #endif
-      #define EIGEN_VECTORIZE_SSE3
-      #define EIGEN_VECTORIZE_SSSE3
-      #define EIGEN_VECTORIZE_SSE4_1
-      #define EIGEN_VECTORIZE_SSE4_2
-    #endif
-    #ifdef __AVX2__
-      #ifndef EIGEN_USE_SYCL 
-        #define EIGEN_VECTORIZE_AVX2
-        #define EIGEN_VECTORIZE_AVX
-      #endif
-      #define EIGEN_VECTORIZE_SSE3
-      #define EIGEN_VECTORIZE_SSSE3
-      #define EIGEN_VECTORIZE_SSE4_1
-      #define EIGEN_VECTORIZE_SSE4_2
-    #endif
-    #if defined(__FMA__) || (EIGEN_COMP_MSVC && defined(__AVX2__))
-      // MSVC does not expose a switch dedicated for FMA
-      // For MSVC, AVX2 => FMA
-      #define EIGEN_VECTORIZE_FMA
-    #endif
-    #if defined(__AVX512F__)
-      #ifndef EIGEN_VECTORIZE_FMA
-      #if EIGEN_COMP_GNUC
-      #error Please add -mfma to your compiler flags: compiling with -mavx512f alone without SSE/AVX FMA is not supported (bug 1638).
-      #else
-      #error Please enable FMA in your compiler flags (e.g. -mfma): compiling with AVX512 alone without SSE/AVX FMA is not supported (bug 1638).
-      #endif
-      #endif
-      #ifndef EIGEN_USE_SYCL
-        #define EIGEN_VECTORIZE_AVX512
-        #define EIGEN_VECTORIZE_AVX2
-        #define EIGEN_VECTORIZE_AVX
-      #endif
-      #define EIGEN_VECTORIZE_FMA
-      #define EIGEN_VECTORIZE_SSE3
-      #define EIGEN_VECTORIZE_SSSE3
-      #define EIGEN_VECTORIZE_SSE4_1
-      #define EIGEN_VECTORIZE_SSE4_2
-      #ifndef EIGEN_USE_SYCL
-        #ifdef __AVX512DQ__
-          #define EIGEN_VECTORIZE_AVX512DQ
-        #endif
-        #ifdef __AVX512ER__
-          #define EIGEN_VECTORIZE_AVX512ER
-        #endif
-      #endif
-    #endif
-
-    // Disable AVX support on broken xcode versions
-    #if defined(__apple_build_version__) && (__apple_build_version__ == 11000033 ) && ( __MAC_OS_X_VERSION_MIN_REQUIRED == 101500 )
-      // A nasty bug in the clang compiler shipped with xcode in a common compilation situation
-      // when XCode 11.0 and Mac deployment target macOS 10.15 is https://trac.macports.org/ticket/58776#no1
-      #ifdef EIGEN_VECTORIZE_AVX
-        #undef EIGEN_VECTORIZE_AVX
-        #warning "Disabling AVX support: clang compiler shipped with XCode 11.[012] generates broken assembly with -macosx-version-min=10.15 and AVX enabled. "
-        #ifdef EIGEN_VECTORIZE_AVX2
-          #undef EIGEN_VECTORIZE_AVX2
-        #endif
-        #ifdef EIGEN_VECTORIZE_FMA
-          #undef EIGEN_VECTORIZE_FMA
-        #endif
-        #ifdef EIGEN_VECTORIZE_AVX512
-          #undef EIGEN_VECTORIZE_AVX512
-        #endif
-        #ifdef EIGEN_VECTORIZE_AVX512DQ
-          #undef EIGEN_VECTORIZE_AVX512DQ
-        #endif
-        #ifdef EIGEN_VECTORIZE_AVX512ER
-          #undef EIGEN_VECTORIZE_AVX512ER
-        #endif
-      #endif
-      // NOTE: Confirmed test failures in XCode 11.0, and XCode 11.2 with  -macosx-version-min=10.15 and AVX
-      // NOTE using -macosx-version-min=10.15 with Xcode 11.0 results in runtime segmentation faults in many tests, 11.2 produce core dumps in 3 tests
-      // NOTE using -macosx-version-min=10.14 produces functioning and passing tests in all cases
-      // NOTE __clang_version__ "11.0.0 (clang-1100.0.33.8)"  XCode 11.0 <- Produces many segfault and core dumping tests
-      //                                                                    with  -macosx-version-min=10.15 and AVX
-      // NOTE __clang_version__ "11.0.0 (clang-1100.0.33.12)" XCode 11.2 <- Produces 3 core dumping tests with  
-      //                                                                    -macosx-version-min=10.15 and AVX
-    #endif
-
-    // include files
-
-    // This extern "C" works around a MINGW-w64 compilation issue
-    // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354
-    // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do).
-    // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations
-    // with conflicting linkage.  The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know;
-    // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too.
-    // notice that since these are C headers, the extern "C" is theoretically needed anyways.
-    extern "C" {
-      // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
-      // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
-      #if EIGEN_COMP_ICC >= 1110
-        #include <immintrin.h>
-      #else
-        #include <mmintrin.h>
-        #include <emmintrin.h>
-        #include <xmmintrin.h>
-        #ifdef  EIGEN_VECTORIZE_SSE3
-        #include <pmmintrin.h>
-        #endif
-        #ifdef EIGEN_VECTORIZE_SSSE3
-        #include <tmmintrin.h>
-        #endif
-        #ifdef EIGEN_VECTORIZE_SSE4_1
-        #include <smmintrin.h>
-        #endif
-        #ifdef EIGEN_VECTORIZE_SSE4_2
-        #include <nmmintrin.h>
-        #endif
-        #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512)
-        #include <immintrin.h>
-        #endif
-      #endif
-    } // end extern "C"
-
-  #elif defined __VSX__
-
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_VSX
-    #include <altivec.h>
-    // We need to #undef all these ugly tokens defined in <altivec.h>
-    // => use __vector instead of vector
-    #undef bool
-    #undef vector
-    #undef pixel
-
-  #elif defined __ALTIVEC__
-
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_ALTIVEC
-    #include <altivec.h>
-    // We need to #undef all these ugly tokens defined in <altivec.h>
-    // => use __vector instead of vector
-    #undef bool
-    #undef vector
-    #undef pixel
-
-  #elif (defined  __ARM_NEON) || (defined __ARM_NEON__)
-
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_NEON
-    #include <arm_neon.h>
-
-  #elif (defined __s390x__ && defined __VEC__)
-
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_ZVECTOR
-    #include <vecintrin.h>
-
-  #elif defined __mips_msa
-
-    // Limit MSA optimizations to little-endian CPUs for now.
-    // TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs?
-    #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-      #if defined(__LP64__)
-        #define EIGEN_MIPS_64
-      #else
-        #define EIGEN_MIPS_32
-      #endif
-      #define EIGEN_VECTORIZE
-      #define EIGEN_VECTORIZE_MSA
-      #include <msa.h>
-    #endif
-
-  #endif
-#endif
-
-#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!defined(EIGEN_COMP_CLANG) || EIGEN_COMP_CLANG>=380))
-  // We can use the optimized fp16 to float and float to fp16 conversion routines
-  #define EIGEN_HAS_FP16_C
-
-  #if defined(EIGEN_COMP_CLANG)
-    // Workaround for clang: The FP16C intrinsics for clang are included by
-    // immintrin.h, as opposed to emmintrin.h as suggested by Intel:
-    // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#othertechs=FP16C&expand=1711
-    #include <immintrin.h>
-  #endif
-#endif
-
-#if defined EIGEN_CUDACC
-  #define EIGEN_VECTORIZE_GPU
-  #include <vector_types.h>
-  #if EIGEN_CUDA_SDK_VER >= 70500
-    #define EIGEN_HAS_CUDA_FP16
-  #endif
-#endif
-
-#if defined(EIGEN_HAS_CUDA_FP16)
-  #include <cuda_runtime_api.h>
-  #include <cuda_fp16.h>
-#endif
-
-#if defined(EIGEN_HIPCC)
-  #define EIGEN_VECTORIZE_GPU
-  #include <hip/hip_vector_types.h>
-  #define EIGEN_HAS_HIP_FP16
-  #include <hip/hip_fp16.h>
-#endif
-
-
-/** \brief Namespace containing all symbols from the %Eigen library. */
-namespace Eigen {
-
-inline static const char *SimdInstructionSetsInUse(void) {
-#if defined(EIGEN_VECTORIZE_AVX512)
-  return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
-#elif defined(EIGEN_VECTORIZE_AVX)
-  return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
-#elif defined(EIGEN_VECTORIZE_SSE4_2)
-  return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
-#elif defined(EIGEN_VECTORIZE_SSE4_1)
-  return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
-#elif defined(EIGEN_VECTORIZE_SSSE3)
-  return "SSE, SSE2, SSE3, SSSE3";
-#elif defined(EIGEN_VECTORIZE_SSE3)
-  return "SSE, SSE2, SSE3";
-#elif defined(EIGEN_VECTORIZE_SSE2)
-  return "SSE, SSE2";
-#elif defined(EIGEN_VECTORIZE_ALTIVEC)
-  return "AltiVec";
-#elif defined(EIGEN_VECTORIZE_VSX)
-  return "VSX";
-#elif defined(EIGEN_VECTORIZE_NEON)
-  return "ARM NEON";
-#elif defined(EIGEN_VECTORIZE_ZVECTOR)
-  return "S390X ZVECTOR";
-#elif defined(EIGEN_VECTORIZE_MSA)
-  return "MIPS MSA";
-#else
-  return "None";
-#endif
-}
-
-} // end namespace Eigen
-
-
-#endif // EIGEN_CONFIGURE_VECTORIZATION_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/Constants.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/Constants.h
index 7ada82195..7587d6842 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/Constants.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/util/Constants.h
@@ -25,10 +25,6 @@ const int Dynamic = -1;
   */
 const int DynamicIndex = 0xffffff;
 
-/** This value means that the increment to go from one value to another in a sequence is not constant for each step.
-  */
-const int UndefinedIncr = 0xfffffe;
-
 /** This value means +Infinity; it is currently used only as the p parameter to MatrixBase::lpNorm<int>().
   * The value Infinity there means the L-infinity norm.
   */
@@ -254,6 +250,12 @@ enum AlignmentType {
 #endif
 };
 
+/** \ingroup enums
+ * Enum used by DenseBase::corner() in Eigen2 compatibility mode. */
+// FIXME after the corner() API change, this was not needed anymore, except by AlignedBox
+// TODO: find out what to do with that. Adapt the AlignedBox API ?
+enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight };
+
 /** \ingroup enums
   * Enum containing possible values for the \p Direction parameter of
   * Reverse, PartialReduxExpr and VectorwiseOp. */
@@ -333,8 +335,6 @@ enum SideType {
   OnTheRight = 2  
 };
 
-
-
 /* the following used to be written as:
  *
  *   struct NoChange_t {};
@@ -464,7 +464,6 @@ namespace Architecture
     AltiVec = 0x2,
     VSX = 0x3,
     NEON = 0x4,
-    MSA = 0x5,
 #if defined EIGEN_VECTORIZE_SSE
     Target = SSE
 #elif defined EIGEN_VECTORIZE_ALTIVEC
@@ -473,8 +472,6 @@ namespace Architecture
     Target = VSX
 #elif defined EIGEN_VECTORIZE_NEON
     Target = NEON
-#elif defined EIGEN_VECTORIZE_MSA
-    Target = MSA
 #else
     Target = Generic
 #endif
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/DisableStupidWarnings.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/DisableStupidWarnings.h
index 4501d3248..74f74cc42 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -4,6 +4,7 @@
 #ifdef _MSC_VER
   // 4100 - unreferenced formal parameter (occurred e.g. in aligned_allocator::destroy(pointer p))
   // 4101 - unreferenced local variable
+  // 4127 - conditional expression is constant
   // 4181 - qualifier applied to reference type ignored
   // 4211 - nonstandard extension used : redefined extern to static
   // 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data
@@ -19,7 +20,7 @@
   #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
     #pragma warning( push )
   #endif
-  #pragma warning( disable : 4100 4101 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
+  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
 
 #elif defined __INTEL_COMPILER
   // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
@@ -41,14 +42,6 @@
     #pragma clang diagnostic push
   #endif
   #pragma clang diagnostic ignored "-Wconstant-logical-operand"
-  #if __clang_major__ >= 3 && __clang_minor__ >= 5
-    #pragma clang diagnostic ignored "-Wabsolute-value"
-  #endif
-  #if ( defined(__ALTIVEC__) || defined(__VSX__) ) && __cplusplus < 201103L
-    // warning: generic selections are a C11-specific feature
-    // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h
-    #pragma clang diagnostic ignored "-Wc11-extensions"
-  #endif
 
 #elif defined __GNUC__
 
@@ -71,7 +64,6 @@
 #endif
 
 #if defined __NVCC__
-  #pragma diag_suppress boolean_controlling_expr_is_constant
   // Disable the "statement is unreachable" message
   #pragma diag_suppress code_is_unreachable
   // Disable the "dynamic initialization in unreachable code" message
@@ -89,7 +81,6 @@
   #pragma diag_suppress 2671
   #pragma diag_suppress 2735
   #pragma diag_suppress 2737
-  #pragma diag_suppress 2739
 #endif
 
 #else
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/ForwardDeclarations.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/ForwardDeclarations.h
index cd0bdb5a7..134544f96 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/util/ForwardDeclarations.h
@@ -79,8 +79,6 @@ template<typename ExpressionType> class ForceAlignedAccess;
 template<typename ExpressionType> class SwapWrapper;
 
 template<typename XprType, int BlockRows=Dynamic, int BlockCols=Dynamic, bool InnerPanel = false> class Block;
-template<typename XprType, typename RowIndices, typename ColIndices> class IndexedView;
-template<typename XprType, int Rows=Dynamic, int Cols=Dynamic, int Order=0> class Reshaped;
 
 template<typename MatrixType, int Size=Dynamic> class VectorBlock;
 template<typename MatrixType> class Transpose;
@@ -110,7 +108,7 @@ template<typename _IndicesType> class TranspositionsWrapper;
 template<typename Derived,
          int Level = internal::accessors_level<Derived>::has_write_access ? WriteAccessors : ReadOnlyAccessors
 > class MapBase;
-template<int OuterStrideAtCompileTime, int InnerStrideAtCompileTime> class Stride;
+template<int InnerStrideAtCompileTime, int OuterStrideAtCompileTime> class Stride;
 template<int Value = Dynamic> class InnerStride;
 template<int Value = Dynamic> class OuterStride;
 template<typename MatrixType, int MapOptions=Unaligned, typename StrideType = Stride<0,0> > class Map;
@@ -131,9 +129,6 @@ template<typename Derived> class SolverBase;
 template<typename XprType> class InnerIterator;
 
 namespace internal {
-template<typename XprType> class generic_randaccess_stl_iterator;
-template<typename XprType> class pointer_based_stl_iterator;
-template<typename XprType, DirectionType Direction> class subvector_stl_iterator;
 template<typename DecompositionType> struct kernel_retval_base;
 template<typename DecompositionType> struct kernel_retval;
 template<typename DecompositionType> struct image_retval_base;
@@ -187,7 +182,6 @@ template<typename Scalar> struct scalar_real_op;
 template<typename Scalar> struct scalar_imag_op;
 template<typename Scalar> struct scalar_abs_op;
 template<typename Scalar> struct scalar_abs2_op;
-template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_absolute_difference_op;
 template<typename Scalar> struct scalar_sqrt_op;
 template<typename Scalar> struct scalar_rsqrt_op;
 template<typename Scalar> struct scalar_exp_op;
@@ -215,27 +209,11 @@ template<typename Scalar> struct scalar_lgamma_op;
 template<typename Scalar> struct scalar_digamma_op;
 template<typename Scalar> struct scalar_erf_op;
 template<typename Scalar> struct scalar_erfc_op;
-template<typename Scalar> struct scalar_ndtri_op;
 template<typename Scalar> struct scalar_igamma_op;
 template<typename Scalar> struct scalar_igammac_op;
 template<typename Scalar> struct scalar_zeta_op;
 template<typename Scalar> struct scalar_betainc_op;
 
-// Bessel functions in SpecialFunctions module
-template<typename Scalar> struct scalar_bessel_i0_op;
-template<typename Scalar> struct scalar_bessel_i0e_op;
-template<typename Scalar> struct scalar_bessel_i1_op;
-template<typename Scalar> struct scalar_bessel_i1e_op;
-template<typename Scalar> struct scalar_bessel_j0_op;
-template<typename Scalar> struct scalar_bessel_y0_op;
-template<typename Scalar> struct scalar_bessel_j1_op;
-template<typename Scalar> struct scalar_bessel_y1_op;
-template<typename Scalar> struct scalar_bessel_k0_op;
-template<typename Scalar> struct scalar_bessel_k0e_op;
-template<typename Scalar> struct scalar_bessel_k1_op;
-template<typename Scalar> struct scalar_bessel_k1e_op;
-
-
 } // end namespace internal
 
 struct IOFormat;
@@ -273,7 +251,6 @@ template<typename MatrixType> class HouseholderQR;
 template<typename MatrixType> class ColPivHouseholderQR;
 template<typename MatrixType> class FullPivHouseholderQR;
 template<typename MatrixType> class CompleteOrthogonalDecomposition;
-template<typename MatrixType> class SVDBase;
 template<typename MatrixType, int QRPreconditioner = ColPivHouseholderQRPreconditioner> class JacobiSVD;
 template<typename MatrixType> class BDCSVD;
 template<typename MatrixType, int UpLo = Lower> class LLT;
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/IndexedViewHelper.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/IndexedViewHelper.h
deleted file mode 100644
index 1cda85060..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/IndexedViewHelper.h
+++ /dev/null
@@ -1,186 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-
-#ifndef EIGEN_INDEXED_VIEW_HELPER_H
-#define EIGEN_INDEXED_VIEW_HELPER_H
-
-namespace Eigen {
-
-namespace internal {
-struct symbolic_last_tag {};
-}
-
-/** \var last
-  * \ingroup Core_Module
-  *
-  * Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically reference the last element/row/columns
-  * of the underlying vector or matrix once passed to DenseBase::operator()(const RowIndices&, const ColIndices&).
-  *
-  * This symbolic placeholder supports standard arithmetic operations.
-  *
-  * A typical usage example would be:
-  * \code
-  * using namespace Eigen;
-  * using Eigen::last;
-  * VectorXd v(n);
-  * v(seq(2,last-2)).setOnes();
-  * \endcode
-  *
-  * \sa end
-  */
-static const symbolic::SymbolExpr<internal::symbolic_last_tag> last; // PLEASE use Eigen::last   instead of Eigen::placeholders::last
-
-/** \var lastp1
-  * \ingroup Core_Module
-  *
-  * Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically
-  * reference the last+1 element/row/columns of the underlying vector or matrix once
-  * passed to DenseBase::operator()(const RowIndices&, const ColIndices&).
-  *
-  * This symbolic placeholder supports standard arithmetic operations.
-  * It is essentially an alias to last+fix<1>.
-  *
-  * \sa last
-  */
-#ifdef EIGEN_PARSED_BY_DOXYGEN
-static const auto lastp1 = last+fix<1>;
-#else
-// Using a FixedExpr<1> expression is important here to make sure the compiler
-// can fully optimize the computation starting indices with zero overhead.
-static const symbolic::AddExpr<symbolic::SymbolExpr<internal::symbolic_last_tag>,symbolic::ValueExpr<Eigen::internal::FixedInt<1> > > lastp1(last+fix<1>());
-#endif
-
-namespace internal {
-
- // Replace symbolic last/end "keywords" by their true runtime value
-inline Index eval_expr_given_size(Index x, Index /* size */)   { return x; }
-
-template<int N>
-FixedInt<N> eval_expr_given_size(FixedInt<N> x, Index /*size*/)   { return x; }
-
-template<typename Derived>
-Index eval_expr_given_size(const symbolic::BaseExpr<Derived> &x, Index size)
-{
-  return x.derived().eval(last=size-1);
-}
-
-// Extract increment/step at compile time
-template<typename T, typename EnableIf = void> struct get_compile_time_incr {
-  enum { value = UndefinedIncr };
-};
-
-// Analogue of std::get<0>(x), but tailored for our needs.
-template<typename T>
-Index first(const T& x) { return x.first(); }
-
-// IndexedViewCompatibleType/makeIndexedViewCompatible turn an arbitrary object of type T into something usable by MatrixSlice
-// The generic implementation is a no-op
-template<typename T,int XprSize,typename EnableIf=void>
-struct IndexedViewCompatibleType {
-  typedef T type;
-};
-
-template<typename T,typename Q>
-const T& makeIndexedViewCompatible(const T& x, Index /*size*/, Q) { return x; }
-
-//--------------------------------------------------------------------------------
-// Handling of a single Index
-//--------------------------------------------------------------------------------
-
-struct SingleRange {
-  enum {
-    SizeAtCompileTime = 1
-  };
-  SingleRange(Index val) : m_value(val) {}
-  Index operator[](Index) const { return m_value; }
-  Index size() const { return 1; }
-  Index first() const { return m_value; }
-  Index m_value;
-};
-
-template<> struct get_compile_time_incr<SingleRange> {
-  enum { value = 1 }; // 1 or 0 ??
-};
-
-// Turn a single index into something that looks like an array (i.e., that exposes a .size(), and operator[](int) methods)
-template<typename T, int XprSize>
-struct IndexedViewCompatibleType<T,XprSize,typename internal::enable_if<internal::is_integral<T>::value>::type> {
-  // Here we could simply use Array, but maybe it's less work for the compiler to use
-  // a simpler wrapper as SingleRange
-  //typedef Eigen::Array<Index,1,1> type;
-  typedef SingleRange type;
-};
-
-template<typename T, int XprSize>
-struct IndexedViewCompatibleType<T, XprSize, typename enable_if<symbolic::is_symbolic<T>::value>::type> {
-  typedef SingleRange type;
-};
-
-
-template<typename T>
-typename enable_if<symbolic::is_symbolic<T>::value,SingleRange>::type
-makeIndexedViewCompatible(const T& id, Index size, SpecializedType) {
-  return eval_expr_given_size(id,size);
-}
-
-//--------------------------------------------------------------------------------
-// Handling of all
-//--------------------------------------------------------------------------------
-
-struct all_t { all_t() {} };
-
-// Convert a symbolic 'all' into a usable range type
-template<int XprSize>
-struct AllRange {
-  enum { SizeAtCompileTime = XprSize };
-  AllRange(Index size = XprSize) : m_size(size) {}
-  Index operator[](Index i) const { return i; }
-  Index size() const { return m_size.value(); }
-  Index first() const { return 0; }
-  variable_if_dynamic<Index,XprSize> m_size;
-};
-
-template<int XprSize>
-struct IndexedViewCompatibleType<all_t,XprSize> {
-  typedef AllRange<XprSize> type;
-};
-
-template<typename XprSizeType>
-inline AllRange<get_fixed_value<XprSizeType>::value> makeIndexedViewCompatible(all_t , XprSizeType size, SpecializedType) {
-  return AllRange<get_fixed_value<XprSizeType>::value>(size);
-}
-
-template<int Size> struct get_compile_time_incr<AllRange<Size> > {
-  enum { value = 1 };
-};
-
-} // end namespace internal
-
-
-/** \var all
-  * \ingroup Core_Module
-  * Can be used as a parameter to DenseBase::operator()(const RowIndices&, const ColIndices&) to index all rows or columns
-  */
-static const Eigen::internal::all_t all; // PLEASE use Eigen::all instead of Eigen::placeholders::all
-
-
-namespace placeholders {
-  typedef symbolic::SymbolExpr<internal::symbolic_last_tag> last_t;
-  typedef symbolic::AddExpr<symbolic::SymbolExpr<internal::symbolic_last_tag>,symbolic::ValueExpr<Eigen::internal::FixedInt<1> > > end_t;
-  typedef Eigen::internal::all_t all_t;
-
-  EIGEN_DEPRECATED static const all_t  all  = Eigen::all;    // PLEASE use Eigen::all    instead of Eigen::placeholders::all
-  EIGEN_DEPRECATED static const last_t last = Eigen::last;   // PLEASE use Eigen::last   instead of Eigen::placeholders::last
-  EIGEN_DEPRECATED static const end_t  end  = Eigen::lastp1; // PLEASE use Eigen::lastp1 instead of Eigen::placeholders::end
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_INDEXED_VIEW_HELPER_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/IntegralConstant.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/IntegralConstant.h
deleted file mode 100644
index caeea232d..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/IntegralConstant.h
+++ /dev/null
@@ -1,272 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-
-#ifndef EIGEN_INTEGRAL_CONSTANT_H
-#define EIGEN_INTEGRAL_CONSTANT_H
-
-namespace Eigen {
-
-namespace internal {
-
-template<int N> class FixedInt;
-template<int N> class VariableAndFixedInt;
-
-/** \internal
-  * \class FixedInt
-  *
-  * This class embeds a compile-time integer \c N.
-  *
-  * It is similar to c++11 std::integral_constant<int,N> but with some additional features
-  * such as:
-  *  - implicit conversion to int
-  *  - arithmetic and some bitwise operators: -, +, *, /, %, &, |
-  *  - c++98/14 compatibility with fix<N> and fix<N>() syntax to define integral constants.
-  *
-  * It is strongly discouraged to directly deal with this class FixedInt. Instances are expcected to
-  * be created by the user using Eigen::fix<N> or Eigen::fix<N>(). In C++98-11, the former syntax does
-  * not create a FixedInt<N> instance but rather a point to function that needs to be \em cleaned-up
-  * using the generic helper:
-  * \code
-  * internal::cleanup_index_type<T>::type
-  * internal::cleanup_index_type<T,DynamicKey>::type
-  * \endcode
-  * where T can a FixedInt<N>, a pointer to function FixedInt<N> (*)(), or numerous other integer-like representations.
-  * \c DynamicKey is either Dynamic (default) or DynamicIndex and used to identify true compile-time values.
-  *
-  * For convenience, you can extract the compile-time value \c N in a generic way using the following helper:
-  * \code
-  * internal::get_fixed_value<T,DefaultVal>::value
-  * \endcode
-  * that will give you \c N if T equals FixedInt<N> or FixedInt<N> (*)(), and \c DefaultVal if T does not embed any compile-time value (e.g., T==int).
-  *
-  * \sa fix<N>, class VariableAndFixedInt
-  */
-template<int N> class FixedInt
-{
-public:
-  static const int value = N;
-  operator int() const { return value; }
-  FixedInt() {}
-  FixedInt( VariableAndFixedInt<N> other) {
-    #ifndef EIGEN_INTERNAL_DEBUGGING
-    EIGEN_UNUSED_VARIABLE(other);
-    #endif
-    eigen_internal_assert(int(other)==N);
-  }
-
-  FixedInt<-N> operator-() const { return FixedInt<-N>(); }
-  template<int M>
-  FixedInt<N+M> operator+( FixedInt<M>) const { return FixedInt<N+M>(); }
-  template<int M>
-  FixedInt<N-M> operator-( FixedInt<M>) const { return FixedInt<N-M>(); }
-  template<int M>
-  FixedInt<N*M> operator*( FixedInt<M>) const { return FixedInt<N*M>(); }
-  template<int M>
-  FixedInt<N/M> operator/( FixedInt<M>) const { return FixedInt<N/M>(); }
-  template<int M>
-  FixedInt<N%M> operator%( FixedInt<M>) const { return FixedInt<N%M>(); }
-  template<int M>
-  FixedInt<N|M> operator|( FixedInt<M>) const { return FixedInt<N|M>(); }
-  template<int M>
-  FixedInt<N&M> operator&( FixedInt<M>) const { return FixedInt<N&M>(); }
-
-#if EIGEN_HAS_CXX14
-  // Needed in C++14 to allow fix<N>():
-  FixedInt operator() () const { return *this; }
-
-  VariableAndFixedInt<N> operator() (int val) const { return VariableAndFixedInt<N>(val); }
-#else
-  FixedInt ( FixedInt<N> (*)() ) {}
-#endif
-
-#if EIGEN_HAS_CXX11
-  FixedInt(std::integral_constant<int,N>) {}
-#endif
-};
-
-/** \internal
-  * \class VariableAndFixedInt
-  *
-  * This class embeds both a compile-time integer \c N and a runtime integer.
-  * Both values are supposed to be equal unless the compile-time value \c N has a special
-  * value meaning that the runtime-value should be used. Depending on the context, this special
-  * value can be either Eigen::Dynamic (for positive quantities) or Eigen::DynamicIndex (for
-  * quantities that can be negative).
-  *
-  * It is the return-type of the function Eigen::fix<N>(int), and most of the time this is the only
-  * way it is used. It is strongly discouraged to directly deal with instances of VariableAndFixedInt.
-  * Indeed, in order to write generic code, it is the responsibility of the callee to properly convert
-  * it to either a true compile-time quantity (i.e. a FixedInt<N>), or to a runtime quantity (e.g., an Index)
-  * using the following generic helper:
-  * \code
-  * internal::cleanup_index_type<T>::type
-  * internal::cleanup_index_type<T,DynamicKey>::type
-  * \endcode
-  * where T can be a template instantiation of VariableAndFixedInt or numerous other integer-like representations.
-  * \c DynamicKey is either Dynamic (default) or DynamicIndex and used to identify true compile-time values.
-  *
-  * For convenience, you can also extract the compile-time value \c N using the following helper:
-  * \code
-  * internal::get_fixed_value<T,DefaultVal>::value
-  * \endcode
-  * that will give you \c N if T equals VariableAndFixedInt<N>, and \c DefaultVal if T does not embed any compile-time value (e.g., T==int).
-  *
-  * \sa fix<N>(int), class FixedInt
-  */
-template<int N> class VariableAndFixedInt
-{
-public:
-  static const int value = N;
-  operator int() const { return m_value; }
-  VariableAndFixedInt(int val) { m_value = val; }
-protected:
-  int m_value;
-};
-
-template<typename T, int Default=Dynamic> struct get_fixed_value {
-  static const int value = Default;
-};
-
-template<int N,int Default> struct get_fixed_value<FixedInt<N>,Default> {
-  static const int value = N;
-};
-
-#if !EIGEN_HAS_CXX14
-template<int N,int Default> struct get_fixed_value<FixedInt<N> (*)(),Default> {
-  static const int value = N;
-};
-#endif
-
-template<int N,int Default> struct get_fixed_value<VariableAndFixedInt<N>,Default> {
-  static const int value = N ;
-};
-
-template<typename T, int N, int Default>
-struct get_fixed_value<variable_if_dynamic<T,N>,Default> {
-  static const int value = N;
-};
-
-template<typename T> EIGEN_DEVICE_FUNC Index get_runtime_value(const T &x) { return x; }
-#if !EIGEN_HAS_CXX14
-template<int N> EIGEN_DEVICE_FUNC Index get_runtime_value(FixedInt<N> (*)()) { return N; }
-#endif
-
-// Cleanup integer/FixedInt/VariableAndFixedInt/etc types:
-
-// By default, no cleanup:
-template<typename T, int DynamicKey=Dynamic, typename EnableIf=void> struct cleanup_index_type { typedef T type; };
-
-// Convert any integral type (e.g., short, int, unsigned int, etc.) to Eigen::Index
-template<typename T, int DynamicKey> struct cleanup_index_type<T,DynamicKey,typename internal::enable_if<internal::is_integral<T>::value>::type> { typedef Index type; };
-
-#if !EIGEN_HAS_CXX14
-// In c++98/c++11, fix<N> is a pointer to function that we better cleanup to a true FixedInt<N>:
-template<int N, int DynamicKey> struct cleanup_index_type<FixedInt<N> (*)(), DynamicKey> { typedef FixedInt<N> type; };
-#endif
-
-// If VariableAndFixedInt does not match DynamicKey, then we turn it to a pure compile-time value:
-template<int N, int DynamicKey> struct cleanup_index_type<VariableAndFixedInt<N>, DynamicKey> { typedef FixedInt<N> type; };
-// If VariableAndFixedInt matches DynamicKey, then we turn it to a pure runtime-value (aka Index):
-template<int DynamicKey> struct cleanup_index_type<VariableAndFixedInt<DynamicKey>, DynamicKey> { typedef Index type; };
-
-#if EIGEN_HAS_CXX11
-template<int N, int DynamicKey> struct cleanup_index_type<std::integral_constant<int,N>, DynamicKey> { typedef FixedInt<N> type; };
-#endif
-
-} // end namespace internal
-
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-
-#if EIGEN_HAS_CXX14
-template<int N>
-static const internal::FixedInt<N> fix{};
-#else
-template<int N>
-inline internal::FixedInt<N> fix() { return internal::FixedInt<N>(); }
-
-// The generic typename T is mandatory. Otherwise, a code like fix<N> could refer to either the function above or this next overload.
-// This way a code like fix<N> can only refer to the previous function.
-template<int N,typename T>
-inline internal::VariableAndFixedInt<N> fix(T val) { return internal::VariableAndFixedInt<N>(internal::convert_index<int>(val)); }
-#endif
-
-#else // EIGEN_PARSED_BY_DOXYGEN
-
-/** \var fix<N>()
-  * \ingroup Core_Module
-  *
-  * This \em identifier permits to construct an object embedding a compile-time integer \c N.
-  *
-  * \tparam N the compile-time integer value
-  *
-  * It is typically used in conjunction with the Eigen::seq and Eigen::seqN functions to pass compile-time values to them:
-  * \code
-  * seqN(10,fix<4>,fix<-3>)   // <=> [10 7 4 1]
-  * \endcode
-  *
-  * See also the function fix(int) to pass both a compile-time and runtime value.
-  *
-  * In c++14, it is implemented as:
-  * \code
-  * template<int N> static const internal::FixedInt<N> fix{};
-  * \endcode
-  * where internal::FixedInt<N> is an internal template class similar to
-  * <a href="http://en.cppreference.com/w/cpp/types/integral_constant">\c std::integral_constant </a><tt> <int,N> </tt>
-  * Here, \c fix<N> is thus an object of type \c internal::FixedInt<N>.
-  *
-  * In c++98/11, it is implemented as a function:
-  * \code
-  * template<int N> inline internal::FixedInt<N> fix();
-  * \endcode
-  * Here internal::FixedInt<N> is thus a pointer to function.
-  *
-  * If for some reason you want a true object in c++98 then you can write: \code fix<N>() \endcode which is also valid in c++14.
-  *
-  * \sa fix<N>(int), seq, seqN
-  */
-template<int N>
-static const auto fix();
-
-/** \fn fix<N>(int)
-  * \ingroup Core_Module
-  *
-  * This function returns an object embedding both a compile-time integer \c N, and a fallback runtime value \a val.
-  *
-  * \tparam N the compile-time integer value
-  * \param  val the fallback runtime integer value
-  *
-  * This function is a more general version of the \ref fix identifier/function that can be used in template code
-  * where the compile-time value could turn out to actually mean "undefined at compile-time". For positive integers
-  * such as a size or a dimension, this case is identified by Eigen::Dynamic, whereas runtime signed integers
-  * (e.g., an increment/stride) are identified as Eigen::DynamicIndex. In such a case, the runtime value \a val
-  * will be used as a fallback.
-  *
-  * A typical use case would be:
-  * \code
-  * template<typename Derived> void foo(const MatrixBase<Derived> &mat) {
-  *   const int N = Derived::RowsAtCompileTime==Dynamic ? Dynamic : Derived::RowsAtCompileTime/2;
-  *   const int n = mat.rows()/2;
-  *   ... mat( seqN(0,fix<N>(n) ) ...;
-  * }
-  * \endcode
-  * In this example, the function Eigen::seqN knows that the second argument is expected to be a size.
-  * If the passed compile-time value N equals Eigen::Dynamic, then the proxy object returned by fix will be dissmissed, and converted to an Eigen::Index of value \c n.
-  * Otherwise, the runtime-value \c n will be dissmissed, and the returned ArithmeticSequence will be of the exact same type as <tt> seqN(0,fix<N>) </tt>.
-  *
-  * \sa fix, seqN, class ArithmeticSequence
-  */
-template<int N>
-static const auto fix(int val);
-
-#endif // EIGEN_PARSED_BY_DOXYGEN
-
-} // end namespace Eigen
-
-#endif // EIGEN_INTEGRAL_CONSTANT_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/MKL_support.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/MKL_support.h
index 17963fad4..b7d6ecc76 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/MKL_support.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/util/MKL_support.h
@@ -55,11 +55,7 @@
 
 
 #if defined EIGEN_USE_MKL
-#   if (!defined MKL_DIRECT_CALL) && (!defined EIGEN_MKL_NO_DIRECT_CALL)
-#       define MKL_DIRECT_CALL
-#       define MKL_DIRECT_CALL_JUST_SET
-#   endif
-#   include <mkl.h>
+#   include <mkl.h> 
 /*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/
 #   ifndef INTEL_MKL_VERSION
 #       undef EIGEN_USE_MKL /* INTEL_MKL_VERSION is not even defined on older versions */
@@ -73,9 +69,6 @@
 #       undef   EIGEN_USE_MKL_VML
 #       undef   EIGEN_USE_LAPACKE_STRICT
 #       undef   EIGEN_USE_LAPACKE
-#       ifdef   MKL_DIRECT_CALL_JUST_SET
-#           undef MKL_DIRECT_CALL
-#       endif
 #   endif
 #endif
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/Macros.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/Macros.h
index d0499a1c9..87233eadf 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/Macros.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/util/Macros.h
@@ -11,56 +11,19 @@
 #ifndef EIGEN_MACROS_H
 #define EIGEN_MACROS_H
 
-//------------------------------------------------------------------------------------------
-// Eigen version and basic defaults
-//------------------------------------------------------------------------------------------
-
 #define EIGEN_WORLD_VERSION 3
 #define EIGEN_MAJOR_VERSION 3
-#define EIGEN_MINOR_VERSION 90
+#define EIGEN_MINOR_VERSION 8
 
 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
                                       (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
                                                                  EIGEN_MINOR_VERSION>=z))))
 
-#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
-#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor
-#else
-#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::ColMajor
-#endif
-
-#ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE std::ptrdiff_t
-#endif
-
-// Upperbound on the C++ version to use.
-// Expected values are 03, 11, 14, 17, etc.
-// By default, let's use an arbitrarily large C++ version.
-#ifndef EIGEN_MAX_CPP_VER
-#define EIGEN_MAX_CPP_VER 99
-#endif
-
-/** Allows to disable some optimizations which might affect the accuracy of the result.
-  * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
-  * They currently include:
-  *   - single precision ArrayBase::sin() and ArrayBase::cos() for SSE and AVX vectorization.
-  */
-#ifndef EIGEN_FAST_MATH
-#define EIGEN_FAST_MATH 1
-#endif
-
-#ifndef EIGEN_STACK_ALLOCATION_LIMIT
-// 131072 == 128 KB
-#define EIGEN_STACK_ALLOCATION_LIMIT 131072
-#endif
-
-//------------------------------------------------------------------------------------------
 // Compiler identification, EIGEN_COMP_*
-//------------------------------------------------------------------------------------------
 
 /// \internal EIGEN_COMP_GNUC set to 1 for all compilers compatible with GCC
 #ifdef __GNUC__
-  #define EIGEN_COMP_GNUC (__GNUC__*10+__GNUC_MINOR__)
+  #define EIGEN_COMP_GNUC 1
 #else
   #define EIGEN_COMP_GNUC 0
 #endif
@@ -108,44 +71,14 @@
   #define EIGEN_COMP_MSVC 0
 #endif
 
-#if defined(__NVCC__)
-#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
-  #define EIGEN_COMP_NVCC  ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
-#elif defined(__CUDACC_VER__)
-  #define EIGEN_COMP_NVCC __CUDACC_VER__
-#else
-  #error "NVCC did not define compiler version."
-#endif
-#else
-  #define EIGEN_COMP_NVCC 0
-#endif
-
 // For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC:
-//  name        ver   MSC_VER
-//  2008         9      1500
-//  2010        10      1600
-//  2012        11      1700
-//  2013        12      1800
-//  2015        14      1900
-//  "15"        15      1900
-//  2017-14.1   15.0    1910
-//  2017-14.11  15.3    1911
-//  2017-14.12  15.5    1912
-//  2017-14.13  15.6    1913
-//  2017-14.14  15.7    1914
-
-/// \internal EIGEN_COMP_MSVC_LANG set to _MSVC_LANG if the compiler is Microsoft Visual C++, 0 otherwise.
-#if defined(_MSVC_LANG)
-  #define EIGEN_COMP_MSVC_LANG _MSVC_LANG
-#else
-  #define EIGEN_COMP_MSVC_LANG 0
-#endif
-
-// For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC_LANG:
-// MSVC option                          Standard  MSVC_LANG
-// /std:c++14 (default as of VS 2019)   C++14     201402L
-// /std:c++17                           C++17     201703L
-// /std:c++latest                       >C++17    >201703L
+//  name  ver   MSC_VER
+//  2008    9      1500
+//  2010   10      1600
+//  2012   11      1700
+//  2013   12      1800
+//  2015   14      1900
+//  "15"   15      1900
 
 /// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC or clang-cl
 #if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC || EIGEN_COMP_LLVM || EIGEN_COMP_CLANG)
@@ -154,21 +87,16 @@
   #define EIGEN_COMP_MSVC_STRICT 0
 #endif
 
-/// \internal EIGEN_COMP_IBM set to xlc version if the compiler is IBM XL C++
-// XLC   version
-// 3.1   0x0301	
-// 4.5   0x0405	
-// 5.0   0x0500
-// 12.1  0x0C01
-#if defined(__IBMCPP__) || defined(__xlc__) || defined(__ibmxl__)
-  #define EIGEN_COMP_IBM __xlC__
+/// \internal EIGEN_COMP_IBM set to 1 if the compiler is IBM XL C++
+#if defined(__IBMCPP__) || defined(__xlc__)
+  #define EIGEN_COMP_IBM 1
 #else
   #define EIGEN_COMP_IBM 0
 #endif
 
-/// \internal EIGEN_COMP_PGI set to PGI version if the compiler is Portland Group Compiler
+/// \internal EIGEN_COMP_PGI set to 1 if the compiler is Portland Group Compiler
 #if defined(__PGI)
-  #define EIGEN_COMP_PGI (__PGIC__*100+__PGIC_MINOR__)
+  #define EIGEN_COMP_PGI 1
 #else
   #define EIGEN_COMP_PGI 0
 #endif
@@ -180,7 +108,7 @@
   #define EIGEN_COMP_ARM 0
 #endif
 
-/// \internal EIGEN_COMP_EMSCRIPTEN set to 1 if the compiler is Emscripten Compiler
+/// \internal EIGEN_COMP_ARM set to 1 if the compiler is ARM Compiler
 #if defined(__EMSCRIPTEN__)
   #define EIGEN_COMP_EMSCRIPTEN 1
 #else
@@ -214,11 +142,7 @@
 #endif
 
 
-
-//------------------------------------------------------------------------------------------
 // Architecture identification, EIGEN_ARCH_*
-//------------------------------------------------------------------------------------------
-
 
 #if defined(__x86_64__) || defined(_M_X64) || defined(__amd64)
   #define EIGEN_ARCH_x86_64 1
@@ -288,9 +212,7 @@
 
 
 
-//------------------------------------------------------------------------------------------
 // Operating system identification, EIGEN_OS_*
-//------------------------------------------------------------------------------------------
 
 /// \internal EIGEN_OS_UNIX set to 1 if the OS is a unix variant
 #if defined(__unix__) || defined(__unix)
@@ -377,17 +299,9 @@
   #define EIGEN_OS_WIN_STRICT 0
 #endif
 
-/// \internal EIGEN_OS_SUN set to __SUNPRO_C if the OS is SUN
-// compiler  solaris   __SUNPRO_C
-// version   studio
-// 5.7       10        0x570
-// 5.8       11        0x580
-// 5.9       12        0x590
-// 5.10	     12.1      0x5100
-// 5.11	     12.2      0x5110
-// 5.12	     12.3      0x5120
+/// \internal EIGEN_OS_SUN set to 1 if the OS is SUN
 #if (defined(sun) || defined(__sun)) && !(defined(__SVR4) || defined(__svr4__))
-  #define EIGEN_OS_SUN __SUNPRO_C
+  #define EIGEN_OS_SUN 1
 #else
   #define EIGEN_OS_SUN 0
 #endif
@@ -400,112 +314,6 @@
 #endif
 
 
-//------------------------------------------------------------------------------------------
-// Detect GPU compilers and architectures
-//------------------------------------------------------------------------------------------
-
-// NVCC is not supported as the target platform for HIPCC
-// Note that this also makes EIGEN_CUDACC and EIGEN_HIPCC mutually exclusive
-#if defined(__NVCC__) && defined(__HIPCC__)
-  #error "NVCC as the target platform for HIPCC is currently not supported."
-#endif
-
-#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA)
-  // Means the compiler is either nvcc or clang with CUDA enabled
-  #define EIGEN_CUDACC __CUDACC__
-#endif
-
-#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA)
-  // Means we are generating code for the device
-  #define EIGEN_CUDA_ARCH __CUDA_ARCH__
-#endif
-
-#if defined(EIGEN_CUDACC)
-#include <cuda.h>
-  #define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10)
-#else
-  #define EIGEN_CUDA_SDK_VER 0
-#endif
-
-#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP)
-  // Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP)
-  #define EIGEN_HIPCC __HIPCC__
-
-  // We need to include hip_runtime.h here because it pulls in
-  // ++ hip_common.h which contains the define for  __HIP_DEVICE_COMPILE__
-  // ++ host_defines.h which contains the defines for the __host__ and __device__ macros
-  #include <hip/hip_runtime.h>
-
-  #if defined(__HIP_DEVICE_COMPILE__)
-    // analogous to EIGEN_CUDA_ARCH, but for HIP
-    #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__
-  #endif
-#endif
-
-// Unify CUDA/HIPCC
-
-#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
-//
-// If either EIGEN_CUDACC or EIGEN_HIPCC is defined, then define EIGEN_GPUCC
-//
-#define EIGEN_GPUCC
-//
-// EIGEN_HIPCC implies the HIP compiler and is used to tweak Eigen code for use in HIP kernels
-// EIGEN_CUDACC implies the CUDA compiler and is used to tweak Eigen code for use in CUDA kernels
-//
-// In most cases the same tweaks are required to the Eigen code to enable in both the HIP and CUDA kernels.
-// For those cases, the corresponding code should be guarded with
-//      #if defined(EIGEN_GPUCC)
-// instead of
-//      #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
-//
-// For cases where the tweak is specific to HIP, the code should be guarded with
-//      #if defined(EIGEN_HIPCC)
-//
-// For cases where the tweak is specific to CUDA, the code should be guarded with
-//      #if defined(EIGEN_CUDACC)
-//
-#endif
-
-#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
-//
-// If either EIGEN_CUDA_ARCH or EIGEN_HIP_DEVICE_COMPILE is defined, then define EIGEN_GPU_COMPILE_PHASE
-//
-#define EIGEN_GPU_COMPILE_PHASE
-//
-// GPU compilers (HIPCC, NVCC) typically do two passes over the source code,
-//   + one to compile the source for the "host" (ie CPU)
-//   + another to compile the source for the "device" (ie. GPU)
-//
-// Code that needs to enabled only during the either the "host" or "device" compilation phase
-// needs to be guarded with a macro that indicates the current compilation phase
-//
-// EIGEN_HIP_DEVICE_COMPILE implies the device compilation phase in HIP
-// EIGEN_CUDA_ARCH implies the device compilation phase in CUDA
-//
-// In most cases, the "host" / "device" specific code is the same for both HIP and CUDA
-// For those cases, the code should be guarded with
-//       #if defined(EIGEN_GPU_COMPILE_PHASE)
-// instead of
-//       #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
-//
-// For cases where the tweak is specific to HIP, the code should be guarded with
-//      #if defined(EIGEN_HIP_DEVICE_COMPILE)
-//
-// For cases where the tweak is specific to CUDA, the code should be guarded with
-//      #if defined(EIGEN_CUDA_ARCH)
-//
-#endif
-
-#if defined(EIGEN_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__)
-// EIGEN_USE_SYCL is a user-defined macro while __SYCL_DEVICE_ONLY__ is a compiler-defined macro.
-// In most cases we want to check if both macros are defined which can be done using the define below.
-#define SYCL_DEVICE_ONLY
-#endif
-
-//------------------------------------------------------------------------------------------
-// Detect Compiler/Architecture/OS specific features
-//------------------------------------------------------------------------------------------
 
 #if EIGEN_GNUC_AT_MOST(4,3) && !EIGEN_COMP_CLANG
   // see bug 89
@@ -514,6 +322,20 @@
   #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 1
 #endif
 
+// This macro can be used to prevent from macro expansion, e.g.:
+//   std::max EIGEN_NOT_A_MACRO(a,b)
+#define EIGEN_NOT_A_MACRO
+
+#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
+#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor
+#else
+#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::ColMajor
+#endif
+
+#ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE std::ptrdiff_t
+#endif
+
 // Cross compiler wrapper around LLVM's __has_builtin
 #ifdef __has_builtin
 #  define EIGEN_HAS_BUILTIN(x) __has_builtin(x)
@@ -527,47 +349,19 @@
 # define __has_feature(x) 0
 #endif
 
-// Some old compilers do not support template specializations like:
-// template<typename T,int N> void foo(const T x[N]);
-#if !(   EIGEN_COMP_CLANG && (   (EIGEN_COMP_CLANG<309)                                                       \
-                              || (defined(__apple_build_version__) && (__apple_build_version__ < 9000000)))  \
-      || EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<49)
-#define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 1
-#else
-#define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 0
+// Upperbound on the C++ version to use.
+// Expected values are 03, 11, 14, 17, etc.
+// By default, let's use an arbitrarily large C++ version.
+#ifndef EIGEN_MAX_CPP_VER
+#define EIGEN_MAX_CPP_VER 99
 #endif
 
-
-// The macro EIGEN_COMP_CXXVER defines the c++ verson expected by the compiler.
-// For instance, if compiling with gcc and -std=c++17, then EIGEN_COMP_CXXVER
-// is defined to 17.
-#if   (defined(__cplusplus) && (__cplusplus >  201402L) || EIGEN_COMP_MSVC_LANG > 201402L)
-#define EIGEN_COMP_CXXVER 17
-#elif (defined(__cplusplus) && (__cplusplus >  201103L) || EIGEN_COMP_MSVC >= 1910)
-#define EIGEN_COMP_CXXVER 14
-#elif (defined(__cplusplus) && (__cplusplus >= 201103L) || EIGEN_COMP_MSVC >= 1900)
-#define EIGEN_COMP_CXXVER 11
-#else
-#define EIGEN_COMP_CXXVER 03
-#endif
-
-
-// The macros EIGEN_HAS_CXX?? defines a rough estimate of available c++ features
-// but in practice we should not rely on them but rather on the availabilty of
-// individual features as defined later.
-// This is why there is no EIGEN_HAS_CXX17.
-// FIXME: get rid of EIGEN_HAS_CXX14 and maybe even EIGEN_HAS_CXX11.
-#if EIGEN_MAX_CPP_VER>=11 && EIGEN_COMP_CXXVER>=11
+#if EIGEN_MAX_CPP_VER>=11 && (defined(__cplusplus) && (__cplusplus >= 201103L) || EIGEN_COMP_MSVC >= 1900)
 #define EIGEN_HAS_CXX11 1
 #else
 #define EIGEN_HAS_CXX11 0
 #endif
 
-#if EIGEN_MAX_CPP_VER>=14 && EIGEN_COMP_CXXVER>=14
-#define EIGEN_HAS_CXX14 1
-#else
-#define EIGEN_HAS_CXX14 0
-#endif
 
 // Do we support r-value references?
 #ifndef EIGEN_HAS_RVALUE_REFERENCES
@@ -582,14 +376,12 @@
 #endif
 
 // Does the compiler support C99?
-// Need to include <cmath> to make sure _GLIBCXX_USE_C99 gets defined
-#include <cmath>
 #ifndef EIGEN_HAS_C99_MATH
 #if EIGEN_MAX_CPP_VER>=11 && \
     ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901))       \
   || (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) \
   || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) \
-  || (EIGEN_COMP_MSVC >= 1900) || defined(SYCL_DEVICE_ONLY))
+  || (EIGEN_COMP_MSVC >= 1900) )
   #define EIGEN_HAS_C99_MATH 1
 #else
   #define EIGEN_HAS_C99_MATH 0
@@ -597,33 +389,14 @@
 #endif
 
 // Does the compiler support result_of?
-// It's likely that MSVC 2013 supports result_of but I couldn't not find a good source for that,
-// so let's be conservative.
 #ifndef EIGEN_HAS_STD_RESULT_OF
-#if EIGEN_MAX_CPP_VER>=11 && \
-    (__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L) || EIGEN_COMP_MSVC >= 1900)
+#if EIGEN_MAX_CPP_VER>=11 && ((__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L)))
 #define EIGEN_HAS_STD_RESULT_OF 1
 #else
 #define EIGEN_HAS_STD_RESULT_OF 0
 #endif
 #endif
 
-#ifndef EIGEN_HAS_ALIGNAS
-#if EIGEN_MAX_CPP_VER>=11 && EIGEN_HAS_CXX11 &&   \
-      (     __has_feature(cxx_alignas)            \
-        ||  EIGEN_HAS_CXX14                       \
-        || (EIGEN_COMP_MSVC >= 1800)              \
-        || (EIGEN_GNUC_AT_LEAST(4,8))             \
-        || (EIGEN_COMP_CLANG>=305)                \
-        || (EIGEN_COMP_ICC>=1500)                 \
-        || (EIGEN_COMP_PGI>=1500)                 \
-        || (EIGEN_COMP_SUNCC>=0x5130))
-#define EIGEN_HAS_ALIGNAS 1
-#else
-#define EIGEN_HAS_ALIGNAS 0
-#endif
-#endif
-
 // Does the compiler support type_traits?
 // - full support of type traits was added only to GCC 5.1.0.
 // - 20150626 corresponds to the last release of 4.x libstdc++
@@ -641,12 +414,10 @@
 // Does the compiler support variadic templates?
 #ifndef EIGEN_HAS_VARIADIC_TEMPLATES
 #if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \
-  && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_COMP_NVCC >= 80000) )
+  && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_CUDACC_VER >= 80000) )
     // ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices:
     //    this prevents nvcc from crashing when compiling Eigen on Tegra X1
 #define EIGEN_HAS_VARIADIC_TEMPLATES 1
-#elif  EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) && defined(SYCL_DEVICE_ONLY)
-#define EIGEN_HAS_VARIADIC_TEMPLATES 1
 #else
 #define EIGEN_HAS_VARIADIC_TEMPLATES 0
 #endif
@@ -654,22 +425,22 @@
 
 // Does the compiler fully support const expressions? (as in c++14)
 #ifndef EIGEN_HAS_CONSTEXPR
-  #if defined(EIGEN_CUDACC)
-  // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
-    #if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_COMP_NVCC >= 70500))
-      #define EIGEN_HAS_CONSTEXPR 1
-    #endif
-  #elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \
-    (EIGEN_GNUC_AT_LEAST(4,8) && (__cplusplus > 199711L)) || \
-    (EIGEN_COMP_CLANG >= 306 && (__cplusplus > 199711L)))
-    #define EIGEN_HAS_CONSTEXPR 1
-  #endif
 
-  #ifndef EIGEN_HAS_CONSTEXPR
-    #define EIGEN_HAS_CONSTEXPR 0
-  #endif
+#ifdef __CUDACC__
+// Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above
+#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_CUDACC_VER >= 70500))
+  #define EIGEN_HAS_CONSTEXPR 1
+#endif
+#elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \
+  (EIGEN_GNUC_AT_LEAST(4,8) && (__cplusplus > 199711L)))
+#define EIGEN_HAS_CONSTEXPR 1
+#endif
 
-#endif // EIGEN_HAS_CONSTEXPR
+#ifndef EIGEN_HAS_CONSTEXPR
+#define EIGEN_HAS_CONSTEXPR 0
+#endif
+
+#endif
 
 // Does the compiler support C++11 math?
 // Let's be conservative and enable the default C++11 implementation only if we are sure it exists
@@ -707,80 +478,15 @@
   #endif
 #endif
 
-#ifndef EIGEN_HAS_CXX11_ATOMIC
-  #if    EIGEN_MAX_CPP_VER>=11 && \
-         (__has_feature(cxx_atomic) \
-      || (__cplusplus > 201103L) \
-      || ((__cplusplus >= 201103L) && (EIGEN_COMP_MSVC==0 || EIGEN_COMP_MSVC >= 1700)))
-    #define EIGEN_HAS_CXX11_ATOMIC 1
-  #else
-    #define EIGEN_HAS_CXX11_ATOMIC 0
-  #endif
+/** Allows to disable some optimizations which might affect the accuracy of the result.
+  * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
+  * They currently include:
+  *   - single precision ArrayBase::sin() and ArrayBase::cos() for SSE and AVX vectorization.
+  */
+#ifndef EIGEN_FAST_MATH
+#define EIGEN_FAST_MATH 1
 #endif
 
-#ifndef EIGEN_HAS_CXX11_OVERRIDE_FINAL
-  #if    EIGEN_MAX_CPP_VER>=11 && \
-       (__cplusplus >= 201103L || EIGEN_COMP_MSVC >= 1700)
-    #define EIGEN_HAS_CXX11_OVERRIDE_FINAL 1
-  #else
-    #define EIGEN_HAS_CXX11_OVERRIDE_FINAL 0
-  #endif
-#endif
-
-// NOTE: the required Apple's clang version is very conservative 
-//       and it could be that XCode 9 works just fine.
-// NOTE: the MSVC version is based on https://en.cppreference.com/w/cpp/compiler_support
-//       and not tested.
-#ifndef EIGEN_HAS_CXX17_OVERALIGN
-#if EIGEN_MAX_CPP_VER>=17 && EIGEN_COMP_CXXVER>=17 && (                                 \
-           (EIGEN_COMP_MSVC >= 1912)                                                    \
-        || (EIGEN_GNUC_AT_LEAST(7,0))                                                   \
-        || ((!defined(__apple_build_version__)) && (EIGEN_COMP_CLANG>=500))             \
-        || (( defined(__apple_build_version__)) && (__apple_build_version__>=10000000)) \
-      )
-#define EIGEN_HAS_CXX17_OVERALIGN 1
-#else
-#define EIGEN_HAS_CXX17_OVERALIGN 0
-#endif
-#endif
-
-#if defined(EIGEN_CUDACC) && EIGEN_HAS_CONSTEXPR
-  // While available already with c++11, this is useful mostly starting with c++14 and relaxed constexpr rules
-  #if defined(__NVCC__)
-    // nvcc considers constexpr functions as __host__ __device__ with the option --expt-relaxed-constexpr
-    #ifdef __CUDACC_RELAXED_CONSTEXPR__
-      #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC
-    #endif
-  #elif defined(__clang__) && defined(__CUDA__) && __has_feature(cxx_relaxed_constexpr)
-    // clang++ always considers constexpr functions as implicitly __host__ __device__
-    #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC
-  #endif
-#endif
-
-// Does the compiler support the __int128 and __uint128_t extensions for 128-bit
-// integer arithmetic?
-//
-// Clang and GCC define __SIZEOF_INT128__ when these extensions are supported,
-// but we avoid using them in certain cases:
-//
-// * Building using Clang for Windows, where the Clang runtime library has
-//   128-bit support only on LP64 architectures, but Windows is LLP64.
-#ifndef EIGEN_HAS_BUILTIN_INT128
-#if defined(__SIZEOF_INT128__) && !(EIGEN_OS_WIN && EIGEN_COMP_CLANG)
-#define EIGEN_HAS_BUILTIN_INT128 1
-#else
-#define EIGEN_HAS_BUILTIN_INT128 0
-#endif
-#endif
-
-//------------------------------------------------------------------------------------------
-// Preprocessor programming helpers
-//------------------------------------------------------------------------------------------
-
-// This macro can be used to prevent from macro expansion, e.g.:
-//   std::max EIGEN_NOT_A_MACRO(a,b)
-#define EIGEN_NOT_A_MACRO
-
 #define EIGEN_DEBUG_VAR(x) std::cerr << #x << " = " << x << std::endl;
 
 // concatenate two tokens
@@ -812,7 +518,7 @@
 //   Eval.h:91: sorry, unimplemented: inlining failed in call to 'const Eigen::Eval<Derived> Eigen::MatrixBase<Scalar, Derived>::eval() const'
 //    : function body not available
 //   See also bug 1367
-#if EIGEN_GNUC_AT_LEAST(4,2) && !defined(SYCL_DEVICE_ONLY)
+#if EIGEN_GNUC_AT_LEAST(4,2)
 #define EIGEN_ALWAYS_INLINE __attribute__((always_inline)) inline
 #else
 #define EIGEN_ALWAYS_INLINE EIGEN_STRONG_INLINE
@@ -832,43 +538,12 @@
 #define EIGEN_PERMISSIVE_EXPR
 #endif
 
-// GPU stuff
-
-// Disable some features when compiling with GPU compilers (NVCC/clang-cuda/SYCL/HIPCC)
-#if defined(EIGEN_CUDACC) || defined(SYCL_DEVICE_ONLY) || defined(EIGEN_HIPCC)
-  // Do not try asserts on device code
-  #ifndef EIGEN_NO_DEBUG
-  #define EIGEN_NO_DEBUG
-  #endif
-
-  #ifdef EIGEN_INTERNAL_DEBUGGING
-  #undef EIGEN_INTERNAL_DEBUGGING
-  #endif
-
-  #ifdef EIGEN_EXCEPTIONS
-  #undef EIGEN_EXCEPTIONS
-  #endif
-#endif
-
-#if defined(SYCL_DEVICE_ONLY)
-  #ifndef EIGEN_DONT_VECTORIZE
-    #define EIGEN_DONT_VECTORIZE
-  #endif
-  #define EIGEN_DEVICE_FUNC __attribute__((flatten)) __attribute__((always_inline))
-// All functions callable from CUDA/HIP code must be qualified with __device__
-#elif defined(EIGEN_GPUCC) 
-    #define EIGEN_DEVICE_FUNC __host__ __device__
-#else
-  #define EIGEN_DEVICE_FUNC
-#endif
-
-
 // this macro allows to get rid of linking errors about multiply defined functions.
 //  - static is not very good because it prevents definitions from different object files to be merged.
 //           So static causes the resulting linked executable to be bloated with multiple copies of the same function.
 //  - inline is not perfect either as it unwantedly hints the compiler toward inlining the function.
-#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC
-#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC inline
+#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS inline
 
 #ifdef NDEBUG
 # ifndef EIGEN_NO_DEBUG
@@ -878,12 +553,8 @@
 
 // eigen_plain_assert is where we implement the workaround for the assert() bug in GCC <= 4.3, see bug 89
 #ifdef EIGEN_NO_DEBUG
-  #ifdef SYCL_DEVICE_ONLY // used to silence the warning on SYCL device
-    #define eigen_plain_assert(x) EIGEN_UNUSED_VARIABLE(x)
-  #else
-    #define eigen_plain_assert(x)
-  #endif
-#else 
+  #define eigen_plain_assert(x)
+#else
   #if EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO
     namespace Eigen {
     namespace internal {
@@ -956,7 +627,7 @@
 // Suppresses 'unused variable' warnings.
 namespace Eigen {
   namespace internal {
-    template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ignore_unused_variable(const T&) {}
+    template<typename T> EIGEN_DEVICE_FUNC void ignore_unused_variable(const T&) {}
   }
 }
 #define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var);
@@ -970,14 +641,169 @@ namespace Eigen {
 #endif
 
 
-#if EIGEN_COMP_MSVC
-  // NOTE MSVC often gives C4127 warnings with compiletime if statements. See bug 1362.
-  // This workaround is ugly, but it does the job.
-#  define EIGEN_CONST_CONDITIONAL(cond)  (void)0, cond
+//------------------------------------------------------------------------------------------
+// Static and dynamic alignment control
+//
+// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES
+// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively.
+// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not,
+// a default value is automatically computed based on architecture, compiler, and OS.
+//
+// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX}
+// to be used to declare statically aligned buffers.
+//------------------------------------------------------------------------------------------
+
+
+/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements.
+ * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled,
+ * so that vectorization doesn't affect binary compatibility.
+ *
+ * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
+ * vectorized and non-vectorized code.
+ */
+#if (defined __CUDACC__)
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
+#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
+#elif EIGEN_COMP_MSVC
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
+#elif EIGEN_COMP_SUNCC
+  // FIXME not sure about this one:
+  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
 #else
-#  define EIGEN_CONST_CONDITIONAL(cond)  cond
+  #error Please tell me what is the equivalent of __attribute__((aligned(n))) for your compiler
 #endif
 
+// If the user explicitly disable vectorization, then we also disable alignment
+#if defined(EIGEN_DONT_VECTORIZE)
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0
+#elif defined(EIGEN_VECTORIZE_AVX512)
+  // 64 bytes static alignmeent is preferred only if really required
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64
+#elif defined(__AVX__)
+  // 32 bytes static alignmeent is preferred only if really required
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32
+#else
+  #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
+#endif
+
+
+// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense
+#define EIGEN_MIN_ALIGN_BYTES 16
+
+// Defined the boundary (in bytes) on which the data needs to be aligned. Note
+// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be
+// aligned at all regardless of the value of this #define.
+
+#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN))  && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0
+#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY.
+#endif
+
+// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprectated
+// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0
+#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)
+  #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES
+    #undef EIGEN_MAX_STATIC_ALIGN_BYTES
+  #endif
+  #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
+#endif
+
+#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES
+
+  // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES
+
+  // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
+  // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
+  // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
+  // certain common platform (compiler+architecture combinations) to avoid these problems.
+  // Only static alignment is really problematic (relies on nonstandard compiler extensions),
+  // try to keep heap alignment even when we have to disable static alignment.
+  #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64)
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
+  #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6)
+  // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support.
+  // Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use.
+  // 4.8 and newer seem definitely unaffected.
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
+  #else
+  #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
+  #endif
+
+  // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX
+  #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \
+  && !EIGEN_GCC3_OR_OLDER \
+  && !EIGEN_COMP_SUNCC \
+  && !EIGEN_OS_QNX
+    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
+  #else
+    #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
+  #endif
+
+  #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT
+    #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+  #else
+    #define EIGEN_MAX_STATIC_ALIGN_BYTES 0
+  #endif
+
+#endif
+
+// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_ALIGN_BYTES
+#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES<EIGEN_MAX_STATIC_ALIGN_BYTES
+#undef EIGEN_MAX_STATIC_ALIGN_BYTES
+#define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
+#endif
+
+#if EIGEN_MAX_STATIC_ALIGN_BYTES==0 && !defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
+  #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
+#endif
+
+// At this stage, EIGEN_MAX_STATIC_ALIGN_BYTES>0 is the true test whether we want to align arrays on the stack or not.
+// It takes into account both the user choice to explicitly enable/disable alignment (by settting EIGEN_MAX_STATIC_ALIGN_BYTES)
+// and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT).
+// Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used.
+
+
+// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY
+#define EIGEN_ALIGN8  EIGEN_ALIGN_TO_BOUNDARY(8)
+#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
+#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32)
+#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64)
+#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
+#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES)
+#else
+#define EIGEN_ALIGN_MAX
+#endif
+
+
+// Dynamic alignment control
+
+#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0
+#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN.
+#endif
+
+#ifdef EIGEN_DONT_ALIGN
+  #ifdef EIGEN_MAX_ALIGN_BYTES
+    #undef EIGEN_MAX_ALIGN_BYTES
+  #endif
+  #define EIGEN_MAX_ALIGN_BYTES 0
+#elif !defined(EIGEN_MAX_ALIGN_BYTES)
+  #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+#endif
+
+#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES
+#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+#else
+#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
+#endif
+
+
+#ifndef EIGEN_UNALIGNED_VECTORIZE
+#define EIGEN_UNALIGNED_VECTORIZE 1
+#endif
+
+//----------------------------------------------------------------------
+
+
 #ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD
   #define EIGEN_RESTRICT
 #endif
@@ -985,6 +811,10 @@ namespace Eigen {
   #define EIGEN_RESTRICT __restrict
 #endif
 
+#ifndef EIGEN_STACK_ALLOCATION_LIMIT
+// 131072 == 128 KB
+#define EIGEN_STACK_ALLOCATION_LIMIT 131072
+#endif
 
 #ifndef EIGEN_DEFAULT_IO_FORMAT
 #ifdef EIGEN_MAKING_DOCS
@@ -999,32 +829,7 @@ namespace Eigen {
 // just an empty macro !
 #define EIGEN_EMPTY
 
-
-// When compiling CUDA/HIP device code with NVCC or HIPCC
-// pull in math functions from the global namespace.
-// In host mode, and when device code is compiled with clang,
-// use the std versions.
-#if (defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)) || defined(EIGEN_HIP_DEVICE_COMPILE)
-  #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC;
-#else
-  #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC;
-#endif
-
-
-// When compiling HIP device code with HIPCC, certain functions
-// from the stdlib need to be pulled in from the global namespace
-// (as opposed to from the std:: namespace). This is because HIPCC
-// does not natively support all the std:: routines in device code.
-// Instead it contains header files that declare the corresponding
-// routines in the global namespace such they can be used in device code.
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-  #define EIGEN_USING_STD(FUNC) using ::FUNC;
-#else
-  #define EIGEN_USING_STD(FUNC) using std::FUNC;
-#endif
-
-
-#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_COMP_NVCC)
+#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_CUDACC_VER>0)
   // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324)
   #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
     using Base::operator =;
@@ -1103,8 +908,7 @@ namespace Eigen {
   typedef typename Eigen::internal::ref_selector<Derived>::type Nested; \
   typedef typename Eigen::internal::traits<Derived>::StorageKind StorageKind; \
   typedef typename Eigen::internal::traits<Derived>::StorageIndex StorageIndex; \
-  enum CompileTimeTraits \
-      { RowsAtCompileTime = Eigen::internal::traits<Derived>::RowsAtCompileTime, \
+  enum { RowsAtCompileTime = Eigen::internal::traits<Derived>::RowsAtCompileTime, \
         ColsAtCompileTime = Eigen::internal::traits<Derived>::ColsAtCompileTime, \
         Flags = Eigen::internal::traits<Derived>::Flags, \
         SizeAtCompileTime = Base::SizeAtCompileTime, \
@@ -1149,14 +953,6 @@ namespace Eigen {
 
 #define EIGEN_IMPLIES(a,b) (!(a) || (b))
 
-#if EIGEN_HAS_BUILTIN(__builtin_expect) || EIGEN_COMP_GNUC
-#define EIGEN_PREDICT_FALSE(x) (__builtin_expect(x, false))
-#define EIGEN_PREDICT_TRUE(x) (__builtin_expect(false || (x), true))
-#else
-#define EIGEN_PREDICT_FALSE(x) (x)
-#define EIGEN_PREDICT_TRUE(x) (x)
-#endif
-
 // the expression type of a standard coefficient wise binary operation
 #define EIGEN_CWISE_BINARY_RETURN_TYPE(LHS,RHS,OPNAME) \
     CwiseBinaryOp< \
@@ -1188,14 +984,14 @@ namespace Eigen {
                 const typename internal::plain_constant_type<EXPR,SCALAR>::type, const EXPR>
 
 // Workaround for MSVC 2010 (see ML thread "patch with compile for for MSVC 2010")
-#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC_STRICT<=1600)
+#if EIGEN_COMP_MSVC_STRICT<=1600
 #define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) typename internal::enable_if<true,X>::type
 #else
 #define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) X
 #endif
 
 #define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME) \
-  template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \
+  template <typename T> EIGEN_DEVICE_FUNC inline \
   EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,Scalar,T)>::type,OPNAME))\
   (METHOD)(const T& scalar) const { \
     typedef typename internal::promote_scalar_arg<Scalar,T,EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,Scalar,T)>::type PromotedT; \
@@ -1204,7 +1000,7 @@ namespace Eigen {
   }
 
 #define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD,OPNAME) \
-  template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend \
+  template <typename T> EIGEN_DEVICE_FUNC inline friend \
   EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,T,Scalar)>::type,Derived,OPNAME)) \
   (METHOD)(const T& scalar, const StorageBaseType& matrix) { \
     typedef typename internal::promote_scalar_arg<Scalar,T,EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME,T,Scalar)>::type PromotedT; \
@@ -1217,23 +1013,15 @@ namespace Eigen {
   EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME)
 
 
-#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) && !defined(EIGEN_HIP_DEVICE_COMPILE)
-  #define EIGEN_EXCEPTIONS
-#endif
-
-
 #ifdef EIGEN_EXCEPTIONS
 #  define EIGEN_THROW_X(X) throw X
 #  define EIGEN_THROW throw
 #  define EIGEN_TRY try
 #  define EIGEN_CATCH(X) catch (X)
 #else
-#  if defined(EIGEN_CUDA_ARCH)
+#  ifdef __CUDA_ARCH__
 #    define EIGEN_THROW_X(X) asm("trap;")
 #    define EIGEN_THROW asm("trap;")
-#  elif defined(EIGEN_HIP_DEVICE_COMPILE)
-#    define EIGEN_THROW_X(X) asm("s_trap 0")
-#    define EIGEN_THROW asm("s_trap 0")
 #  else
 #    define EIGEN_THROW_X(X) std::abort()
 #    define EIGEN_THROW std::abort()
@@ -1253,47 +1041,13 @@ namespace Eigen {
 #   define EIGEN_NOEXCEPT
 #   define EIGEN_NOEXCEPT_IF(x)
 #   define EIGEN_NO_THROW throw()
-#   if EIGEN_COMP_MSVC || EIGEN_COMP_CXXVER>=17
+#   if EIGEN_COMP_MSVC
       // MSVC does not support exception specifications (warning C4290),
-      // and they are deprecated in c++11 anyway. This is even an error in c++17.
+      // and they are deprecated in c++11 anyway.
 #     define EIGEN_EXCEPTION_SPEC(X) throw()
 #   else
 #     define EIGEN_EXCEPTION_SPEC(X) throw(X)
 #   endif
 #endif
 
-#if EIGEN_HAS_VARIADIC_TEMPLATES
-// The all function is used to enable a variadic version of eigen_assert which can take a parameter pack as its input.
-namespace Eigen {
-namespace internal {
-
-inline bool all(){ return true; }
-
-template<typename T, typename ...Ts>
-bool all(T t, Ts ... ts){ return t && all(ts...); }
-
-}
-}
-#endif
-
-#if EIGEN_HAS_CXX11_OVERRIDE_FINAL
-// provide override and final specifiers if they are available:
-#   define EIGEN_OVERRIDE override
-#   define EIGEN_FINAL final
-#else
-#   define EIGEN_OVERRIDE
-#   define EIGEN_FINAL
-#endif
-
-// Wrapping #pragma unroll in a macro since it is required for SYCL
-#if defined(SYCL_DEVICE_ONLY)
-  #if defined(_MSC_VER)
-    #define EIGEN_UNROLL_LOOP __pragma(unroll)
-  #else
-    #define EIGEN_UNROLL_LOOP _Pragma("unroll")
-  #endif
-#else
-  #define EIGEN_UNROLL_LOOP
-#endif
-
 #endif // EIGEN_MACROS_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/Memory.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/Memory.h
index 1b12544d2..291383c58 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/Memory.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/util/Memory.h
@@ -63,27 +63,14 @@ namespace Eigen {
 
 namespace internal {
 
-EIGEN_DEVICE_FUNC
+EIGEN_DEVICE_FUNC 
 inline void throw_std_bad_alloc()
 {
   #ifdef EIGEN_EXCEPTIONS
     throw std::bad_alloc();
   #else
     std::size_t huge = static_cast<std::size_t>(-1);
-    #if defined(EIGEN_HIPCC)
-    //
-    // calls to "::operator new" are to be treated as opaque function calls (i.e no inlining),
-    // and as a consequence the code in the #else block triggers the hipcc warning :
-    // "no overloaded function has restriction specifiers that are compatible with the ambient context"
-    //
-    // "throw_std_bad_alloc" has the EIGEN_DEVICE_FUNC attribute, so it seems that hipcc expects
-    // the same on "operator new"
-    // Reverting code back to the old version in this #if block for the hipcc compiler
-    //
-    new int[huge];
-    #else
     ::operator new(huge);
-    #endif
   #endif
 }
 
@@ -96,26 +83,19 @@ inline void throw_std_bad_alloc()
 /** \internal Like malloc, but the returned pointer is guaranteed to be 16-byte aligned.
   * Fast, but wastes 16 additional bytes of memory. Does not throw any exception.
   */
-EIGEN_DEVICE_FUNC inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES)
+inline void* handmade_aligned_malloc(std::size_t size)
 {
-  eigen_assert(alignment >= sizeof(void*) && (alignment & (alignment-1)) == 0 && "Alignment must be at least sizeof(void*) and a power of 2");
-
-  EIGEN_USING_STD(malloc)
-  void *original = malloc(size+alignment);
-  
+  void *original = std::malloc(size+EIGEN_DEFAULT_ALIGN_BYTES);
   if (original == 0) return 0;
-  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(alignment-1))) + alignment);
+  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1))) + EIGEN_DEFAULT_ALIGN_BYTES);
   *(reinterpret_cast<void**>(aligned) - 1) = original;
   return aligned;
 }
 
 /** \internal Frees memory allocated with handmade_aligned_malloc */
-EIGEN_DEVICE_FUNC inline void handmade_aligned_free(void *ptr)
+inline void handmade_aligned_free(void *ptr)
 {
-  if (ptr) {
-    EIGEN_USING_STD(free)
-    free(*(reinterpret_cast<void**>(ptr) - 1));
-  }
+  if (ptr) std::free(*(reinterpret_cast<void**>(ptr) - 1));
 }
 
 /** \internal
@@ -134,7 +114,7 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t =
   void *previous_aligned = static_cast<char *>(original)+previous_offset;
   if(aligned!=previous_aligned)
     std::memmove(aligned, previous_aligned, size);
-
+  
   *(reinterpret_cast<void**>(aligned) - 1) = original;
   return aligned;
 }
@@ -162,7 +142,7 @@ EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
 {
   eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)");
 }
-#else
+#else 
 EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed()
 {}
 #endif
@@ -176,12 +156,9 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size)
 
   void *result;
   #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
-
-    EIGEN_USING_STD(malloc)
-    result = malloc(size);
-
+    result = std::malloc(size);
     #if EIGEN_DEFAULT_ALIGN_BYTES==16
-    eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade aligned memory allocator.");
+    eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade alignd memory allocator.");
     #endif
   #else
     result = handmade_aligned_malloc(size);
@@ -197,10 +174,7 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size)
 EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr)
 {
   #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED
-
-    EIGEN_USING_STD(free)
-    free(ptr);
-
+    std::free(ptr);
   #else
     handmade_aligned_free(ptr);
   #endif
@@ -244,9 +218,7 @@ template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(std:
 {
   check_that_malloc_is_allowed();
 
-  EIGEN_USING_STD(malloc)
-  void *result = malloc(size);
-
+  void *result = std::malloc(size);
   if(!result && size)
     throw_std_bad_alloc();
   return result;
@@ -260,8 +232,7 @@ template<bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void
 
 template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void *ptr)
 {
-  EIGEN_USING_STD(free)
-  free(ptr);
+  std::free(ptr);
 }
 
 template<bool Align> inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size)
@@ -360,7 +331,7 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned
 template<typename T> EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, std::size_t size)
 {
   destruct_elements_of_array<T>(ptr, size);
-  Eigen::internal::aligned_free(ptr);
+  aligned_free(ptr);
 }
 
 /** \internal Deletes objects constructed with conditional_aligned_new
@@ -500,8 +471,8 @@ EIGEN_DEVICE_FUNC inline Index first_default_aligned(const Scalar* array, Index
 }
 
 /** \internal Returns the smallest integer multiple of \a base and greater or equal to \a size
-  */
-template<typename Index>
+  */ 
+template<typename Index> 
 inline Index first_multiple(Index size, Index base)
 {
   return ((size+base-1)/base)*base;
@@ -522,8 +493,7 @@ template<typename T> struct smart_copy_helper<T,true> {
     IntPtr size = IntPtr(end)-IntPtr(start);
     if(size==0) return;
     eigen_internal_assert(start!=0 && end!=0 && target!=0);
-    EIGEN_USING_STD(memcpy)
-    memcpy(target, start, size);
+    std::memcpy(target, start, size);
   }
 };
 
@@ -532,7 +502,7 @@ template<typename T> struct smart_copy_helper<T,false> {
   { std::copy(start, end, target); }
 };
 
-// intelligent memmove. falls back to std::memmove for POD types, uses std::copy otherwise.
+// intelligent memmove. falls back to std::memmove for POD types, uses std::copy otherwise. 
 template<typename T, bool UseMemmove> struct smart_memmove_helper;
 
 template<typename T> void smart_memmove(const T* start, const T* end, T* target)
@@ -552,15 +522,15 @@ template<typename T> struct smart_memmove_helper<T,true> {
 
 template<typename T> struct smart_memmove_helper<T,false> {
   static inline void run(const T* start, const T* end, T* target)
-  {
+  { 
     if (UIntPtr(target) < UIntPtr(start))
     {
       std::copy(start, end, target);
     }
-    else
+    else                                 
     {
       std::ptrdiff_t count = (std::ptrdiff_t(end)-std::ptrdiff_t(start)) / sizeof(T);
-      std::copy_backward(start, end, target + count);
+      std::copy_backward(start, end, target + count); 
     }
   }
 };
@@ -572,7 +542,7 @@ template<typename T> struct smart_memmove_helper<T,false> {
 
 // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA
 // to the appropriate stack allocation function
-#if ! defined EIGEN_ALLOCA && ! defined EIGEN_GPU_COMPILE_PHASE
+#ifndef EIGEN_ALLOCA
   #if EIGEN_OS_LINUX || EIGEN_OS_MAC || (defined alloca)
     #define EIGEN_ALLOCA alloca
   #elif EIGEN_COMP_MSVC
@@ -580,15 +550,6 @@ template<typename T> struct smart_memmove_helper<T,false> {
   #endif
 #endif
 
-// With clang -Oz -mthumb, alloca changes the stack pointer in a way that is
-// not allowed in Thumb2. -DEIGEN_STACK_ALLOCATION_LIMIT=0 doesn't work because
-// the compiler still emits bad code because stack allocation checks use "<=".
-// TODO: Eliminate after https://bugs.llvm.org/show_bug.cgi?id=23772
-// is fixed.
-#if defined(__clang__) && defined(__thumb__)
-  #undef EIGEN_ALLOCA
-#endif
-
 // This helper class construct the allocated memory, and takes care of destructing and freeing the handled data
 // at destruction time. In practice this helper class is mainly useful to avoid memory leak in case of exceptions.
 template<typename T> class aligned_stack_memory_handler : noncopyable
@@ -600,14 +561,12 @@ template<typename T> class aligned_stack_memory_handler : noncopyable
      * In this case, the buffer elements will also be destructed when this handler will be destructed.
      * Finally, if \a dealloc is true, then the pointer \a ptr is freed.
      **/
-    EIGEN_DEVICE_FUNC
     aligned_stack_memory_handler(T* ptr, std::size_t size, bool dealloc)
       : m_ptr(ptr), m_size(size), m_deallocate(dealloc)
     {
       if(NumTraits<T>::RequireInitialization && m_ptr)
         Eigen::internal::construct_elements_of_array(m_ptr, size);
     }
-    EIGEN_DEVICE_FUNC
     ~aligned_stack_memory_handler()
     {
       if(NumTraits<T>::RequireInitialization && m_ptr)
@@ -621,60 +580,6 @@ template<typename T> class aligned_stack_memory_handler : noncopyable
     bool m_deallocate;
 };
 
-#ifdef EIGEN_ALLOCA
-
-template<typename Xpr, int NbEvaluations,
-         bool MapExternalBuffer = nested_eval<Xpr,NbEvaluations>::Evaluate && Xpr::MaxSizeAtCompileTime==Dynamic
-         >
-struct local_nested_eval_wrapper
-{
-  static const bool NeedExternalBuffer = false;
-  typedef typename Xpr::Scalar Scalar;
-  typedef typename nested_eval<Xpr,NbEvaluations>::type ObjectType;
-  ObjectType object;
-
-  EIGEN_DEVICE_FUNC
-  local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) : object(xpr)
-  {
-    EIGEN_UNUSED_VARIABLE(ptr);
-    eigen_internal_assert(ptr==0);
-  }
-};
-
-template<typename Xpr, int NbEvaluations>
-struct local_nested_eval_wrapper<Xpr,NbEvaluations,true>
-{
-  static const bool NeedExternalBuffer = true;
-  typedef typename Xpr::Scalar Scalar;
-  typedef typename plain_object_eval<Xpr>::type PlainObject;
-  typedef Map<PlainObject,EIGEN_DEFAULT_ALIGN_BYTES> ObjectType;
-  ObjectType object;
-
-  EIGEN_DEVICE_FUNC
-  local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr)
-    : object(ptr==0 ? reinterpret_cast<Scalar*>(Eigen::internal::aligned_malloc(sizeof(Scalar)*xpr.size())) : ptr, xpr.rows(), xpr.cols()),
-      m_deallocate(ptr==0)
-  {
-    if(NumTraits<Scalar>::RequireInitialization && object.data())
-      Eigen::internal::construct_elements_of_array(object.data(), object.size());
-    object = xpr;
-  }
-
-  EIGEN_DEVICE_FUNC
-  ~local_nested_eval_wrapper()
-  {
-    if(NumTraits<Scalar>::RequireInitialization && object.data())
-      Eigen::internal::destruct_elements_of_array(object.data(), object.size());
-    if(m_deallocate)
-      Eigen::internal::aligned_free(object.data());
-  }
-
-private:
-  bool m_deallocate;
-};
-
-#endif // EIGEN_ALLOCA
-
 template<typename T> class scoped_array : noncopyable
 {
   T* m_ptr;
@@ -698,15 +603,13 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
 {
   std::swap(a.ptr(),b.ptr());
 }
-
+    
 } // end namespace internal
 
 /** \internal
-  *
-  * The macro ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) declares, allocates,
-  * and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
-  * if the size in bytes is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
-  * (currently, this is Linux, OSX and Visual Studio only). Otherwise the memory is allocated on the heap.
+  * Declares, allocates and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
+  * if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
+  * (currently, this is Linux and Visual Studio only). Otherwise the memory is allocated on the heap.
   * The allocated buffer is automatically deleted when exiting the scope of this declaration.
   * If BUFFER is non null, then the declared variable is simply an alias for BUFFER, and no allocation/deletion occurs.
   * Here is an example:
@@ -717,17 +620,9 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
   * }
   * \endcode
   * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
-  *
-  * The macro ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) is analogue to
-  * \code
-  *   typename internal::nested_eval<XPRT_T,N>::type NAME(XPR);
-  * \endcode
-  * with the advantage of using aligned stack allocation even if the maximal size of XPR at compile time is unknown.
-  * This is accomplished through alloca if this later is supported and if the required number of bytes
-  * is below EIGEN_STACK_ALLOCATION_LIMIT.
   */
 #ifdef EIGEN_ALLOCA
-
+  
   #if EIGEN_DEFAULT_ALIGN_BYTES>0
     // We always manually re-align the result of EIGEN_ALLOCA.
     // If alloca is already aligned, the compiler should be smart enough to optimize away the re-alignment.
@@ -744,23 +639,13 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
                     : Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE) );  \
     Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,sizeof(TYPE)*SIZE>EIGEN_STACK_ALLOCATION_LIMIT)
 
-
-  #define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) \
-    Eigen::internal::local_nested_eval_wrapper<XPR_T,N> EIGEN_CAT(NAME,_wrapper)(XPR, reinterpret_cast<typename XPR_T::Scalar*>( \
-      ( (Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::NeedExternalBuffer) && ((sizeof(typename XPR_T::Scalar)*XPR.size())<=EIGEN_STACK_ALLOCATION_LIMIT) ) \
-        ? EIGEN_ALIGNED_ALLOCA( sizeof(typename XPR_T::Scalar)*XPR.size() ) : 0 ) ) ; \
-    typename Eigen::internal::local_nested_eval_wrapper<XPR_T,N>::ObjectType NAME(EIGEN_CAT(NAME,_wrapper).object)
-
 #else
 
   #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \
     Eigen::internal::check_size_for_overflow<TYPE>(SIZE); \
     TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE));    \
     Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true)
-
-
-#define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) typename Eigen::internal::nested_eval<XPR_T,N>::type NAME(XPR)
-
+    
 #endif
 
 
@@ -768,17 +653,6 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
 *** Implementation of EIGEN_MAKE_ALIGNED_OPERATOR_NEW [_IF]                ***
 *****************************************************************************/
 
-#if EIGEN_HAS_CXX17_OVERALIGN
-
-// C++17 -> no need to bother about alignment anymore :)
-
-#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign)
-#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)
-#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW
-#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size)
-
-#else
-
 #if EIGEN_MAX_ALIGN_BYTES!=0
   #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
       void* operator new(std::size_t size, const std::nothrow_t&) EIGEN_NO_THROW { \
@@ -814,14 +688,8 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
 #endif
 
 #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true)
-#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size)                        \
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(                                                             \
-        ((Size)!=Eigen::Dynamic) &&                                                                    \
-        (((EIGEN_MAX_ALIGN_BYTES>=16) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES  )==0)) ||    \
-         ((EIGEN_MAX_ALIGN_BYTES>=32) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES/2)==0)) ||    \
-         ((EIGEN_MAX_ALIGN_BYTES>=64) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES/4)==0))   )))
-
-#endif
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_MAX_ALIGN_BYTES==0)))
 
 /****************************************************************************/
 
@@ -835,13 +703,13 @@ template<typename T> void swap(scoped_array<T> &a,scoped_array<T> &b)
 *  - 32 bytes alignment if AVX is enabled.
 *  - 64 bytes alignment if AVX512 is enabled.
 *
-* This can be controlled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented
+* This can be controled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented
 * \link TopicPreprocessorDirectivesPerformance there \endlink.
 *
 * Example:
 * \code
 * // Matrix4f requires 16 bytes alignment:
-* std::map< int, Matrix4f, std::less<int>,
+* std::map< int, Matrix4f, std::less<int>, 
 *           aligned_allocator<std::pair<const int, Matrix4f> > > my_map_mat4;
 * // Vector3f does not require 16 bytes alignment, no need to use Eigen's allocator:
 * std::map< int, Vector3f > my_map_vec3;
@@ -876,19 +744,18 @@ public:
 
   ~aligned_allocator() {}
 
-  #if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_LEAST(7,0)
-  // In gcc std::allocator::max_size() is bugged making gcc triggers a warning:
-  // eigen/Eigen/src/Core/util/Memory.h:189:12: warning: argument 1 value '18446744073709551612' exceeds maximum object size 9223372036854775807
-  // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87544
-  size_type max_size() const {
-    return (std::numeric_limits<std::ptrdiff_t>::max)()/sizeof(T);
-  }
-  #endif
-
   pointer allocate(size_type num, const void* /*hint*/ = 0)
   {
     internal::check_size_for_overflow<T>(num);
-    return static_cast<pointer>( internal::aligned_malloc(num * sizeof(T)) );
+    size_type size = num * sizeof(T);
+#if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_LEAST(7,0)
+    // workaround gcc bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87544
+    // It triggered eigen/Eigen/src/Core/util/Memory.h:189:12: warning: argument 1 value '18446744073709551612' exceeds maximum object size 9223372036854775807
+    if(size>=std::size_t((std::numeric_limits<std::ptrdiff_t>::max)()))
+      return 0;
+    else
+#endif
+      return static_cast<pointer>( internal::aligned_malloc(size) );
   }
 
   void deallocate(pointer p, size_type /*num*/)
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/Meta.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/Meta.h
index e9e2f1873..9b61ff037 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/Meta.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/util/Meta.h
@@ -11,18 +11,9 @@
 #ifndef EIGEN_META_H
 #define EIGEN_META_H
 
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-
- #include <cfloat>
-
- #if defined(EIGEN_CUDA_ARCH)
-  #include <math_constants.h>
- #endif
-
- #if defined(EIGEN_HIP_DEVICE_COMPILE)
-  #include "Eigen/src/Core/arch/HIP/hcc/math_constants.h"
-  #endif
-
+#if defined(__CUDA_ARCH__)
+#include <cfloat>
+#include <math_constants.h>
 #endif
 
 #if EIGEN_COMP_ICC>=1600 &&  __cplusplus >= 201103L
@@ -63,21 +54,15 @@ typedef std::size_t UIntPtr;
 struct true_type {  enum { value = 1 }; };
 struct false_type { enum { value = 0 }; };
 
-template<bool Condition>
-struct bool_constant;
-
-template<>
-struct bool_constant<true> : true_type {};
-
-template<>
-struct bool_constant<false> : false_type {};
-
 template<bool Condition, typename Then, typename Else>
 struct conditional { typedef Then type; };
 
 template<typename Then, typename Else>
 struct conditional <false, Then, Else> { typedef Else type; };
 
+template<typename T, typename U> struct is_same { enum { value = 0 }; };
+template<typename T> struct is_same<T,T> { enum { value = 1 }; };
+
 template<typename T> struct remove_reference { typedef T type; };
 template<typename T> struct remove_reference<T&> { typedef T type; };
 
@@ -112,31 +97,23 @@ template<> struct is_arithmetic<unsigned int>  { enum { value = true }; };
 template<> struct is_arithmetic<signed long>   { enum { value = true }; };
 template<> struct is_arithmetic<unsigned long> { enum { value = true }; };
 
-template<typename T, typename U> struct is_same { enum { value = 0 }; };
-template<typename T> struct is_same<T,T> { enum { value = 1 }; };
-
-template< class T >
-struct is_void : is_same<void, typename remove_const<T>::type> {};
-
 #if EIGEN_HAS_CXX11
-template<> struct is_arithmetic<signed long long>   { enum { value = true }; };
-template<> struct is_arithmetic<unsigned long long> { enum { value = true }; };
 using std::is_integral;
 #else
-template<typename T> struct is_integral               { enum { value = false }; };
-template<> struct is_integral<bool>                   { enum { value = true }; };
-template<> struct is_integral<char>                   { enum { value = true }; };
-template<> struct is_integral<signed char>            { enum { value = true }; };
-template<> struct is_integral<unsigned char>          { enum { value = true }; };
-template<> struct is_integral<signed short>           { enum { value = true }; };
-template<> struct is_integral<unsigned short>         { enum { value = true }; };
-template<> struct is_integral<signed int>             { enum { value = true }; };
-template<> struct is_integral<unsigned int>           { enum { value = true }; };
-template<> struct is_integral<signed long>            { enum { value = true }; };
-template<> struct is_integral<unsigned long>          { enum { value = true }; };
+template<typename T> struct is_integral        { enum { value = false }; };
+template<> struct is_integral<bool>            { enum { value = true }; };
+template<> struct is_integral<char>            { enum { value = true }; };
+template<> struct is_integral<signed char>     { enum { value = true }; };
+template<> struct is_integral<unsigned char>   { enum { value = true }; };
+template<> struct is_integral<signed short>    { enum { value = true }; };
+template<> struct is_integral<unsigned short>  { enum { value = true }; };
+template<> struct is_integral<signed int>      { enum { value = true }; };
+template<> struct is_integral<unsigned int>    { enum { value = true }; };
+template<> struct is_integral<signed long>     { enum { value = true }; };
+template<> struct is_integral<unsigned long>   { enum { value = true }; };
 #if EIGEN_COMP_MSVC
-template<> struct is_integral<signed __int64>         { enum { value = true }; };
-template<> struct is_integral<unsigned __int64>       { enum { value = true }; };
+template<> struct is_integral<signed __int64>  { enum { value = true }; };
+template<> struct is_integral<unsigned __int64>{ enum { value = true }; };
 #endif
 #endif
 
@@ -174,11 +151,6 @@ template<typename T> struct add_const_on_value_type<T*>        { typedef T const
 template<typename T> struct add_const_on_value_type<T* const>  { typedef T const* const type; };
 template<typename T> struct add_const_on_value_type<T const* const>  { typedef T const* const type; };
 
-#if EIGEN_HAS_CXX11
-
-using std::is_convertible;
-
-#else
 
 template<typename From, typename To>
 struct is_convertible_impl
@@ -192,19 +164,16 @@ private:
   struct yes {int a[1];};
   struct no  {int a[2];};
 
-  template<typename T>
-  static yes test(T, int);
-
-  template<typename T>
+  static yes test(const To&, int);
   static no  test(any_conversion, ...);
 
 public:
-  static typename internal::remove_reference<From>::type* ms_from;
+  static From ms_from;
 #ifdef __INTEL_COMPILER
   #pragma warning push
   #pragma warning ( disable : 2259 )
 #endif
-  enum { value = sizeof(test<To>(*ms_from, 0))==sizeof(yes) };
+  enum { value = sizeof(test(ms_from, 0))==sizeof(yes) };
 #ifdef __INTEL_COMPILER
   #pragma warning pop
 #endif
@@ -213,17 +182,10 @@ public:
 template<typename From, typename To>
 struct is_convertible
 {
-  enum { value = is_convertible_impl<From,To>::value };
+  enum { value = is_convertible_impl<typename remove_all<From>::type,
+                                     typename remove_all<To  >::type>::value };
 };
 
-template<typename T>
-struct is_convertible<T,T&> { enum { value = false }; };
-
-template<typename T>
-struct is_convertible<const T,const T&> { enum { value = true }; };
-
-#endif
-
 /** \internal Allows to enable/disable an overload
   * according to a compile time condition.
   */
@@ -232,7 +194,7 @@ template<bool Condition, typename T=void> struct enable_if;
 template<typename T> struct enable_if<true,T>
 { typedef T type; };
 
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#if defined(__CUDA_ARCH__)
 #if !defined(__FLT_EPSILON__)
 #define __FLT_EPSILON__ FLT_EPSILON
 #define __DBL_EPSILON__ DBL_EPSILON
@@ -254,31 +216,13 @@ template<> struct numeric_limits<float>
   EIGEN_DEVICE_FUNC
   static float epsilon() { return __FLT_EPSILON__; }
   EIGEN_DEVICE_FUNC
-  static float (max)() {
-  #if defined(EIGEN_CUDA_ARCH)
-    return CUDART_MAX_NORMAL_F;
-  #else
-    return HIPRT_MAX_NORMAL_F;
-  #endif
-  }
+  static float (max)() { return CUDART_MAX_NORMAL_F; }
   EIGEN_DEVICE_FUNC
   static float (min)() { return FLT_MIN; }
   EIGEN_DEVICE_FUNC
-  static float infinity() {
-  #if defined(EIGEN_CUDA_ARCH)
-    return CUDART_INF_F;
-  #else
-    return HIPRT_INF_F;
-  #endif
-  }
+  static float infinity() { return CUDART_INF_F; }
   EIGEN_DEVICE_FUNC
-  static float quiet_NaN() {
-  #if defined(EIGEN_CUDA_ARCH)
-    return CUDART_NAN_F;
-  #else
-    return HIPRT_NAN_F;
-  #endif
-  }
+  static float quiet_NaN() { return CUDART_NAN_F; }
 };
 template<> struct numeric_limits<double>
 {
@@ -289,21 +233,9 @@ template<> struct numeric_limits<double>
   EIGEN_DEVICE_FUNC
   static double (min)() { return DBL_MIN; }
   EIGEN_DEVICE_FUNC
-  static double infinity() {
-  #if defined(EIGEN_CUDA_ARCH)
-    return CUDART_INF;
-  #else
-    return HIPRT_INF;
-  #endif
-  }
+  static double infinity() { return CUDART_INF; }
   EIGEN_DEVICE_FUNC
-  static double quiet_NaN() {
-  #if defined(EIGEN_CUDA_ARCH)
-    return CUDART_NAN;
-  #else
-    return HIPRT_NAN;
-  #endif
-  }
+  static double quiet_NaN() { return CUDART_NAN; }
 };
 template<> struct numeric_limits<int>
 {
@@ -359,22 +291,13 @@ template<> struct numeric_limits<unsigned long long>
   EIGEN_DEVICE_FUNC
   static unsigned long long (min)() { return 0; }
 };
-template<> struct numeric_limits<bool>
-{
-  EIGEN_DEVICE_FUNC
-  static bool epsilon() { return false; }
-  EIGEN_DEVICE_FUNC
-  static bool (max)() { return true; }
-  EIGEN_DEVICE_FUNC
-  static bool (min)() { return false; }
-};
 
 }
 
 #endif
 
 /** \internal
-  * A base class do disable default copy ctor and copy assignment operator.
+  * A base class do disable default copy ctor and copy assignement operator.
   */
 class noncopyable
 {
@@ -385,59 +308,6 @@ protected:
   EIGEN_DEVICE_FUNC ~noncopyable() {}
 };
 
-/** \internal
-  * Provides access to the number of elements in the object of as a compile-time constant expression.
-  * It "returns" Eigen::Dynamic if the size cannot be resolved at compile-time (default).
-  *
-  * Similar to std::tuple_size, but more general.
-  *
-  * It currently supports:
-  *  - any types T defining T::SizeAtCompileTime
-  *  - plain C arrays as T[N]
-  *  - std::array (c++11)
-  *  - some internal types such as SingleRange and AllRange
-  *
-  * The second template parameter eases SFINAE-based specializations.
-  */
-template<typename T, typename EnableIf = void> struct array_size {
-  enum { value = Dynamic };
-};
-
-template<typename T> struct array_size<T,typename internal::enable_if<((T::SizeAtCompileTime&0)==0)>::type> {
-  enum { value = T::SizeAtCompileTime };
-};
-
-template<typename T, int N> struct array_size<const T (&)[N]> {
-  enum { value = N };
-};
-template<typename T, int N> struct array_size<T (&)[N]> {
-  enum { value = N };
-};
-
-#if EIGEN_HAS_CXX11
-template<typename T, std::size_t N> struct array_size<const std::array<T,N> > {
-  enum { value = N };
-};
-template<typename T, std::size_t N> struct array_size<std::array<T,N> > {
-  enum { value = N };
-};
-#endif
-
-/** \internal
-  * Analogue of the std::size free function.
-  * It returns the size of the container or view \a x of type \c T
-  *
-  * It currently supports:
-  *  - any types T defining a member T::size() const
-  *  - plain C arrays as T[N]
-  *
-  */
-template<typename T>
-Index size(const T& x) { return x.size(); }
-
-template<typename T,std::size_t N>
-Index size(const T (&) [N]) { return N; }
-
 /** \internal
   * Convenient struct to get the result type of a unary or binary functor.
   *
@@ -535,10 +405,10 @@ struct meta_no  { char a[2]; };
 template <typename T>
 struct has_ReturnType
 {
-  template <typename C> static meta_yes testFunctor(C const *, typename C::ReturnType const * = 0);
-  template <typename C> static meta_no  testFunctor(...);
+  template <typename C> static meta_yes testFunctor(typename C::ReturnType const *);
+  template <typename C> static meta_no testFunctor(...);
 
-  enum { value = sizeof(testFunctor<T>(static_cast<T*>(0))) == sizeof(meta_yes) };
+  enum { value = sizeof(testFunctor<T>(0)) == sizeof(meta_yes) };
 };
 
 template<typename T> const T* return_ptr();
@@ -621,27 +491,17 @@ template<typename T, typename U> struct scalar_product_traits
 // typedef typename scalar_product_traits<typename remove_all<ArgType0>::type, typename remove_all<ArgType1>::type>::ReturnType type;
 // };
 
-/** \internal Obtains a POD type suitable to use as storage for an object of a size
-  * of at most Len bytes, aligned as specified by \c Align.
-  */
-template<unsigned Len, unsigned Align>
-struct aligned_storage {
-  struct type {
-    EIGEN_ALIGN_TO_BOUNDARY(Align) unsigned char data[Len];
-  };
-};
-
 } // end namespace internal
 
 namespace numext {
-
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+  
+#if defined(__CUDA_ARCH__)
 template<typename T> EIGEN_DEVICE_FUNC   void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; }
 #else
 template<typename T> EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); }
 #endif
 
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#if defined(__CUDA_ARCH__)
 using internal::device::numeric_limits;
 #else
 using std::numeric_limits;
@@ -650,7 +510,6 @@ using std::numeric_limits;
 // Integer division with rounding up.
 // T is assumed to be an integer type with a>=0, and b>0
 template<typename T>
-EIGEN_DEVICE_FUNC
 T div_ceil(const T &a, const T &b)
 {
   return (a+b-1) / b;
@@ -658,35 +517,23 @@ T div_ceil(const T &a, const T &b)
 
 // The aim of the following functions is to bypass -Wfloat-equal warnings
 // when we really want a strict equality comparison on floating points.
-template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+template<typename X, typename Y> EIGEN_STRONG_INLINE
 bool equal_strict(const X& x,const Y& y) { return x == y; }
 
-#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+template<> EIGEN_STRONG_INLINE
 bool equal_strict(const float& x,const float& y) { return std::equal_to<float>()(x,y); }
 
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+template<> EIGEN_STRONG_INLINE
 bool equal_strict(const double& x,const double& y) { return std::equal_to<double>()(x,y); }
-#endif
 
-template<typename X, typename Y> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+template<typename X, typename Y> EIGEN_STRONG_INLINE
 bool not_equal_strict(const X& x,const Y& y) { return x != y; }
 
-#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+template<> EIGEN_STRONG_INLINE
 bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to<float>()(x,y); }
 
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+template<> EIGEN_STRONG_INLINE
 bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to<double>()(x,y); }
-#endif
-
-/** \internal extract the bits of the float \a x */
-inline unsigned int as_uint(float x)
-{
-  unsigned int ret;
-  std::memcpy(&ret, &x, sizeof(float));
-  return ret;
-}
 
 } // end namespace numext
 
@@ -697,10 +544,6 @@ inline unsigned int as_uint(float x)
 #include <cstdint>
 namespace Eigen {
 namespace numext {
-typedef std::uint8_t  uint8_t;
-typedef std::int8_t   int8_t;
-typedef std::uint16_t uint16_t;
-typedef std::int16_t  int16_t;
 typedef std::uint32_t uint32_t;
 typedef std::int32_t  int32_t;
 typedef std::uint64_t uint64_t;
@@ -713,10 +556,6 @@ typedef std::int64_t  int64_t;
 #include <stdint.h>
 namespace Eigen {
 namespace numext {
-typedef ::uint8_t  uint8_t;
-typedef ::int8_t   int8_t;
-typedef ::uint16_t uint16_t;
-typedef ::int16_t  int16_t;
 typedef ::uint32_t uint32_t;
 typedef ::int32_t  int32_t;
 typedef ::uint64_t uint64_t;
@@ -725,4 +564,5 @@ typedef ::int64_t  int64_t;
 }
 #endif
 
+
 #endif // EIGEN_META_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/ReshapedHelper.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/ReshapedHelper.h
deleted file mode 100644
index 412432132..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/ReshapedHelper.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-
-#ifndef EIGEN_RESHAPED_HELPER_H
-#define EIGEN_RESHAPED_HELPER_H
-
-namespace Eigen {
-
-enum AutoSize_t   { AutoSize };
-const int AutoOrder = 2;
-
-namespace internal {
-
-template<typename SizeType,typename OtherSize, int TotalSize>
-struct get_compiletime_reshape_size {
-  enum { value = get_fixed_value<SizeType>::value };
-};
-
-template<typename SizeType>
-Index get_runtime_reshape_size(SizeType size, Index /*other*/, Index /*total*/) {
-  return internal::get_runtime_value(size);
-}
-
-template<typename OtherSize, int TotalSize>
-struct get_compiletime_reshape_size<AutoSize_t,OtherSize,TotalSize> {
-  enum {
-    other_size = get_fixed_value<OtherSize>::value,
-    value = (TotalSize==Dynamic || other_size==Dynamic) ? Dynamic : TotalSize / other_size };
-};
-
-inline Index get_runtime_reshape_size(AutoSize_t /*size*/, Index other, Index total) {
-  return total/other;
-}
-
-template<int Flags, int Order>
-struct get_compiletime_reshape_order {
-  enum { value = Order == AutoOrder ? Flags & RowMajorBit : Order };
-};
-
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_RESHAPED_HELPER_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/StaticAssert.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/StaticAssert.h
index 95107ff36..500e47792 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/StaticAssert.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/util/StaticAssert.h
@@ -103,9 +103,7 @@
         STORAGE_KIND_MUST_MATCH=1,
         STORAGE_INDEX_MUST_MATCH=1,
         CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1,
-        SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1,
-        INVALID_TEMPLATE_PARAMETER=1,
-        GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS=1
+        SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1
       };
     };
 
@@ -184,7 +182,7 @@
      )
 
 #define EIGEN_STATIC_ASSERT_NON_INTEGER(TYPE) \
-    EIGEN_STATIC_ASSERT(!Eigen::NumTraits<TYPE>::IsInteger, THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
+    EIGEN_STATIC_ASSERT(!NumTraits<TYPE>::IsInteger, THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
 
 
 // static assertion failing if it is guaranteed at compile-time that the two matrix expression types have different sizes
@@ -194,8 +192,8 @@
     YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES)
 
 #define EIGEN_STATIC_ASSERT_SIZE_1x1(TYPE) \
-      EIGEN_STATIC_ASSERT((TYPE::RowsAtCompileTime == 1 || TYPE::RowsAtCompileTime == Eigen::Dynamic) && \
-                          (TYPE::ColsAtCompileTime == 1 || TYPE::ColsAtCompileTime == Eigen::Dynamic), \
+      EIGEN_STATIC_ASSERT((TYPE::RowsAtCompileTime == 1 || TYPE::RowsAtCompileTime == Dynamic) && \
+                          (TYPE::ColsAtCompileTime == 1 || TYPE::ColsAtCompileTime == Dynamic), \
                           THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS)
 
 #define EIGEN_STATIC_ASSERT_LVALUE(Derived) \
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/SymbolicIndex.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/SymbolicIndex.h
deleted file mode 100644
index 17cf46f05..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/SymbolicIndex.h
+++ /dev/null
@@ -1,293 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_SYMBOLIC_INDEX_H
-#define EIGEN_SYMBOLIC_INDEX_H
-
-namespace Eigen {
-
-/** \namespace Eigen::symbolic
-  * \ingroup Core_Module
-  *
-  * This namespace defines a set of classes and functions to build and evaluate symbolic expressions of scalar type Index.
-  * Here is a simple example:
-  *
-  * \code
-  * // First step, defines symbols:
-  * struct x_tag {};  static const symbolic::SymbolExpr<x_tag> x;
-  * struct y_tag {};  static const symbolic::SymbolExpr<y_tag> y;
-  * struct z_tag {};  static const symbolic::SymbolExpr<z_tag> z;
-  *
-  * // Defines an expression:
-  * auto expr = (x+3)/y+z;
-  *
-  * // And evaluate it: (c++14)
-  * std::cout << expr.eval(x=6,y=3,z=-13) << "\n";
-  *
-  * // In c++98/11, only one symbol per expression is supported for now:
-  * auto expr98 = (3-x)/2;
-  * std::cout << expr98.eval(x=6) << "\n";
-  * \endcode
-  *
-  * It is currently only used internally to define and manipulate the Eigen::last and Eigen::lastp1 symbols in Eigen::seq and Eigen::seqN.
-  *
-  */
-namespace symbolic {
-
-template<typename Tag> class Symbol;
-template<typename Arg0> class NegateExpr;
-template<typename Arg1,typename Arg2> class AddExpr;
-template<typename Arg1,typename Arg2> class ProductExpr;
-template<typename Arg1,typename Arg2> class QuotientExpr;
-
-// A simple wrapper around an integral value to provide the eval method.
-// We could also use a free-function symbolic_eval...
-template<typename IndexType=Index>
-class ValueExpr {
-public:
-  ValueExpr(IndexType val) : m_value(val) {}
-  template<typename T>
-  IndexType eval_impl(const T&) const { return m_value; }
-protected:
-  IndexType m_value;
-};
-
-// Specialization for compile-time value,
-// It is similar to ValueExpr(N) but this version helps the compiler to generate better code.
-template<int N>
-class ValueExpr<internal::FixedInt<N> > {
-public:
-  ValueExpr() {}
-  template<typename T>
-  Index eval_impl(const T&) const { return N; }
-};
-
-
-/** \class BaseExpr
-  * \ingroup Core_Module
-  * Common base class of any symbolic expressions
-  */
-template<typename Derived>
-class BaseExpr
-{
-public:
-  const Derived& derived() const { return *static_cast<const Derived*>(this); }
-
-  /** Evaluate the expression given the \a values of the symbols.
-    *
-    * \param values defines the values of the symbols, it can either be a SymbolValue or a std::tuple of SymbolValue
-    *               as constructed by SymbolExpr::operator= operator.
-    *
-    */
-  template<typename T>
-  Index eval(const T& values) const { return derived().eval_impl(values); }
-
-#if EIGEN_HAS_CXX14
-  template<typename... Types>
-  Index eval(Types&&... values) const { return derived().eval_impl(std::make_tuple(values...)); }
-#endif
-
-  NegateExpr<Derived> operator-() const { return NegateExpr<Derived>(derived()); }
-
-  AddExpr<Derived,ValueExpr<> > operator+(Index b) const
-  { return AddExpr<Derived,ValueExpr<> >(derived(),  b); }
-  AddExpr<Derived,ValueExpr<> > operator-(Index a) const
-  { return AddExpr<Derived,ValueExpr<> >(derived(), -a); }
-  ProductExpr<Derived,ValueExpr<> > operator*(Index a) const
-  { return ProductExpr<Derived,ValueExpr<> >(derived(),a); }
-  QuotientExpr<Derived,ValueExpr<> > operator/(Index a) const
-  { return QuotientExpr<Derived,ValueExpr<> >(derived(),a); }
-
-  friend AddExpr<Derived,ValueExpr<> > operator+(Index a, const BaseExpr& b)
-  { return AddExpr<Derived,ValueExpr<> >(b.derived(), a); }
-  friend AddExpr<NegateExpr<Derived>,ValueExpr<> > operator-(Index a, const BaseExpr& b)
-  { return AddExpr<NegateExpr<Derived>,ValueExpr<> >(-b.derived(), a); }
-  friend ProductExpr<ValueExpr<>,Derived> operator*(Index a, const BaseExpr& b)
-  { return ProductExpr<ValueExpr<>,Derived>(a,b.derived()); }
-  friend QuotientExpr<ValueExpr<>,Derived> operator/(Index a, const BaseExpr& b)
-  { return QuotientExpr<ValueExpr<>,Derived>(a,b.derived()); }
-
-  template<int N>
-  AddExpr<Derived,ValueExpr<internal::FixedInt<N> > > operator+(internal::FixedInt<N>) const
-  { return AddExpr<Derived,ValueExpr<internal::FixedInt<N> > >(derived(), ValueExpr<internal::FixedInt<N> >()); }
-  template<int N>
-  AddExpr<Derived,ValueExpr<internal::FixedInt<-N> > > operator-(internal::FixedInt<N>) const
-  { return AddExpr<Derived,ValueExpr<internal::FixedInt<-N> > >(derived(), ValueExpr<internal::FixedInt<-N> >()); }
-  template<int N>
-  ProductExpr<Derived,ValueExpr<internal::FixedInt<N> > > operator*(internal::FixedInt<N>) const
-  { return ProductExpr<Derived,ValueExpr<internal::FixedInt<N> > >(derived(),ValueExpr<internal::FixedInt<N> >()); }
-  template<int N>
-  QuotientExpr<Derived,ValueExpr<internal::FixedInt<N> > > operator/(internal::FixedInt<N>) const
-  { return QuotientExpr<Derived,ValueExpr<internal::FixedInt<N> > >(derived(),ValueExpr<internal::FixedInt<N> >()); }
-
-  template<int N>
-  friend AddExpr<Derived,ValueExpr<internal::FixedInt<N> > > operator+(internal::FixedInt<N>, const BaseExpr& b)
-  { return AddExpr<Derived,ValueExpr<internal::FixedInt<N> > >(b.derived(), ValueExpr<internal::FixedInt<N> >()); }
-  template<int N>
-  friend AddExpr<NegateExpr<Derived>,ValueExpr<internal::FixedInt<N> > > operator-(internal::FixedInt<N>, const BaseExpr& b)
-  { return AddExpr<NegateExpr<Derived>,ValueExpr<internal::FixedInt<N> > >(-b.derived(), ValueExpr<internal::FixedInt<N> >()); }
-  template<int N>
-  friend ProductExpr<ValueExpr<internal::FixedInt<N> >,Derived> operator*(internal::FixedInt<N>, const BaseExpr& b)
-  { return ProductExpr<ValueExpr<internal::FixedInt<N> >,Derived>(ValueExpr<internal::FixedInt<N> >(),b.derived()); }
-  template<int N>
-  friend QuotientExpr<ValueExpr<internal::FixedInt<N> >,Derived> operator/(internal::FixedInt<N>, const BaseExpr& b)
-  { return QuotientExpr<ValueExpr<internal::FixedInt<N> > ,Derived>(ValueExpr<internal::FixedInt<N> >(),b.derived()); }
-
-#if (!EIGEN_HAS_CXX14)
-  template<int N>
-  AddExpr<Derived,ValueExpr<internal::FixedInt<N> > > operator+(internal::FixedInt<N> (*)()) const
-  { return AddExpr<Derived,ValueExpr<internal::FixedInt<N> > >(derived(), ValueExpr<internal::FixedInt<N> >()); }
-  template<int N>
-  AddExpr<Derived,ValueExpr<internal::FixedInt<-N> > > operator-(internal::FixedInt<N> (*)()) const
-  { return AddExpr<Derived,ValueExpr<internal::FixedInt<-N> > >(derived(), ValueExpr<internal::FixedInt<-N> >()); }
-  template<int N>
-  ProductExpr<Derived,ValueExpr<internal::FixedInt<N> > > operator*(internal::FixedInt<N> (*)()) const
-  { return ProductExpr<Derived,ValueExpr<internal::FixedInt<N> > >(derived(),ValueExpr<internal::FixedInt<N> >()); }
-  template<int N>
-  QuotientExpr<Derived,ValueExpr<internal::FixedInt<N> > > operator/(internal::FixedInt<N> (*)()) const
-  { return QuotientExpr<Derived,ValueExpr<internal::FixedInt<N> > >(derived(),ValueExpr<internal::FixedInt<N> >()); }
-
-  template<int N>
-  friend AddExpr<Derived,ValueExpr<internal::FixedInt<N> > > operator+(internal::FixedInt<N> (*)(), const BaseExpr& b)
-  { return AddExpr<Derived,ValueExpr<internal::FixedInt<N> > >(b.derived(), ValueExpr<internal::FixedInt<N> >()); }
-  template<int N>
-  friend AddExpr<NegateExpr<Derived>,ValueExpr<internal::FixedInt<N> > > operator-(internal::FixedInt<N> (*)(), const BaseExpr& b)
-  { return AddExpr<NegateExpr<Derived>,ValueExpr<internal::FixedInt<N> > >(-b.derived(), ValueExpr<internal::FixedInt<N> >()); }
-  template<int N>
-  friend ProductExpr<ValueExpr<internal::FixedInt<N> >,Derived> operator*(internal::FixedInt<N> (*)(), const BaseExpr& b)
-  { return ProductExpr<ValueExpr<internal::FixedInt<N> >,Derived>(ValueExpr<internal::FixedInt<N> >(),b.derived()); }
-  template<int N>
-  friend QuotientExpr<ValueExpr<internal::FixedInt<N> >,Derived> operator/(internal::FixedInt<N> (*)(), const BaseExpr& b)
-  { return QuotientExpr<ValueExpr<internal::FixedInt<N> > ,Derived>(ValueExpr<internal::FixedInt<N> >(),b.derived()); }
-#endif
-
-
-  template<typename OtherDerived>
-  AddExpr<Derived,OtherDerived> operator+(const BaseExpr<OtherDerived> &b) const
-  { return AddExpr<Derived,OtherDerived>(derived(),  b.derived()); }
-
-  template<typename OtherDerived>
-  AddExpr<Derived,NegateExpr<OtherDerived> > operator-(const BaseExpr<OtherDerived> &b) const
-  { return AddExpr<Derived,NegateExpr<OtherDerived> >(derived(), -b.derived()); }
-
-  template<typename OtherDerived>
-  ProductExpr<Derived,OtherDerived> operator*(const BaseExpr<OtherDerived> &b) const
-  { return ProductExpr<Derived,OtherDerived>(derived(), b.derived()); }
-
-  template<typename OtherDerived>
-  QuotientExpr<Derived,OtherDerived> operator/(const BaseExpr<OtherDerived> &b) const
-  { return QuotientExpr<Derived,OtherDerived>(derived(), b.derived()); }
-};
-
-template<typename T>
-struct is_symbolic {
-  // BaseExpr has no conversion ctor, so we only have to check whether T can be statically cast to its base class BaseExpr<T>.
-  enum { value = internal::is_convertible<T,BaseExpr<T> >::value };
-};
-
-/** Represents the actual value of a symbol identified by its tag
-  *
-  * It is the return type of SymbolValue::operator=, and most of the time this is only way it is used.
-  */
-template<typename Tag>
-class SymbolValue
-{
-public:
-  /** Default constructor from the value \a val */
-  SymbolValue(Index val) : m_value(val) {}
-
-  /** \returns the stored value of the symbol */
-  Index value() const { return m_value; }
-protected:
-  Index m_value;
-};
-
-/** Expression of a symbol uniquely identified by the template parameter type \c tag */
-template<typename tag>
-class SymbolExpr : public BaseExpr<SymbolExpr<tag> >
-{
-public:
-  /** Alias to the template parameter \c tag */
-  typedef tag Tag;
-
-  SymbolExpr() {}
-
-  /** Associate the value \a val to the given symbol \c *this, uniquely identified by its \c Tag.
-    *
-    * The returned object should be passed to ExprBase::eval() to evaluate a given expression with this specified runtime-time value.
-    */
-  SymbolValue<Tag> operator=(Index val) const {
-    return SymbolValue<Tag>(val);
-  }
-
-  Index eval_impl(const SymbolValue<Tag> &values) const { return values.value(); }
-
-#if EIGEN_HAS_CXX14
-  // C++14 versions suitable for multiple symbols
-  template<typename... Types>
-  Index eval_impl(const std::tuple<Types...>& values) const { return std::get<SymbolValue<Tag> >(values).value(); }
-#endif
-};
-
-template<typename Arg0>
-class NegateExpr : public BaseExpr<NegateExpr<Arg0> >
-{
-public:
-  NegateExpr(const Arg0& arg0) : m_arg0(arg0) {}
-
-  template<typename T>
-  Index eval_impl(const T& values) const { return -m_arg0.eval_impl(values); }
-protected:
-  Arg0 m_arg0;
-};
-
-template<typename Arg0, typename Arg1>
-class AddExpr : public BaseExpr<AddExpr<Arg0,Arg1> >
-{
-public:
-  AddExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {}
-
-  template<typename T>
-  Index eval_impl(const T& values) const { return m_arg0.eval_impl(values) + m_arg1.eval_impl(values); }
-protected:
-  Arg0 m_arg0;
-  Arg1 m_arg1;
-};
-
-template<typename Arg0, typename Arg1>
-class ProductExpr : public BaseExpr<ProductExpr<Arg0,Arg1> >
-{
-public:
-  ProductExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {}
-
-  template<typename T>
-  Index eval_impl(const T& values) const { return m_arg0.eval_impl(values) * m_arg1.eval_impl(values); }
-protected:
-  Arg0 m_arg0;
-  Arg1 m_arg1;
-};
-
-template<typename Arg0, typename Arg1>
-class QuotientExpr : public BaseExpr<QuotientExpr<Arg0,Arg1> >
-{
-public:
-  QuotientExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {}
-
-  template<typename T>
-  Index eval_impl(const T& values) const { return m_arg0.eval_impl(values) / m_arg1.eval_impl(values); }
-protected:
-  Arg0 m_arg0;
-  Arg1 m_arg1;
-};
-
-} // end namespace symbolic
-
-} // end namespace Eigen
-
-#endif // EIGEN_SYMBOLIC_INDEX_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/XprHelper.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/XprHelper.h
index fd2db56a4..6bb497082 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/XprHelper.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Core/util/XprHelper.h
@@ -49,12 +49,6 @@ template<typename T> struct is_valid_index_type
   };
 };
 
-// true if both types are not valid index types
-template<typename RowIndices, typename ColIndices>
-struct valid_indexed_view_overload {
-  enum { value = !(internal::is_valid_index_type<RowIndices>::value && internal::is_valid_index_type<ColIndices>::value) };
-};
-
 // promote_scalar_arg is an helper used in operation between an expression and a scalar, like:
 //    expression * scalar
 // Its role is to determine how the type T of the scalar operand should be promoted given the scalar type ExprScalar of the given expression.
@@ -132,7 +126,6 @@ template<typename T, int Value> class variable_if_dynamic
     EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamic)
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); }
     EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T value() { return T(Value); }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator T() const { return T(Value); }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T) {}
 };
 
@@ -143,7 +136,6 @@ template<typename T> class variable_if_dynamic<T, Dynamic>
   public:
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value) : m_value(value) {}
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T value() const { return m_value; }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator T() const { return m_value; }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; }
 };
 
@@ -187,10 +179,7 @@ template<typename T> struct unpacket_traits
   enum
   {
     size = 1,
-    alignment = 1,
-    vectorizable = false,
-    masked_load_available=false,
-    masked_store_available=false
+    alignment = 1
   };
 };
 
@@ -411,7 +400,7 @@ template<typename T> struct plain_matrix_type_row_major
   typedef Matrix<typename traits<T>::Scalar,
                 Rows,
                 Cols,
-                (MaxCols==1&&MaxRows!=1) ? ColMajor : RowMajor,
+                (MaxCols==1&&MaxRows!=1) ? RowMajor : ColMajor,
                 MaxRows,
                 MaxCols
           > type;
@@ -466,7 +455,7 @@ template<typename T, int n, typename PlainObject = typename plain_object_eval<T>
 {
   enum {
     ScalarReadCost = NumTraits<typename traits<T>::Scalar>::ReadCost,
-    CoeffReadCost = evaluator<T>::CoeffReadCost,  // NOTE What if an evaluator evaluate itself into a temporary?
+    CoeffReadCost = evaluator<T>::CoeffReadCost,  // NOTE What if an evaluator evaluate itself into a tempory?
                                                   //      Then CoeffReadCost will be small (e.g., 1) but we still have to evaluate, especially if n>1.
                                                   //      This situation is already taken care by the EvalBeforeNestingBit flag, which is turned ON
                                                   //      for all evaluator creating a temporary. This flag is then propagated by the parent evaluators.
@@ -682,39 +671,24 @@ template<typename T> struct is_diagonal<DiagonalWrapper<T> >
 template<typename T, int S> struct is_diagonal<DiagonalMatrix<T,S> >
 { enum { ret = true }; };
 
-
-template<typename T> struct is_identity
-{ enum { value = false }; };
-
-template<typename T> struct is_identity<CwiseNullaryOp<internal::scalar_identity_op<typename T::Scalar>, T> >
-{ enum { value = true }; };
-
-
 template<typename S1, typename S2> struct glue_shapes;
 template<> struct glue_shapes<DenseShape,TriangularShape> { typedef TriangularShape type;  };
 
 template<typename T1, typename T2>
-struct possibly_same_dense {
-  enum { value = has_direct_access<T1>::ret && has_direct_access<T2>::ret && is_same<typename T1::Scalar,typename T2::Scalar>::value };
-};
-
-template<typename T1, typename T2>
-EIGEN_DEVICE_FUNC
-bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if<possibly_same_dense<T1,T2>::value>::type * = 0)
+bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if<has_direct_access<T1>::ret&&has_direct_access<T2>::ret, T1>::type * = 0)
 {
   return (mat1.data()==mat2.data()) && (mat1.innerStride()==mat2.innerStride()) && (mat1.outerStride()==mat2.outerStride());
 }
 
 template<typename T1, typename T2>
-EIGEN_DEVICE_FUNC
-bool is_same_dense(const T1 &, const T2 &, typename enable_if<!possibly_same_dense<T1,T2>::value>::type * = 0)
+bool is_same_dense(const T1 &, const T2 &, typename enable_if<!(has_direct_access<T1>::ret&&has_direct_access<T2>::ret), T1>::type * = 0)
 {
   return false;
 }
 
 // Internal helper defining the cost of a scalar division for the type T.
 // The default heuristic can be specialized for each scalar type and architecture.
-template<typename T,bool Vectorized=false,typename EnableIf = void>
+template<typename T,bool Vectorized=false,typename EnaleIf = void>
 struct scalar_div_cost {
   enum { value = 8*NumTraits<T>::MulCost };
 };
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h
index 081e918f1..dc5fae06a 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h
@@ -214,7 +214,7 @@ template<typename _MatrixType> class ComplexEigenSolver
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was successful, \c NoConvergence otherwise.
+      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
       */
     ComputationInfo info() const
     {
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/ComplexSchur.h b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/ComplexSchur.h
index fc71468f8..4354e4018 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/ComplexSchur.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/ComplexSchur.h
@@ -212,7 +212,7 @@ template<typename _MatrixType> class ComplexSchur
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was successful, \c NoConvergence otherwise.
+      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
       */
     ComputationInfo info() const
     {
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/EigenSolver.h b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/EigenSolver.h
index 572b29e4e..f205b185d 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/EigenSolver.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/EigenSolver.h
@@ -110,7 +110,7 @@ template<typename _MatrixType> class EigenSolver
       *
       * \sa compute() for an example.
       */
-    EigenSolver() : m_eivec(), m_eivalues(), m_isInitialized(false), m_eigenvectorsOk(false), m_realSchur(), m_matT(), m_tmp() {}
+    EigenSolver() : m_eivec(), m_eivalues(), m_isInitialized(false), m_realSchur(), m_matT(), m_tmp() {}
 
     /** \brief Default constructor with memory preallocation
       *
@@ -277,7 +277,7 @@ template<typename _MatrixType> class EigenSolver
     template<typename InputType>
     EigenSolver& compute(const EigenBase<InputType>& matrix, bool computeEigenvectors = true);
 
-    /** \returns NumericalIssue if the input contains INF or NaN values or overflow occurred. Returns Success otherwise. */
+    /** \returns NumericalIssue if the input contains INF or NaN values or overflow occured. Returns Success otherwise. */
     ComputationInfo info() const
     {
       eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
index d0f9091be..5f6bb8289 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
@@ -121,7 +121,7 @@ class GeneralizedSelfAdjointEigenSolver : public SelfAdjointEigenSolver<_MatrixT
       *
       * \returns    Reference to \c *this
       *
-      * According to \p options, this function computes eigenvalues and (if requested)
+      * Accoring to \p options, this function computes eigenvalues and (if requested)
       * the eigenvectors of one of the following three generalized eigenproblems:
       * - \c Ax_lBx: \f$ Ax = \lambda B x \f$
       * - \c ABx_lx: \f$ ABx = \lambda x \f$
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h
index d947dac4e..f647f69b0 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h
@@ -315,7 +315,7 @@ void HessenbergDecomposition<MatrixType>::_compute(MatrixType& matA, CoeffVector
 
     // A = A H'
     matA.rightCols(remainingSize)
-        .applyHouseholderOnTheRight(matA.col(i).tail(remainingSize-1), numext::conj(h), &temp.coeffRef(0));
+        .applyHouseholderOnTheRight(matA.col(i).tail(remainingSize-1).conjugate(), numext::conj(h), &temp.coeffRef(0));
   }
 }
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
index 66e5a3dbb..e4e426071 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
@@ -84,7 +84,7 @@ MatrixBase<Derived>::eigenvalues() const
   * \sa SelfAdjointEigenSolver::eigenvalues(), MatrixBase::eigenvalues()
   */
 template<typename MatrixType, unsigned int UpLo> 
-EIGEN_DEVICE_FUNC inline typename SelfAdjointView<MatrixType, UpLo>::EigenvaluesReturnType
+inline typename SelfAdjointView<MatrixType, UpLo>::EigenvaluesReturnType
 SelfAdjointView<MatrixType, UpLo>::eigenvalues() const
 {
   PlainObject thisAsMatrix(*this);
@@ -147,7 +147,7 @@ MatrixBase<Derived>::operatorNorm() const
   * \sa eigenvalues(), MatrixBase::operatorNorm()
   */
 template<typename MatrixType, unsigned int UpLo>
-EIGEN_DEVICE_FUNC inline typename SelfAdjointView<MatrixType, UpLo>::RealScalar
+inline typename SelfAdjointView<MatrixType, UpLo>::RealScalar
 SelfAdjointView<MatrixType, UpLo>::operatorNorm() const
 {
   return eigenvalues().cwiseAbs().maxCoeff();
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/RealQZ.h b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/RealQZ.h
index 509130184..b3a910dd9 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/RealQZ.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/RealQZ.h
@@ -90,9 +90,8 @@ namespace Eigen {
         m_Z(size, size),
         m_workspace(size*2),
         m_maxIters(400),
-        m_isInitialized(false),
-        m_computeQZ(true)
-      {}
+        m_isInitialized(false)
+        { }
 
       /** \brief Constructor; computes real QZ decomposition of given matrices
        * 
@@ -109,11 +108,9 @@ namespace Eigen {
         m_Z(A.rows(),A.cols()),
         m_workspace(A.rows()*2),
         m_maxIters(400),
-        m_isInitialized(false),
-        m_computeQZ(true)
-      {
-        compute(A, B, computeQZ);
-      }
+        m_isInitialized(false) {
+          compute(A, B, computeQZ);
+        }
 
       /** \brief Returns matrix Q in the QZ decomposition. 
        *
@@ -164,7 +161,7 @@ namespace Eigen {
 
       /** \brief Reports whether previous computation was successful.
        *
-       * \returns \c Success if computation was successful, \c NoConvergence otherwise.
+       * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
        */
       ComputationInfo info() const
       {
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/RealSchur.h b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/RealSchur.h
index 7304ef344..9191519ab 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/RealSchur.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/RealSchur.h
@@ -190,7 +190,7 @@ template<typename _MatrixType> class RealSchur
     RealSchur& computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ,  bool computeU);
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was successful, \c NoConvergence otherwise.
+      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
       */
     ComputationInfo info() const
     {
@@ -270,13 +270,8 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::compute(const EigenBase<InputType>
   // Step 1. Reduce to Hessenberg form
   m_hess.compute(matrix.derived()/scale);
 
-  // Step 2. Reduce to real Schur form
-  // Note: we copy m_hess.matrixQ() into m_matU here and not in computeFromHessenberg
-  //       to be able to pass our working-space buffer for the Householder to Dense evaluation.
-  m_workspaceVector.resize(matrix.cols());
-  if(computeU)
-    m_hess.matrixQ().evalTo(m_matU, m_workspaceVector);
-  computeFromHessenberg(m_hess.matrixH(), m_matU, computeU);
+  // Step 2. Reduce to real Schur form  
+  computeFromHessenberg(m_hess.matrixH(), m_hess.matrixQ(), computeU);
 
   m_matT *= scale;
   
@@ -289,13 +284,13 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa
   using std::abs;
 
   m_matT = matrixH;
-  m_workspaceVector.resize(m_matT.cols());
-  if(computeU && !internal::is_same_dense(m_matU,matrixQ))
+  if(computeU)
     m_matU = matrixQ;
   
   Index maxIters = m_maxIters;
   if (maxIters == -1)
     maxIters = m_maxIterationsPerRow * matrixH.rows();
+  m_workspaceVector.resize(m_matT.cols());
   Scalar* workspace = &m_workspaceVector.coeffRef(0);
 
   // The matrix m_matT is divided in three parts. 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
index 9bbce652f..d37656fa2 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@@ -20,9 +20,7 @@ class GeneralizedSelfAdjointEigenSolver;
 
 namespace internal {
 template<typename SolverType,int Size,bool IsComplex> struct direct_selfadjoint_eigenvalues;
-
 template<typename MatrixType, typename DiagType, typename SubDiagType>
-EIGEN_DEVICE_FUNC
 ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec);
 }
 
@@ -121,9 +119,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
         : m_eivec(),
           m_eivalues(),
           m_subdiag(),
-          m_info(InvalidInput),
-          m_isInitialized(false),
-          m_eigenvectorsOk(false)
+          m_isInitialized(false)
     { }
 
     /** \brief Constructor, pre-allocates memory for dynamic-size matrices.
@@ -143,8 +139,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
         : m_eivec(size, size),
           m_eivalues(size),
           m_subdiag(size > 1 ? size - 1 : 1),
-          m_isInitialized(false),
-          m_eigenvectorsOk(false)
+          m_isInitialized(false)
     {}
 
     /** \brief Constructor; computes eigendecomposition of given matrix.
@@ -168,8 +163,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       : m_eivec(matrix.rows(), matrix.cols()),
         m_eivalues(matrix.cols()),
         m_subdiag(matrix.rows() > 1 ? matrix.rows() - 1 : 1),
-        m_isInitialized(false),
-        m_eigenvectorsOk(false)
+        m_isInitialized(false)
     {
       compute(matrix.derived(), options);
     }
@@ -343,7 +337,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was successful, \c NoConvergence otherwise.
+      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
       */
     EIGEN_DEVICE_FUNC
     ComputationInfo info() const
@@ -360,8 +354,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
     static const int m_maxIterations = 30;
 
   protected:
-    static EIGEN_DEVICE_FUNC
-    void check_template_parameters()
+    static void check_template_parameters()
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
@@ -410,7 +403,7 @@ SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
   
   const InputType &matrix(a_matrix.derived());
   
-  EIGEN_USING_STD_MATH(abs);
+  using std::abs;
   eigen_assert(matrix.cols() == matrix.rows());
   eigen_assert((options&~(EigVecMask|GenEigMask))==0
           && (options&EigVecMask)!=EigVecMask
@@ -486,10 +479,9 @@ namespace internal {
   * \returns \c Success or \c NoConvergence
   */
 template<typename MatrixType, typename DiagType, typename SubDiagType>
-EIGEN_DEVICE_FUNC
 ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec)
 {
-  EIGEN_USING_STD_MATH(abs);
+  using std::abs;
 
   ComputationInfo info;
   typedef typename MatrixType::Scalar Scalar;
@@ -543,7 +535,7 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag
       diag.segment(i,n-i).minCoeff(&k);
       if (k > 0)
       {
-        numext::swap(diag[i], diag[k+i]);
+        std::swap(diag[i], diag[k+i]);
         if(computeEigenvectors)
           eivec.col(i).swap(eivec.col(k+i));
       }
@@ -613,8 +605,8 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
   EIGEN_DEVICE_FUNC
   static inline bool extract_kernel(MatrixType& mat, Ref<VectorType> res, Ref<VectorType> representative)
   {
-    EIGEN_USING_STD_MATH(abs);
-    EIGEN_USING_STD_MATH(sqrt);
+    EIGEN_USING_STD_MATH(sqrt)
+    EIGEN_USING_STD_MATH(abs)
     Index i0;
     // Find non-zero column i0 (by construction, there must exist a non zero coefficient on the diagonal):
     mat.diagonal().cwiseAbs().maxCoeff(&i0);
@@ -728,7 +720,7 @@ struct direct_selfadjoint_eigenvalues<SolverType,2,false>
   EIGEN_DEVICE_FUNC
   static inline void computeRoots(const MatrixType& m, VectorType& roots)
   {
-    EIGEN_USING_STD_MATH(sqrt);
+    using std::sqrt;
     const Scalar t0 = Scalar(0.5) * sqrt( numext::abs2(m(0,0)-m(1,1)) + Scalar(4)*numext::abs2(m(1,0)));
     const Scalar t1 = Scalar(0.5) * (m(0,0) + m(1,1));
     roots(0) = t1 - t0;
@@ -816,7 +808,7 @@ template<int StorageOrder,typename RealScalar, typename Scalar, typename Index>
 EIGEN_DEVICE_FUNC
 static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n)
 {
-  EIGEN_USING_STD_MATH(abs);
+  using std::abs;
   RealScalar td = (diag[end-1] - diag[end])*RealScalar(0.5);
   RealScalar e = subdiag[end-1];
   // Note that thanks to scaling, e^2 or td^2 cannot overflow, however they can still
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/Tridiagonalization.h b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/Tridiagonalization.h
index c5c1acf46..1d102c17b 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/Tridiagonalization.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/Tridiagonalization.h
@@ -25,7 +25,6 @@ struct traits<TridiagonalizationMatrixTReturnType<MatrixType> >
 };
 
 template<typename MatrixType, typename CoeffVectorType>
-EIGEN_DEVICE_FUNC
 void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs);
 }
 
@@ -345,7 +344,6 @@ namespace internal {
   * \sa Tridiagonalization::packedMatrix()
   */
 template<typename MatrixType, typename CoeffVectorType>
-EIGEN_DEVICE_FUNC
 void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs)
 {
   using numext::conj;
@@ -426,7 +424,6 @@ struct tridiagonalization_inplace_selector;
   * \sa class Tridiagonalization
   */
 template<typename MatrixType, typename DiagonalType, typename SubDiagonalType>
-EIGEN_DEVICE_FUNC
 void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
 {
   eigen_assert(mat.cols()==mat.rows() && diag.size()==mat.rows() && subdiag.size()==mat.rows()-1);
@@ -442,8 +439,7 @@ struct tridiagonalization_inplace_selector
   typedef typename Tridiagonalization<MatrixType>::CoeffVectorType CoeffVectorType;
   typedef typename Tridiagonalization<MatrixType>::HouseholderSequenceType HouseholderSequenceType;
   template<typename DiagonalType, typename SubDiagonalType>
-  static EIGEN_DEVICE_FUNC
-  void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
+  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
   {
     CoeffVectorType hCoeffs(mat.cols()-1);
     tridiagonalization_inplace(mat,hCoeffs);
@@ -512,8 +508,7 @@ struct tridiagonalization_inplace_selector<MatrixType,1,IsComplex>
   typedef typename MatrixType::Scalar Scalar;
 
   template<typename DiagonalType, typename SubDiagonalType>
-  static EIGEN_DEVICE_FUNC
-  void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ)
+  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ)
   {
     diag(0,0) = numext::real(mat(0,0));
     if(extractQ)
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Geometry/AlignedBox.h b/uppsrc/plugin/Eigen/Eigen/src/Geometry/AlignedBox.h
index c902d8f0a..066eae4f9 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Geometry/AlignedBox.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Geometry/AlignedBox.h
@@ -63,7 +63,7 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
 
   /** Default constructor initializing a null box. */
   EIGEN_DEVICE_FUNC inline AlignedBox()
-  { if (EIGEN_CONST_CONDITIONAL(AmbientDimAtCompileTime!=Dynamic)) setEmpty(); }
+  { if (AmbientDimAtCompileTime!=Dynamic) setEmpty(); }
 
   /** Constructs a null box with \a _dim the dimension of the ambient space. */
   EIGEN_DEVICE_FUNC inline explicit AlignedBox(Index _dim) : m_min(_dim), m_max(_dim)
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Geometry/Hyperplane.h b/uppsrc/plugin/Eigen/Eigen/src/Geometry/Hyperplane.h
index cebe03557..05929b299 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Geometry/Hyperplane.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Geometry/Hyperplane.h
@@ -119,7 +119,7 @@ public:
     * If the dimension of the ambient space is greater than 2, then there isn't uniqueness,
     * so an arbitrary choice is made.
     */
-  // FIXME to be consistent with the rest this could be implemented as a static Through function ??
+  // FIXME to be consitent with the rest this could be implemented as a static Through function ??
   EIGEN_DEVICE_FUNC explicit Hyperplane(const ParametrizedLine<Scalar, AmbientDimAtCompileTime>& parametrized)
   {
     normal() = parametrized.direction().unitOrthogonal();
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Geometry/OrthoMethods.h b/uppsrc/plugin/Eigen/Eigen/src/Geometry/OrthoMethods.h
index 524aebe1b..a035e6310 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Geometry/OrthoMethods.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Geometry/OrthoMethods.h
@@ -27,10 +27,9 @@ namespace Eigen {
 template<typename Derived>
 template<typename OtherDerived>
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename MatrixBase<Derived>::template cross_product_return_type<OtherDerived>::type
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::template cross_product_return_type<OtherDerived>::type
 #else
-typename MatrixBase<Derived>::PlainObject
+inline typename MatrixBase<Derived>::PlainObject
 #endif
 MatrixBase<Derived>::cross(const MatrixBase<OtherDerived>& other) const
 {
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Geometry/ParametrizedLine.h b/uppsrc/plugin/Eigen/Eigen/src/Geometry/ParametrizedLine.h
index 3929ca87f..1e985d8cd 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Geometry/ParametrizedLine.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Geometry/ParametrizedLine.h
@@ -104,44 +104,7 @@ public:
   template <int OtherOptions>
   EIGEN_DEVICE_FUNC VectorType intersectionPoint(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const;
 
-  /** Applies the transformation matrix \a mat to \c *this and returns a reference to \c *this.
-    *
-    * \param mat the Dim x Dim transformation matrix
-    * \param traits specifies whether the matrix \a mat represents an #Isometry
-    *               or a more generic #Affine transformation. The default is #Affine.
-    */
-  template<typename XprType>
-  EIGEN_DEVICE_FUNC inline ParametrizedLine& transform(const MatrixBase<XprType>& mat, TransformTraits traits = Affine)
-  {
-    if (traits==Affine)
-      direction() = (mat * direction()).normalized();
-    else if (traits==Isometry)
-      direction() = mat * direction();
-    else
-    {
-      eigen_assert(0 && "invalid traits value in ParametrizedLine::transform()");
-    }
-    origin() = mat * origin();
-    return *this;
-  }
-
-  /** Applies the transformation \a t to \c *this and returns a reference to \c *this.
-    *
-    * \param t the transformation of dimension Dim
-    * \param traits specifies whether the transformation \a t represents an #Isometry
-    *               or a more generic #Affine transformation. The default is #Affine.
-    *               Other kind of transformations are not supported.
-    */
-  template<int TrOptions>
-  EIGEN_DEVICE_FUNC inline ParametrizedLine& transform(const Transform<Scalar,AmbientDimAtCompileTime,Affine,TrOptions>& t,
-                                                       TransformTraits traits = Affine)
-  {
-    transform(t.linear(), traits);
-    origin() += t.translation();
-    return *this;
-  }
-
-/** \returns \c *this with scalar type casted to \a NewScalarType
+  /** \returns \c *this with scalar type casted to \a NewScalarType
     *
     * Note that if \a NewScalarType is equal to the current scalar type of \c *this
     * then this function smartly returns a const reference to \c *this.
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Geometry/Quaternion.h b/uppsrc/plugin/Eigen/Eigen/src/Geometry/Quaternion.h
index 7b2c4d89d..b81820656 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Geometry/Quaternion.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Geometry/Quaternion.h
@@ -294,21 +294,6 @@ public:
   EIGEN_DEVICE_FUNC explicit inline Quaternion(const Quaternion<OtherScalar, OtherOptions>& other)
   { m_coeffs = other.coeffs().template cast<Scalar>(); }
 
-#if EIGEN_HAS_RVALUE_REFERENCES
-  // We define a copy constructor, which means we don't get an implicit move constructor or assignment operator.
-  /** Default move constructor */
-  EIGEN_DEVICE_FUNC inline Quaternion(Quaternion&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible<Scalar>::value)
-    : m_coeffs(std::move(other.coeffs()))
-  {}
-
-  /** Default move assignment operator */
-  EIGEN_DEVICE_FUNC Quaternion& operator=(Quaternion&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable<Scalar>::value)
-  {
-    m_coeffs = std::move(other.coeffs());
-    return *this;
-  }
-#endif
-
   EIGEN_DEVICE_FUNC static Quaternion UnitRandom();
 
   template<typename Derived1, typename Derived2>
@@ -661,7 +646,7 @@ EIGEN_DEVICE_FUNC Quaternion<Scalar,Options> Quaternion<Scalar,Options>::UnitRan
   const Scalar u1 = internal::random<Scalar>(0, 1),
                u2 = internal::random<Scalar>(0, 2*EIGEN_PI),
                u3 = internal::random<Scalar>(0, 2*EIGEN_PI);
-  const Scalar a = sqrt(Scalar(1) - u1),
+  const Scalar a = sqrt(1 - u1),
                b = sqrt(u1);
   return Quaternion (a * sin(u2), a * cos(u2), b * sin(u3), b * cos(u3));
 }
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Geometry/Scaling.h b/uppsrc/plugin/Eigen/Eigen/src/Geometry/Scaling.h
index df650fda6..f58ca03d9 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Geometry/Scaling.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Geometry/Scaling.h
@@ -29,22 +29,6 @@ namespace Eigen {
   *
   * \sa Scaling(), class DiagonalMatrix, MatrixBase::asDiagonal(), class Translation, class Transform
   */
-
-namespace internal
-{
-  // This helper helps nvcc+MSVC to properly parse this file.
-  // See bug 1412.
-  template <typename Scalar, int Dim, int Mode>
-  struct uniformscaling_times_affine_returntype
-  {
-    enum
-    {
-      NewMode = int(Mode) == int(Isometry) ? Affine : Mode
-    };
-    typedef Transform <Scalar, Dim, NewMode> type;
-  };
-}
-
 template<typename _Scalar>
 class UniformScaling
 {
@@ -76,11 +60,9 @@ public:
 
   /** Concatenates a uniform scaling and an affine transformation */
   template<int Dim, int Mode, int Options>
-  inline typename
-	internal::uniformscaling_times_affine_returntype<Scalar,Dim,Mode>::type
-	operator* (const Transform<Scalar, Dim, Mode, Options>& t) const
+  inline Transform<Scalar,Dim,(int(Mode)==int(Isometry)?Affine:Mode)> operator* (const Transform<Scalar,Dim, Mode, Options>& t) const
   {
-    typename internal::uniformscaling_times_affine_returntype<Scalar,Dim,Mode>::type res = t;
+    Transform<Scalar,Dim,(int(Mode)==int(Isometry)?Affine:Mode)> res = t;
     res.prescale(factor());
     return res;
   }
@@ -88,7 +70,7 @@ public:
   /** Concatenates a uniform scaling and a linear transformation matrix */
   // TODO returns an expression
   template<typename Derived>
-  inline typename Eigen::internal::plain_matrix_type<Derived>::type operator* (const MatrixBase<Derived>& other) const
+  inline typename internal::plain_matrix_type<Derived>::type operator* (const MatrixBase<Derived>& other) const
   { return other * m_factor; }
 
   template<typename Derived,int Dim>
@@ -128,7 +110,7 @@ public:
 /** Concatenates a linear transformation matrix and a uniform scaling
   * \relates UniformScaling
   */
-// NOTE this operator is defined in MatrixBase and not as a friend function
+// NOTE this operator is defiend in MatrixBase and not as a friend function
 // of UniformScaling to fix an internal crash of Intel's ICC
 template<typename Derived,typename Scalar>
 EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,Scalar,product)
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Geometry/Transform.h b/uppsrc/plugin/Eigen/Eigen/src/Geometry/Transform.h
index c87b5fedf..c21d9e550 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Geometry/Transform.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Geometry/Transform.h
@@ -97,9 +97,6 @@ template<int Mode> struct transform_make_affine;
   *              - #AffineCompact: the transformation is stored as a (Dim)x(Dim+1) matrix.
   *              - #Projective: the transformation is stored as a (Dim+1)^2 matrix
   *                             without any assumption.
-  *              - #Isometry: same as #Affine with the additional assumption that
-  *                           the linear part represents a rotation. This assumption is exploited
-  *                           to speed up some functions such as inverse() and rotation().
   * \tparam _Options has the same meaning as in class Matrix. It allows to specify DontAlign and/or RowMajor.
   *                  These Options are passed directly to the underlying matrix type.
   *
@@ -118,7 +115,7 @@ template<int Mode> struct transform_make_affine;
   * \end{array} \right) \f$
   *
   * Note that for a projective transformation the last row can be anything,
-  * and then the interpretation of different parts might be slightly different.
+  * and then the interpretation of different parts might be sightly different.
   *
   * However, unlike a plain matrix, the Transform class provides many features
   * simplifying both its assembly and usage. In particular, it can be composed
@@ -338,7 +335,7 @@ public:
            OtherModeIsAffineCompact = OtherMode == int(AffineCompact)
     };
 
-    if(EIGEN_CONST_CONDITIONAL(ModeIsAffineCompact == OtherModeIsAffineCompact))
+    if(ModeIsAffineCompact == OtherModeIsAffineCompact)
     {
       // We need the block expression because the code is compiled for all
       // combinations of transformations and will trigger a compile time error
@@ -346,7 +343,7 @@ public:
       m_matrix.template block<Dim,Dim+1>(0,0) = other.matrix().template block<Dim,Dim+1>(0,0);
       makeAffine();
     }
-    else if(EIGEN_CONST_CONDITIONAL(OtherModeIsAffineCompact))
+    else if(OtherModeIsAffineCompact)
     {
       typedef typename Transform<Scalar,Dim,OtherMode,OtherOptions>::MatrixType OtherMatrixType;
       internal::transform_construct_from_matrix<OtherMatrixType,Mode,Options,Dim,HDim>::run(this, other.matrix());
@@ -484,7 +481,7 @@ public:
     TransformTimeDiagonalReturnType res;
     res.linear().noalias() = a*b.linear();
     res.translation().noalias() = a*b.translation();
-    if (EIGEN_CONST_CONDITIONAL(Mode!=int(AffineCompact)))
+    if (Mode!=int(AffineCompact))
       res.matrix().row(Dim) = b.matrix().row(Dim);
     return res;
   }
@@ -605,9 +602,7 @@ public:
   template<typename Derived>
   EIGEN_DEVICE_FUNC inline Transform operator*(const RotationBase<Derived,Dim>& r) const;
 
-  typedef typename internal::conditional<int(Mode)==Isometry,ConstLinearPart,const LinearMatrixType>::type RotationReturnType;
-  EIGEN_DEVICE_FUNC RotationReturnType rotation() const;
-
+  EIGEN_DEVICE_FUNC const LinearMatrixType rotation() const;
   template<typename RotationMatrixType, typename ScalingMatrixType>
   EIGEN_DEVICE_FUNC
   void computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const;
@@ -760,7 +755,7 @@ template<typename Scalar, int Dim, int Mode,int Options>
 Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const QMatrix& other)
 {
   EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  if (EIGEN_CONST_CONDITIONAL(Mode == int(AffineCompact)))
+  if (Mode == int(AffineCompact))
     m_matrix << other.m11(), other.m21(), other.dx(),
                 other.m12(), other.m22(), other.dy();
   else
@@ -806,7 +801,7 @@ Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator
 {
   check_template_params();
   EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  if (EIGEN_CONST_CONDITIONAL(Mode == int(AffineCompact)))
+  if (Mode == int(AffineCompact))
     m_matrix << other.m11(), other.m21(), other.dx(),
                 other.m12(), other.m22(), other.dy();
   else
@@ -824,7 +819,7 @@ template<typename Scalar, int Dim, int Mode, int Options>
 QTransform Transform<Scalar,Dim,Mode,Options>::toQTransform(void) const
 {
   EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  if (EIGEN_CONST_CONDITIONAL(Mode == int(AffineCompact)))
+  if (Mode == int(AffineCompact))
     return QTransform(m_matrix.coeff(0,0), m_matrix.coeff(1,0),
                       m_matrix.coeff(0,1), m_matrix.coeff(1,1),
                       m_matrix.coeff(0,2), m_matrix.coeff(1,2));
@@ -917,7 +912,7 @@ EIGEN_DEVICE_FUNC Transform<Scalar,Dim,Mode,Options>&
 Transform<Scalar,Dim,Mode,Options>::pretranslate(const MatrixBase<OtherDerived> &other)
 {
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
-  if(EIGEN_CONST_CONDITIONAL(int(Mode)==int(Projective)))
+  if(int(Mode)==int(Projective))
     affine() += other * m_matrix.row(Dim);
   else
     translation() += other;
@@ -1051,43 +1046,20 @@ EIGEN_DEVICE_FUNC inline Transform<Scalar,Dim,Mode,Options> Transform<Scalar,Dim
 *** Special functions ***
 ************************/
 
-namespace internal {
-template<int Mode> struct transform_rotation_impl {
-  template<typename TransformType>
-  EIGEN_DEVICE_FUNC static inline
-  const typename TransformType::LinearMatrixType run(const TransformType& t)
-  {
-    typedef typename TransformType::LinearMatrixType LinearMatrixType; 
-    LinearMatrixType result;
-    t.computeRotationScaling(&result, (LinearMatrixType*)0);
-    return result;
-  }
-};
-template<> struct transform_rotation_impl<Isometry> {
-  template<typename TransformType>
-  EIGEN_DEVICE_FUNC static inline
-  typename TransformType::ConstLinearPart run(const TransformType& t)
-  {
-    return t.linear();
-  }
-};
-}
 /** \returns the rotation part of the transformation
   *
-  * If Mode==Isometry, then this method is an alias for linear(),
-  * otherwise it calls computeRotationScaling() to extract the rotation
-  * through a SVD decomposition.
   *
   * \svd_module
   *
   * \sa computeRotationScaling(), computeScalingRotation(), class SVD
   */
 template<typename Scalar, int Dim, int Mode, int Options>
-EIGEN_DEVICE_FUNC
-typename Transform<Scalar,Dim,Mode,Options>::RotationReturnType
+EIGEN_DEVICE_FUNC const typename Transform<Scalar,Dim,Mode,Options>::LinearMatrixType
 Transform<Scalar,Dim,Mode,Options>::rotation() const
 {
-  return internal::transform_rotation_impl<Mode>::run(*this);
+  LinearMatrixType result;
+  computeRotationScaling(&result, (LinearMatrixType*)0);
+  return result;
 }
 
 
@@ -1111,12 +1083,12 @@ EIGEN_DEVICE_FUNC void Transform<Scalar,Dim,Mode,Options>::computeRotationScalin
   Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1
   VectorType sv(svd.singularValues());
   sv.coeffRef(0) *= x;
-  if(scaling) *scaling = svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint();
+  if(scaling) scaling->lazyAssign(svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint());
   if(rotation)
   {
     LinearMatrixType m(svd.matrixU());
     m.col(0) /= x;
-    *rotation = m * svd.matrixV().adjoint();
+    rotation->lazyAssign(m * svd.matrixV().adjoint());
   }
 }
 
@@ -1140,12 +1112,12 @@ EIGEN_DEVICE_FUNC void Transform<Scalar,Dim,Mode,Options>::computeScalingRotatio
   Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1
   VectorType sv(svd.singularValues());
   sv.coeffRef(0) *= x;
-  if(scaling) *scaling = svd.matrixU() * sv.asDiagonal() * svd.matrixU().adjoint();
+  if(scaling) scaling->lazyAssign(svd.matrixU() * sv.asDiagonal() * svd.matrixU().adjoint());
   if(rotation)
   {
     LinearMatrixType m(svd.matrixU());
     m.col(0) /= x;
-    *rotation = m * svd.matrixV().adjoint();
+    rotation->lazyAssign(m * svd.matrixV().adjoint());
   }
 }
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Geometry/Translation.h b/uppsrc/plugin/Eigen/Eigen/src/Geometry/Translation.h
index 8c2290121..0e99ce68e 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Geometry/Translation.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Geometry/Translation.h
@@ -70,18 +70,18 @@ public:
   /** Constructs and initialize the translation transformation from a vector of translation coefficients */
   EIGEN_DEVICE_FUNC explicit inline Translation(const VectorType& vector) : m_coeffs(vector) {}
 
-  /** \brief Returns the x-translation by value. **/
+  /** \brief Retruns the x-translation by value. **/
   EIGEN_DEVICE_FUNC inline Scalar x() const { return m_coeffs.x(); }
-  /** \brief Returns the y-translation by value. **/
+  /** \brief Retruns the y-translation by value. **/
   EIGEN_DEVICE_FUNC inline Scalar y() const { return m_coeffs.y(); }
-  /** \brief Returns the z-translation by value. **/
+  /** \brief Retruns the z-translation by value. **/
   EIGEN_DEVICE_FUNC inline Scalar z() const { return m_coeffs.z(); }
 
-  /** \brief Returns the x-translation as a reference. **/
+  /** \brief Retruns the x-translation as a reference. **/
   EIGEN_DEVICE_FUNC inline Scalar& x() { return m_coeffs.x(); }
-  /** \brief Returns the y-translation as a reference. **/
+  /** \brief Retruns the y-translation as a reference. **/
   EIGEN_DEVICE_FUNC inline Scalar& y() { return m_coeffs.y(); }
-  /** \brief Returns the z-translation as a reference. **/
+  /** \brief Retruns the z-translation as a reference. **/
   EIGEN_DEVICE_FUNC inline Scalar& z() { return m_coeffs.z(); }
 
   EIGEN_DEVICE_FUNC const VectorType& vector() const { return m_coeffs; }
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Geometry/arch/Geometry_SSE.h b/uppsrc/plugin/Eigen/Eigen/src/Geometry/arch/Geometry_SSE.h
index 108cc9f8e..f68cab583 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Geometry/arch/Geometry_SSE.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Geometry/arch/Geometry_SSE.h
@@ -25,20 +25,18 @@ struct quat_product<Architecture::SSE, Derived, OtherDerived, float>
   };
   static inline Quaternion<float> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
   {
-    evaluator<typename Derived::Coefficients> ae(_a.coeffs());
-    evaluator<typename OtherDerived::Coefficients> be(_b.coeffs());
     Quaternion<float> res;
-    const Packet4f mask = _mm_setr_ps(0.f,0.f,0.f,-0.f);
-    Packet4f a = ae.template packet<AAlignment,Packet4f>(0);
-    Packet4f b = be.template packet<BAlignment,Packet4f>(0);
-    Packet4f s1 = pmul(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2));
-    Packet4f s2 = pmul(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1));
+    const __m128 mask = _mm_setr_ps(0.f,0.f,0.f,-0.f);
+    __m128 a = _a.coeffs().template packet<AAlignment>(0);
+    __m128 b = _b.coeffs().template packet<BAlignment>(0);
+    __m128 s1 = _mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2));
+    __m128 s2 = _mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1));
     pstoret<float,Packet4f,ResAlignment>(
               &res.x(),
-              padd(psub(pmul(a,vec4f_swizzle1(b,3,3,3,3)),
-                                    pmul(vec4f_swizzle1(a,2,0,1,0),
+              _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,vec4f_swizzle1(b,3,3,3,3)),
+                                    _mm_mul_ps(vec4f_swizzle1(a,2,0,1,0),
                                                vec4f_swizzle1(b,1,2,0,0))),
-                         pxor(mask,padd(s1,s2))));
+                         _mm_xor_ps(mask,_mm_add_ps(s1,s2))));
     
     return res;
   }
@@ -52,10 +50,9 @@ struct quat_conj<Architecture::SSE, Derived, float>
   };
   static inline Quaternion<float> run(const QuaternionBase<Derived>& q)
   {
-    evaluator<typename Derived::Coefficients> qe(q.coeffs());
     Quaternion<float> res;
-    const Packet4f mask = _mm_setr_ps(-0.f,-0.f,-0.f,0.f);
-    pstoret<float,Packet4f,ResAlignment>(&res.x(), pxor(mask, qe.template packet<traits<Derived>::Alignment,Packet4f>(0)));
+    const __m128 mask = _mm_setr_ps(-0.f,-0.f,-0.f,0.f);
+    pstoret<float,Packet4f,ResAlignment>(&res.x(), _mm_xor_ps(mask, q.coeffs().template packet<traits<Derived>::Alignment>(0)));
     return res;
   }
 };
@@ -70,14 +67,12 @@ struct cross3_impl<Architecture::SSE,VectorLhs,VectorRhs,float,true>
   static inline typename plain_matrix_type<VectorLhs>::type
   run(const VectorLhs& lhs, const VectorRhs& rhs)
   {
-    evaluator<VectorLhs> lhs_eval(lhs);
-    evaluator<VectorRhs> rhs_eval(rhs);
-    Packet4f a = lhs_eval.template packet<traits<VectorLhs>::Alignment,Packet4f>(0);
-    Packet4f b = rhs_eval.template packet<traits<VectorRhs>::Alignment,Packet4f>(0);
-    Packet4f mul1 = pmul(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3));
-    Packet4f mul2 = pmul(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3));
+    __m128 a = lhs.template packet<traits<VectorLhs>::Alignment>(0);
+    __m128 b = rhs.template packet<traits<VectorRhs>::Alignment>(0);
+    __m128 mul1=_mm_mul_ps(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3));
+    __m128 mul2=_mm_mul_ps(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3));
     typename plain_matrix_type<VectorLhs>::type res;
-    pstoret<float,Packet4f,ResAlignment>(&res.x(),psub(mul1,mul2));
+    pstoret<float,Packet4f,ResAlignment>(&res.x(),_mm_sub_ps(mul1,mul2));
     return res;
   }
 };
@@ -99,12 +94,9 @@ struct quat_product<Architecture::SSE, Derived, OtherDerived, double>
 
   Quaternion<double> res;
 
-  evaluator<typename Derived::Coefficients> ae(_a.coeffs());
-  evaluator<typename OtherDerived::Coefficients> be(_b.coeffs());
-
   const double* a = _a.coeffs().data();
-  Packet2d b_xy = be.template packet<BAlignment,Packet2d>(0);
-  Packet2d b_zw = be.template packet<BAlignment,Packet2d>(2);
+  Packet2d b_xy = _b.coeffs().template packet<BAlignment>(0);
+  Packet2d b_zw = _b.coeffs().template packet<BAlignment>(2);
   Packet2d a_xx = pset1<Packet2d>(a[0]);
   Packet2d a_yy = pset1<Packet2d>(a[1]);
   Packet2d a_zz = pset1<Packet2d>(a[2]);
@@ -153,12 +145,11 @@ struct quat_conj<Architecture::SSE, Derived, double>
   };
   static inline Quaternion<double> run(const QuaternionBase<Derived>& q)
   {
-    evaluator<typename Derived::Coefficients> qe(q.coeffs());
     Quaternion<double> res;
-    const Packet2d mask0 = _mm_setr_pd(-0.,-0.);
-    const Packet2d mask2 = _mm_setr_pd(-0.,0.);
-    pstoret<double,Packet2d,ResAlignment>(&res.x(), pxor(mask0, qe.template packet<traits<Derived>::Alignment,Packet2d>(0)));
-    pstoret<double,Packet2d,ResAlignment>(&res.z(), pxor(mask2, qe.template packet<traits<Derived>::Alignment,Packet2d>(2)));
+    const __m128d mask0 = _mm_setr_pd(-0.,-0.);
+    const __m128d mask2 = _mm_setr_pd(-0.,0.);
+    pstoret<double,Packet2d,ResAlignment>(&res.x(), _mm_xor_pd(mask0, q.coeffs().template packet<traits<Derived>::Alignment>(0)));
+    pstoret<double,Packet2d,ResAlignment>(&res.z(), _mm_xor_pd(mask2, q.coeffs().template packet<traits<Derived>::Alignment>(2)));
     return res;
   }
 };
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Householder/BlockHouseholder.h b/uppsrc/plugin/Eigen/Eigen/src/Householder/BlockHouseholder.h
index 39ce1c2a0..01a7ed188 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Householder/BlockHouseholder.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Householder/BlockHouseholder.h
@@ -63,15 +63,8 @@ void make_block_householder_triangular_factor(TriangularFactorType& triFactor, c
       triFactor.row(i).tail(rt).noalias() = -hCoeffs(i) * vectors.col(i).tail(rs).adjoint()
                                                         * vectors.bottomRightCorner(rs, rt).template triangularView<UnitLower>();
             
-      // FIXME use the following line with .noalias() once the triangular product can work inplace
-      // triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView<Upper>();
-      for(Index j=nbVecs-1; j>i; --j)
-      {
-        typename TriangularFactorType::Scalar z = triFactor(i,j);
-        triFactor(i,j) = z * triFactor(j,j);
-        if(nbVecs-j-1>0)
-          triFactor.row(i).tail(nbVecs-j-1) += z * triFactor.row(j).tail(nbVecs-j-1);
-      }
+      // FIXME add .noalias() once the triangular product can work inplace
+      triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView<Upper>();
       
     }
     triFactor(i,i) = hCoeffs(i);
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Householder/Householder.h b/uppsrc/plugin/Eigen/Eigen/src/Householder/Householder.h
index 5bc037f00..80de2c305 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Householder/Householder.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Householder/Householder.h
@@ -39,7 +39,6 @@ template<int n> struct decrement_size
   *     MatrixBase::applyHouseholderOnTheRight()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
 void MatrixBase<Derived>::makeHouseholderInPlace(Scalar& tau, RealScalar& beta)
 {
   VectorBlock<Derived, internal::decrement_size<Base::SizeAtCompileTime>::ret> essentialPart(derived(), 1, size()-1);
@@ -63,7 +62,6 @@ void MatrixBase<Derived>::makeHouseholderInPlace(Scalar& tau, RealScalar& beta)
   */
 template<typename Derived>
 template<typename EssentialPart>
-EIGEN_DEVICE_FUNC
 void MatrixBase<Derived>::makeHouseholder(
   EssentialPart& essential,
   Scalar& tau,
@@ -105,14 +103,13 @@ void MatrixBase<Derived>::makeHouseholder(
   * \param essential the essential part of the vector \c v
   * \param tau the scaling factor of the Householder transformation
   * \param workspace a pointer to working space with at least
-  *                  this->cols() entries
+  *                  this->cols() * essential.size() entries
   *
   * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), 
   *     MatrixBase::applyHouseholderOnTheRight()
   */
 template<typename Derived>
 template<typename EssentialPart>
-EIGEN_DEVICE_FUNC
 void MatrixBase<Derived>::applyHouseholderOnTheLeft(
   const EssentialPart& essential,
   const Scalar& tau,
@@ -143,14 +140,13 @@ void MatrixBase<Derived>::applyHouseholderOnTheLeft(
   * \param essential the essential part of the vector \c v
   * \param tau the scaling factor of the Householder transformation
   * \param workspace a pointer to working space with at least
-  *                  this->rows() entries
+  *                  this->cols() * essential.size() entries
   *
   * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), 
   *     MatrixBase::applyHouseholderOnTheLeft()
   */
 template<typename Derived>
 template<typename EssentialPart>
-EIGEN_DEVICE_FUNC
 void MatrixBase<Derived>::applyHouseholderOnTheRight(
   const EssentialPart& essential,
   const Scalar& tau,
@@ -164,10 +160,10 @@ void MatrixBase<Derived>::applyHouseholderOnTheRight(
   {
     Map<typename internal::plain_col_type<PlainObject>::type> tmp(workspace,rows());
     Block<Derived, Derived::RowsAtCompileTime, EssentialPart::SizeAtCompileTime> right(derived(), 0, 1, rows(), cols()-1);
-    tmp.noalias() = right * essential;
+    tmp.noalias() = right * essential.conjugate();
     tmp += this->col(0);
     this->col(0) -= tau * tmp;
-    right.noalias() -= tau * tmp * essential.adjoint();
+    right.noalias() -= tau * tmp * essential.transpose();
   }
 }
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Householder/HouseholderSequence.h b/uppsrc/plugin/Eigen/Eigen/src/Householder/HouseholderSequence.h
index 9318c281f..3ce0a693d 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Householder/HouseholderSequence.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Householder/HouseholderSequence.h
@@ -87,7 +87,7 @@ struct hseq_side_dependent_impl
 {
   typedef Block<const VectorsType, Dynamic, 1> EssentialVectorType;
   typedef HouseholderSequence<VectorsType, CoeffsType, OnTheLeft> HouseholderSequenceType;
-  static EIGEN_DEVICE_FUNC inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k)
+  static inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k)
   {
     Index start = k+1+h.m_shift;
     return Block<const VectorsType,Dynamic,1>(h.m_vectors, start, k, h.rows()-start, 1);
@@ -140,28 +140,6 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       Side
     > ConjugateReturnType;
 
-    typedef HouseholderSequence<
-      VectorsType,
-      typename internal::conditional<NumTraits<Scalar>::IsComplex,
-        typename internal::remove_all<typename CoeffsType::ConjugateReturnType>::type,
-        CoeffsType>::type,
-      Side
-    > AdjointReturnType;
-
-    typedef HouseholderSequence<
-      typename internal::conditional<NumTraits<Scalar>::IsComplex,
-        typename internal::remove_all<typename VectorsType::ConjugateReturnType>::type,
-        VectorsType>::type,
-      CoeffsType,
-      Side
-    > TransposeReturnType;
-
-    typedef HouseholderSequence<
-      typename internal::add_const<VectorsType>::type,
-      typename internal::add_const<CoeffsType>::type,
-      Side
-    > ConstHouseholderSequence;
-
     /** \brief Constructor.
       * \param[in]  v      %Matrix containing the essential parts of the Householder vectors
       * \param[in]  h      Vector containing the Householder coefficients
@@ -179,19 +157,17 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       *
       * \sa setLength(), setShift()
       */
-    EIGEN_DEVICE_FUNC
     HouseholderSequence(const VectorsType& v, const CoeffsType& h)
-      : m_vectors(v), m_coeffs(h), m_reverse(false), m_length(v.diagonalSize()),
+      : m_vectors(v), m_coeffs(h), m_trans(false), m_length(v.diagonalSize()),
         m_shift(0)
     {
     }
 
     /** \brief Copy constructor. */
-    EIGEN_DEVICE_FUNC
     HouseholderSequence(const HouseholderSequence& other)
       : m_vectors(other.m_vectors),
         m_coeffs(other.m_coeffs),
-        m_reverse(other.m_reverse),
+        m_trans(other.m_trans),
         m_length(other.m_length),
         m_shift(other.m_shift)
     {
@@ -201,14 +177,12 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       * \returns Number of rows 
       * \details This equals the dimension of the space that the transformation acts on.
       */
-    EIGEN_DEVICE_FUNC
     Index rows() const { return Side==OnTheLeft ? m_vectors.rows() : m_vectors.cols(); }
 
     /** \brief Number of columns of transformation viewed as a matrix.
       * \returns Number of columns
       * \details This equals the dimension of the space that the transformation acts on.
       */
-    EIGEN_DEVICE_FUNC
     Index cols() const { return rows(); }
 
     /** \brief Essential part of a Householder vector.
@@ -225,7 +199,6 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       *
       * \sa setShift(), shift()
       */
-    EIGEN_DEVICE_FUNC
     const EssentialVectorType essentialVector(Index k) const
     {
       eigen_assert(k >= 0 && k < m_length);
@@ -233,51 +206,31 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
     }
 
     /** \brief %Transpose of the Householder sequence. */
-    TransposeReturnType transpose() const
+    HouseholderSequence transpose() const
     {
-      return TransposeReturnType(m_vectors.conjugate(), m_coeffs)
-              .setReverseFlag(!m_reverse)
-              .setLength(m_length)
-              .setShift(m_shift);
+      return HouseholderSequence(*this).setTrans(!m_trans);
     }
 
     /** \brief Complex conjugate of the Householder sequence. */
     ConjugateReturnType conjugate() const
     {
       return ConjugateReturnType(m_vectors.conjugate(), m_coeffs.conjugate())
-             .setReverseFlag(m_reverse)
+             .setTrans(m_trans)
              .setLength(m_length)
              .setShift(m_shift);
     }
 
-    /** \returns an expression of the complex conjugate of \c *this if Cond==true,
-     *           returns \c *this otherwise.
-     */
-    template<bool Cond>
-    EIGEN_DEVICE_FUNC
-    inline typename internal::conditional<Cond,ConjugateReturnType,ConstHouseholderSequence>::type
-    conjugateIf() const
-    {
-      typedef typename internal::conditional<Cond,ConjugateReturnType,ConstHouseholderSequence>::type ReturnType;
-      return ReturnType(m_vectors.template conjugateIf<Cond>(), m_coeffs.template conjugateIf<Cond>());
-    }
-
     /** \brief Adjoint (conjugate transpose) of the Householder sequence. */
-    AdjointReturnType adjoint() const
+    ConjugateReturnType adjoint() const
     {
-      return AdjointReturnType(m_vectors, m_coeffs.conjugate())
-              .setReverseFlag(!m_reverse)
-              .setLength(m_length)
-              .setShift(m_shift);
+      return conjugate().setTrans(!m_trans);
     }
 
     /** \brief Inverse of the Householder sequence (equals the adjoint). */
-    AdjointReturnType inverse() const { return adjoint(); }
+    ConjugateReturnType inverse() const { return adjoint(); }
 
     /** \internal */
-    template<typename DestType>
-    inline EIGEN_DEVICE_FUNC
-    void evalTo(DestType& dst) const
+    template<typename DestType> inline void evalTo(DestType& dst) const
     {
       Matrix<Scalar, DestType::RowsAtCompileTime, 1,
              AutoAlign|ColMajor, DestType::MaxRowsAtCompileTime, 1> workspace(rows());
@@ -286,7 +239,6 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
 
     /** \internal */
     template<typename Dest, typename Workspace>
-    EIGEN_DEVICE_FUNC
     void evalTo(Dest& dst, Workspace& workspace) const
     {
       workspace.resize(rows());
@@ -299,7 +251,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
         for(Index k = vecs-1; k >= 0; --k)
         {
           Index cornerSize = rows() - k - m_shift;
-          if(m_reverse)
+          if(m_trans)
             dst.bottomRightCorner(cornerSize, cornerSize)
                .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data());
           else
@@ -313,26 +265,18 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
         for(Index k = 0; k<cols()-vecs ; ++k)
           dst.col(k).tail(rows()-k-1).setZero();
       }
-      else if(m_length>BlockSize)
-      {
-        dst.setIdentity(rows(), rows());
-        if(m_reverse)
-          applyThisOnTheLeft(dst,workspace,true);
-        else
-          applyThisOnTheLeft(dst,workspace,true);
-      }
       else
       {
         dst.setIdentity(rows(), rows());
         for(Index k = vecs-1; k >= 0; --k)
         {
           Index cornerSize = rows() - k - m_shift;
-          if(m_reverse)
+          if(m_trans)
             dst.bottomRightCorner(cornerSize, cornerSize)
-               .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data());
+               .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0));
           else
             dst.bottomRightCorner(cornerSize, cornerSize)
-               .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), workspace.data());
+               .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0));
         }
       }
     }
@@ -351,34 +295,31 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       workspace.resize(dst.rows());
       for(Index k = 0; k < m_length; ++k)
       {
-        Index actual_k = m_reverse ? m_length-k-1 : k;
+        Index actual_k = m_trans ? m_length-k-1 : k;
         dst.rightCols(rows()-m_shift-actual_k)
            .applyHouseholderOnTheRight(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data());
       }
     }
 
     /** \internal */
-    template<typename Dest> inline void applyThisOnTheLeft(Dest& dst, bool inputIsIdentity = false) const
+    template<typename Dest> inline void applyThisOnTheLeft(Dest& dst) const
     {
       Matrix<Scalar,1,Dest::ColsAtCompileTime,RowMajor,1,Dest::MaxColsAtCompileTime> workspace;
-      applyThisOnTheLeft(dst, workspace, inputIsIdentity);
+      applyThisOnTheLeft(dst, workspace);
     }
 
     /** \internal */
     template<typename Dest, typename Workspace>
-    inline void applyThisOnTheLeft(Dest& dst, Workspace& workspace, bool inputIsIdentity = false) const
+    inline void applyThisOnTheLeft(Dest& dst, Workspace& workspace) const
     {
-      if(inputIsIdentity && m_reverse)
-        inputIsIdentity = false;
+      const Index BlockSize = 48;
       // if the entries are large enough, then apply the reflectors by block
       if(m_length>=BlockSize && dst.cols()>1)
       {
-        // Make sure we have at least 2 useful blocks, otherwise it is point-less:
-        Index blockSize = m_length<Index(2*BlockSize) ? (m_length+1)/2 : Index(BlockSize);
-        for(Index i = 0; i < m_length; i+=blockSize)
+        for(Index i = 0; i < m_length; i+=BlockSize)
         {
-          Index end = m_reverse ? (std::min)(m_length,i+blockSize) : m_length-i;
-          Index k = m_reverse ? i : (std::max)(Index(0),end-blockSize);
+          Index end = m_trans ? (std::min)(m_length,i+BlockSize) : m_length-i;
+          Index k = m_trans ? i : (std::max)(Index(0),end-BlockSize);
           Index bs = end-k;
           Index start = k + m_shift;
           
@@ -388,15 +329,8 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
                                                                    Side==OnTheRight ? bs : m_vectors.rows()-start,
                                                                    Side==OnTheRight ? m_vectors.cols()-start : bs);
           typename internal::conditional<Side==OnTheRight, Transpose<SubVectorsType>, SubVectorsType&>::type sub_vecs(sub_vecs1);
-
-          Index dstStart = dst.rows()-rows()+m_shift+k;
-          Index dstRows  = rows()-m_shift-k;
-          Block<Dest,Dynamic,Dynamic> sub_dst(dst,
-                                              dstStart,
-                                              inputIsIdentity ? dstStart : 0,
-                                              dstRows,
-                                              inputIsIdentity ? dstRows : dst.cols());
-          apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_reverse);
+          Block<Dest,Dynamic,Dynamic> sub_dst(dst,dst.rows()-rows()+m_shift+k,0, rows()-m_shift-k,dst.cols());
+          apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_trans);
         }
       }
       else
@@ -404,9 +338,8 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
         workspace.resize(dst.cols());
         for(Index k = 0; k < m_length; ++k)
         {
-          Index actual_k = m_reverse ? k : m_length-k-1;
-          Index dstStart = rows()-m_shift-actual_k;
-          dst.bottomRightCorner(dstStart, inputIsIdentity ? dstStart : dst.cols())
+          Index actual_k = m_trans ? k : m_length-k-1;
+          dst.bottomRows(rows()-m_shift-actual_k)
             .applyHouseholderOnTheLeft(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data());
         }
       }
@@ -424,7 +357,7 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
     {
       typename internal::matrix_type_times_scalar_type<Scalar, OtherDerived>::Type
         res(other.template cast<typename internal::matrix_type_times_scalar_type<Scalar,OtherDerived>::ResultScalar>());
-      applyThisOnTheLeft(res, internal::is_identity<OtherDerived>::value && res.rows()==res.cols());
+      applyThisOnTheLeft(res);
       return res;
     }
 
@@ -439,7 +372,6 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       *
       * \sa length()
       */
-    EIGEN_DEVICE_FUNC
     HouseholderSequence& setLength(Index length)
     {
       m_length = length;
@@ -457,17 +389,13 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
       *
       * \sa shift()
       */
-    EIGEN_DEVICE_FUNC
     HouseholderSequence& setShift(Index shift)
     {
       m_shift = shift;
       return *this;
     }
 
-    EIGEN_DEVICE_FUNC
     Index length() const { return m_length; }  /**< \brief Returns the length of the Householder sequence. */
-
-    EIGEN_DEVICE_FUNC
     Index shift() const { return m_shift; }    /**< \brief Returns the shift of the Householder sequence. */
 
     /* Necessary for .adjoint() and .conjugate() */
@@ -475,30 +403,27 @@ template<typename VectorsType, typename CoeffsType, int Side> class HouseholderS
 
   protected:
 
-    /** \internal
-      * \brief Sets the reverse flag.
-      * \param [in]  reverse  New value of the reverse flag.
+    /** \brief Sets the transpose flag.
+      * \param [in]  trans  New value of the transpose flag.
       *
-      * By default, the reverse flag is not set. If the reverse flag is set, then this object represents
-      * \f$ H^r = H_{n-1} \ldots H_1 H_0 \f$ instead of \f$ H = H_0 H_1 \ldots H_{n-1} \f$.
-      * \note For real valued HouseholderSequence this is equivalent to transposing \f$ H \f$.
+      * By default, the transpose flag is not set. If the transpose flag is set, then this object represents 
+      * \f$ H^T = H_{n-1}^T \ldots H_1^T H_0^T \f$ instead of \f$ H = H_0 H_1 \ldots H_{n-1} \f$.
       *
-      * \sa reverseFlag(), transpose(), adjoint()
+      * \sa trans()
       */
-    HouseholderSequence& setReverseFlag(bool reverse)
+    HouseholderSequence& setTrans(bool trans)
     {
-      m_reverse = reverse;
+      m_trans = trans;
       return *this;
     }
 
-    bool reverseFlag() const { return m_reverse; }     /**< \internal \brief Returns the reverse flag. */
+    bool trans() const { return m_trans; }     /**< \brief Returns the transpose flag. */
 
     typename VectorsType::Nested m_vectors;
     typename CoeffsType::Nested m_coeffs;
-    bool m_reverse;
+    bool m_trans;
     Index m_length;
     Index m_shift;
-    enum { BlockSize = 48 };
 };
 
 /** \brief Computes the product of a matrix with a Householder sequence.
diff --git a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
index 153acef65..454f46814 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
@@ -191,16 +191,32 @@ public:
 
   /** \internal */
   template<typename Rhs,typename Dest>
-  void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
+  void _solve_with_guess_impl(const Rhs& b, Dest& x) const
   {    
-    m_iterations = Base::maxIterations();
-    m_error = Base::m_tolerance;
-    
-    bool ret = internal::bicgstab(matrix(), b, x, Base::m_preconditioner, m_iterations, m_error);
-
-    m_info = (!ret) ? NumericalIssue
+    bool failed = false;
+    for(Index j=0; j<b.cols(); ++j)
+    {
+      m_iterations = Base::maxIterations();
+      m_error = Base::m_tolerance;
+      
+      typename Dest::ColXpr xj(x,j);
+      if(!internal::bicgstab(matrix(), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error))
+        failed = true;
+    }
+    m_info = failed ? NumericalIssue
            : m_error <= Base::m_tolerance ? Success
            : NoConvergence;
+    m_isInitialized = true;
+  }
+
+  /** \internal */
+  using Base::_solve_impl;
+  template<typename Rhs,typename Dest>
+  void _solve_impl(const MatrixBase<Rhs>& b, Dest& x) const
+  {
+    x.resize(this->rows(),b.cols());
+    x.setZero();
+    _solve_with_guess_impl(b,x);
   }
 
 protected:
diff --git a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
index 5d8c6b433..f7ce47134 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
@@ -51,7 +51,7 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
     return;
   }
   const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
-  RealScalar threshold = numext::maxi(RealScalar(tol*tol*rhsNorm2),considerAsZero);
+  RealScalar threshold = numext::maxi(tol*tol*rhsNorm2,considerAsZero);
   RealScalar residualNorm2 = residual.squaredNorm();
   if (residualNorm2 < threshold)
   {
@@ -195,7 +195,7 @@ public:
 
   /** \internal */
   template<typename Rhs,typename Dest>
-  void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
+  void _solve_with_guess_impl(const Rhs& b, Dest& x) const
   {
     typedef typename Base::MatrixWrapper MatrixWrapper;
     typedef typename Base::ActualMatrixType ActualMatrixType;
@@ -211,14 +211,31 @@ public:
                                            RowMajorWrapper,
                                            typename MatrixWrapper::template ConstSelfAdjointViewReturnType<UpLo>::Type
                                           >::type SelfAdjointWrapper;
-
     m_iterations = Base::maxIterations();
     m_error = Base::m_tolerance;
 
-    RowMajorWrapper row_mat(matrix());
-    internal::conjugate_gradient(SelfAdjointWrapper(row_mat), b, x, Base::m_preconditioner, m_iterations, m_error);
+    for(Index j=0; j<b.cols(); ++j)
+    {
+      m_iterations = Base::maxIterations();
+      m_error = Base::m_tolerance;
+
+      typename Dest::ColXpr xj(x,j);
+      RowMajorWrapper row_mat(matrix());
+      internal::conjugate_gradient(SelfAdjointWrapper(row_mat), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error);
+    }
+
+    m_isInitialized = true;
     m_info = m_error <= Base::m_tolerance ? Success : NoConvergence;
   }
+  
+  /** \internal */
+  using Base::_solve_impl;
+  template<typename Rhs,typename Dest>
+  void _solve_impl(const MatrixBase<Rhs>& b, Dest& x) const
+  {
+    x.setZero();
+    _solve_with_guess_impl(b.derived(),x);
+  }
 
 protected:
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
index e5d0308ec..e45c272b4 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
@@ -41,7 +41,13 @@ namespace Eigen {
   * the info() method, then you can either increase the initial shift, or better use another preconditioning technique.
   *
   */
-template <typename Scalar, int _UpLo = Lower, typename _OrderingType = AMDOrdering<int> >
+template <typename Scalar, int _UpLo = Lower, typename _OrderingType =
+#ifndef EIGEN_MPL2_ONLY
+AMDOrdering<int>
+#else
+NaturalOrdering<int>
+#endif
+>
 class IncompleteCholesky : public SparseSolverBase<IncompleteCholesky<Scalar,_UpLo,_OrderingType> >
 {
   protected:
@@ -70,12 +76,12 @@ class IncompleteCholesky : public SparseSolverBase<IncompleteCholesky<Scalar,_Up
       *
       * \sa IncompleteCholesky(const MatrixType&)
       */
-    IncompleteCholesky() : m_initialShift(1e-3),m_analysisIsOk(false),m_factorizationIsOk(false) {}
+    IncompleteCholesky() : m_initialShift(1e-3),m_factorizationIsOk(false) {}
     
     /** Constructor computing the incomplete factorization for the given matrix \a matrix.
       */
     template<typename MatrixType>
-    IncompleteCholesky(const MatrixType& matrix) : m_initialShift(1e-3),m_analysisIsOk(false),m_factorizationIsOk(false)
+    IncompleteCholesky(const MatrixType& matrix) : m_initialShift(1e-3),m_factorizationIsOk(false)
     {
       compute(matrix);
     }
diff --git a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
index 09436cb67..338e6f10a 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
@@ -136,7 +136,7 @@ class IncompleteLUT : public SparseSolverBase<IncompleteLUT<_Scalar, _StorageInd
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was successful,
+      * \returns \c Success if computation was succesful,
       *          \c NumericalIssue if the matrix.appears to be negative.
       */
     ComputationInfo info() const
@@ -225,15 +225,24 @@ void IncompleteLUT<Scalar,StorageIndex>::analyzePattern(const _MatrixType& amat)
   // Compute the Fill-reducing permutation
   // Since ILUT does not perform any numerical pivoting,
   // it is highly preferable to keep the diagonal through symmetric permutations.
+#ifndef EIGEN_MPL2_ONLY
   // To this end, let's symmetrize the pattern and perform AMD on it.
   SparseMatrix<Scalar,ColMajor, StorageIndex> mat1 = amat;
   SparseMatrix<Scalar,ColMajor, StorageIndex> mat2 = amat.transpose();
   // FIXME for a matrix with nearly symmetric pattern, mat2+mat1 is the appropriate choice.
-  //       on the other hand for a really non-symmetric pattern, mat2*mat1 should be preferred...
+  //       on the other hand for a really non-symmetric pattern, mat2*mat1 should be prefered...
   SparseMatrix<Scalar,ColMajor, StorageIndex> AtA = mat2 + mat1;
   AMDOrdering<StorageIndex> ordering;
   ordering(AtA,m_P);
   m_Pinv  = m_P.inverse(); // cache the inverse permutation
+#else
+  // If AMD is not available, (MPL2-only), then let's use the slower COLAMD routine.
+  SparseMatrix<Scalar,ColMajor, StorageIndex> mat1 = amat;
+  COLAMDOrdering<StorageIndex> ordering;
+  ordering(mat1,m_Pinv);
+  m_P = m_Pinv.inverse();
+#endif
+
   m_analysisIsOk = true;
   m_factorizationIsOk = false;
   m_isInitialized = true;
diff --git a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
index 13ba9a55b..7c2326eb7 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
@@ -275,7 +275,7 @@ public:
   const Preconditioner& preconditioner() const { return m_preconditioner; }
 
   /** \returns the max number of iterations.
-    * It is either the value set by setMaxIterations or, by default,
+    * It is either the value setted by setMaxIterations or, by default,
     * twice the number of columns of the matrix.
     */
   Index maxIterations() const
@@ -331,7 +331,7 @@ public:
   
   /** \internal */
   template<typename Rhs, typename DestDerived>
-  void _solve_with_guess_impl(const Rhs& b, SparseMatrixBase<DestDerived> &aDest) const
+  void _solve_impl(const Rhs& b, SparseMatrixBase<DestDerived> &aDest) const
   {
     eigen_assert(rows()==b.rows());
     
@@ -344,65 +344,15 @@ public:
     // We do not directly fill dest because sparse expressions have to be free of aliasing issue.
     // For non square least-square problems, b and dest might not have the same size whereas they might alias each-other.
     typename DestDerived::PlainObject tmp(cols(),rhsCols);
-    ComputationInfo global_info = Success;
     for(Index k=0; k<rhsCols; ++k)
     {
       tb = b.col(k);
-      tx = dest.col(k);
-      derived()._solve_vector_with_guess_impl(tb,tx);
+      tx = derived().solve(tb);
       tmp.col(k) = tx.sparseView(0);
-
-      // The call to _solve_vector_with_guess_impl updates m_info, so if it failed for a previous column
-      // we need to restore it to the worst value.
-      if(m_info==NumericalIssue)
-        global_info = NumericalIssue;
-      else if(m_info==NoConvergence)
-        global_info = NoConvergence;
     }
-    m_info = global_info;
     dest.swap(tmp);
   }
 
-  template<typename Rhs, typename DestDerived>
-  typename internal::enable_if<Rhs::ColsAtCompileTime!=1 && DestDerived::ColsAtCompileTime!=1>::type
-  _solve_with_guess_impl(const Rhs& b, MatrixBase<DestDerived> &aDest) const
-  {
-    eigen_assert(rows()==b.rows());
-    
-    Index rhsCols = b.cols();
-    DestDerived& dest(aDest.derived());
-    ComputationInfo global_info = Success;
-    for(Index k=0; k<rhsCols; ++k)
-    {
-      typename DestDerived::ColXpr xk(dest,k);
-      typename Rhs::ConstColXpr bk(b,k);
-      derived()._solve_vector_with_guess_impl(bk,xk);
-
-      // The call to _solve_vector_with_guess updates m_info, so if it failed for a previous column
-      // we need to restore it to the worst value.
-      if(m_info==NumericalIssue)
-        global_info = NumericalIssue;
-      else if(m_info==NoConvergence)
-        global_info = NoConvergence;
-    }
-    m_info = global_info;
-  }
-
-  template<typename Rhs, typename DestDerived>
-  typename internal::enable_if<Rhs::ColsAtCompileTime==1 || DestDerived::ColsAtCompileTime==1>::type
-  _solve_with_guess_impl(const Rhs& b, MatrixBase<DestDerived> &dest) const
-  {
-    derived()._solve_vector_with_guess_impl(b,dest.derived());
-  }
-
-  /** \internal default initial guess = 0 */
-  template<typename Rhs,typename Dest>
-  void _solve_impl(const Rhs& b, Dest& x) const
-  {
-    x.setZero();
-    derived()._solve_with_guess_impl(b,x);
-  }
-
 protected:
   void init()
   {
diff --git a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
index 203fd0ec6..0aea0e099 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
@@ -182,14 +182,32 @@ public:
 
   /** \internal */
   template<typename Rhs,typename Dest>
-  void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
+  void _solve_with_guess_impl(const Rhs& b, Dest& x) const
   {
     m_iterations = Base::maxIterations();
     m_error = Base::m_tolerance;
 
-    internal::least_square_conjugate_gradient(matrix(), b, x, Base::m_preconditioner, m_iterations, m_error);
+    for(Index j=0; j<b.cols(); ++j)
+    {
+      m_iterations = Base::maxIterations();
+      m_error = Base::m_tolerance;
+
+      typename Dest::ColXpr xj(x,j);
+      internal::least_square_conjugate_gradient(matrix(), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error);
+    }
+
+    m_isInitialized = true;
     m_info = m_error <= Base::m_tolerance ? Success : NoConvergence;
   }
+  
+  /** \internal */
+  using Base::_solve_impl;
+  template<typename Rhs,typename Dest>
+  void _solve_impl(const MatrixBase<Rhs>& b, Dest& x) const
+  {
+    x.setZero();
+    _solve_with_guess_impl(b.derived(),x);
+  }
 
 };
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
index 79e1e4819..0ace45177 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
@@ -108,7 +108,7 @@ struct Assignment<DstXprType, SolveWithGuess<DecType,RhsType,GuessType>, interna
   }
 };
 
-} // end namespace internal
+} // end namepsace internal
 
 } // end namespace Eigen
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/Jacobi/Jacobi.h b/uppsrc/plugin/Eigen/Eigen/src/Jacobi/Jacobi.h
index bfb9dcb08..1998c6322 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/Jacobi/Jacobi.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/Jacobi/Jacobi.h
@@ -11,7 +11,7 @@
 #ifndef EIGEN_JACOBI_H
 #define EIGEN_JACOBI_H
 
-namespace Eigen {
+namespace Eigen { 
 
 /** \ingroup Jacobi_Module
   * \jacobi_module
@@ -37,20 +37,17 @@ template<typename Scalar> class JacobiRotation
     typedef typename NumTraits<Scalar>::Real RealScalar;
 
     /** Default constructor without any initialization. */
-    EIGEN_DEVICE_FUNC
     JacobiRotation() {}
 
     /** Construct a planar rotation from a cosine-sine pair (\a c, \c s). */
-    EIGEN_DEVICE_FUNC
     JacobiRotation(const Scalar& c, const Scalar& s) : m_c(c), m_s(s) {}
 
-    EIGEN_DEVICE_FUNC Scalar& c() { return m_c; }
-    EIGEN_DEVICE_FUNC Scalar c() const { return m_c; }
-    EIGEN_DEVICE_FUNC Scalar& s() { return m_s; }
-    EIGEN_DEVICE_FUNC Scalar s() const { return m_s; }
+    Scalar& c() { return m_c; }
+    Scalar c() const { return m_c; }
+    Scalar& s() { return m_s; }
+    Scalar s() const { return m_s; }
 
     /** Concatenates two planar rotation */
-    EIGEN_DEVICE_FUNC
     JacobiRotation operator*(const JacobiRotation& other)
     {
       using numext::conj;
@@ -59,26 +56,19 @@ template<typename Scalar> class JacobiRotation
     }
 
     /** Returns the transposed transformation */
-    EIGEN_DEVICE_FUNC
     JacobiRotation transpose() const { using numext::conj; return JacobiRotation(m_c, -conj(m_s)); }
 
     /** Returns the adjoint transformation */
-    EIGEN_DEVICE_FUNC
     JacobiRotation adjoint() const { using numext::conj; return JacobiRotation(conj(m_c), -m_s); }
 
     template<typename Derived>
-    EIGEN_DEVICE_FUNC
     bool makeJacobi(const MatrixBase<Derived>&, Index p, Index q);
-    EIGEN_DEVICE_FUNC
     bool makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z);
 
-    EIGEN_DEVICE_FUNC
     void makeGivens(const Scalar& p, const Scalar& q, Scalar* r=0);
 
   protected:
-    EIGEN_DEVICE_FUNC
     void makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type);
-    EIGEN_DEVICE_FUNC
     void makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type);
 
     Scalar m_c, m_s;
@@ -90,12 +80,10 @@ template<typename Scalar> class JacobiRotation
   * \sa MatrixBase::makeJacobi(const MatrixBase<Derived>&, Index, Index), MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
   */
 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z)
 {
   using std::sqrt;
   using std::abs;
-
   RealScalar deno = RealScalar(2)*abs(y);
   if(deno < (std::numeric_limits<RealScalar>::min)())
   {
@@ -135,7 +123,6 @@ bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, co
   */
 template<typename Scalar>
 template<typename Derived>
-EIGEN_DEVICE_FUNC
 inline bool JacobiRotation<Scalar>::makeJacobi(const MatrixBase<Derived>& m, Index p, Index q)
 {
   return makeJacobi(numext::real(m.coeff(p,p)), m.coeff(p,q), numext::real(m.coeff(q,q)));
@@ -158,7 +145,6 @@ inline bool JacobiRotation<Scalar>::makeJacobi(const MatrixBase<Derived>& m, Ind
   * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
   */
 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r)
 {
   makeGivens(p, q, r, typename internal::conditional<NumTraits<Scalar>::IsComplex, internal::true_type, internal::false_type>::type());
@@ -167,13 +153,12 @@ void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar
 
 // specialization for complexes
 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type)
 {
   using std::sqrt;
   using std::abs;
   using numext::conj;
-
+  
   if(q==Scalar(0))
   {
     m_c = numext::real(p)<0 ? Scalar(-1) : Scalar(1);
@@ -227,7 +212,6 @@ void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar
 
 // specialization for reals
 template<typename Scalar>
-EIGEN_DEVICE_FUNC
 void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type)
 {
   using std::sqrt;
@@ -273,13 +257,12 @@ void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar
 
 namespace internal {
 /** \jacobi_module
-  * Applies the clock wise 2D rotation \a j to the set of 2D vectors of coordinates \a x and \a y:
+  * Applies the clock wise 2D rotation \a j to the set of 2D vectors of cordinates \a x and \a y:
   * \f$ \left ( \begin{array}{cc} x \\ y \end{array} \right )  =  J \left ( \begin{array}{cc} x \\ y \end{array} \right ) \f$
   *
   * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
   */
 template<typename VectorX, typename VectorY, typename OtherScalar>
-EIGEN_DEVICE_FUNC
 void apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j);
 }
 
@@ -291,7 +274,6 @@ void apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>&
   */
 template<typename Derived>
 template<typename OtherScalar>
-EIGEN_DEVICE_FUNC
 inline void MatrixBase<Derived>::applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j)
 {
   RowXpr x(this->row(p));
@@ -307,7 +289,6 @@ inline void MatrixBase<Derived>::applyOnTheLeft(Index p, Index q, const JacobiRo
   */
 template<typename Derived>
 template<typename OtherScalar>
-EIGEN_DEVICE_FUNC
 inline void MatrixBase<Derived>::applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j)
 {
   ColXpr x(this->col(p));
@@ -321,8 +302,7 @@ template<typename Scalar, typename OtherScalar,
          int SizeAtCompileTime, int MinAlignment, bool Vectorizable>
 struct apply_rotation_in_the_plane_selector
 {
-  static EIGEN_DEVICE_FUNC
-  inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s)
+  static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s)
   {
     for(Index i=0; i<size; ++i)
     {
@@ -449,7 +429,6 @@ struct apply_rotation_in_the_plane_selector<Scalar,OtherScalar,SizeAtCompileTime
 };
 
 template<typename VectorX, typename VectorY, typename OtherScalar>
-EIGEN_DEVICE_FUNC
 void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y, const JacobiRotation<OtherScalar>& j)
 {
   typedef typename VectorX::Scalar Scalar;
@@ -463,7 +442,7 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x
 
   Scalar* EIGEN_RESTRICT x = &xpr_x.derived().coeffRef(0);
   Scalar* EIGEN_RESTRICT y = &xpr_y.derived().coeffRef(0);
-
+  
   OtherScalar c = j.c();
   OtherScalar s = j.s();
   if (c==OtherScalar(1) && s==OtherScalar(0))
diff --git a/uppsrc/plugin/Eigen/Eigen/src/KLUSupport/KLUSupport.h b/uppsrc/plugin/Eigen/Eigen/src/KLUSupport/KLUSupport.h
deleted file mode 100644
index d2633a935..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/KLUSupport/KLUSupport.h
+++ /dev/null
@@ -1,358 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2017 Kyle Macfarlan <kyle.macfarlan@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_KLUSUPPORT_H
-#define EIGEN_KLUSUPPORT_H
-
-namespace Eigen {
-
-/* TODO extract L, extract U, compute det, etc... */
-
-/** \ingroup KLUSupport_Module
-  * \brief A sparse LU factorization and solver based on KLU
-  *
-  * This class allows to solve for A.X = B sparse linear problems via a LU factorization
-  * using the KLU library. The sparse matrix A must be squared and full rank.
-  * The vectors or matrices X and B can be either dense or sparse.
-  *
-  * \warning The input matrix A should be in a \b compressed and \b column-major form.
-  * Otherwise an expensive copy will be made. You can call the inexpensive makeCompressed() to get a compressed matrix.
-  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
-  *
-  * \implsparsesolverconcept
-  *
-  * \sa \ref TutorialSparseSolverConcept, class UmfPackLU, class SparseLU
-  */
-
-
-inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, double B [ ], klu_common *Common, double) {
-   return klu_solve(Symbolic, Numeric, internal::convert_index<int>(ldim), internal::convert_index<int>(nrhs), B, Common);
-}
-
-inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, std::complex<double>B[], klu_common *Common, std::complex<double>) {
-   return klu_z_solve(Symbolic, Numeric, internal::convert_index<int>(ldim), internal::convert_index<int>(nrhs), &numext::real_ref(B[0]), Common);
-}
-
-inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, double B[], klu_common *Common, double) {
-   return klu_tsolve(Symbolic, Numeric, internal::convert_index<int>(ldim), internal::convert_index<int>(nrhs), B, Common);
-}
-
-inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, std::complex<double>B[], klu_common *Common, std::complex<double>) {
-   return klu_z_tsolve(Symbolic, Numeric, internal::convert_index<int>(ldim), internal::convert_index<int>(nrhs), &numext::real_ref(B[0]), 0, Common);
-}
-
-inline klu_numeric* klu_factor(int Ap [ ], int Ai [ ], double Ax [ ], klu_symbolic *Symbolic, klu_common *Common, double) {
-   return klu_factor(Ap, Ai, Ax, Symbolic, Common);
-}
-
-inline klu_numeric* klu_factor(int Ap[], int Ai[], std::complex<double> Ax[], klu_symbolic *Symbolic, klu_common *Common, std::complex<double>) {
-   return klu_z_factor(Ap, Ai, &numext::real_ref(Ax[0]), Symbolic, Common);
-}
-
-
-template<typename _MatrixType>
-class KLU : public SparseSolverBase<KLU<_MatrixType> >
-{
-  protected:
-    typedef SparseSolverBase<KLU<_MatrixType> > Base;
-    using Base::m_isInitialized;
-  public:
-    using Base::_solve_impl;
-    typedef _MatrixType MatrixType;
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::StorageIndex StorageIndex;
-    typedef Matrix<Scalar,Dynamic,1> Vector;
-    typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
-    typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
-    typedef SparseMatrix<Scalar> LUMatrixType;
-    typedef SparseMatrix<Scalar,ColMajor,int> KLUMatrixType;
-    typedef Ref<const KLUMatrixType, StandardCompressedFormat> KLUMatrixRef;
-    enum {
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
-    };
-
-  public:
-
-    KLU()
-      : m_dummy(0,0), mp_matrix(m_dummy)
-    {
-      init();
-    }
-
-    template<typename InputMatrixType>
-    explicit KLU(const InputMatrixType& matrix)
-      : mp_matrix(matrix)
-    {
-      init();
-      compute(matrix);
-    }
-
-    ~KLU()
-    {
-      if(m_symbolic) klu_free_symbolic(&m_symbolic,&m_common);
-      if(m_numeric)  klu_free_numeric(&m_numeric,&m_common);
-    }
-
-    inline Index rows() const { return mp_matrix.rows(); }
-    inline Index cols() const { return mp_matrix.cols(); }
-
-    /** \brief Reports whether previous computation was successful.
-      *
-      * \returns \c Success if computation was successful,
-      *          \c NumericalIssue if the matrix.appears to be negative.
-      */
-    ComputationInfo info() const
-    {
-      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
-      return m_info;
-    }
-#if 0 // not implemented yet
-    inline const LUMatrixType& matrixL() const
-    {
-      if (m_extractedDataAreDirty) extractData();
-      return m_l;
-    }
-
-    inline const LUMatrixType& matrixU() const
-    {
-      if (m_extractedDataAreDirty) extractData();
-      return m_u;
-    }
-
-    inline const IntColVectorType& permutationP() const
-    {
-      if (m_extractedDataAreDirty) extractData();
-      return m_p;
-    }
-
-    inline const IntRowVectorType& permutationQ() const
-    {
-      if (m_extractedDataAreDirty) extractData();
-      return m_q;
-    }
-#endif
-    /** Computes the sparse Cholesky decomposition of \a matrix
-     *  Note that the matrix should be column-major, and in compressed format for best performance.
-     *  \sa SparseMatrix::makeCompressed().
-     */
-    template<typename InputMatrixType>
-    void compute(const InputMatrixType& matrix)
-    {
-      if(m_symbolic) klu_free_symbolic(&m_symbolic, &m_common);
-      if(m_numeric)  klu_free_numeric(&m_numeric, &m_common);
-      grab(matrix.derived());
-      analyzePattern_impl();
-      factorize_impl();
-    }
-
-    /** Performs a symbolic decomposition on the sparcity of \a matrix.
-      *
-      * This function is particularly useful when solving for several problems having the same structure.
-      *
-      * \sa factorize(), compute()
-      */
-    template<typename InputMatrixType>
-    void analyzePattern(const InputMatrixType& matrix)
-    {
-      if(m_symbolic) klu_free_symbolic(&m_symbolic, &m_common);
-      if(m_numeric)  klu_free_numeric(&m_numeric, &m_common);
-
-      grab(matrix.derived());
-
-      analyzePattern_impl();
-    }
-
-
-    /** Provides access to the control settings array used by KLU.
-      *
-      * See KLU documentation for details.
-      */
-    inline const klu_common& kluCommon() const
-    {
-      return m_common;
-    }
-
-    /** Provides access to the control settings array used by UmfPack.
-      *
-      * If this array contains NaN's, the default values are used.
-      *
-      * See KLU documentation for details.
-      */
-    inline klu_common& kluCommon()
-    {
-      return m_common;
-    }
-
-    /** Performs a numeric decomposition of \a matrix
-      *
-      * The given matrix must has the same sparcity than the matrix on which the pattern anylysis has been performed.
-      *
-      * \sa analyzePattern(), compute()
-      */
-    template<typename InputMatrixType>
-    void factorize(const InputMatrixType& matrix)
-    {
-      eigen_assert(m_analysisIsOk && "KLU: you must first call analyzePattern()");
-      if(m_numeric)
-        klu_free_numeric(&m_numeric,&m_common);
-
-      grab(matrix.derived());
-
-      factorize_impl();
-    }
-
-    /** \internal */
-    template<typename BDerived,typename XDerived>
-    bool _solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const;
-
-#if 0 // not implemented yet
-    Scalar determinant() const;
-
-    void extractData() const;
-#endif
-
-  protected:
-
-    void init()
-    {
-      m_info                  = InvalidInput;
-      m_isInitialized         = false;
-      m_numeric               = 0;
-      m_symbolic              = 0;
-      m_extractedDataAreDirty = true;
-
-      klu_defaults(&m_common);
-    }
-
-    void analyzePattern_impl()
-    {
-      m_info = InvalidInput;
-      m_analysisIsOk = false;
-      m_factorizationIsOk = false;
-      m_symbolic = klu_analyze(internal::convert_index<int>(mp_matrix.rows()),
-                                     const_cast<StorageIndex*>(mp_matrix.outerIndexPtr()), const_cast<StorageIndex*>(mp_matrix.innerIndexPtr()),
-                                     &m_common);
-      if (m_symbolic) {
-         m_isInitialized = true;
-         m_info = Success;
-         m_analysisIsOk = true;
-         m_extractedDataAreDirty = true;
-      }
-    }
-
-    void factorize_impl()
-    {
-
-      m_numeric = klu_factor(const_cast<StorageIndex*>(mp_matrix.outerIndexPtr()), const_cast<StorageIndex*>(mp_matrix.innerIndexPtr()), const_cast<Scalar*>(mp_matrix.valuePtr()),
-                                    m_symbolic, &m_common, Scalar());
-                                         
-
-      m_info = m_numeric ? Success : NumericalIssue;
-      m_factorizationIsOk = m_numeric ? 1 : 0;
-      m_extractedDataAreDirty = true;
-    }
-
-    template<typename MatrixDerived>
-    void grab(const EigenBase<MatrixDerived> &A)
-    {
-      mp_matrix.~KLUMatrixRef();
-      ::new (&mp_matrix) KLUMatrixRef(A.derived());
-    }
-
-    void grab(const KLUMatrixRef &A)
-    {
-      if(&(A.derived()) != &mp_matrix)
-      {
-        mp_matrix.~KLUMatrixRef();
-        ::new (&mp_matrix) KLUMatrixRef(A);
-      }
-    }
-
-    // cached data to reduce reallocation, etc.
-#if 0 // not implemented yet
-    mutable LUMatrixType m_l;
-    mutable LUMatrixType m_u;
-    mutable IntColVectorType m_p;
-    mutable IntRowVectorType m_q;
-#endif
-
-    KLUMatrixType m_dummy;
-    KLUMatrixRef mp_matrix;
-
-    klu_numeric* m_numeric;
-    klu_symbolic* m_symbolic;
-    klu_common m_common;
-    mutable ComputationInfo m_info;
-    int m_factorizationIsOk;
-    int m_analysisIsOk;
-    mutable bool m_extractedDataAreDirty;
-
-  private:
-    KLU(const KLU& ) { }
-};
-
-#if 0 // not implemented yet
-template<typename MatrixType>
-void KLU<MatrixType>::extractData() const
-{
-  if (m_extractedDataAreDirty)
-  {
-     eigen_assert(false && "KLU: extractData Not Yet Implemented");
-
-    // get size of the data
-    int lnz, unz, rows, cols, nz_udiag;
-    umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar());
-
-    // allocate data
-    m_l.resize(rows,(std::min)(rows,cols));
-    m_l.resizeNonZeros(lnz);
-
-    m_u.resize((std::min)(rows,cols),cols);
-    m_u.resizeNonZeros(unz);
-
-    m_p.resize(rows);
-    m_q.resize(cols);
-
-    // extract
-    umfpack_get_numeric(m_l.outerIndexPtr(), m_l.innerIndexPtr(), m_l.valuePtr(),
-                        m_u.outerIndexPtr(), m_u.innerIndexPtr(), m_u.valuePtr(),
-                        m_p.data(), m_q.data(), 0, 0, 0, m_numeric);
-
-    m_extractedDataAreDirty = false;
-  }
-}
-
-template<typename MatrixType>
-typename KLU<MatrixType>::Scalar KLU<MatrixType>::determinant() const
-{
-  eigen_assert(false && "KLU: extractData Not Yet Implemented");
-  return Scalar();
-}
-#endif
-
-template<typename MatrixType>
-template<typename BDerived,typename XDerived>
-bool KLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const
-{
-  Index rhsCols = b.cols();
-  EIGEN_STATIC_ASSERT((XDerived::Flags&RowMajorBit)==0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
-  eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()");
-
-  x = b;
-  int info = klu_solve(m_symbolic, m_numeric, b.rows(), rhsCols, x.const_cast_derived().data(), const_cast<klu_common*>(&m_common), Scalar());
-
-  m_info = info!=0 ? Success : NumericalIssue;
-  return true;
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_KLUSUPPORT_H
diff --git a/uppsrc/plugin/Eigen/Eigen/src/LU/Determinant.h b/uppsrc/plugin/Eigen/Eigen/src/LU/Determinant.h
index 3a41e6fcb..d6a3c1e5a 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/LU/Determinant.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/LU/Determinant.h
@@ -15,7 +15,6 @@ namespace Eigen {
 namespace internal {
 
 template<typename Derived>
-EIGEN_DEVICE_FUNC
 inline const typename Derived::Scalar bruteforce_det3_helper
 (const MatrixBase<Derived>& matrix, int a, int b, int c)
 {
@@ -23,6 +22,14 @@ inline const typename Derived::Scalar bruteforce_det3_helper
          * (matrix.coeff(1,b) * matrix.coeff(2,c) - matrix.coeff(1,c) * matrix.coeff(2,b));
 }
 
+template<typename Derived>
+const typename Derived::Scalar bruteforce_det4_helper
+(const MatrixBase<Derived>& matrix, int j, int k, int m, int n)
+{
+  return (matrix.coeff(j,0) * matrix.coeff(k,1) - matrix.coeff(k,0) * matrix.coeff(j,1))
+       * (matrix.coeff(m,2) * matrix.coeff(n,3) - matrix.coeff(n,2) * matrix.coeff(m,3));
+}
+
 template<typename Derived,
          int DeterminantType = Derived::RowsAtCompileTime
 > struct determinant_impl
@@ -37,8 +44,7 @@ template<typename Derived,
 
 template<typename Derived> struct determinant_impl<Derived, 1>
 {
-  static inline EIGEN_DEVICE_FUNC
-  typename traits<Derived>::Scalar run(const Derived& m)
+  static inline typename traits<Derived>::Scalar run(const Derived& m)
   {
     return m.coeff(0,0);
   }
@@ -46,8 +52,7 @@ template<typename Derived> struct determinant_impl<Derived, 1>
 
 template<typename Derived> struct determinant_impl<Derived, 2>
 {
-  static inline EIGEN_DEVICE_FUNC
-  typename traits<Derived>::Scalar run(const Derived& m)
+  static inline typename traits<Derived>::Scalar run(const Derived& m)
   {
     return m.coeff(0,0) * m.coeff(1,1) - m.coeff(1,0) * m.coeff(0,1);
   }
@@ -55,8 +60,7 @@ template<typename Derived> struct determinant_impl<Derived, 2>
 
 template<typename Derived> struct determinant_impl<Derived, 3>
 {
-  static inline EIGEN_DEVICE_FUNC
-  typename traits<Derived>::Scalar run(const Derived& m)
+  static inline typename traits<Derived>::Scalar run(const Derived& m)
   {
     return bruteforce_det3_helper(m,0,1,2)
           - bruteforce_det3_helper(m,1,0,2)
@@ -66,34 +70,15 @@ template<typename Derived> struct determinant_impl<Derived, 3>
 
 template<typename Derived> struct determinant_impl<Derived, 4>
 {
-  typedef typename traits<Derived>::Scalar Scalar;
-  static EIGEN_DEVICE_FUNC
-  Scalar run(const Derived& m)
+  static typename traits<Derived>::Scalar run(const Derived& m)
   {
-    Scalar d2_01 = det2(m, 0, 1);
-    Scalar d2_02 = det2(m, 0, 2);
-    Scalar d2_03 = det2(m, 0, 3);
-    Scalar d2_12 = det2(m, 1, 2);
-    Scalar d2_13 = det2(m, 1, 3);
-    Scalar d2_23 = det2(m, 2, 3);
-    Scalar d3_0 = det3(m, 1,d2_23, 2,d2_13, 3,d2_12);
-    Scalar d3_1 = det3(m, 0,d2_23, 2,d2_03, 3,d2_02);
-    Scalar d3_2 = det3(m, 0,d2_13, 1,d2_03, 3,d2_01);
-    Scalar d3_3 = det3(m, 0,d2_12, 1,d2_02, 2,d2_01);
-    return internal::pmadd(-m(0,3),d3_0, m(1,3)*d3_1) +
-           internal::pmadd(-m(2,3),d3_2, m(3,3)*d3_3);
-  }
-protected:
-  static EIGEN_DEVICE_FUNC
-  Scalar det2(const Derived& m, Index i0, Index i1)
-  {
-    return m(i0,0) * m(i1,1) - m(i1,0) * m(i0,1);
-  }
-
-  static EIGEN_DEVICE_FUNC
-  Scalar det3(const Derived& m, Index i0, const Scalar& d0, Index i1, const Scalar& d1, Index i2, const Scalar& d2)
-  {
-    return internal::pmadd(m(i0,2), d0, internal::pmadd(-m(i1,2), d1, m(i2,2)*d2));
+    // trick by Martin Costabel to compute 4x4 det with only 30 muls
+    return bruteforce_det4_helper(m,0,1,2,3)
+          - bruteforce_det4_helper(m,0,2,1,3)
+          + bruteforce_det4_helper(m,0,3,1,2)
+          + bruteforce_det4_helper(m,1,2,0,3)
+          - bruteforce_det4_helper(m,1,3,0,2)
+          + bruteforce_det4_helper(m,2,3,0,1);
   }
 };
 
@@ -104,7 +89,6 @@ protected:
   * \returns the determinant of this matrix
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
 inline typename internal::traits<Derived>::Scalar MatrixBase<Derived>::determinant() const
 {
   eigen_assert(rows() == cols());
diff --git a/uppsrc/plugin/Eigen/Eigen/src/LU/FullPivLU.h b/uppsrc/plugin/Eigen/Eigen/src/LU/FullPivLU.h
index ef93ec5eb..03b6af706 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/LU/FullPivLU.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/LU/FullPivLU.h
@@ -18,7 +18,6 @@ template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> >
 {
   typedef MatrixXpr XprKind;
   typedef SolverStorage StorageKind;
-  typedef int StorageIndex;
   enum { Flags = 0 };
 };
 
@@ -49,12 +48,12 @@ template<typename _MatrixType> struct traits<FullPivLU<_MatrixType> >
   * The data of the LU decomposition can be directly accessed through the methods matrixLU(),
   * permutationP(), permutationQ().
   *
-  * As an example, here is how the original matrix can be retrieved:
+  * As an exemple, here is how the original matrix can be retrieved:
   * \include class_FullPivLU.cpp
   * Output: \verbinclude class_FullPivLU.out
   *
   * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
-  *
+  * 
   * \sa MatrixBase::fullPivLu(), MatrixBase::determinant(), MatrixBase::inverse()
   */
 template<typename _MatrixType> class FullPivLU
@@ -63,9 +62,9 @@ template<typename _MatrixType> class FullPivLU
   public:
     typedef _MatrixType MatrixType;
     typedef SolverBase<FullPivLU> Base;
-    friend class SolverBase<FullPivLU>;
 
     EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivLU)
+    // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int
     enum {
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
@@ -219,7 +218,6 @@ template<typename _MatrixType> class FullPivLU
       return internal::image_retval<FullPivLU>(*this, originalMatrix);
     }
 
-    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** \return a solution x to the equation Ax=b, where A is the matrix of which
       * *this is the LU decomposition.
       *
@@ -239,10 +237,14 @@ template<typename _MatrixType> class FullPivLU
       *
       * \sa TriangularView::solve(), kernel(), inverse()
       */
+    // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion.
     template<typename Rhs>
     inline const Solve<FullPivLU, Rhs>
-    solve(const MatrixBase<Rhs>& b) const;
-    #endif
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "LU is not initialized.");
+      return Solve<FullPivLU, Rhs>(*this, b.derived());
+    }
 
     /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is
         the LU decomposition.
@@ -318,7 +320,7 @@ template<typename _MatrixType> class FullPivLU
       return m_usePrescribedThreshold ? m_prescribedThreshold
       // this formula comes from experimenting (see "LU precision tuning" thread on the list)
       // and turns out to be identical to Higham's formula used already in LDLt.
-          : NumTraits<Scalar>::epsilon() * RealScalar(m_lu.diagonalSize());
+                                      : NumTraits<Scalar>::epsilon() * m_lu.diagonalSize();
     }
 
     /** \returns the rank of the matrix of which *this is the LU decomposition.
@@ -409,9 +411,11 @@ template<typename _MatrixType> class FullPivLU
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
 
     template<bool Conjugate, typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
     void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
     #endif
 
@@ -527,8 +531,8 @@ void FullPivLU<MatrixType>::computeInPlace()
       m_nonzero_pivots = k;
       for(Index i = k; i < size; ++i)
       {
-        m_rowsTranspositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
-        m_colsTranspositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
+        m_rowsTranspositions.coeffRef(i) = i;
+        m_colsTranspositions.coeffRef(i) = i;
       }
       break;
     }
@@ -539,8 +543,8 @@ void FullPivLU<MatrixType>::computeInPlace()
     // Now that we've found the pivot, we need to apply the row/col swaps to
     // bring it to the location (k,k).
 
-    m_rowsTranspositions.coeffRef(k) = internal::convert_index<StorageIndex>(row_of_biggest_in_corner);
-    m_colsTranspositions.coeffRef(k) = internal::convert_index<StorageIndex>(col_of_biggest_in_corner);
+    m_rowsTranspositions.coeffRef(k) = row_of_biggest_in_corner;
+    m_colsTranspositions.coeffRef(k) = col_of_biggest_in_corner;
     if(k != row_of_biggest_in_corner) {
       m_lu.row(k).swap(m_lu.row(row_of_biggest_in_corner));
       ++number_of_transpositions;
@@ -753,6 +757,7 @@ void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
   const Index rows = this->rows(),
               cols = this->cols(),
               nonzero_pivots = this->rank();
+  eigen_assert(rhs.rows() == rows);
   const Index smalldim = (std::min)(rows, cols);
 
   if(nonzero_pivots == 0)
@@ -802,6 +807,7 @@ void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType
 
   const Index rows = this->rows(), cols = this->cols(),
     nonzero_pivots = this->rank();
+   eigen_assert(rhs.rows() == cols);
   const Index smalldim = (std::min)(rows, cols);
 
   if(nonzero_pivots == 0)
@@ -815,19 +821,29 @@ void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType
   // Step 1
   c = permutationQ().inverse() * rhs;
 
-  // Step 2
-  m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
-      .template triangularView<Upper>()
-      .transpose()
-      .template conjugateIf<Conjugate>()
-      .solveInPlace(c.topRows(nonzero_pivots));
-
-  // Step 3
-  m_lu.topLeftCorner(smalldim, smalldim)
-      .template triangularView<UnitLower>()
-      .transpose()
-      .template conjugateIf<Conjugate>()
-      .solveInPlace(c.topRows(smalldim));
+  if (Conjugate) {
+    // Step 2
+    m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
+        .template triangularView<Upper>()
+        .adjoint()
+        .solveInPlace(c.topRows(nonzero_pivots));
+    // Step 3
+    m_lu.topLeftCorner(smalldim, smalldim)
+        .template triangularView<UnitLower>()
+        .adjoint()
+        .solveInPlace(c.topRows(smalldim));
+  } else {
+    // Step 2
+    m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
+        .template triangularView<Upper>()
+        .transpose()
+        .solveInPlace(c.topRows(nonzero_pivots));
+    // Step 3
+    m_lu.topLeftCorner(smalldim, smalldim)
+        .template triangularView<UnitLower>()
+        .transpose()
+        .solveInPlace(c.topRows(smalldim));
+  }
 
   // Step 4
   PermutationPType invp = permutationP().inverse().eval();
diff --git a/uppsrc/plugin/Eigen/Eigen/src/LU/InverseImpl.h b/uppsrc/plugin/Eigen/Eigen/src/LU/InverseImpl.h
index 1bab00c01..f49f23360 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/LU/InverseImpl.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/LU/InverseImpl.h
@@ -290,7 +290,6 @@ template<typename DstXprType, typename XprType>
 struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar>, Dense2Dense>
 {
   typedef Inverse<XprType> SrcXprType;
-  EIGEN_DEVICE_FUNC
   static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename XprType::Scalar> &)
   {
     Index dstRows = src.rows();
@@ -333,7 +332,6 @@ struct Assignment<DstXprType, Inverse<XprType>, internal::assign_op<typename Dst
   * \sa computeInverseAndDetWithCheck()
   */
 template<typename Derived>
-EIGEN_DEVICE_FUNC
 inline const Inverse<Derived> MatrixBase<Derived>::inverse() const
 {
   EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsInteger,THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
diff --git a/uppsrc/plugin/Eigen/Eigen/src/LU/PartialPivLU.h b/uppsrc/plugin/Eigen/Eigen/src/LU/PartialPivLU.h
index b8938013a..6b10f39fa 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/LU/PartialPivLU.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/LU/PartialPivLU.h
@@ -19,7 +19,6 @@ template<typename _MatrixType> struct traits<PartialPivLU<_MatrixType> >
 {
   typedef MatrixXpr XprKind;
   typedef SolverStorage StorageKind;
-  typedef int StorageIndex;
   typedef traits<_MatrixType> BaseTraits;
   enum {
     Flags = BaseTraits::Flags & RowMajorBit,
@@ -80,9 +79,8 @@ template<typename _MatrixType> class PartialPivLU
 
     typedef _MatrixType MatrixType;
     typedef SolverBase<PartialPivLU> Base;
-    friend class SolverBase<PartialPivLU>;
-
     EIGEN_GENERIC_PUBLIC_INTERFACE(PartialPivLU)
+    // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int
     enum {
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
@@ -154,7 +152,6 @@ template<typename _MatrixType> class PartialPivLU
       return m_p;
     }
 
-    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** This method returns the solution x to the equation Ax=b, where A is the matrix of which
       * *this is the LU decomposition.
       *
@@ -172,10 +169,14 @@ template<typename _MatrixType> class PartialPivLU
       *
       * \sa TriangularView::solve(), inverse(), computeInverse()
       */
+    // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion.
     template<typename Rhs>
     inline const Solve<PartialPivLU, Rhs>
-    solve(const MatrixBase<Rhs>& b) const;
-    #endif
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
+      return Solve<PartialPivLU, Rhs>(*this, b.derived());
+    }
 
     /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is
         the LU decomposition.
@@ -230,6 +231,8 @@ template<typename _MatrixType> class PartialPivLU
       * Step 3: replace c by the solution x to Ux = c.
       */
 
+      eigen_assert(rhs.rows() == m_lu.rows());
+
       // Step 1
       dst = permutationP() * rhs;
 
@@ -243,21 +246,26 @@ template<typename _MatrixType> class PartialPivLU
     template<bool Conjugate, typename RhsType, typename DstType>
     EIGEN_DEVICE_FUNC
     void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const {
-     /* The decomposition PA = LU can be rewritten as A^T = U^T L^T P.
+     /* The decomposition PA = LU can be rewritten as A = P^{-1} L U.
       * So we proceed as follows:
-      * Step 1: compute c as the solution to L^T c = b
-      * Step 2: replace c by the solution x to U^T x = c.
-      * Step 3: update  c = P^-1 c.
+      * Step 1: compute c = Pb.
+      * Step 2: replace c by the solution x to Lx = c.
+      * Step 3: replace c by the solution x to Ux = c.
       */
 
       eigen_assert(rhs.rows() == m_lu.cols());
 
-      // Step 1
-      dst = m_lu.template triangularView<Upper>().transpose()
-                .template conjugateIf<Conjugate>().solve(rhs);
-      // Step 2
-      m_lu.template triangularView<UnitLower>().transpose()
-          .template conjugateIf<Conjugate>().solveInPlace(dst);
+      if (Conjugate) {
+        // Step 1
+        dst = m_lu.template triangularView<Upper>().adjoint().solve(rhs);
+        // Step 2
+        m_lu.template triangularView<UnitLower>().adjoint().solveInPlace(dst);
+      } else {
+        // Step 1
+        dst = m_lu.template triangularView<Upper>().transpose().solve(rhs);
+        // Step 2
+        m_lu.template triangularView<UnitLower>().transpose().solveInPlace(dst);
+      }
       // Step 3
       dst = permutationP().transpose() * dst;
     }
@@ -331,18 +339,17 @@ PartialPivLU<MatrixType>::PartialPivLU(EigenBase<InputType>& matrix)
 namespace internal {
 
 /** \internal This is the blocked version of fullpivlu_unblocked() */
-template<typename Scalar, int StorageOrder, typename PivIndex, int SizeAtCompileTime=Dynamic>
+template<typename Scalar, int StorageOrder, typename PivIndex>
 struct partial_lu_impl
 {
-  static const int UnBlockedBound = 16;
-  static const bool UnBlockedAtCompileTime = SizeAtCompileTime!=Dynamic && SizeAtCompileTime<=UnBlockedBound;
-  static const int ActualSizeAtCompileTime = UnBlockedAtCompileTime ? SizeAtCompileTime : Dynamic;
-  // Remaining rows and columns at compile-time:
-  static const int RRows = SizeAtCompileTime==2 ? 1 : Dynamic;
-  static const int RCols = SizeAtCompileTime==2 ? 1 : Dynamic;
-  typedef Matrix<Scalar, ActualSizeAtCompileTime, ActualSizeAtCompileTime, StorageOrder> MatrixType;
-  typedef Ref<MatrixType> MatrixTypeRef;
-  typedef Ref<Matrix<Scalar, Dynamic, Dynamic, StorageOrder> > BlockType;
+  // FIXME add a stride to Map, so that the following mapping becomes easier,
+  // another option would be to create an expression being able to automatically
+  // warp any Map, Matrix, and Block expressions as a unique type, but since that's exactly
+  // a Map + stride, why not adding a stride to Map, and convenient ctors from a Matrix,
+  // and Block.
+  typedef Map<Matrix<Scalar, Dynamic, Dynamic, StorageOrder> > MapLU;
+  typedef Block<MapLU, Dynamic, Dynamic> MatrixType;
+  typedef Block<MatrixType,Dynamic,Dynamic> BlockType;
   typedef typename MatrixType::RealScalar RealScalar;
 
   /** \internal performs the LU decomposition in-place of the matrix \a lu
@@ -355,22 +362,19 @@ struct partial_lu_impl
     *
     * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise.
     */
-  static Index unblocked_lu(MatrixTypeRef& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions)
+  static Index unblocked_lu(MatrixType& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions)
   {
     typedef scalar_score_coeff_op<Scalar> Scoring;
     typedef typename Scoring::result_type Score;
     const Index rows = lu.rows();
     const Index cols = lu.cols();
     const Index size = (std::min)(rows,cols);
-    // For small compile-time matrices it is worth processing the last row separately:
-    //  speedup: +100% for 2x2, +10% for others.
-    const Index endk = UnBlockedAtCompileTime ? size-1 : size;
     nb_transpositions = 0;
     Index first_zero_pivot = -1;
-    for(Index k = 0; k < endk; ++k)
+    for(Index k = 0; k < size; ++k)
     {
-      int rrows = internal::convert_index<int>(rows-k-1);
-      int rcols = internal::convert_index<int>(cols-k-1);
+      Index rrows = rows-k-1;
+      Index rcols = cols-k-1;
 
       Index row_of_biggest_in_col;
       Score biggest_in_corner
@@ -387,7 +391,9 @@ struct partial_lu_impl
           ++nb_transpositions;
         }
 
-        lu.col(k).tail(fix<RRows>(rrows)) /= lu.coeff(k,k);
+        // FIXME shall we introduce a safe quotient expression in cas 1/lu.coeff(k,k)
+        // overflow but not the actual quotient?
+        lu.col(k).tail(rrows) /= lu.coeff(k,k);
       }
       else if(first_zero_pivot==-1)
       {
@@ -397,18 +403,8 @@ struct partial_lu_impl
       }
 
       if(k<rows-1)
-        lu.bottomRightCorner(fix<RRows>(rrows),fix<RCols>(rcols)).noalias() -= lu.col(k).tail(fix<RRows>(rrows)) * lu.row(k).tail(fix<RCols>(rcols));
+        lu.bottomRightCorner(rrows,rcols).noalias() -= lu.col(k).tail(rrows) * lu.row(k).tail(rcols);
     }
-
-    // special handling of the last entry
-    if(UnBlockedAtCompileTime)
-    {
-      Index k = endk;
-      row_transpositions[k] = PivIndex(k);
-      if (Scoring()(lu(k, k)) == Score(0) && first_zero_pivot == -1)
-        first_zero_pivot = k;
-    }
-
     return first_zero_pivot;
   }
 
@@ -424,17 +420,18 @@ struct partial_lu_impl
     * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise.
     *
     * \note This very low level interface using pointers, etc. is to:
-    *   1 - reduce the number of instantiations to the strict minimum
-    *   2 - avoid infinite recursion of the instantiations with Block<Block<Block<...> > >
+    *   1 - reduce the number of instanciations to the strict minimum
+    *   2 - avoid infinite recursion of the instanciations with Block<Block<Block<...> > >
     */
   static Index blocked_lu(Index rows, Index cols, Scalar* lu_data, Index luStride, PivIndex* row_transpositions, PivIndex& nb_transpositions, Index maxBlockSize=256)
   {
-    MatrixTypeRef lu = MatrixType::Map(lu_data,rows, cols, OuterStride<>(luStride));
+    MapLU lu1(lu_data,StorageOrder==RowMajor?rows:luStride,StorageOrder==RowMajor?luStride:cols);
+    MatrixType lu(lu1,0,0,rows,cols);
 
     const Index size = (std::min)(rows,cols);
 
     // if the matrix is too small, no blocking:
-    if(UnBlockedAtCompileTime || size<=UnBlockedBound)
+    if(size<=16)
     {
       return unblocked_lu(lu, row_transpositions, nb_transpositions);
     }
@@ -460,12 +457,12 @@ struct partial_lu_impl
       //                          A00 | A01 | A02
       // lu  = A_0 | A_1 | A_2 =  A10 | A11 | A12
       //                          A20 | A21 | A22
-      BlockType A_0 = lu.block(0,0,rows,k);
-      BlockType A_2 = lu.block(0,k+bs,rows,tsize);
-      BlockType A11 = lu.block(k,k,bs,bs);
-      BlockType A12 = lu.block(k,k+bs,bs,tsize);
-      BlockType A21 = lu.block(k+bs,k,trows,bs);
-      BlockType A22 = lu.block(k+bs,k+bs,trows,tsize);
+      BlockType A_0(lu,0,0,rows,k);
+      BlockType A_2(lu,0,k+bs,rows,tsize);
+      BlockType A11(lu,k,k,bs,bs);
+      BlockType A12(lu,k,k+bs,bs,tsize);
+      BlockType A21(lu,k+bs,k,trows,bs);
+      BlockType A22(lu,k+bs,k+bs,trows,tsize);
 
       PivIndex nb_transpositions_in_panel;
       // recursively call the blocked LU algorithm on [A11^T A21^T]^T
@@ -508,9 +505,7 @@ void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, t
   eigen_assert((&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1);
 
   partial_lu_impl
-    < typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor,
-      typename TranspositionType::StorageIndex,
-      EIGEN_SIZE_MIN_PREFER_FIXED(MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime)>
+    <typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor, typename TranspositionType::StorageIndex>
     ::blocked_lu(lu.rows(), lu.cols(), &lu.coeffRef(0,0), lu.outerStride(), &row_transpositions.coeffRef(0), nb_transpositions);
 }
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/OrderingMethods/Amd.h b/uppsrc/plugin/Eigen/Eigen/src/OrderingMethods/Amd.h
index 7ca3f33b1..f91ecb24e 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/OrderingMethods/Amd.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/OrderingMethods/Amd.h
@@ -2,22 +2,32 @@
 // for linear algebra.
 //
 // Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 /*
+
 NOTE: this routine has been adapted from the CSparse library:
 
 Copyright (c) 2006, Timothy A. Davis.
 http://www.suitesparse.com
 
-The author of CSparse, Timothy A. Davis., has executed a license with Google LLC
-to permit distribution of this code and derivative works as part of Eigen under
-the Mozilla Public License v. 2.0, as stated at the top of this file.
+CSparse is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+CSparse is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this Module; if not, write to the Free Software
+Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
 */
 
+#include "../Core/util/NonMPL2.h"
+
 #ifndef EIGEN_SPARSE_AMD_H
 #define EIGEN_SPARSE_AMD_H
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h b/uppsrc/plugin/Eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h
index 8e339a704..da85b4d6e 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h
@@ -13,119 +13,115 @@
 //   Davis (davis@cise.ufl.edu), University of Florida.  The algorithm was
 //   developed in collaboration with John Gilbert, Xerox PARC, and Esmond
 //   Ng, Oak Ridge National Laboratory.
-//
+// 
 //     Date:
-//
+// 
 //   September 8, 2003.  Version 2.3.
-//
+// 
 //     Acknowledgements:
-//
+// 
 //   This work was supported by the National Science Foundation, under
 //   grants DMS-9504974 and DMS-9803599.
-//
+// 
 //     Notice:
-//
+// 
 //   Copyright (c) 1998-2003 by the University of Florida.
 //   All Rights Reserved.
-//
+// 
 //   THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
 //   EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
-//
+// 
 //   Permission is hereby granted to use, copy, modify, and/or distribute
 //   this program, provided that the Copyright, this License, and the
 //   Availability of the original version is retained on all copies and made
 //   accessible to the end-user of any code or package that includes COLAMD
-//   or any modified version of COLAMD.
-//
+//   or any modified version of COLAMD. 
+// 
 //     Availability:
-//
+// 
 //   The colamd/symamd library is available at
-//
+// 
 //       http://www.suitesparse.com
 
-
+  
 #ifndef EIGEN_COLAMD_H
 #define EIGEN_COLAMD_H
 
 namespace internal {
-
-namespace Colamd {
-
 /* Ensure that debugging is turned off: */
 #ifndef COLAMD_NDEBUG
 #define COLAMD_NDEBUG
 #endif /* NDEBUG */
-
-
 /* ========================================================================== */
 /* === Knob and statistics definitions ====================================== */
 /* ========================================================================== */
 
 /* size of the knobs [ ] array.  Only knobs [0..1] are currently used. */
-const int NKnobs = 20;
+#define COLAMD_KNOBS 20
 
 /* number of output statistics.  Only stats [0..6] are currently used. */
-const int NStats = 20;
+#define COLAMD_STATS 20 
 
-/* Indices into knobs and stats array. */
-enum KnobsStatsIndex {
-  /* knobs [0] and stats [0]: dense row knob and output statistic. */
-  DenseRow = 0,
+/* knobs [0] and stats [0]: dense row knob and output statistic. */
+#define COLAMD_DENSE_ROW 0
 
-  /* knobs [1] and stats [1]: dense column knob and output statistic. */
-  DenseCol = 1,
+/* knobs [1] and stats [1]: dense column knob and output statistic. */
+#define COLAMD_DENSE_COL 1
 
-  /* stats [2]: memory defragmentation count output statistic */
-  DefragCount = 2,
+/* stats [2]: memory defragmentation count output statistic */
+#define COLAMD_DEFRAG_COUNT 2
 
-  /* stats [3]: colamd status:  zero OK, > 0 warning or notice, < 0 error */
-  Status = 3,
+/* stats [3]: colamd status:  zero OK, > 0 warning or notice, < 0 error */
+#define COLAMD_STATUS 3
 
-  /* stats [4..6]: error info, or info on jumbled columns */
-  Info1 = 4,
-  Info2 = 5,
-  Info3 = 6
-};
+/* stats [4..6]: error info, or info on jumbled columns */ 
+#define COLAMD_INFO1 4
+#define COLAMD_INFO2 5
+#define COLAMD_INFO3 6
 
 /* error codes returned in stats [3]: */
-enum Status {
-  Ok = 0,
-  OkButJumbled = 1,
-  ErrorANotPresent = -1,
-  ErrorPNotPresent = -2,
-  ErrorNrowNegative = -3,
-  ErrorNcolNegative = -4,
-  ErrorNnzNegative = -5,
-  ErrorP0Nonzero = -6,
-  ErrorATooSmall = -7,
-  ErrorColLengthNegative = -8,
-  ErrorRowIndexOutOfBounds = -9,
-  ErrorOutOfMemory = -10,
-  ErrorInternalError = -999
-};
+#define COLAMD_OK       (0)
+#define COLAMD_OK_BUT_JUMBLED     (1)
+#define COLAMD_ERROR_A_not_present    (-1)
+#define COLAMD_ERROR_p_not_present    (-2)
+#define COLAMD_ERROR_nrow_negative    (-3)
+#define COLAMD_ERROR_ncol_negative    (-4)
+#define COLAMD_ERROR_nnz_negative   (-5)
+#define COLAMD_ERROR_p0_nonzero     (-6)
+#define COLAMD_ERROR_A_too_small    (-7)
+#define COLAMD_ERROR_col_length_negative  (-8)
+#define COLAMD_ERROR_row_index_out_of_bounds  (-9)
+#define COLAMD_ERROR_out_of_memory    (-10)
+#define COLAMD_ERROR_internal_error   (-999)
+
 /* ========================================================================== */
 /* === Definitions ========================================================== */
 /* ========================================================================== */
 
-template <typename IndexType>
-IndexType ones_complement(const IndexType r) {
-  return (-(r)-1);
-}
+#define ONES_COMPLEMENT(r) (-(r)-1)
 
 /* -------------------------------------------------------------------------- */
-const int Empty = -1;
+
+#define COLAMD_EMPTY (-1)
 
 /* Row and column status */
-enum RowColumnStatus {
-  Alive = 0,
-  Dead = -1
-};
+#define ALIVE (0)
+#define DEAD  (-1)
 
 /* Column status */
-enum ColumnStatus {
-  DeadPrincipal = -1,
-  DeadNonPrincipal = -2
-};
+#define DEAD_PRINCIPAL    (-1)
+#define DEAD_NON_PRINCIPAL  (-2)
+
+/* Macros for row and column status update and checking. */
+#define ROW_IS_DEAD(r)      ROW_IS_MARKED_DEAD (Row[r].shared2.mark)
+#define ROW_IS_MARKED_DEAD(row_mark)  (row_mark < ALIVE)
+#define ROW_IS_ALIVE(r)     (Row [r].shared2.mark >= ALIVE)
+#define COL_IS_DEAD(c)      (Col [c].start < ALIVE)
+#define COL_IS_ALIVE(c)     (Col [c].start >= ALIVE)
+#define COL_IS_DEAD_PRINCIPAL(c)  (Col [c].start == DEAD_PRINCIPAL)
+#define KILL_ROW(r)     { Row [r].shared2.mark = DEAD ; }
+#define KILL_PRINCIPAL_COL(c)   { Col [c].start = DEAD_PRINCIPAL ; }
+#define KILL_NON_PRINCIPAL_COL(c) { Col [c].start = DEAD_NON_PRINCIPAL ; }
 
 /* ========================================================================== */
 /* === Colamd reporting mechanism =========================================== */
@@ -133,9 +129,9 @@ enum ColumnStatus {
 
 // == Row and Column structures ==
 template <typename IndexType>
-struct ColStructure
+struct colamd_col
 {
-  IndexType start ;   /* index for A of first row in this column, or Dead */
+  IndexType start ;   /* index for A of first row in this column, or DEAD */
   /* if column is dead */
   IndexType length ;  /* number of rows in this column */
   union
@@ -163,21 +159,11 @@ struct ColStructure
     IndexType degree_next ; /* next column, if col is in a degree list */
     IndexType hash_next ;   /* next column, if col is in a hash list */
   } shared4 ;
-
-  inline bool is_dead() const { return start < Alive; }
-
-  inline bool is_alive() const { return start >= Alive; }
-
-  inline bool is_dead_principal() const { return start == DeadPrincipal; }
-
-  inline void kill_principal() { start = DeadPrincipal; }
-
-  inline void kill_non_principal() { start = DeadNonPrincipal; }
-
+  
 };
-
+ 
 template <typename IndexType>
-struct RowStructure
+struct Colamd_Row
 {
   IndexType start ;   /* index for A of first col in this row */
   IndexType length ;  /* number of principal columns in this row */
@@ -191,19 +177,13 @@ struct RowStructure
     IndexType mark ;  /* for computing set differences and marking dead rows*/
     IndexType first_column ;/* first column in row (used in garbage collection) */
   } shared2 ;
-
-  inline bool is_dead() const { return shared2.mark < Alive; }
-
-  inline bool is_alive() const { return shared2.mark >= Alive; }
-
-  inline void kill() { shared2.mark = Dead; }
-
+  
 };
-
+ 
 /* ========================================================================== */
 /* === Colamd recommended memory size ======================================= */
 /* ========================================================================== */
-
+ 
 /*
   The recommended length Alen of the array A passed to colamd is given by
   the COLAMD_RECOMMENDED (nnz, n_row, n_col) macro.  It returns -1 if any
@@ -212,41 +192,41 @@ struct RowStructure
   required for the Col and Row arrays, respectively, which are internal to
   colamd.  An additional n_col space is the minimal amount of "elbow room",
   and nnz/5 more space is recommended for run time efficiency.
-
+  
   This macro is not needed when using symamd.
-
+  
   Explicit typecast to IndexType added Sept. 23, 2002, COLAMD version 2.2, to avoid
   gcc -pedantic warning messages.
 */
 template <typename IndexType>
-inline IndexType colamd_c(IndexType n_col)
-{ return IndexType( ((n_col) + 1) * sizeof (ColStructure<IndexType>) / sizeof (IndexType) ) ; }
+inline IndexType colamd_c(IndexType n_col) 
+{ return IndexType( ((n_col) + 1) * sizeof (colamd_col<IndexType>) / sizeof (IndexType) ) ; }
 
 template <typename IndexType>
 inline IndexType  colamd_r(IndexType n_row)
-{ return IndexType(((n_row) + 1) * sizeof (RowStructure<IndexType>) / sizeof (IndexType)); }
+{ return IndexType(((n_row) + 1) * sizeof (Colamd_Row<IndexType>) / sizeof (IndexType)); }
 
 // Prototypes of non-user callable routines
 template <typename IndexType>
-static IndexType init_rows_cols (IndexType n_row, IndexType n_col, RowStructure<IndexType> Row [], ColStructure<IndexType> col [], IndexType A [], IndexType p [], IndexType stats[NStats] );
+static IndexType init_rows_cols (IndexType n_row, IndexType n_col, Colamd_Row<IndexType> Row [], colamd_col<IndexType> col [], IndexType A [], IndexType p [], IndexType stats[COLAMD_STATS] ); 
 
 template <typename IndexType>
-static void init_scoring (IndexType n_row, IndexType n_col, RowStructure<IndexType> Row [], ColStructure<IndexType> Col [], IndexType A [], IndexType head [], double knobs[NKnobs], IndexType *p_n_row2, IndexType *p_n_col2, IndexType *p_max_deg);
+static void init_scoring (IndexType n_row, IndexType n_col, Colamd_Row<IndexType> Row [], colamd_col<IndexType> Col [], IndexType A [], IndexType head [], double knobs[COLAMD_KNOBS], IndexType *p_n_row2, IndexType *p_n_col2, IndexType *p_max_deg);
 
 template <typename IndexType>
-static IndexType find_ordering (IndexType n_row, IndexType n_col, IndexType Alen, RowStructure<IndexType> Row [], ColStructure<IndexType> Col [], IndexType A [], IndexType head [], IndexType n_col2, IndexType max_deg, IndexType pfree);
+static IndexType find_ordering (IndexType n_row, IndexType n_col, IndexType Alen, Colamd_Row<IndexType> Row [], colamd_col<IndexType> Col [], IndexType A [], IndexType head [], IndexType n_col2, IndexType max_deg, IndexType pfree);
 
 template <typename IndexType>
-static void order_children (IndexType n_col, ColStructure<IndexType> Col [], IndexType p []);
+static void order_children (IndexType n_col, colamd_col<IndexType> Col [], IndexType p []);
 
 template <typename IndexType>
-static void detect_super_cols (ColStructure<IndexType> Col [], IndexType A [], IndexType head [], IndexType row_start, IndexType row_length ) ;
+static void detect_super_cols (colamd_col<IndexType> Col [], IndexType A [], IndexType head [], IndexType row_start, IndexType row_length ) ;
 
 template <typename IndexType>
-static IndexType garbage_collection (IndexType n_row, IndexType n_col, RowStructure<IndexType> Row [], ColStructure<IndexType> Col [], IndexType A [], IndexType *pfree) ;
+static IndexType garbage_collection (IndexType n_row, IndexType n_col, Colamd_Row<IndexType> Row [], colamd_col<IndexType> Col [], IndexType A [], IndexType *pfree) ;
 
 template <typename IndexType>
-static inline  IndexType clear_mark (IndexType n_row, RowStructure<IndexType> Row [] ) ;
+static inline  IndexType clear_mark (IndexType n_row, Colamd_Row<IndexType> Row [] ) ;
 
 /* === No debugging ========================================================= */
 
@@ -260,37 +240,37 @@ static inline  IndexType clear_mark (IndexType n_row, RowStructure<IndexType> Ro
 
 
 /**
- * \brief Returns the recommended value of Alen
- *
- * Returns recommended value of Alen for use by colamd.
- * Returns -1 if any input argument is negative.
- * The use of this routine or macro is optional.
- * Note that the macro uses its arguments   more than once,
- * so be careful for side effects, if you pass expressions as arguments to COLAMD_RECOMMENDED.
- *
+ * \brief Returns the recommended value of Alen 
+ * 
+ * Returns recommended value of Alen for use by colamd.  
+ * Returns -1 if any input argument is negative.  
+ * The use of this routine or macro is optional.  
+ * Note that the macro uses its arguments   more than once, 
+ * so be careful for side effects, if you pass expressions as arguments to COLAMD_RECOMMENDED.  
+ * 
  * \param nnz nonzeros in A
  * \param n_row number of rows in A
  * \param n_col number of columns in A
  * \return recommended value of Alen for use by colamd
  */
 template <typename IndexType>
-inline IndexType recommended ( IndexType nnz, IndexType n_row, IndexType n_col)
+inline IndexType colamd_recommended ( IndexType nnz, IndexType n_row, IndexType n_col)
 {
   if ((nnz) < 0 || (n_row) < 0 || (n_col) < 0)
     return (-1);
   else
-    return (2 * (nnz) + colamd_c (n_col) + colamd_r (n_row) + (n_col) + ((nnz) / 5));
+    return (2 * (nnz) + colamd_c (n_col) + colamd_r (n_row) + (n_col) + ((nnz) / 5)); 
 }
 
 /**
  * \brief set default parameters  The use of this routine is optional.
- *
- * Colamd: rows with more than (knobs [DenseRow] * n_col)
+ * 
+ * Colamd: rows with more than (knobs [COLAMD_DENSE_ROW] * n_col)
  * entries are removed prior to ordering.  Columns with more than
- * (knobs [DenseCol] * n_row) entries are removed prior to
- * ordering, and placed last in the output column ordering.
+ * (knobs [COLAMD_DENSE_COL] * n_row) entries are removed prior to
+ * ordering, and placed last in the output column ordering. 
  *
- * DenseRow and DenseCol are defined as 0 and 1,
+ * COLAMD_DENSE_ROW and COLAMD_DENSE_COL are defined as 0 and 1,
  * respectively, in colamd.h.  Default values of these two knobs
  * are both 0.5.  Currently, only knobs [0] and knobs [1] are
  * used, but future versions may use more knobs.  If so, they will
@@ -299,37 +279,37 @@ inline IndexType recommended ( IndexType nnz, IndexType n_row, IndexType n_col)
  * not need to change, assuming that you either use
  * colamd_set_defaults, or pass a (double *) NULL pointer as the
  * knobs array to colamd or symamd.
- *
+ * 
  * \param knobs parameter settings for colamd
  */
 
-static inline void set_defaults(double knobs[NKnobs])
+static inline void colamd_set_defaults(double knobs[COLAMD_KNOBS])
 {
   /* === Local variables ================================================== */
-
+  
   int i ;
 
   if (!knobs)
   {
     return ;      /* no knobs to initialize */
   }
-  for (i = 0 ; i < NKnobs ; i++)
+  for (i = 0 ; i < COLAMD_KNOBS ; i++)
   {
     knobs [i] = 0 ;
   }
-  knobs [Colamd::DenseRow] = 0.5 ;  /* ignore rows over 50% dense */
-  knobs [Colamd::DenseCol] = 0.5 ;  /* ignore columns over 50% dense */
+  knobs [COLAMD_DENSE_ROW] = 0.5 ;  /* ignore rows over 50% dense */
+  knobs [COLAMD_DENSE_COL] = 0.5 ;  /* ignore columns over 50% dense */
 }
 
-/**
+/** 
  * \brief  Computes a column ordering using the column approximate minimum degree ordering
- *
+ * 
  * Computes a column ordering (Q) of A such that P(AQ)=LU or
  * (AQ)'AQ=LL' have less fill-in and require fewer floating point
  * operations than factorizing the unpermuted matrix A or A'A,
  * respectively.
- *
- *
+ * 
+ * 
  * \param n_row number of rows in A
  * \param n_col number of columns in A
  * \param Alen, size of the array A
@@ -339,143 +319,143 @@ static inline void set_defaults(double knobs[NKnobs])
  * \param stats colamd output statistics and error codes
  */
 template <typename IndexType>
-static bool compute_ordering(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *A, IndexType *p, double knobs[NKnobs], IndexType stats[NStats])
+static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *A, IndexType *p, double knobs[COLAMD_KNOBS], IndexType stats[COLAMD_STATS])
 {
   /* === Local variables ================================================== */
-
+  
   IndexType i ;     /* loop index */
   IndexType nnz ;     /* nonzeros in A */
   IndexType Row_size ;    /* size of Row [], in integers */
   IndexType Col_size ;    /* size of Col [], in integers */
   IndexType need ;      /* minimum required length of A */
-  Colamd::RowStructure<IndexType> *Row ;   /* pointer into A of Row [0..n_row] array */
-  Colamd::ColStructure<IndexType> *Col ;   /* pointer into A of Col [0..n_col] array */
+  Colamd_Row<IndexType> *Row ;   /* pointer into A of Row [0..n_row] array */
+  colamd_col<IndexType> *Col ;   /* pointer into A of Col [0..n_col] array */
   IndexType n_col2 ;    /* number of non-dense, non-empty columns */
   IndexType n_row2 ;    /* number of non-dense, non-empty rows */
   IndexType ngarbage ;    /* number of garbage collections performed */
   IndexType max_deg ;   /* maximum row degree */
-  double default_knobs [NKnobs] ; /* default knobs array */
-
-
+  double default_knobs [COLAMD_KNOBS] ; /* default knobs array */
+  
+  
   /* === Check the input arguments ======================================== */
-
+  
   if (!stats)
   {
     COLAMD_DEBUG0 (("colamd: stats not present\n")) ;
     return (false) ;
   }
-  for (i = 0 ; i < NStats ; i++)
+  for (i = 0 ; i < COLAMD_STATS ; i++)
   {
     stats [i] = 0 ;
   }
-  stats [Colamd::Status] = Colamd::Ok ;
-  stats [Colamd::Info1] = -1 ;
-  stats [Colamd::Info2] = -1 ;
-
+  stats [COLAMD_STATUS] = COLAMD_OK ;
+  stats [COLAMD_INFO1] = -1 ;
+  stats [COLAMD_INFO2] = -1 ;
+  
   if (!A)   /* A is not present */
   {
-    stats [Colamd::Status] = Colamd::ErrorANotPresent ;
+    stats [COLAMD_STATUS] = COLAMD_ERROR_A_not_present ;
     COLAMD_DEBUG0 (("colamd: A not present\n")) ;
     return (false) ;
   }
-
+  
   if (!p)   /* p is not present */
   {
-    stats [Colamd::Status] = Colamd::ErrorPNotPresent ;
+    stats [COLAMD_STATUS] = COLAMD_ERROR_p_not_present ;
     COLAMD_DEBUG0 (("colamd: p not present\n")) ;
     return (false) ;
   }
-
+  
   if (n_row < 0)  /* n_row must be >= 0 */
   {
-    stats [Colamd::Status] = Colamd::ErrorNrowNegative ;
-    stats [Colamd::Info1] = n_row ;
+    stats [COLAMD_STATUS] = COLAMD_ERROR_nrow_negative ;
+    stats [COLAMD_INFO1] = n_row ;
     COLAMD_DEBUG0 (("colamd: nrow negative %d\n", n_row)) ;
     return (false) ;
   }
-
+  
   if (n_col < 0)  /* n_col must be >= 0 */
   {
-    stats [Colamd::Status] = Colamd::ErrorNcolNegative ;
-    stats [Colamd::Info1] = n_col ;
+    stats [COLAMD_STATUS] = COLAMD_ERROR_ncol_negative ;
+    stats [COLAMD_INFO1] = n_col ;
     COLAMD_DEBUG0 (("colamd: ncol negative %d\n", n_col)) ;
     return (false) ;
   }
-
+  
   nnz = p [n_col] ;
   if (nnz < 0)  /* nnz must be >= 0 */
   {
-    stats [Colamd::Status] = Colamd::ErrorNnzNegative ;
-    stats [Colamd::Info1] = nnz ;
+    stats [COLAMD_STATUS] = COLAMD_ERROR_nnz_negative ;
+    stats [COLAMD_INFO1] = nnz ;
     COLAMD_DEBUG0 (("colamd: number of entries negative %d\n", nnz)) ;
     return (false) ;
   }
-
+  
   if (p [0] != 0)
   {
-    stats [Colamd::Status] = Colamd::ErrorP0Nonzero ;
-    stats [Colamd::Info1] = p [0] ;
+    stats [COLAMD_STATUS] = COLAMD_ERROR_p0_nonzero ;
+    stats [COLAMD_INFO1] = p [0] ;
     COLAMD_DEBUG0 (("colamd: p[0] not zero %d\n", p [0])) ;
     return (false) ;
   }
-
+  
   /* === If no knobs, set default knobs =================================== */
-
+  
   if (!knobs)
   {
-    set_defaults (default_knobs) ;
+    colamd_set_defaults (default_knobs) ;
     knobs = default_knobs ;
   }
-
+  
   /* === Allocate the Row and Col arrays from array A ===================== */
-
+  
   Col_size = colamd_c (n_col) ;
   Row_size = colamd_r (n_row) ;
   need = 2*nnz + n_col + Col_size + Row_size ;
-
+  
   if (need > Alen)
   {
     /* not enough space in array A to perform the ordering */
-    stats [Colamd::Status] = Colamd::ErrorATooSmall ;
-    stats [Colamd::Info1] = need ;
-    stats [Colamd::Info2] = Alen ;
+    stats [COLAMD_STATUS] = COLAMD_ERROR_A_too_small ;
+    stats [COLAMD_INFO1] = need ;
+    stats [COLAMD_INFO2] = Alen ;
     COLAMD_DEBUG0 (("colamd: Need Alen >= %d, given only Alen = %d\n", need,Alen));
     return (false) ;
   }
-
+  
   Alen -= Col_size + Row_size ;
-  Col = (ColStructure<IndexType> *) &A [Alen] ;
-  Row = (RowStructure<IndexType> *) &A [Alen + Col_size] ;
+  Col = (colamd_col<IndexType> *) &A [Alen] ;
+  Row = (Colamd_Row<IndexType> *) &A [Alen + Col_size] ;
 
   /* === Construct the row and column data structures ===================== */
-
-  if (!Colamd::init_rows_cols (n_row, n_col, Row, Col, A, p, stats))
+  
+  if (!Eigen::internal::init_rows_cols (n_row, n_col, Row, Col, A, p, stats))
   {
     /* input matrix is invalid */
     COLAMD_DEBUG0 (("colamd: Matrix invalid\n")) ;
     return (false) ;
   }
-
+  
   /* === Initialize scores, kill dense rows/columns ======================= */
 
-  Colamd::init_scoring (n_row, n_col, Row, Col, A, p, knobs,
+  Eigen::internal::init_scoring (n_row, n_col, Row, Col, A, p, knobs,
 		&n_row2, &n_col2, &max_deg) ;
-
+  
   /* === Order the supercolumns =========================================== */
-
-  ngarbage = Colamd::find_ordering (n_row, n_col, Alen, Row, Col, A, p,
+  
+  ngarbage = Eigen::internal::find_ordering (n_row, n_col, Alen, Row, Col, A, p,
 			    n_col2, max_deg, 2*nnz) ;
-
+  
   /* === Order the non-principal columns ================================== */
-
-  Colamd::order_children (n_col, Col, p) ;
-
+  
+  Eigen::internal::order_children (n_col, Col, p) ;
+  
   /* === Return statistics in stats ======================================= */
-
-  stats [Colamd::DenseRow] = n_row - n_row2 ;
-  stats [Colamd::DenseCol] = n_col - n_col2 ;
-  stats [Colamd::DefragCount] = ngarbage ;
-  COLAMD_DEBUG0 (("colamd: done.\n")) ;
+  
+  stats [COLAMD_DENSE_ROW] = n_row - n_row2 ;
+  stats [COLAMD_DENSE_COL] = n_col - n_col2 ;
+  stats [COLAMD_DEFRAG_COUNT] = ngarbage ;
+  COLAMD_DEBUG0 (("colamd: done.\n")) ; 
   return (true) ;
 }
 
@@ -485,6 +465,7 @@ static bool compute_ordering(IndexType n_row, IndexType n_col, IndexType Alen, I
 
 /* There are no user-callable routines beyond this point in the file */
 
+
 /* ========================================================================== */
 /* === init_rows_cols ======================================================= */
 /* ========================================================================== */
@@ -504,11 +485,11 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
 
     IndexType n_row,      /* number of rows of A */
     IndexType n_col,      /* number of columns of A */
-    RowStructure<IndexType> Row [],    /* of size n_row+1 */
-    ColStructure<IndexType> Col [],    /* of size n_col+1 */
+    Colamd_Row<IndexType> Row [],    /* of size n_row+1 */
+    colamd_col<IndexType> Col [],    /* of size n_col+1 */
     IndexType A [],     /* row indices of A, of size Alen */
     IndexType p [],     /* pointers to columns in A, of size n_col+1 */
-    IndexType stats [NStats]  /* colamd statistics */
+    IndexType stats [COLAMD_STATS]  /* colamd statistics */ 
     )
 {
   /* === Local variables ================================================== */
@@ -531,24 +512,24 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
     if ((Col [col].length) < 0) // extra parentheses to work-around gcc bug 10200
     {
       /* column pointers must be non-decreasing */
-      stats [Colamd::Status] = Colamd::ErrorColLengthNegative ;
-      stats [Colamd::Info1] = col ;
-      stats [Colamd::Info2] = Col [col].length ;
+      stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ;
+      stats [COLAMD_INFO1] = col ;
+      stats [COLAMD_INFO2] = Col [col].length ;
       COLAMD_DEBUG0 (("colamd: col %d length %d < 0\n", col, Col [col].length)) ;
       return (false) ;
     }
 
     Col [col].shared1.thickness = 1 ;
     Col [col].shared2.score = 0 ;
-    Col [col].shared3.prev = Empty ;
-    Col [col].shared4.degree_next = Empty ;
+    Col [col].shared3.prev = COLAMD_EMPTY ;
+    Col [col].shared4.degree_next = COLAMD_EMPTY ;
   }
 
   /* p [0..n_col] no longer needed, used as "head" in subsequent routines */
 
   /* === Scan columns, compute row degrees, and check row indices ========= */
 
-  stats [Info3] = 0 ;  /* number of duplicate or unsorted row indices*/
+  stats [COLAMD_INFO3] = 0 ;  /* number of duplicate or unsorted row indices*/
 
   for (row = 0 ; row < n_row ; row++)
   {
@@ -570,10 +551,10 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
       /* make sure row indices within range */
       if (row < 0 || row >= n_row)
       {
-	stats [Colamd::Status] = Colamd::ErrorRowIndexOutOfBounds ;
-	stats [Colamd::Info1] = col ;
-	stats [Colamd::Info2] = row ;
-	stats [Colamd::Info3] = n_row ;
+	stats [COLAMD_STATUS] = COLAMD_ERROR_row_index_out_of_bounds ;
+	stats [COLAMD_INFO1] = col ;
+	stats [COLAMD_INFO2] = row ;
+	stats [COLAMD_INFO3] = n_row ;
 	COLAMD_DEBUG0 (("colamd: row %d col %d out of bounds\n", row, col)) ;
 	return (false) ;
       }
@@ -582,10 +563,10 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
       {
 	/* row index are unsorted or repeated (or both), thus col */
 	/* is jumbled.  This is a notice, not an error condition. */
-	stats [Colamd::Status] = Colamd::OkButJumbled ;
-	stats [Colamd::Info1] = col ;
-	stats [Colamd::Info2] = row ;
-	(stats [Colamd::Info3]) ++ ;
+	stats [COLAMD_STATUS] = COLAMD_OK_BUT_JUMBLED ;
+	stats [COLAMD_INFO1] = col ;
+	stats [COLAMD_INFO2] = row ;
+	(stats [COLAMD_INFO3]) ++ ;
 	COLAMD_DEBUG1 (("colamd: row %d col %d unsorted/duplicate\n",row,col));
       }
 
@@ -623,7 +604,7 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
 
   /* === Create row form ================================================== */
 
-  if (stats [Status] == OkButJumbled)
+  if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED)
   {
     /* if cols jumbled, watch for repeated row indices */
     for (col = 0 ; col < n_col ; col++)
@@ -665,7 +646,7 @@ static IndexType init_rows_cols  /* returns true if OK, or false otherwise */
 
   /* === See if we need to re-create columns ============================== */
 
-  if (stats [Status] == OkButJumbled)
+  if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED)
   {
     COLAMD_DEBUG0 (("colamd: reconstructing column form, matrix jumbled\n")) ;
 
@@ -720,11 +701,11 @@ static void init_scoring
 
     IndexType n_row,      /* number of rows of A */
     IndexType n_col,      /* number of columns of A */
-    RowStructure<IndexType> Row [],    /* of size n_row+1 */
-    ColStructure<IndexType> Col [],    /* of size n_col+1 */
+    Colamd_Row<IndexType> Row [],    /* of size n_row+1 */
+    colamd_col<IndexType> Col [],    /* of size n_col+1 */
     IndexType A [],     /* column form and row form of A */
     IndexType head [],    /* of size n_col+1 */
-    double knobs [NKnobs],/* parameters */
+    double knobs [COLAMD_KNOBS],/* parameters */
     IndexType *p_n_row2,    /* number of non-dense, non-empty rows */
     IndexType *p_n_col2,    /* number of non-dense, non-empty columns */
     IndexType *p_max_deg    /* maximum row degree */
@@ -751,8 +732,8 @@ static void init_scoring
 
   /* === Extract knobs ==================================================== */
 
-  dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [Colamd::DenseRow] * n_col), n_col)) ;
-  dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [Colamd::DenseCol] * n_row), n_row)) ;
+  dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [COLAMD_DENSE_ROW] * n_col), n_col)) ;
+  dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [COLAMD_DENSE_COL] * n_row), n_row)) ;
   COLAMD_DEBUG1 (("colamd: densecount: %d %d\n", dense_row_count, dense_col_count)) ;
   max_deg = 0 ;
   n_col2 = n_col ;
@@ -769,7 +750,7 @@ static void init_scoring
     {
       /* this is a empty column, kill and order it last */
       Col [c].shared2.order = --n_col2 ;
-      Col[c].kill_principal() ;
+      KILL_PRINCIPAL_COL (c) ;
     }
   }
   COLAMD_DEBUG1 (("colamd: null columns killed: %d\n", n_col - n_col2)) ;
@@ -780,7 +761,7 @@ static void init_scoring
   for (c = n_col-1 ; c >= 0 ; c--)
   {
     /* skip any dead columns */
-    if (Col[c].is_dead())
+    if (COL_IS_DEAD (c))
     {
       continue ;
     }
@@ -796,7 +777,7 @@ static void init_scoring
       {
 	Row [*cp++].shared1.degree-- ;
       }
-      Col[c].kill_principal() ;
+      KILL_PRINCIPAL_COL (c) ;
     }
   }
   COLAMD_DEBUG1 (("colamd: Dense and null columns killed: %d\n", n_col - n_col2)) ;
@@ -810,7 +791,7 @@ static void init_scoring
     if (deg > dense_row_count || deg == 0)
     {
       /* kill a dense or empty row */
-      Row[r].kill() ;
+      KILL_ROW (r) ;
       --n_row2 ;
     }
     else
@@ -832,7 +813,7 @@ static void init_scoring
   for (c = n_col-1 ; c >= 0 ; c--)
   {
     /* skip dead column */
-    if (Col[c].is_dead())
+    if (COL_IS_DEAD (c))
     {
       continue ;
     }
@@ -845,7 +826,7 @@ static void init_scoring
       /* get a row */
       row = *cp++ ;
       /* skip if dead */
-      if (Row[row].is_dead())
+      if (ROW_IS_DEAD (row))
       {
 	continue ;
       }
@@ -864,7 +845,7 @@ static void init_scoring
       /* and have already been killed) */
       COLAMD_DEBUG2 (("Newly null killed: %d\n", c)) ;
       Col [c].shared2.order = --n_col2 ;
-      Col[c].kill_principal() ;
+      KILL_PRINCIPAL_COL (c) ;
     }
     else
     {
@@ -889,7 +870,7 @@ static void init_scoring
   /* clear the hash buckets */
   for (c = 0 ; c <= n_col ; c++)
   {
-    head [c] = Empty ;
+    head [c] = COLAMD_EMPTY ;
   }
   min_score = n_col ;
   /* place in reverse order, so low column indices are at the front */
@@ -897,7 +878,7 @@ static void init_scoring
   for (c = n_col-1 ; c >= 0 ; c--)
   {
     /* only add principal columns to degree lists */
-    if (Col[c].is_alive())
+    if (COL_IS_ALIVE (c))
     {
       COLAMD_DEBUG4 (("place %d score %d minscore %d ncol %d\n",
 		      c, Col [c].shared2.score, min_score, n_col)) ;
@@ -910,16 +891,16 @@ static void init_scoring
       COLAMD_ASSERT (min_score <= n_col) ;
       COLAMD_ASSERT (score >= 0) ;
       COLAMD_ASSERT (score <= n_col) ;
-      COLAMD_ASSERT (head [score] >= Empty) ;
+      COLAMD_ASSERT (head [score] >= COLAMD_EMPTY) ;
 
       /* now add this column to dList at proper score location */
       next_col = head [score] ;
-      Col [c].shared3.prev = Empty ;
+      Col [c].shared3.prev = COLAMD_EMPTY ;
       Col [c].shared4.degree_next = next_col ;
 
       /* if there already was a column with the same score, set its */
       /* previous pointer to this new column */
-      if (next_col != Empty)
+      if (next_col != COLAMD_EMPTY)
       {
 	Col [next_col].shared3.prev = c ;
       }
@@ -958,8 +939,8 @@ static IndexType find_ordering /* return the number of garbage collections */
     IndexType n_row,      /* number of rows of A */
     IndexType n_col,      /* number of columns of A */
     IndexType Alen,     /* size of A, 2*nnz + n_col or larger */
-    RowStructure<IndexType> Row [],    /* of size n_row+1 */
-    ColStructure<IndexType> Col [],    /* of size n_col+1 */
+    Colamd_Row<IndexType> Row [],    /* of size n_row+1 */
+    colamd_col<IndexType> Col [],    /* of size n_col+1 */
     IndexType A [],     /* column form and row form of A */
     IndexType head [],    /* of size n_col+1 */
     IndexType n_col2,     /* Remaining columns to order */
@@ -1005,7 +986,7 @@ static IndexType find_ordering /* return the number of garbage collections */
   /* === Initialization and clear mark ==================================== */
 
   max_mark = INT_MAX - n_col ;  /* INT_MAX defined in <limits.h> */
-  tag_mark = Colamd::clear_mark (n_row, Row) ;
+  tag_mark = Eigen::internal::clear_mark (n_row, Row) ;
   min_score = 0 ;
   ngarbage = 0 ;
   COLAMD_DEBUG1 (("colamd: Ordering, n_col2=%d\n", n_col2)) ;
@@ -1020,10 +1001,10 @@ static IndexType find_ordering /* return the number of garbage collections */
     /* make sure degree list isn't empty */
     COLAMD_ASSERT (min_score >= 0) ;
     COLAMD_ASSERT (min_score <= n_col) ;
-    COLAMD_ASSERT (head [min_score] >= Empty) ;
+    COLAMD_ASSERT (head [min_score] >= COLAMD_EMPTY) ;
 
     /* get pivot column from head of minimum degree list */
-    while (min_score < n_col && head [min_score] == Empty)
+    while (min_score < n_col && head [min_score] == COLAMD_EMPTY)
     {
       min_score++ ;
     }
@@ -1031,12 +1012,12 @@ static IndexType find_ordering /* return the number of garbage collections */
     COLAMD_ASSERT (pivot_col >= 0 && pivot_col <= n_col) ;
     next_col = Col [pivot_col].shared4.degree_next ;
     head [min_score] = next_col ;
-    if (next_col != Empty)
+    if (next_col != COLAMD_EMPTY)
     {
-      Col [next_col].shared3.prev = Empty ;
+      Col [next_col].shared3.prev = COLAMD_EMPTY ;
     }
 
-    COLAMD_ASSERT (Col[pivot_col].is_alive()) ;
+    COLAMD_ASSERT (COL_IS_ALIVE (pivot_col)) ;
     COLAMD_DEBUG3 (("Pivot col: %d\n", pivot_col)) ;
 
     /* remember score for defrag check */
@@ -1055,12 +1036,12 @@ static IndexType find_ordering /* return the number of garbage collections */
     needed_memory = numext::mini(pivot_col_score, n_col - k) ;
     if (pfree + needed_memory >= Alen)
     {
-      pfree = Colamd::garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ;
+      pfree = Eigen::internal::garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ;
       ngarbage++ ;
       /* after garbage collection we will have enough */
       COLAMD_ASSERT (pfree + needed_memory < Alen) ;
       /* garbage collection has wiped out the Row[].shared2.mark array */
-      tag_mark = Colamd::clear_mark (n_row, Row) ;
+      tag_mark = Eigen::internal::clear_mark (n_row, Row) ;
 
     }
 
@@ -1083,9 +1064,9 @@ static IndexType find_ordering /* return the number of garbage collections */
     {
       /* get a row */
       row = *cp++ ;
-      COLAMD_DEBUG4 (("Pivot col pattern %d %d\n", Row[row].is_alive(), row)) ;
+      COLAMD_DEBUG4 (("Pivot col pattern %d %d\n", ROW_IS_ALIVE (row), row)) ;
       /* skip if row is dead */
-      if (Row[row].is_dead())
+      if (ROW_IS_DEAD (row))
       {
 	continue ;
       }
@@ -1097,7 +1078,7 @@ static IndexType find_ordering /* return the number of garbage collections */
 	col = *rp++ ;
 	/* add the column, if alive and untagged */
 	col_thickness = Col [col].shared1.thickness ;
-	if (col_thickness > 0 && Col[col].is_alive())
+	if (col_thickness > 0 && COL_IS_ALIVE (col))
 	{
 	  /* tag column in pivot row */
 	  Col [col].shared1.thickness = -col_thickness ;
@@ -1124,7 +1105,7 @@ static IndexType find_ordering /* return the number of garbage collections */
       /* may be killing an already dead row */
       row = *cp++ ;
       COLAMD_DEBUG3 (("Kill row in pivot col: %d\n", row)) ;
-      Row[row].kill() ;
+      KILL_ROW (row) ;
     }
 
     /* === Select a row index to use as the new pivot row =============== */
@@ -1139,7 +1120,7 @@ static IndexType find_ordering /* return the number of garbage collections */
     else
     {
       /* there is no pivot row, since it is of zero length */
-      pivot_row = Empty ;
+      pivot_row = COLAMD_EMPTY ;
       COLAMD_ASSERT (pivot_row_length == 0) ;
     }
     COLAMD_ASSERT (Col [pivot_col].length > 0 || pivot_row_length == 0) ;
@@ -1176,7 +1157,7 @@ static IndexType find_ordering /* return the number of garbage collections */
     while (rp < rp_end)
     {
       col = *rp++ ;
-      COLAMD_ASSERT (Col[col].is_alive() && col != pivot_col) ;
+      COLAMD_ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ;
       COLAMD_DEBUG3 (("Col: %d\n", col)) ;
 
       /* clear tags used to construct pivot row pattern */
@@ -1191,8 +1172,8 @@ static IndexType find_ordering /* return the number of garbage collections */
       next_col = Col [col].shared4.degree_next ;
       COLAMD_ASSERT (cur_score >= 0) ;
       COLAMD_ASSERT (cur_score <= n_col) ;
-      COLAMD_ASSERT (cur_score >= Empty) ;
-      if (prev_col == Empty)
+      COLAMD_ASSERT (cur_score >= COLAMD_EMPTY) ;
+      if (prev_col == COLAMD_EMPTY)
       {
 	head [cur_score] = next_col ;
       }
@@ -1200,7 +1181,7 @@ static IndexType find_ordering /* return the number of garbage collections */
       {
 	Col [prev_col].shared4.degree_next = next_col ;
       }
-      if (next_col != Empty)
+      if (next_col != COLAMD_EMPTY)
       {
 	Col [next_col].shared3.prev = prev_col ;
       }
@@ -1213,12 +1194,12 @@ static IndexType find_ordering /* return the number of garbage collections */
       {
 	/* get a row */
 	row = *cp++ ;
+	row_mark = Row [row].shared2.mark ;
 	/* skip if dead */
-	if (Row[row].is_dead())
+	if (ROW_IS_MARKED_DEAD (row_mark))
 	{
 	  continue ;
 	}
-  row_mark = Row [row].shared2.mark ;
 	COLAMD_ASSERT (row != pivot_row) ;
 	set_difference = row_mark - tag_mark ;
 	/* check if the row has been seen yet */
@@ -1234,7 +1215,7 @@ static IndexType find_ordering /* return the number of garbage collections */
 	if (set_difference == 0)
 	{
 	  COLAMD_DEBUG3 (("aggressive absorption. Row: %d\n", row)) ;
-	  Row[row].kill() ;
+	  KILL_ROW (row) ;
 	}
 	else
 	{
@@ -1256,7 +1237,7 @@ static IndexType find_ordering /* return the number of garbage collections */
     {
       /* get a column */
       col = *rp++ ;
-      COLAMD_ASSERT (Col[col].is_alive() && col != pivot_col) ;
+      COLAMD_ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ;
       hash = 0 ;
       cur_score = 0 ;
       cp = &A [Col [col].start] ;
@@ -1271,12 +1252,12 @@ static IndexType find_ordering /* return the number of garbage collections */
 	/* get a row */
 	row = *cp++ ;
 	COLAMD_ASSERT(row >= 0 && row < n_row) ;
+	row_mark = Row [row].shared2.mark ;
 	/* skip if dead */
-	if (Row [row].is_dead())
+	if (ROW_IS_MARKED_DEAD (row_mark))
 	{
 	  continue ;
 	}
-  row_mark = Row [row].shared2.mark ;
 	COLAMD_ASSERT (row_mark > tag_mark) ;
 	/* compact the column */
 	*new_cp++ = row ;
@@ -1297,7 +1278,7 @@ static IndexType find_ordering /* return the number of garbage collections */
       {
 	COLAMD_DEBUG4 (("further mass elimination. Col: %d\n", col)) ;
 	/* nothing left but the pivot row in this column */
-	Col[col].kill_principal() ;
+	KILL_PRINCIPAL_COL (col) ;
 	pivot_row_degree -= Col [col].shared1.thickness ;
 	COLAMD_ASSERT (pivot_row_degree >= 0) ;
 	/* order it */
@@ -1321,7 +1302,7 @@ static IndexType find_ordering /* return the number of garbage collections */
 	COLAMD_ASSERT (hash <= n_col) ;
 
 	head_column = head [hash] ;
-	if (head_column > Empty)
+	if (head_column > COLAMD_EMPTY)
 	{
 	  /* degree list "hash" is non-empty, use prev (shared3) of */
 	  /* first column in degree list as head of hash bucket */
@@ -1338,7 +1319,7 @@ static IndexType find_ordering /* return the number of garbage collections */
 
 	/* save hash function in Col [col].shared3.hash */
 	Col [col].shared3.hash = (IndexType) hash ;
-	COLAMD_ASSERT (Col[col].is_alive()) ;
+	COLAMD_ASSERT (COL_IS_ALIVE (col)) ;
       }
     }
 
@@ -1348,11 +1329,11 @@ static IndexType find_ordering /* return the number of garbage collections */
 
     COLAMD_DEBUG3 (("** Supercolumn detection phase. **\n")) ;
 
-    Colamd::detect_super_cols (Col, A, head, pivot_row_start, pivot_row_length) ;
+    Eigen::internal::detect_super_cols (Col, A, head, pivot_row_start, pivot_row_length) ;
 
     /* === Kill the pivotal column ====================================== */
 
-    Col[pivot_col].kill_principal() ;
+    KILL_PRINCIPAL_COL (pivot_col) ;
 
     /* === Clear mark =================================================== */
 
@@ -1360,7 +1341,7 @@ static IndexType find_ordering /* return the number of garbage collections */
     if (tag_mark >= max_mark)
     {
       COLAMD_DEBUG2 (("clearing tag_mark\n")) ;
-      tag_mark = Colamd::clear_mark (n_row, Row) ;
+      tag_mark = Eigen::internal::clear_mark (n_row, Row) ;
     }
 
     /* === Finalize the new pivot row, and column scores ================ */
@@ -1376,7 +1357,7 @@ static IndexType find_ordering /* return the number of garbage collections */
     {
       col = *rp++ ;
       /* skip dead columns */
-      if (Col[col].is_dead())
+      if (COL_IS_DEAD (col))
       {
 	continue ;
       }
@@ -1410,11 +1391,11 @@ static IndexType find_ordering /* return the number of garbage collections */
       COLAMD_ASSERT (min_score <= n_col) ;
       COLAMD_ASSERT (cur_score >= 0) ;
       COLAMD_ASSERT (cur_score <= n_col) ;
-      COLAMD_ASSERT (head [cur_score] >= Empty) ;
+      COLAMD_ASSERT (head [cur_score] >= COLAMD_EMPTY) ;
       next_col = head [cur_score] ;
       Col [col].shared4.degree_next = next_col ;
-      Col [col].shared3.prev = Empty ;
-      if (next_col != Empty)
+      Col [col].shared3.prev = COLAMD_EMPTY ;
+      if (next_col != COLAMD_EMPTY)
       {
 	Col [next_col].shared3.prev = col ;
       }
@@ -1467,7 +1448,7 @@ static inline  void order_children
   /* === Parameters ======================================================= */
 
   IndexType n_col,      /* number of columns of A */
-  ColStructure<IndexType> Col [],    /* of size n_col+1 */
+  colamd_col<IndexType> Col [],    /* of size n_col+1 */
   IndexType p []      /* p [0 ... n_col-1] is the column permutation*/
   )
 {
@@ -1483,15 +1464,15 @@ static inline  void order_children
   for (i = 0 ; i < n_col ; i++)
   {
     /* find an un-ordered non-principal column */
-    COLAMD_ASSERT (col_is_dead(Col, i)) ;
-    if (!Col[i].is_dead_principal() && Col [i].shared2.order == Empty)
+    COLAMD_ASSERT (COL_IS_DEAD (i)) ;
+    if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == COLAMD_EMPTY)
     {
       parent = i ;
       /* once found, find its principal parent */
       do
       {
 	parent = Col [parent].shared1.parent ;
-      } while (!Col[parent].is_dead_principal()) ;
+      } while (!COL_IS_DEAD_PRINCIPAL (parent)) ;
 
       /* now, order all un-ordered non-principal columns along path */
       /* to this parent.  collapse tree at the same time */
@@ -1501,7 +1482,7 @@ static inline  void order_children
 
       do
       {
-	COLAMD_ASSERT (Col [c].shared2.order == Empty) ;
+	COLAMD_ASSERT (Col [c].shared2.order == COLAMD_EMPTY) ;
 
 	/* order this column */
 	Col [c].shared2.order = order++ ;
@@ -1512,9 +1493,9 @@ static inline  void order_children
 	c = Col [c].shared1.parent ;
 
 	/* continue until we hit an ordered column.  There are */
-	/* guaranteed not to be anymore unordered columns */
+	/* guarranteed not to be anymore unordered columns */
 	/* above an ordered column */
-      } while (Col [c].shared2.order == Empty) ;
+      } while (Col [c].shared2.order == COLAMD_EMPTY) ;
 
       /* re-order the super_col parent to largest order for this group */
       Col [parent].shared2.order = order ;
@@ -1566,8 +1547,8 @@ template <typename IndexType>
 static void detect_super_cols
 (
   /* === Parameters ======================================================= */
-
-  ColStructure<IndexType> Col [],    /* of size n_col+1 */
+  
+  colamd_col<IndexType> Col [],    /* of size n_col+1 */
   IndexType A [],     /* row indices of A */
   IndexType head [],    /* head of degree lists and hash buckets */
   IndexType row_start,    /* pointer to set of columns to check */
@@ -1597,7 +1578,7 @@ static void detect_super_cols
   while (rp < rp_end)
   {
     col = *rp++ ;
-    if (Col[col].is_dead())
+    if (COL_IS_DEAD (col))
     {
       continue ;
     }
@@ -1609,7 +1590,7 @@ static void detect_super_cols
     /* === Get the first column in this hash bucket ===================== */
 
     head_column = head [hash] ;
-    if (head_column > Empty)
+    if (head_column > COLAMD_EMPTY)
     {
       first_col = Col [head_column].shared3.headhash ;
     }
@@ -1620,10 +1601,10 @@ static void detect_super_cols
 
     /* === Consider each column in the hash bucket ====================== */
 
-    for (super_c = first_col ; super_c != Empty ;
+    for (super_c = first_col ; super_c != COLAMD_EMPTY ;
 	 super_c = Col [super_c].shared4.hash_next)
     {
-      COLAMD_ASSERT (Col [super_c].is_alive()) ;
+      COLAMD_ASSERT (COL_IS_ALIVE (super_c)) ;
       COLAMD_ASSERT (Col [super_c].shared3.hash == hash) ;
       length = Col [super_c].length ;
 
@@ -1633,10 +1614,10 @@ static void detect_super_cols
       /* === Compare super_c with all columns after it ================ */
 
       for (c = Col [super_c].shared4.hash_next ;
-	   c != Empty ; c = Col [c].shared4.hash_next)
+	   c != COLAMD_EMPTY ; c = Col [c].shared4.hash_next)
       {
 	COLAMD_ASSERT (c != super_c) ;
-	COLAMD_ASSERT (Col[c].is_alive()) ;
+	COLAMD_ASSERT (COL_IS_ALIVE (c)) ;
 	COLAMD_ASSERT (Col [c].shared3.hash == hash) ;
 
 	/* not identical if lengths or scores are different */
@@ -1654,10 +1635,10 @@ static void detect_super_cols
 	for (i = 0 ; i < length ; i++)
 	{
 	  /* the columns are "clean" (no dead rows) */
-	  COLAMD_ASSERT ( cp1->is_alive() );
-	  COLAMD_ASSERT ( cp2->is_alive() );
+	  COLAMD_ASSERT (ROW_IS_ALIVE (*cp1))  ;
+	  COLAMD_ASSERT (ROW_IS_ALIVE (*cp2))  ;
 	  /* row indices will same order for both supercols, */
-	  /* no gather scatter necessary */
+	  /* no gather scatter nessasary */
 	  if (*cp1++ != *cp2++)
 	  {
 	    break ;
@@ -1677,9 +1658,9 @@ static void detect_super_cols
 
 	Col [super_c].shared1.thickness += Col [c].shared1.thickness ;
 	Col [c].shared1.parent = super_c ;
-	Col[c].kill_non_principal() ;
+	KILL_NON_PRINCIPAL_COL (c) ;
 	/* order c later, in order_children() */
-	Col [c].shared2.order = Empty ;
+	Col [c].shared2.order = COLAMD_EMPTY ;
 	/* remove c from hash bucket */
 	Col [prev_c].shared4.hash_next = Col [c].shared4.hash_next ;
       }
@@ -1687,15 +1668,15 @@ static void detect_super_cols
 
     /* === Empty this hash bucket ======================================= */
 
-    if (head_column > Empty)
+    if (head_column > COLAMD_EMPTY)
     {
       /* corresponding degree list "hash" is not empty */
-      Col [head_column].shared3.headhash = Empty ;
+      Col [head_column].shared3.headhash = COLAMD_EMPTY ;
     }
     else
     {
       /* corresponding degree list "hash" is empty */
-      head [hash] = Empty ;
+      head [hash] = COLAMD_EMPTY ;
     }
   }
 }
@@ -1707,7 +1688,7 @@ static void detect_super_cols
 
 /*
   Defragments and compacts columns and rows in the workspace A.  Used when
-  all available memory has been used while performing row merging.  Returns
+  all avaliable memory has been used while performing row merging.  Returns
   the index of the first free position in A, after garbage collection.  The
   time taken by this routine is linear is the size of the array A, which is
   itself linear in the number of nonzeros in the input matrix.
@@ -1717,11 +1698,11 @@ template <typename IndexType>
 static IndexType garbage_collection  /* returns the new value of pfree */
   (
     /* === Parameters ======================================================= */
-
+    
     IndexType n_row,      /* number of rows */
     IndexType n_col,      /* number of columns */
-    RowStructure<IndexType> Row [],    /* row info */
-    ColStructure<IndexType> Col [],    /* column info */
+    Colamd_Row<IndexType> Row [],    /* row info */
+    colamd_col<IndexType> Col [],    /* column info */
     IndexType A [],     /* A [0 ... Alen-1] holds the matrix */
     IndexType *pfree      /* &A [0] ... pfree is in use */
     )
@@ -1740,7 +1721,7 @@ static IndexType garbage_collection  /* returns the new value of pfree */
   pdest = &A[0] ;
   for (c = 0 ; c < n_col ; c++)
   {
-    if (Col[c].is_alive())
+    if (COL_IS_ALIVE (c))
     {
       psrc = &A [Col [c].start] ;
 
@@ -1751,7 +1732,7 @@ static IndexType garbage_collection  /* returns the new value of pfree */
       for (j = 0 ; j < length ; j++)
       {
 	r = *psrc++ ;
-	if (Row[r].is_alive())
+	if (ROW_IS_ALIVE (r))
 	{
 	  *pdest++ = r ;
 	}
@@ -1764,22 +1745,22 @@ static IndexType garbage_collection  /* returns the new value of pfree */
 
   for (r = 0 ; r < n_row ; r++)
   {
-    if (Row[r].is_alive())
+    if (ROW_IS_ALIVE (r))
     {
       if (Row [r].length == 0)
       {
-        /* this row is of zero length.  cannot compact it, so kill it */
-        COLAMD_DEBUG3 (("Defrag row kill\n")) ;
-        Row[r].kill() ;
+	/* this row is of zero length.  cannot compact it, so kill it */
+	COLAMD_DEBUG3 (("Defrag row kill\n")) ;
+	KILL_ROW (r) ;
       }
       else
       {
-        /* save first column index in Row [r].shared2.first_column */
-        psrc = &A [Row [r].start] ;
-        Row [r].shared2.first_column = *psrc ;
-        COLAMD_ASSERT (Row[r].is_alive()) ;
-        /* flag the start of the row with the one's complement of row */
-        *psrc = ones_complement(r) ;
+	/* save first column index in Row [r].shared2.first_column */
+	psrc = &A [Row [r].start] ;
+	Row [r].shared2.first_column = *psrc ;
+	COLAMD_ASSERT (ROW_IS_ALIVE (r)) ;
+	/* flag the start of the row with the one's complement of row */
+	*psrc = ONES_COMPLEMENT (r) ;
 
       }
     }
@@ -1795,11 +1776,11 @@ static IndexType garbage_collection  /* returns the new value of pfree */
     {
       psrc-- ;
       /* get the row index */
-      r = ones_complement(*psrc) ;
+      r = ONES_COMPLEMENT (*psrc) ;
       COLAMD_ASSERT (r >= 0 && r < n_row) ;
       /* restore first column index */
       *psrc = Row [r].shared2.first_column ;
-      COLAMD_ASSERT (Row[r].is_alive()) ;
+      COLAMD_ASSERT (ROW_IS_ALIVE (r)) ;
 
       /* move and compact the row */
       COLAMD_ASSERT (pdest <= psrc) ;
@@ -1808,7 +1789,7 @@ static IndexType garbage_collection  /* returns the new value of pfree */
       for (j = 0 ; j < length ; j++)
       {
 	c = *psrc++ ;
-	if (Col[c].is_alive())
+	if (COL_IS_ALIVE (c))
 	{
 	  *pdest++ = c ;
 	}
@@ -1840,7 +1821,7 @@ static inline  IndexType clear_mark  /* return the new value for tag_mark */
       /* === Parameters ======================================================= */
 
     IndexType n_row,    /* number of rows in A */
-    RowStructure<IndexType> Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */
+    Colamd_Row<IndexType> Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */
     )
 {
   /* === Local variables ================================================== */
@@ -1849,7 +1830,7 @@ static inline  IndexType clear_mark  /* return the new value for tag_mark */
 
   for (r = 0 ; r < n_row ; r++)
   {
-    if (Row[r].is_alive())
+    if (ROW_IS_ALIVE (r))
     {
       Row [r].shared2.mark = 0 ;
     }
@@ -1857,7 +1838,6 @@ static inline  IndexType clear_mark  /* return the new value for tag_mark */
   return (1) ;
 }
 
-} // namespace Colamd
 
-} // namespace internal
+} // namespace internal 
 #endif
diff --git a/uppsrc/plugin/Eigen/Eigen/src/OrderingMethods/Ordering.h b/uppsrc/plugin/Eigen/Eigen/src/OrderingMethods/Ordering.h
index c57897014..7ea9b14d7 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/OrderingMethods/Ordering.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/OrderingMethods/Ordering.h
@@ -31,13 +31,15 @@ void ordering_helper_at_plus_a(const MatrixType& A, MatrixType& symmat)
   for (int i = 0; i < C.rows(); i++) 
   {
       for (typename MatrixType::InnerIterator it(C, i); it; ++it)
-        it.valueRef() = typename MatrixType::Scalar(0);
+        it.valueRef() = 0.0;
   }
   symmat = C + A;
 }
     
 }
 
+#ifndef EIGEN_MPL2_ONLY
+
 /** \ingroup OrderingMethods_Module
   * \class AMDOrdering
   *
@@ -79,6 +81,8 @@ class AMDOrdering
     }
 };
 
+#endif // EIGEN_MPL2_ONLY
+
 /** \ingroup OrderingMethods_Module
   * \class NaturalOrdering
   *
@@ -129,17 +133,17 @@ class COLAMDOrdering
       StorageIndex n = StorageIndex(mat.cols());
       StorageIndex nnz = StorageIndex(mat.nonZeros());
       // Get the recommended value of Alen to be used by colamd
-      StorageIndex Alen = internal::Colamd::recommended(nnz, m, n); 
+      StorageIndex Alen = internal::colamd_recommended(nnz, m, n); 
       // Set the default parameters
-      double knobs [internal::Colamd::NKnobs]; 
-      StorageIndex stats [internal::Colamd::NStats];
-      internal::Colamd::set_defaults(knobs);
+      double knobs [COLAMD_KNOBS]; 
+      StorageIndex stats [COLAMD_STATS];
+      internal::colamd_set_defaults(knobs);
       
       IndexVector p(n+1), A(Alen); 
       for(StorageIndex i=0; i <= n; i++)   p(i) = mat.outerIndexPtr()[i];
       for(StorageIndex i=0; i < nnz; i++)  A(i) = mat.innerIndexPtr()[i];
       // Call Colamd routine to compute the ordering 
-      StorageIndex info = internal::Colamd::compute_ordering(m, n, Alen, A.data(), p.data(), knobs, stats); 
+      StorageIndex info = internal::colamd(m, n, Alen, A.data(), p.data(), knobs, stats); 
       EIGEN_UNUSED_VARIABLE(info);
       eigen_assert( info && "COLAMD failed " );
       
diff --git a/uppsrc/plugin/Eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h b/uppsrc/plugin/Eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h
index 37426877a..160d8a523 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h
@@ -203,7 +203,7 @@ class PastixBase : public SparseSolverBase<Derived>
     
      /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was successful,
+      * \returns \c Success if computation was succesful,
       *          \c NumericalIssue if the PaStiX reports a problem
       *          \c InvalidInput if the input matrix is invalid
       *
diff --git a/uppsrc/plugin/Eigen/Eigen/src/PardisoSupport/PardisoSupport.h b/uppsrc/plugin/Eigen/Eigen/src/PardisoSupport/PardisoSupport.h
index f89b79bd5..f8c7d0780 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/PardisoSupport/PardisoSupport.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/PardisoSupport/PardisoSupport.h
@@ -123,7 +123,6 @@ class PardisoImpl : public SparseSolverBase<Derived>
     };
 
     PardisoImpl()
-      : m_analysisIsOk(false), m_factorizationIsOk(false)
     {
       eigen_assert((sizeof(StorageIndex) >= sizeof(_INTEGER_t) && sizeof(StorageIndex) <= 8) && "Non-supported index type");
       m_iparm.setZero();
@@ -141,7 +140,7 @@ class PardisoImpl : public SparseSolverBase<Derived>
   
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was successful,
+      * \returns \c Success if computation was succesful,
       *          \c NumericalIssue if the matrix appears to be negative.
       */
     ComputationInfo info() const
@@ -193,7 +192,8 @@ class PardisoImpl : public SparseSolverBase<Derived>
     void pardisoInit(int type)
     {
       m_type = type;
-      bool symmetric = std::abs(m_type) < 10;
+      EIGEN_USING_STD(abs);
+      bool symmetric = abs(m_type) < 10;
       m_iparm[0] = 1;   // No solver default
       m_iparm[1] = 2;   // use Metis for the ordering
       m_iparm[2] = 0;   // Reserved. Set to zero. (??Numbers of processors, value of OMP_NUM_THREADS??)
@@ -386,15 +386,14 @@ class PardisoLU : public PardisoImpl< PardisoLU<MatrixType> >
 {
   protected:
     typedef PardisoImpl<PardisoLU> Base;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::RealScalar RealScalar;
     using Base::pardisoInit;
     using Base::m_matrix;
     friend class PardisoImpl< PardisoLU<MatrixType> >;
 
   public:
 
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::RealScalar RealScalar;
-
     using Base::compute;
     using Base::solve;
 
@@ -442,14 +441,14 @@ class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
 {
   protected:
     typedef PardisoImpl< PardisoLLT<MatrixType,_UpLo> > Base;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::RealScalar RealScalar;
     using Base::pardisoInit;
     using Base::m_matrix;
     friend class PardisoImpl< PardisoLLT<MatrixType,_UpLo> >;
 
   public:
 
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::RealScalar RealScalar;
     typedef typename Base::StorageIndex StorageIndex;
     enum { UpLo = _UpLo };
     using Base::compute;
@@ -505,14 +504,14 @@ class PardisoLDLT : public PardisoImpl< PardisoLDLT<MatrixType,Options> >
 {
   protected:
     typedef PardisoImpl< PardisoLDLT<MatrixType,Options> > Base;
+    typedef typename Base::Scalar Scalar;
+    typedef typename Base::RealScalar RealScalar;
     using Base::pardisoInit;
     using Base::m_matrix;
     friend class PardisoImpl< PardisoLDLT<MatrixType,Options> >;
 
   public:
 
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::RealScalar RealScalar;
     typedef typename Base::StorageIndex StorageIndex;
     using Base::compute;
     enum { UpLo = Options&(Upper|Lower) };
diff --git a/uppsrc/plugin/Eigen/Eigen/src/QR/ColPivHouseholderQR.h b/uppsrc/plugin/Eigen/Eigen/src/QR/ColPivHouseholderQR.h
index 9b677e9bf..a7b47d55d 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/QR/ColPivHouseholderQR.h
@@ -17,9 +17,6 @@ namespace internal {
 template<typename _MatrixType> struct traits<ColPivHouseholderQR<_MatrixType> >
  : traits<_MatrixType>
 {
-  typedef MatrixXpr XprKind;
-  typedef SolverStorage StorageKind;
-  typedef int StorageIndex;
   enum { Flags = 0 };
 };
 
@@ -49,19 +46,20 @@ template<typename _MatrixType> struct traits<ColPivHouseholderQR<_MatrixType> >
   * \sa MatrixBase::colPivHouseholderQr()
   */
 template<typename _MatrixType> class ColPivHouseholderQR
-        : public SolverBase<ColPivHouseholderQR<_MatrixType> >
 {
   public:
 
     typedef _MatrixType MatrixType;
-    typedef SolverBase<ColPivHouseholderQR> Base;
-    friend class SolverBase<ColPivHouseholderQR>;
-
-    EIGEN_GENERIC_PUBLIC_INTERFACE(ColPivHouseholderQR)
     enum {
+      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    // FIXME should be int
+    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
     typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationType;
     typedef typename internal::plain_row_type<MatrixType, Index>::type IntRowVectorType;
@@ -158,7 +156,6 @@ template<typename _MatrixType> class ColPivHouseholderQR
       computeInPlace();
     }
 
-    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
       * *this is the QR decomposition, if any exists.
       *
@@ -175,8 +172,11 @@ template<typename _MatrixType> class ColPivHouseholderQR
       */
     template<typename Rhs>
     inline const Solve<ColPivHouseholderQR, Rhs>
-    solve(const MatrixBase<Rhs>& b) const;
-    #endif
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+      return Solve<ColPivHouseholderQR, Rhs>(*this, b.derived());
+    }
 
     HouseholderSequenceType householderQ() const;
     HouseholderSequenceType matrixQ() const
@@ -402,7 +402,7 @@ template<typename _MatrixType> class ColPivHouseholderQR
       */
     RealScalar maxPivot() const { return m_maxpivot; }
 
-    /** \brief Reports whether the QR factorization was successful.
+    /** \brief Reports whether the QR factorization was succesful.
       *
       * \note This function always returns \c Success. It is provided for compatibility
       * with other factorization routines.
@@ -416,10 +416,8 @@ template<typename _MatrixType> class ColPivHouseholderQR
 
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
-
-    template<bool Conjugate, typename RhsType, typename DstType>
-    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
     #endif
 
   protected:
@@ -586,6 +584,8 @@ template<typename _MatrixType>
 template<typename RhsType, typename DstType>
 void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
+  eigen_assert(rhs.rows() == rows());
+
   const Index nonzero_pivots = nonzeroPivots();
 
   if(nonzero_pivots == 0)
@@ -596,7 +596,11 @@ void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &
 
   typename RhsType::PlainObject c(rhs);
 
-  c.applyOnTheLeft(householderQ().setLength(nonzero_pivots).adjoint() );
+  // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T
+  c.applyOnTheLeft(householderSequence(m_qr, m_hCoeffs)
+                    .setLength(nonzero_pivots)
+                    .transpose()
+    );
 
   m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots)
       .template triangularView<Upper>()
@@ -605,31 +609,6 @@ void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &
   for(Index i = 0; i < nonzero_pivots; ++i) dst.row(m_colsPermutation.indices().coeff(i)) = c.row(i);
   for(Index i = nonzero_pivots; i < cols(); ++i) dst.row(m_colsPermutation.indices().coeff(i)).setZero();
 }
-
-template<typename _MatrixType>
-template<bool Conjugate, typename RhsType, typename DstType>
-void ColPivHouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
-{
-  const Index nonzero_pivots = nonzeroPivots();
-
-  if(nonzero_pivots == 0)
-  {
-    dst.setZero();
-    return;
-  }
-
-  typename RhsType::PlainObject c(m_colsPermutation.transpose()*rhs);
-
-  m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots)
-        .template triangularView<Upper>()
-        .transpose().template conjugateIf<Conjugate>()
-        .solveInPlace(c.topRows(nonzero_pivots));
-
-  dst.topRows(nonzero_pivots) = c.topRows(nonzero_pivots);
-  dst.bottomRows(rows()-nonzero_pivots).setZero();
-
-  dst.applyOnTheLeft(householderQ().setLength(nonzero_pivots).template conjugateIf<!Conjugate>() );
-}
 #endif
 
 namespace internal {
diff --git a/uppsrc/plugin/Eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/uppsrc/plugin/Eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h
index 486d3373a..34c637b70 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h
@@ -16,9 +16,6 @@ namespace internal {
 template <typename _MatrixType>
 struct traits<CompleteOrthogonalDecomposition<_MatrixType> >
     : traits<_MatrixType> {
-  typedef MatrixXpr XprKind;
-  typedef SolverStorage StorageKind;
-  typedef int StorageIndex;
   enum { Flags = 0 };
 };
 
@@ -47,21 +44,19 @@ struct traits<CompleteOrthogonalDecomposition<_MatrixType> >
   * 
   * \sa MatrixBase::completeOrthogonalDecomposition()
   */
-template <typename _MatrixType> class CompleteOrthogonalDecomposition
-          : public SolverBase<CompleteOrthogonalDecomposition<_MatrixType> >
-{
+template <typename _MatrixType>
+class CompleteOrthogonalDecomposition {
  public:
   typedef _MatrixType MatrixType;
-  typedef SolverBase<CompleteOrthogonalDecomposition> Base;
-
-  template<typename Derived>
-  friend struct internal::solve_assertion;
-
-  EIGEN_GENERIC_PUBLIC_INTERFACE(CompleteOrthogonalDecomposition)
   enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
   };
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
   typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
   typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime>
       PermutationType;
@@ -136,9 +131,9 @@ template <typename _MatrixType> class CompleteOrthogonalDecomposition
       m_temp(matrix.cols())
   {
     computeInPlace();
-  } 
+  }
+
 
-  #ifdef EIGEN_PARSED_BY_DOXYGEN
   /** This method computes the minimum-norm solution X to a least squares
    * problem \f[\mathrm{minimize} \|A X - B\|, \f] where \b A is the matrix of
    * which \c *this is the complete orthogonal decomposition.
@@ -150,8 +145,11 @@ template <typename _MatrixType> class CompleteOrthogonalDecomposition
    */
   template <typename Rhs>
   inline const Solve<CompleteOrthogonalDecomposition, Rhs> solve(
-      const MatrixBase<Rhs>& b) const;
-  #endif
+      const MatrixBase<Rhs>& b) const {
+    eigen_assert(m_cpqr.m_isInitialized &&
+                 "CompleteOrthogonalDecomposition is not initialized.");
+    return Solve<CompleteOrthogonalDecomposition, Rhs>(*this, b.derived());
+  }
 
   HouseholderSequenceType householderQ(void) const;
   HouseholderSequenceType matrixQ(void) const { return m_cpqr.householderQ(); }
@@ -160,8 +158,8 @@ template <typename _MatrixType> class CompleteOrthogonalDecomposition
    */
   MatrixType matrixZ() const {
     MatrixType Z = MatrixType::Identity(m_cpqr.cols(), m_cpqr.cols());
-    applyZOnTheLeftInPlace<false>(Z);
-    return Z;
+    applyZAdjointOnTheLeftInPlace(Z);
+    return Z.adjoint();
   }
 
   /** \returns a reference to the matrix where the complete orthogonal
@@ -277,7 +275,6 @@ template <typename _MatrixType> class CompleteOrthogonalDecomposition
    */
   inline const Inverse<CompleteOrthogonalDecomposition> pseudoInverse() const
   {
-    eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized.");
     return Inverse<CompleteOrthogonalDecomposition>(*this);
   }
 
@@ -356,7 +353,7 @@ template <typename _MatrixType> class CompleteOrthogonalDecomposition
   inline RealScalar maxPivot() const { return m_cpqr.maxPivot(); }
 
   /** \brief Reports whether the complete orthogonal decomposition was
-   * successful.
+   * succesful.
    *
    * \note This function always returns \c Success. It is provided for
    * compatibility
@@ -370,10 +367,7 @@ template <typename _MatrixType> class CompleteOrthogonalDecomposition
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
   template <typename RhsType, typename DstType>
-  void _solve_impl(const RhsType& rhs, DstType& dst) const;
-
-  template<bool Conjugate, typename RhsType, typename DstType>
-  void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
+  EIGEN_DEVICE_FUNC void _solve_impl(const RhsType& rhs, DstType& dst) const;
 #endif
 
  protected:
@@ -381,22 +375,8 @@ template <typename _MatrixType> class CompleteOrthogonalDecomposition
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
   }
 
-  template<bool Transpose_, typename Rhs>
-  void _check_solve_assertion(const Rhs& b) const {
-      EIGEN_ONLY_USED_FOR_DEBUG(b);
-      eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized.");
-      eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "CompleteOrthogonalDecomposition::solve(): invalid number of rows of the right hand side matrix b");
-  }
-
   void computeInPlace();
 
-  /** Overwrites \b rhs with \f$ \mathbf{Z} * \mathbf{rhs} \f$ or
-   *  \f$ \mathbf{\overline Z} * \mathbf{rhs} \f$ if \c Conjugate 
-   *  is set to \c true.
-   */
-  template <bool Conjugate, typename Rhs>
-  void applyZOnTheLeftInPlace(Rhs& rhs) const;
-
   /** Overwrites \b rhs with \f$ \mathbf{Z}^* * \mathbf{rhs} \f$.
    */
   template <typename Rhs>
@@ -472,7 +452,7 @@ void CompleteOrthogonalDecomposition<MatrixType>::computeInPlace()
         // Apply Z(k) to the first k rows of X_k
         m_cpqr.m_qr.topRightCorner(k, cols - rank + 1)
             .applyHouseholderOnTheRight(
-                m_cpqr.m_qr.row(k).tail(cols - rank).adjoint(), m_zCoeffs(k),
+                m_cpqr.m_qr.row(k).tail(cols - rank).transpose(), m_zCoeffs(k),
                 &m_temp(0));
       }
       if (k != rank - 1) {
@@ -484,28 +464,6 @@ void CompleteOrthogonalDecomposition<MatrixType>::computeInPlace()
   }
 }
 
-template <typename MatrixType>
-template <bool Conjugate, typename Rhs>
-void CompleteOrthogonalDecomposition<MatrixType>::applyZOnTheLeftInPlace(
-    Rhs& rhs) const {
-  const Index cols = this->cols();
-  const Index nrhs = rhs.cols();
-  const Index rank = this->rank();
-  Matrix<typename Rhs::Scalar, Dynamic, 1> temp((std::max)(cols, nrhs));
-  for (Index k = rank-1; k >= 0; --k) {
-    if (k != rank - 1) {
-      rhs.row(k).swap(rhs.row(rank - 1));
-    }
-    rhs.middleRows(rank - 1, cols - rank + 1)
-        .applyHouseholderOnTheLeft(
-            matrixQTZ().row(k).tail(cols - rank).transpose().template conjugateIf<!Conjugate>(), zCoeffs().template conjugateIf<Conjugate>()(k),
-            &temp(0));
-    if (k != rank - 1) {
-      rhs.row(k).swap(rhs.row(rank - 1));
-    }
-  }
-}
-
 template <typename MatrixType>
 template <typename Rhs>
 void CompleteOrthogonalDecomposition<MatrixType>::applyZAdjointOnTheLeftInPlace(
@@ -513,7 +471,7 @@ void CompleteOrthogonalDecomposition<MatrixType>::applyZAdjointOnTheLeftInPlace(
   const Index cols = this->cols();
   const Index nrhs = rhs.cols();
   const Index rank = this->rank();
-  Matrix<typename Rhs::Scalar, Dynamic, 1> temp((std::max)(cols, nrhs));
+  Matrix<typename MatrixType::Scalar, Dynamic, 1> temp((std::max)(cols, nrhs));
   for (Index k = 0; k < rank; ++k) {
     if (k != rank - 1) {
       rhs.row(k).swap(rhs.row(rank - 1));
@@ -533,6 +491,8 @@ template <typename _MatrixType>
 template <typename RhsType, typename DstType>
 void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl(
     const RhsType& rhs, DstType& dst) const {
+  eigen_assert(rhs.rows() == this->rows());
+
   const Index rank = this->rank();
   if (rank == 0) {
     dst.setZero();
@@ -540,8 +500,11 @@ void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl(
   }
 
   // Compute c = Q^* * rhs
+  // Note that the matrix Q = H_0^* H_1^*... so its inverse is
+  // Q^* = (H_0 H_1 ...)^T
   typename RhsType::PlainObject c(rhs);
-  c.applyOnTheLeft(matrixQ().setLength(rank).adjoint());
+  c.applyOnTheLeft(
+      householderSequence(matrixQTZ(), hCoeffs()).setLength(rank).transpose());
 
   // Solve T z = c(1:rank, :)
   dst.topRows(rank) = matrixT()
@@ -560,45 +523,10 @@ void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl(
   // Undo permutation to get x = P^{-1} * y.
   dst = colsPermutation() * dst;
 }
-
-template<typename _MatrixType>
-template<bool Conjugate, typename RhsType, typename DstType>
-void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
-{
-  const Index rank = this->rank();
-
-  if (rank == 0) {
-    dst.setZero();
-    return;
-  }
-
-  typename RhsType::PlainObject c(colsPermutation().transpose()*rhs);
-
-  if (rank < cols()) {
-    applyZOnTheLeftInPlace<!Conjugate>(c);
-  }
-
-  matrixT().topLeftCorner(rank, rank)
-           .template triangularView<Upper>()
-           .transpose().template conjugateIf<Conjugate>()
-           .solveInPlace(c.topRows(rank));
-
-  dst.topRows(rank) = c.topRows(rank);
-  dst.bottomRows(rows()-rank).setZero();
-
-  dst.applyOnTheLeft(householderQ().setLength(rank).template conjugateIf<!Conjugate>() );
-}
 #endif
 
 namespace internal {
 
-template<typename MatrixType>
-struct traits<Inverse<CompleteOrthogonalDecomposition<MatrixType> > >
-  : traits<typename Transpose<typename MatrixType::PlainObject>::PlainObject>
-{
-  enum { Flags = 0 };
-};
-
 template<typename DstXprType, typename MatrixType>
 struct Assignment<DstXprType, Inverse<CompleteOrthogonalDecomposition<MatrixType> >, internal::assign_op<typename DstXprType::Scalar,typename CompleteOrthogonalDecomposition<MatrixType>::Scalar>, Dense2Dense>
 {
@@ -606,8 +534,7 @@ struct Assignment<DstXprType, Inverse<CompleteOrthogonalDecomposition<MatrixType
   typedef Inverse<CodType> SrcXprType;
   static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename CodType::Scalar> &)
   {
-    typedef Matrix<typename CodType::Scalar, CodType::RowsAtCompileTime, CodType::RowsAtCompileTime, 0, CodType::MaxRowsAtCompileTime, CodType::MaxRowsAtCompileTime> IdentityMatrixType;
-    dst = src.nestedExpression().solve(IdentityMatrixType::Identity(src.cols(), src.cols()));
+    dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.rows()));
   }
 };
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/QR/FullPivHouseholderQR.h b/uppsrc/plugin/Eigen/Eigen/src/QR/FullPivHouseholderQR.h
index d0664a1d8..e489bddc2 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/QR/FullPivHouseholderQR.h
@@ -18,9 +18,6 @@ namespace internal {
 template<typename _MatrixType> struct traits<FullPivHouseholderQR<_MatrixType> >
  : traits<_MatrixType>
 {
-  typedef MatrixXpr XprKind;
-  typedef SolverStorage StorageKind;
-  typedef int StorageIndex;
   enum { Flags = 0 };
 };
 
@@ -58,19 +55,20 @@ struct traits<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
   * \sa MatrixBase::fullPivHouseholderQr()
   */
 template<typename _MatrixType> class FullPivHouseholderQR
-        : public SolverBase<FullPivHouseholderQR<_MatrixType> >
 {
   public:
 
     typedef _MatrixType MatrixType;
-    typedef SolverBase<FullPivHouseholderQR> Base;
-    friend class SolverBase<FullPivHouseholderQR>;
-
-    EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivHouseholderQR)
     enum {
+      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    // FIXME should be int
+    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef internal::FullPivHouseholderQRMatrixQReturnType<MatrixType> MatrixQReturnType;
     typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
     typedef Matrix<StorageIndex, 1,
@@ -158,7 +156,6 @@ template<typename _MatrixType> class FullPivHouseholderQR
       computeInPlace();
     }
 
-    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
       * \c *this is the QR decomposition.
       *
@@ -176,8 +173,11 @@ template<typename _MatrixType> class FullPivHouseholderQR
       */
     template<typename Rhs>
     inline const Solve<FullPivHouseholderQR, Rhs>
-    solve(const MatrixBase<Rhs>& b) const;
-    #endif
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+      return Solve<FullPivHouseholderQR, Rhs>(*this, b.derived());
+    }
 
     /** \returns Expression object representing the matrix Q
       */
@@ -392,24 +392,22 @@ template<typename _MatrixType> class FullPivHouseholderQR
       *          diagonal coefficient of U.
       */
     RealScalar maxPivot() const { return m_maxpivot; }
-
+    
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
-
-    template<bool Conjugate, typename RhsType, typename DstType>
-    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
     #endif
 
   protected:
-
+    
     static void check_template_parameters()
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
-
+    
     void computeInPlace();
-
+    
     MatrixType m_qr;
     HCoeffsType m_hCoeffs;
     IntDiagSizeVectorType m_rows_transpositions;
@@ -501,15 +499,15 @@ void FullPivHouseholderQR<MatrixType>::computeInPlace()
       m_nonzero_pivots = k;
       for(Index i = k; i < size; i++)
       {
-        m_rows_transpositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
-        m_cols_transpositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
+        m_rows_transpositions.coeffRef(i) = i;
+        m_cols_transpositions.coeffRef(i) = i;
         m_hCoeffs.coeffRef(i) = Scalar(0);
       }
       break;
     }
 
-    m_rows_transpositions.coeffRef(k) = internal::convert_index<StorageIndex>(row_of_biggest_in_corner);
-    m_cols_transpositions.coeffRef(k) = internal::convert_index<StorageIndex>(col_of_biggest_in_corner);
+    m_rows_transpositions.coeffRef(k) = row_of_biggest_in_corner;
+    m_cols_transpositions.coeffRef(k) = col_of_biggest_in_corner;
     if(k != row_of_biggest_in_corner) {
       m_qr.row(k).tail(cols-k).swap(m_qr.row(row_of_biggest_in_corner).tail(cols-k));
       ++number_of_transpositions;
@@ -543,6 +541,7 @@ template<typename _MatrixType>
 template<typename RhsType, typename DstType>
 void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
+  eigen_assert(rhs.rows() == rows());
   const Index l_rank = rank();
 
   // FIXME introduce nonzeroPivots() and use it here. and more generally,
@@ -555,7 +554,7 @@ void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType
 
   typename RhsType::PlainObject c(rhs);
 
-  Matrix<typename RhsType::Scalar,1,RhsType::ColsAtCompileTime> temp(rhs.cols());
+  Matrix<Scalar,1,RhsType::ColsAtCompileTime> temp(rhs.cols());
   for (Index k = 0; k < l_rank; ++k)
   {
     Index remainingSize = rows()-k;
@@ -572,42 +571,6 @@ void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType
   for(Index i = 0; i < l_rank; ++i) dst.row(m_cols_permutation.indices().coeff(i)) = c.row(i);
   for(Index i = l_rank; i < cols(); ++i) dst.row(m_cols_permutation.indices().coeff(i)).setZero();
 }
-
-template<typename _MatrixType>
-template<bool Conjugate, typename RhsType, typename DstType>
-void FullPivHouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
-{
-  const Index l_rank = rank();
-
-  if(l_rank == 0)
-  {
-    dst.setZero();
-    return;
-  }
-
-  typename RhsType::PlainObject c(m_cols_permutation.transpose()*rhs);
-
-  m_qr.topLeftCorner(l_rank, l_rank)
-         .template triangularView<Upper>()
-         .transpose().template conjugateIf<Conjugate>()
-         .solveInPlace(c.topRows(l_rank));
-
-  dst.topRows(l_rank) = c.topRows(l_rank);
-  dst.bottomRows(rows()-l_rank).setZero();
-
-  Matrix<Scalar, 1, DstType::ColsAtCompileTime> temp(dst.cols());
-  const Index size = (std::min)(rows(), cols());
-  for (Index k = size-1; k >= 0; --k)
-  {
-    Index remainingSize = rows()-k;
-
-    dst.bottomRightCorner(remainingSize, dst.cols())
-       .applyHouseholderOnTheLeft(m_qr.col(k).tail(remainingSize-1).template conjugateIf<!Conjugate>(),
-                                  m_hCoeffs.template conjugateIf<Conjugate>().coeff(k), &temp.coeffRef(0));
-
-    dst.row(k).swap(dst.row(m_rows_transpositions.coeff(k)));
-  }
-}
 #endif
 
 namespace internal {
diff --git a/uppsrc/plugin/Eigen/Eigen/src/QR/HouseholderQR.h b/uppsrc/plugin/Eigen/Eigen/src/QR/HouseholderQR.h
index 801739fbd..3513d995c 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/QR/HouseholderQR.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/QR/HouseholderQR.h
@@ -14,18 +14,6 @@
 
 namespace Eigen { 
 
-namespace internal {
-template<typename _MatrixType> struct traits<HouseholderQR<_MatrixType> >
- : traits<_MatrixType>
-{
-  typedef MatrixXpr XprKind;
-  typedef SolverStorage StorageKind;
-  typedef int StorageIndex;
-  enum { Flags = 0 };
-};
-
-} // end namespace internal
-
 /** \ingroup QR_Module
   *
   *
@@ -54,19 +42,20 @@ template<typename _MatrixType> struct traits<HouseholderQR<_MatrixType> >
   * \sa MatrixBase::householderQr()
   */
 template<typename _MatrixType> class HouseholderQR
-        : public SolverBase<HouseholderQR<_MatrixType> >
 {
   public:
 
     typedef _MatrixType MatrixType;
-    typedef SolverBase<HouseholderQR> Base;
-    friend class SolverBase<HouseholderQR>;
-
-    EIGEN_GENERIC_PUBLIC_INTERFACE(HouseholderQR)
     enum {
+      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
       MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
       MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
     };
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    // FIXME should be int
+    typedef typename MatrixType::StorageIndex StorageIndex;
     typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, (MatrixType::Flags&RowMajorBit) ? RowMajor : ColMajor, MaxRowsAtCompileTime, MaxRowsAtCompileTime> MatrixQType;
     typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
     typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
@@ -132,7 +121,6 @@ template<typename _MatrixType> class HouseholderQR
       computeInPlace();
     }
 
-    #ifdef EIGEN_PARSED_BY_DOXYGEN
     /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
       * *this is the QR decomposition, if any exists.
       *
@@ -149,8 +137,11 @@ template<typename _MatrixType> class HouseholderQR
       */
     template<typename Rhs>
     inline const Solve<HouseholderQR, Rhs>
-    solve(const MatrixBase<Rhs>& b) const;
-    #endif
+    solve(const MatrixBase<Rhs>& b) const
+    {
+      eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
+      return Solve<HouseholderQR, Rhs>(*this, b.derived());
+    }
 
     /** This method returns an expression of the unitary matrix Q as a sequence of Householder transformations.
       *
@@ -213,30 +204,28 @@ template<typename _MatrixType> class HouseholderQR
 
     inline Index rows() const { return m_qr.rows(); }
     inline Index cols() const { return m_qr.cols(); }
-
+    
     /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q.
       * 
       * For advanced uses only.
       */
     const HCoeffsType& hCoeffs() const { return m_hCoeffs; }
-
+    
     #ifndef EIGEN_PARSED_BY_DOXYGEN
     template<typename RhsType, typename DstType>
+    EIGEN_DEVICE_FUNC
     void _solve_impl(const RhsType &rhs, DstType &dst) const;
-
-    template<bool Conjugate, typename RhsType, typename DstType>
-    void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
     #endif
 
   protected:
-
+    
     static void check_template_parameters()
     {
       EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
     }
 
     void computeInPlace();
-
+    
     MatrixType m_qr;
     HCoeffsType m_hCoeffs;
     RowVectorType m_temp;
@@ -303,7 +292,7 @@ template<typename MatrixQR, typename HCoeffs,
   bool InnerStrideIsOne = (MatrixQR::InnerStrideAtCompileTime == 1 && HCoeffs::InnerStrideAtCompileTime == 1)>
 struct householder_qr_inplace_blocked
 {
-  // This is specialized for LAPACK-supported Scalar types in HouseholderQR_LAPACKE.h
+  // This is specialized for MKL-supported Scalar types in HouseholderQR_MKL.h
   static void run(MatrixQR& mat, HCoeffs& hCoeffs, Index maxBlockSize=32,
       typename MatrixQR::Scalar* tempData = 0)
   {
@@ -361,10 +350,15 @@ template<typename RhsType, typename DstType>
 void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
   const Index rank = (std::min)(rows(), cols());
+  eigen_assert(rhs.rows() == rows());
 
   typename RhsType::PlainObject c(rhs);
 
-  c.applyOnTheLeft(householderQ().setLength(rank).adjoint() );
+  // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T
+  c.applyOnTheLeft(householderSequence(
+    m_qr.leftCols(rank),
+    m_hCoeffs.head(rank)).transpose()
+  );
 
   m_qr.topLeftCorner(rank, rank)
       .template triangularView<Upper>()
@@ -373,25 +367,6 @@ void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) c
   dst.topRows(rank) = c.topRows(rank);
   dst.bottomRows(cols()-rank).setZero();
 }
-
-template<typename _MatrixType>
-template<bool Conjugate, typename RhsType, typename DstType>
-void HouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
-{
-  const Index rank = (std::min)(rows(), cols());
-
-  typename RhsType::PlainObject c(rhs);
-
-  m_qr.topLeftCorner(rank, rank)
-      .template triangularView<Upper>()
-      .transpose().template conjugateIf<Conjugate>()
-      .solveInPlace(c.topRows(rank));
-
-  dst.topRows(rank) = c.topRows(rank);
-  dst.bottomRows(rows()-rank).setZero();
-
-  dst.applyOnTheLeft(householderQ().setLength(rank).template conjugateIf<!Conjugate>() );
-}
 #endif
 
 /** Performs the QR factorization of the given matrix \a matrix. The result of
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/uppsrc/plugin/Eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
index 013c7ae7a..953d57c9d 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
@@ -74,35 +74,13 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
     };
   public:
     SPQR() 
-      : m_analysisIsOk(false),
-        m_factorizationIsOk(false),
-        m_isRUpToDate(false),
-        m_ordering(SPQR_ORDERING_DEFAULT),
-        m_allow_tol(SPQR_DEFAULT_TOL),
-        m_tolerance (NumTraits<Scalar>::epsilon()),
-        m_cR(0),
-        m_E(0),
-        m_H(0),
-        m_HPinv(0),
-        m_HTau(0),
-        m_useDefaultThreshold(true)
+      : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()), m_useDefaultThreshold(true)
     { 
       cholmod_l_start(&m_cc);
     }
     
     explicit SPQR(const _MatrixType& matrix)
-      : m_analysisIsOk(false),
-        m_factorizationIsOk(false),
-        m_isRUpToDate(false),
-        m_ordering(SPQR_ORDERING_DEFAULT),
-        m_allow_tol(SPQR_DEFAULT_TOL),
-        m_tolerance (NumTraits<Scalar>::epsilon()),
-        m_cR(0),
-        m_E(0),
-        m_H(0),
-        m_HPinv(0),
-        m_HTau(0),
-        m_useDefaultThreshold(true)
+    : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()), m_useDefaultThreshold(true)
     {
       cholmod_l_start(&m_cc);
       compute(matrix);
@@ -242,7 +220,7 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
     
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was successful,
+      * \returns \c Success if computation was succesful,
       *          \c NumericalIssue if the sparse QR can not be computed
       */
     ComputationInfo info() const
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SVD/BDCSVD.h b/uppsrc/plugin/Eigen/Eigen/src/SVD/BDCSVD.h
index bcec45f58..a5b73f8f2 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SVD/BDCSVD.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SVD/BDCSVD.h
@@ -22,11 +22,6 @@
 // #define EIGEN_BDCSVD_DEBUG_VERBOSE
 // #define EIGEN_BDCSVD_SANITY_CHECKS
 
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-#undef eigen_internal_assert
-#define eigen_internal_assert(X) assert(X);
-#endif
-
 namespace Eigen {
 
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
@@ -39,7 +34,6 @@ namespace internal {
 
 template<typename _MatrixType> 
 struct traits<BDCSVD<_MatrixType> >
-        : traits<_MatrixType>
 {
   typedef _MatrixType MatrixType;
 };  
@@ -63,7 +57,7 @@ struct traits<BDCSVD<_MatrixType> >
  * recommended and can several order of magnitude faster.
  *
  * \warning this algorithm is unlikely to provide accurate result when compiled with unsafe math optimizations.
- * For instance, this concerns Intel's compiler (ICC), which performs such optimization by default unless
+ * For instance, this concerns Intel's compiler (ICC), which perfroms such optimization by default unless
  * you compile with the \c -fp-model \c precise option. Likewise, the \c -ffast-math option of GCC or clang will
  * significantly degrade the accuracy.
  *
@@ -111,7 +105,7 @@ public:
    * The default constructor is useful in cases in which the user intends to
    * perform decompositions via BDCSVD::compute(const MatrixType&).
    */
-  BDCSVD() : m_algoswap(16), m_isTranspose(false), m_compU(false), m_compV(false), m_numIters(0)
+  BDCSVD() : m_algoswap(16), m_numIters(0)
   {}
 
 
@@ -218,7 +212,7 @@ public:
 
 // Method to allocate and initialize matrix and attributes
 template<typename MatrixType>
-void BDCSVD<MatrixType>::allocate(Eigen::Index rows, Eigen::Index cols, unsigned int computationOptions)
+void BDCSVD<MatrixType>::allocate(Index rows, Index cols, unsigned int computationOptions)
 {
   m_isTranspose = (cols > rows);
 
@@ -394,7 +388,7 @@ void BDCSVD<MatrixType>::structured_update(Block<MatrixXr,Dynamic,Dynamic> A, co
 //@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix 
 // to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the reference paper.
 template<typename MatrixType>
-void BDCSVD<MatrixType>::divide (Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift)
+void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift)
 {
   // requires rows = cols + 1;
   using std::pow;
@@ -574,7 +568,7 @@ void BDCSVD<MatrixType>::divide (Eigen::Index firstCol, Eigen::Index lastCol, Ei
 // handling of round-off errors, be consistent in ordering
 // For instance, to solve the secular equation using FMM, see http://www.stat.uchicago.edu/~lekheng/courses/302/classics/greengard-rokhlin.pdf
 template <typename MatrixType>
-void BDCSVD<MatrixType>::computeSVDofM(Eigen::Index firstCol, Eigen::Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V)
+void BDCSVD<MatrixType>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V)
 {
   const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
   using std::abs;
@@ -597,7 +591,7 @@ void BDCSVD<MatrixType>::computeSVDofM(Eigen::Index firstCol, Eigen::Index n, Ma
   // but others are interleaved and we must ignore them at this stage.
   // To this end, let's compute a permutation skipping them:
   Index actual_n = n;
-  while(actual_n>1 && diag(actual_n-1)==Literal(0)) {--actual_n; eigen_internal_assert(col0(actual_n)==Literal(0)); }
+  while(actual_n>1 && diag(actual_n-1)==Literal(0)) --actual_n;
   Index m = 0; // size of the deflated problem
   for(Index k=0;k<actual_n;++k)
     if(abs(col0(k))>considerZero)
@@ -624,11 +618,13 @@ void BDCSVD<MatrixType>::computeSVDofM(Eigen::Index firstCol, Eigen::Index n, Ma
   std::cout << "  shift:    " << shifts.transpose() << "\n";
   
   {
+    Index actual_n = n;
+    while(actual_n>1 && abs(col0(actual_n-1))<considerZero) --actual_n;
     std::cout << "\n\n    mus:    " << mus.head(actual_n).transpose() << "\n\n";
     std::cout << "    check1 (expect0) : " << ((singVals.array()-(shifts+mus)) / singVals.array()).head(actual_n).transpose() << "\n\n";
-    assert((((singVals.array()-(shifts+mus)) / singVals.array()).head(actual_n) >= 0).all());
     std::cout << "    check2 (>0)      : " << ((singVals.array()-diag) / singVals.array()).head(actual_n).transpose() << "\n\n";
-    assert((((singVals.array()-diag) / singVals.array()).head(actual_n) >= 0).all());
+    std::cout << "    check3 (>0)      : " << ((diag.segment(1,actual_n-1)-singVals.head(actual_n-1).array()) / singVals.head(actual_n-1).array()).transpose() << "\n\n\n";
+    std::cout << "    check4 (>0)      : " << ((singVals.segment(1,actual_n-1)-singVals.head(actual_n-1))).transpose() << "\n\n\n";
   }
 #endif
   
@@ -656,13 +652,13 @@ void BDCSVD<MatrixType>::computeSVDofM(Eigen::Index firstCol, Eigen::Index n, Ma
 #endif
   
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  assert(U.allFinite());
+  assert(V.allFinite());
+  assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 1e-14 * n);
+  assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 1e-14 * n);
   assert(m_naiveU.allFinite());
   assert(m_naiveV.allFinite());
   assert(m_computed.allFinite());
-  assert(U.allFinite());
-  assert(V.allFinite());
-//   assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 100*NumTraits<RealScalar>::epsilon() * n);
-//   assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 100*NumTraits<RealScalar>::epsilon() * n);
 #endif
   
   // Because of deflation, the singular values might not be completely sorted.
@@ -677,15 +673,6 @@ void BDCSVD<MatrixType>::computeSVDofM(Eigen::Index firstCol, Eigen::Index n, Ma
       if(m_compV) V.col(i).swap(V.col(i+1));
     }
   }
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-  {
-    bool singular_values_sorted = (((singVals.segment(1,actual_n-1)-singVals.head(actual_n-1))).array() >= 0).all();
-    if(!singular_values_sorted)
-      std::cout << "Singular values are not sorted: " << singVals.segment(1,actual_n).transpose() << "\n";
-    assert(singular_values_sorted);
-  }
-#endif
   
   // Reverse order so that singular values in increased order
   // Because of deflation, the zeros singular-values are already at the end
@@ -762,22 +749,19 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
     RealScalar mid = left + (right-left) / Literal(2);
     RealScalar fMid = secularEq(mid, col0, diag, perm, diag, Literal(0));
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-    std::cout << "right-left = " << right-left << "\n";
-//     std::cout << "fMid = " << fMid << " " << secularEq(mid-left, col0, diag, perm, ArrayXr(diag-left), left)
-//                            << " " << secularEq(mid-right, col0, diag, perm, ArrayXr(diag-right), right)   << "\n";
-    std::cout << "     = " << secularEq(left+RealScalar(0.000001)*(right-left), col0, diag, perm, diag, 0)
-              << " "       << secularEq(left+RealScalar(0.1)     *(right-left), col0, diag, perm, diag, 0)
-              << " "       << secularEq(left+RealScalar(0.2)     *(right-left), col0, diag, perm, diag, 0)
-              << " "       << secularEq(left+RealScalar(0.3)     *(right-left), col0, diag, perm, diag, 0)
-              << " "       << secularEq(left+RealScalar(0.4)     *(right-left), col0, diag, perm, diag, 0)
-              << " "       << secularEq(left+RealScalar(0.49)    *(right-left), col0, diag, perm, diag, 0)
-              << " "       << secularEq(left+RealScalar(0.5)     *(right-left), col0, diag, perm, diag, 0)
-              << " "       << secularEq(left+RealScalar(0.51)    *(right-left), col0, diag, perm, diag, 0)
-              << " "       << secularEq(left+RealScalar(0.6)     *(right-left), col0, diag, perm, diag, 0)
-              << " "       << secularEq(left+RealScalar(0.7)     *(right-left), col0, diag, perm, diag, 0)
-              << " "       << secularEq(left+RealScalar(0.8)     *(right-left), col0, diag, perm, diag, 0)
-              << " "       << secularEq(left+RealScalar(0.9)     *(right-left), col0, diag, perm, diag, 0)
-              << " "       << secularEq(left+RealScalar(0.999999)*(right-left), col0, diag, perm, diag, 0) << "\n";
+    std::cout << right-left << "\n";
+    std::cout << "fMid = " << fMid << " " << secularEq(mid-left, col0, diag, perm, diag-left, left) << " " << secularEq(mid-right, col0, diag, perm, diag-right, right)   << "\n";
+    std::cout << "     = " << secularEq(0.1*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.2*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.3*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.4*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.49*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.5*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.51*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.6*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.7*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.8*(left+right), col0, diag, perm, diag, 0)
+              << " "       << secularEq(0.9*(left+right), col0, diag, perm, diag, 0) << "\n";
 #endif
     RealScalar shift = (k == actual_n-1 || fMid > Literal(0)) ? left : right;
     
@@ -835,16 +819,13 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
       // And find mu such that f(mu)==0:
       RealScalar muZero = -a/b;
       RealScalar fZero = secularEq(muZero, col0, diag, perm, diagShifted, shift);
-
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-      assert((numext::isfinite)(fZero));
-#endif
       
       muPrev = muCur;
       fPrev = fCur;
       muCur = muZero;
       fCur = fZero;
       
+      
       if (shift == left  && (muCur < Literal(0) || muCur > right - left)) useBisection = true;
       if (shift == right && (muCur < -(right - left) || muCur > Literal(0))) useBisection = true;
       if (abs(fCur)>abs(fPrev)) useBisection = true;
@@ -877,33 +858,20 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
         else
           rightShifted = -(std::numeric_limits<RealScalar>::min)();
       }
-
+      
       RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift);
       eigen_internal_assert(fLeft<Literal(0));
 
-#if defined EIGEN_INTERNAL_DEBUGGING || defined EIGEN_BDCSVD_SANITY_CHECKS
+#if defined EIGEN_INTERNAL_DEBUGGING || defined EIGEN_BDCSVD_DEBUG_VERBOSE
       RealScalar fRight = secularEq(rightShifted, col0, diag, perm, diagShifted, shift);
 #endif
 
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-      if(!(numext::isfinite)(fLeft))
-        std::cout << "f(" << leftShifted << ") =" << fLeft << " ; " << left << " " << shift << " " << right << "\n";
-      assert((numext::isfinite)(fLeft));
 
-      if(!(numext::isfinite)(fRight))
-        std::cout << "f(" << rightShifted << ") =" << fRight << " ; " << left << " " << shift << " " << right << "\n";
-      // assert((numext::isfinite)(fRight));
-#endif
-    
 #ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
       if(!(fLeft * fRight<0))
       {
-        std::cout << "f(leftShifted) using  leftShifted=" << leftShifted << " ;  diagShifted(1:10):" << diagShifted.head(10).transpose()  << "\n ; "
-                  << "left==shift=" << bool(left==shift) << " ; left-shift = " << (left-shift) << "\n";
-        std::cout << "k=" << k << ", " <<  fLeft << " * " << fRight << " == " << fLeft * fRight << "  ;  "
-                  << "[" << left << " .. " << right << "] -> [" << leftShifted << " " << rightShifted << "], shift=" << shift
-                  << " ,  f(right)=" << secularEq(0,     col0, diag, perm, diagShifted, shift)
-                           << " == " << secularEq(right, col0, diag, perm, diag, 0) << " == " << fRight << "\n";
+        std::cout << "fLeft: " << leftShifted << " - " << diagShifted.head(10).transpose()  << "\n ; " << bool(left==shift) << " " << (left-shift) << "\n";
+        std::cout << k << " : " <<  fLeft << " * " << fRight << " == " << fLeft * fRight << "  ;  " << left << " - " << right << " -> " <<  leftShifted << " " << rightShifted << "   shift=" << shift << "\n";
       }
 #endif
       eigen_internal_assert(fLeft * fRight < Literal(0));
@@ -944,15 +912,6 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
     shifts[k] = shift;
     mus[k] = muCur;
 
-#ifdef  EIGEN_BDCSVD_DEBUG_VERBOSE
-    if(k+1<n)
-      std::cout << "found " << singVals[k] << " == " << shift << " + " << muCur << " from " << diag(k) << " .. "  << diag(k+1) << "\n";
-#endif
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-    assert(k==0 || singVals[k]>=singVals[k-1]);
-    assert(singVals[k]>=diag(k));
-#endif
-
     // perturb singular value slightly if it equals diagonal entry to avoid division by zero later
     // (deflation is supposed to avoid this from happening)
     // - this does no seem to be necessary anymore -
@@ -976,7 +935,7 @@ void BDCSVD<MatrixType>::perturbCol0
     zhat.setZero();
     return;
   }
-  Index lastIdx = perm(m-1);
+  Index last = perm(m-1);
   // The offset permits to skip deflated entries while computing zhat
   for (Index k = 0; k < n; ++k)
   {
@@ -986,43 +945,15 @@ void BDCSVD<MatrixType>::perturbCol0
     {
       // see equation (3.6)
       RealScalar dk = diag(k);
-      RealScalar prod = (singVals(lastIdx) + dk) * (mus(lastIdx) + (shifts(lastIdx) - dk));
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-      if(prod<0) {
-        std::cout << "k = " << k << " ;  z(k)=" << col0(k) << ", diag(k)=" << dk << "\n";
-        std::cout << "prod = " << "(" << singVals(lastIdx) << " + " << dk << ") * (" << mus(lastIdx) << " + (" << shifts(lastIdx) << " - " << dk << "))" << "\n";
-        std::cout << "     = " << singVals(lastIdx) + dk << " * " << mus(lastIdx) + (shifts(lastIdx) - dk) <<  "\n";
-      }
-      assert(prod>=0);
-#endif
+      RealScalar prod = (singVals(last) + dk) * (mus(last) + (shifts(last) - dk));
 
       for(Index l = 0; l<m; ++l)
       {
         Index i = perm(l);
         if(i!=k)
         {
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-          if(i>=k && (l==0 || l-1>=m))
-          {
-            std::cout << "Error in perturbCol0\n";
-            std::cout << "  " << k << "/" << n << " "  << l << "/" << m << " " << i << "/" << n << " ; " << col0(k) << " " << diag(k) << " "  <<  "\n";
-            std::cout << "  " <<diag(i) << "\n";
-            Index j = (i<k /*|| l==0*/) ? i : perm(l-1);
-            std::cout << "  " << "j=" << j << "\n";
-          }
-#endif
           Index j = i<k ? i : perm(l-1);
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-          if(!(dk!=Literal(0) || diag(i)!=Literal(0)))
-          {
-            std::cout << "k=" << k << ", i=" << i << ", l=" << l << ", perm.size()=" << perm.size() << "\n";
-          }
-          assert(dk!=Literal(0) || diag(i)!=Literal(0));
-#endif
           prod *= ((singVals(j)+dk) / ((diag(i)+dk))) * ((mus(j)+(shifts(j)-dk)) / ((diag(i)-dk)));
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-          assert(prod>=0);
-#endif
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
           if(i!=k && numext::abs(((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) - 1) > 0.9 )
             std::cout << "     " << ((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) << " == (" << (singVals(j)+dk) << " * " << (mus(j)+(shifts(j)-dk))
@@ -1031,12 +962,9 @@ void BDCSVD<MatrixType>::perturbCol0
         }
       }
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-      std::cout << "zhat(" << k << ") =  sqrt( " << prod << ")  ;  " << (singVals(lastIdx) + dk) << " * " << mus(lastIdx) + shifts(lastIdx) << " - " << dk << "\n";
+      std::cout << "zhat(" << k << ") =  sqrt( " << prod << ")  ;  " << (singVals(last) + dk) << " * " << mus(last) + shifts(last) << " - " << dk << "\n";
 #endif
       RealScalar tmp = sqrt(prod);
-#ifdef EIGEN_BDCSVD_SANITY_CHECKS
-      assert((numext::isfinite)(tmp));
-#endif
       zhat(k) = col0(k) > Literal(0) ? RealScalar(tmp) : RealScalar(-tmp);
     }
   }
@@ -1090,7 +1018,7 @@ void BDCSVD<MatrixType>::computeSingVecs
 // i >= 1, di almost null and zi non null.
 // We use a rotation to zero out zi applied to the left of M
 template <typename MatrixType>
-void BDCSVD<MatrixType>::deflation43(Eigen::Index firstCol, Eigen::Index shift, Eigen::Index i, Eigen::Index size)
+void BDCSVD<MatrixType>::deflation43(Index firstCol, Index shift, Index i, Index size)
 {
   using std::abs;
   using std::sqrt;
@@ -1119,7 +1047,7 @@ void BDCSVD<MatrixType>::deflation43(Eigen::Index firstCol, Eigen::Index shift,
 // We apply two rotations to have zj = 0;
 // TODO deflation44 is still broken and not properly tested
 template <typename MatrixType>
-void BDCSVD<MatrixType>::deflation44(Eigen::Index firstColu , Eigen::Index firstColm, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index i, Eigen::Index j, Eigen::Index size)
+void BDCSVD<MatrixType>::deflation44(Index firstColu , Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size)
 {
   using std::abs;
   using std::sqrt;
@@ -1146,7 +1074,7 @@ void BDCSVD<MatrixType>::deflation44(Eigen::Index firstColu , Eigen::Index first
   }
   c/=r;
   s/=r;
-  m_computed(firstColm + i, firstColm) = r;
+  m_computed(firstColm + i, firstColm) = r;  
   m_computed(firstColm + j, firstColm + j) = m_computed(firstColm + i, firstColm + i);
   m_computed(firstColm + j, firstColm) = Literal(0);
 
@@ -1159,7 +1087,7 @@ void BDCSVD<MatrixType>::deflation44(Eigen::Index firstColu , Eigen::Index first
 
 // acts on block from (firstCol+shift, firstCol+shift) to (lastCol+shift, lastCol+shift) [inclusive]
 template <typename MatrixType>
-void BDCSVD<MatrixType>::deflation(Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index k, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift)
+void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift)
 {
   using std::sqrt;
   using std::abs;
@@ -1220,7 +1148,6 @@ void BDCSVD<MatrixType>::deflation(Eigen::Index firstCol, Eigen::Index lastCol,
 #endif
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
   std::cout << "to be sorted: " << diag.transpose() << "\n\n";
-  std::cout << "            : " << col0.transpose() << "\n\n";
 #endif
   {
     // Check for total deflation
@@ -1311,7 +1238,7 @@ void BDCSVD<MatrixType>::deflation(Eigen::Index firstCol, Eigen::Index lastCol,
        if( (diag(i) - diag(i-1)) < NumTraits<RealScalar>::epsilon()*maxDiag )
       {
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
-        std::cout << "deflation 4.4 with i = " << i << " because " << diag(i) << " - " << diag(i-1) << " == " << (diag(i) - diag(i-1)) << " < " << NumTraits<RealScalar>::epsilon()*/*diag(i)*/maxDiag << "\n";
+        std::cout << "deflation 4.4 with i = " << i << " because " << (diag(i) - diag(i-1)) << " < " << NumTraits<RealScalar>::epsilon()*diag(i) << "\n";
 #endif
         eigen_internal_assert(abs(diag(i) - diag(i-1))<epsilon_coarse && " diagonal entries are not properly sorted");
         deflation44(firstCol, firstCol + shift, firstRowW, firstColW, i-1, i, length);
@@ -1330,7 +1257,7 @@ void BDCSVD<MatrixType>::deflation(Eigen::Index firstCol, Eigen::Index lastCol,
 #endif
 }//end deflation
 
-#if !defined(EIGEN_GPUCC)
+#ifndef __CUDACC__
 /** \svd_module
   *
   * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SVD/JacobiSVD.h b/uppsrc/plugin/Eigen/Eigen/src/SVD/JacobiSVD.h
index 2b6891105..43488b1e0 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SVD/JacobiSVD.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SVD/JacobiSVD.h
@@ -425,7 +425,6 @@ struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true>
 
 template<typename _MatrixType, int QRPreconditioner> 
 struct traits<JacobiSVD<_MatrixType,QRPreconditioner> >
-        : traits<_MatrixType>
 {
   typedef _MatrixType MatrixType;
 };
@@ -611,7 +610,7 @@ template<typename _MatrixType, int QRPreconditioner> class JacobiSVD
 };
 
 template<typename MatrixType, int QRPreconditioner>
-void JacobiSVD<MatrixType, QRPreconditioner>::allocate(Eigen::Index rows, Eigen::Index cols, unsigned int computationOptions)
+void JacobiSVD<MatrixType, QRPreconditioner>::allocate(Index rows, Index cols, unsigned int computationOptions)
 {
   eigen_assert(rows >= 0 && cols >= 0);
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SVD/SVDBase.h b/uppsrc/plugin/Eigen/Eigen/src/SVD/SVDBase.h
index 34d5c9dd3..53da28488 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SVD/SVDBase.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SVD/SVDBase.h
@@ -17,18 +17,6 @@
 #define EIGEN_SVDBASE_H
 
 namespace Eigen {
-
-namespace internal {
-template<typename Derived> struct traits<SVDBase<Derived> >
- : traits<Derived>
-{
-  typedef MatrixXpr XprKind;
-  typedef SolverStorage StorageKind;
-  typedef int StorageIndex;
-  enum { Flags = 0 };
-};
-}
-
 /** \ingroup SVD_Module
  *
  *
@@ -56,18 +44,15 @@ template<typename Derived> struct traits<SVDBase<Derived> >
  * terminate in finite (and reasonable) time.
  * \sa class BDCSVD, class JacobiSVD
  */
-template<typename Derived> class SVDBase
- : public SolverBase<SVDBase<Derived> >
+template<typename Derived>
+class SVDBase
 {
-public: 
-   
-  template<typename Derived_>
-  friend struct internal::solve_assertion;
 
+public:
   typedef typename internal::traits<Derived>::MatrixType MatrixType;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-  typedef typename Eigen::internal::traits<SVDBase>::StorageIndex StorageIndex;
+  typedef typename MatrixType::StorageIndex StorageIndex;
   typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3
   enum {
     RowsAtCompileTime = MatrixType::RowsAtCompileTime,
@@ -209,7 +194,6 @@ public:
   inline Index rows() const { return m_rows; }
   inline Index cols() const { return m_cols; }
   
-  #ifdef EIGEN_PARSED_BY_DOXYGEN
   /** \returns a (least squares) solution of \f$ A x = b \f$ using the current SVD decomposition of A.
     *
     * \param b the right-hand-side of the equation to solve.
@@ -221,15 +205,17 @@ public:
     */
   template<typename Rhs>
   inline const Solve<Derived, Rhs>
-  solve(const MatrixBase<Rhs>& b) const;
-  #endif
-
+  solve(const MatrixBase<Rhs>& b) const
+  {
+    eigen_assert(m_isInitialized && "SVD is not initialized.");
+    eigen_assert(computeU() && computeV() && "SVD::solve() requires both unitaries U and V to be computed (thin unitaries suffice).");
+    return Solve<Derived, Rhs>(derived(), b.derived());
+  }
+  
   #ifndef EIGEN_PARSED_BY_DOXYGEN
   template<typename RhsType, typename DstType>
+  EIGEN_DEVICE_FUNC
   void _solve_impl(const RhsType &rhs, DstType &dst) const;
-
-  template<bool Conjugate, typename RhsType, typename DstType>
-  void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const;
   #endif
 
 protected:
@@ -238,14 +224,6 @@ protected:
   {
     EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
   }
-
-  template<bool Transpose_, typename Rhs>
-  void _check_solve_assertion(const Rhs& b) const {
-      EIGEN_ONLY_USED_FOR_DEBUG(b);
-      eigen_assert(m_isInitialized && "SVD is not initialized.");
-      eigen_assert(computeU() && computeV() && "SVDBase::solve(): Both unitaries U and V are required to be computed (thin unitaries suffice).");
-      eigen_assert((Transpose_?cols():rows())==b.rows() && "SVDBase::solve(): invalid number of rows of the right hand side matrix b");
-  }
   
   // return true if already allocated
   bool allocate(Index rows, Index cols, unsigned int computationOptions) ;
@@ -268,10 +246,6 @@ protected:
     : m_isInitialized(false),
       m_isAllocated(false),
       m_usePrescribedThreshold(false),
-      m_computeFullU(false),
-      m_computeThinU(false),
-      m_computeFullV(false),
-      m_computeThinV(false),
       m_computationOptions(0),
       m_rows(-1), m_cols(-1), m_diagSize(0)
   {
@@ -286,30 +260,17 @@ template<typename Derived>
 template<typename RhsType, typename DstType>
 void SVDBase<Derived>::_solve_impl(const RhsType &rhs, DstType &dst) const
 {
+  eigen_assert(rhs.rows() == rows());
+
   // A = U S V^*
   // So A^{-1} = V S^{-1} U^*
 
-  Matrix<typename RhsType::Scalar, Dynamic, RhsType::ColsAtCompileTime, 0, MatrixType::MaxRowsAtCompileTime, RhsType::MaxColsAtCompileTime> tmp;
+  Matrix<Scalar, Dynamic, RhsType::ColsAtCompileTime, 0, MatrixType::MaxRowsAtCompileTime, RhsType::MaxColsAtCompileTime> tmp;
   Index l_rank = rank();
   tmp.noalias() =  m_matrixU.leftCols(l_rank).adjoint() * rhs;
   tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp;
   dst = m_matrixV.leftCols(l_rank) * tmp;
 }
-
-template<typename Derived>
-template<bool Conjugate, typename RhsType, typename DstType>
-void SVDBase<Derived>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const
-{
-  // A = U S V^*
-  // So  A^{-*} = U S^{-1} V^*
-  // And A^{-T} = U_conj S^{-1} V^T
-  Matrix<typename RhsType::Scalar, Dynamic, RhsType::ColsAtCompileTime, 0, MatrixType::MaxRowsAtCompileTime, RhsType::MaxColsAtCompileTime> tmp;
-  Index l_rank = rank();
-
-  tmp.noalias() =  m_matrixV.leftCols(l_rank).transpose().template conjugateIf<Conjugate>() * rhs;
-  tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp;
-  dst = m_matrixU.template conjugateIf<!Conjugate>().leftCols(l_rank) * tmp;
-}
 #endif
 
 template<typename MatrixType>
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SVD/UpperBidiagonalization.h b/uppsrc/plugin/Eigen/Eigen/src/SVD/UpperBidiagonalization.h
index 997defc47..11ac847e1 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SVD/UpperBidiagonalization.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SVD/UpperBidiagonalization.h
@@ -127,7 +127,7 @@ void upperbidiagonalization_inplace_unblocked(MatrixType& mat,
        .makeHouseholderInPlace(mat.coeffRef(k,k+1), upper_diagonal[k]);
     // apply householder transform to remaining part of mat on the left
     mat.bottomRightCorner(remainingRows-1, remainingCols)
-       .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols-1).adjoint(), mat.coeff(k,k+1), tempData);
+       .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols-1).transpose(), mat.coeff(k,k+1), tempData);
   }
 }
 
@@ -202,7 +202,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A,
       {
         SubColumnType y_k( Y.col(k).tail(remainingCols) );
         
-        // let's use the beginning of column k of Y as a temporary vector
+        // let's use the begining of column k of Y as a temporary vector
         SubColumnType tmp( Y.col(k).head(k) );
         y_k.noalias()  = A.block(k,k+1, remainingRows,remainingCols).adjoint() * v_k; // bottleneck
         tmp.noalias()  = V_k1.adjoint()  * v_k;
@@ -231,7 +231,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A,
       {
         SubColumnType x_k ( X.col(k).tail(remainingRows-1) );
         
-        // let's use the beginning of column k of X as a temporary vectors
+        // let's use the begining of column k of X as a temporary vectors
         // note that tmp0 and tmp1 overlaps
         SubColumnType tmp0 ( X.col(k).head(k) ),
                       tmp1 ( X.col(k).head(k+1) );
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h
index 06edb8688..369e6804a 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h
@@ -80,19 +80,11 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
 
     /** Default constructor */
     SimplicialCholeskyBase()
-      : m_info(Success),
-        m_factorizationIsOk(false),
-        m_analysisIsOk(false),
-        m_shiftOffset(0),
-        m_shiftScale(1)
+      : m_info(Success), m_shiftOffset(0), m_shiftScale(1)
     {}
 
     explicit SimplicialCholeskyBase(const MatrixType& matrix)
-      : m_info(Success),
-        m_factorizationIsOk(false),
-        m_analysisIsOk(false),
-        m_shiftOffset(0),
-        m_shiftScale(1)
+      : m_info(Success), m_shiftOffset(0), m_shiftScale(1)
     {
       derived().compute(matrix);
     }
@@ -109,7 +101,7 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
     
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was successful,
+      * \returns \c Success if computation was succesful,
       *          \c NumericalIssue if the matrix.appears to be negative.
       */
     ComputationInfo info() const
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
index 72e1740c1..7b6183d08 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
@@ -2,21 +2,46 @@
 // for linear algebra.
 //
 // Copyright (C) 2008-2012 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 /*
-NOTE: these functions have been adapted from the LDL library:
+
+NOTE: thes functions vave been adapted from the LDL library:
 
 LDL Copyright (c) 2005 by Timothy A. Davis.  All Rights Reserved.
 
-The author of LDL, Timothy A. Davis., has executed a license with Google LLC
-to permit distribution of this code and derivative works as part of Eigen under
-the Mozilla Public License v. 2.0, as stated at the top of this file.
+LDL License:
+
+    Your use or distribution of LDL or any modified version of
+    LDL implies that you agree to this License.
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+    USA
+
+    Permission is hereby granted to use or copy this program under the
+    terms of the GNU LGPL, provided that the Copyright, this License,
+    and the Availability of the original version is retained on all copies.
+    User documentation of any code that uses this code or any modified
+    version of this code must cite the Copyright, this License, the
+    Availability note, and "Used by permission." Permission to modify
+    the code and to distribute modified code is granted, provided the
+    Copyright, this License, and the Availability note are retained,
+    and a notice that the code was modified is included.
  */
 
+#include "../Core/util/NonMPL2.h"
+
 #ifndef EIGEN_SIMPLICIAL_CHOLESKY_IMPL_H
 #define EIGEN_SIMPLICIAL_CHOLESKY_IMPL_H
 
@@ -97,7 +122,7 @@ void SimplicialCholeskyBase<Derived>::factorize_preordered(const CholMatrixType&
   for(StorageIndex k = 0; k < size; ++k)
   {
     // compute nonzero pattern of kth row of L, in topological order
-    y[k] = Scalar(0);                     // Y(0:k) is now all zero
+    y[k] = 0.0;                     // Y(0:k) is now all zero
     StorageIndex top = size;               // stack for pattern is empty
     tags[k] = k;                    // mark node k as visited
     m_nonZerosPerCol[k] = 0;        // count of nonzeros in column k of L
@@ -121,12 +146,12 @@ void SimplicialCholeskyBase<Derived>::factorize_preordered(const CholMatrixType&
     /* compute numerical values kth row of L (a sparse triangular solve) */
 
     RealScalar d = numext::real(y[k]) * m_shiftScale + m_shiftOffset;    // get D(k,k), apply the shift function, and clear Y(k)
-    y[k] = Scalar(0);
+    y[k] = 0.0;
     for(; top < size; ++top)
     {
       Index i = pattern[top];       /* pattern[top:n-1] is pattern of L(:,k) */
       Scalar yi = y[i];             /* get and clear Y(i) */
-      y[i] = Scalar(0);
+      y[i] = 0.0;
 
       /* the nonzero entry L(k,i) */
       Scalar l_ki;
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/CompressedStorage.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/CompressedStorage.h
index acd986fab..d89fa0dae 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/CompressedStorage.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/CompressedStorage.h
@@ -207,22 +207,6 @@ class CompressedStorage
       return m_values[id];
     }
 
-    void moveChunk(Index from, Index to, Index chunkSize)
-    {
-      eigen_internal_assert(to+chunkSize <= m_size);
-      if(to>from && from+chunkSize>to)
-      {
-        // move backward
-        internal::smart_memmove(m_values+from,  m_values+from+chunkSize,  m_values+to);
-        internal::smart_memmove(m_indices+from, m_indices+from+chunkSize, m_indices+to);
-      }
-      else
-      {
-        internal::smart_copy(m_values+from,  m_values+from+chunkSize,  m_values+to);
-        internal::smart_copy(m_indices+from, m_indices+from+chunkSize, m_indices+to);
-      }
-    }
-
     void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
     {
       Index k = 0;
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseAssign.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseAssign.h
index 905485c88..18352a847 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseAssign.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseAssign.h
@@ -83,7 +83,7 @@ void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src)
     // eval without temporary
     dst.resize(src.rows(), src.cols());
     dst.setZero();
-    dst.reserve((std::min)(src.rows()*src.cols(), (std::max)(src.rows(),src.cols())*2));
+    dst.reserve((std::max)(src.rows(),src.cols())*2);
     for (Index j=0; j<outerEvaluationSize; ++j)
     {
       dst.startVec(j);
@@ -107,7 +107,7 @@ void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src)
     
     DstXprType temp(src.rows(), src.cols());
 
-    temp.reserve((std::min)(src.rows()*src.cols(), (std::max)(src.rows(),src.cols())*2));
+    temp.reserve((std::max)(src.rows(),src.cols())*2);
     for (Index j=0; j<outerEvaluationSize; ++j)
     {
       temp.startVec(j);
@@ -134,8 +134,8 @@ struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Sparse>
 };
 
 // Generic Sparse to Dense assignment
-template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
-struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Dense, Weak>
+template< typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Dense>
 {
   static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
   {
@@ -153,73 +153,6 @@ struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Dense, Weak>
   }
 };
 
-// Specialization for dense ?= dense +/- sparse and dense ?= sparse +/- dense
-template<typename DstXprType, typename Func1, typename Func2>
-struct assignment_from_dense_op_sparse
-{
-  template<typename SrcXprType, typename InitialFunc>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/)
-  {
-    #ifdef EIGEN_SPARSE_ASSIGNMENT_FROM_DENSE_OP_SPARSE_PLUGIN
-    EIGEN_SPARSE_ASSIGNMENT_FROM_DENSE_OP_SPARSE_PLUGIN
-    #endif
-
-    call_assignment_no_alias(dst, src.lhs(), Func1());
-    call_assignment_no_alias(dst, src.rhs(), Func2());
-  }
-
-  // Specialization for dense1 = sparse + dense2; -> dense1 = dense2; dense1 += sparse;
-  template<typename Lhs, typename Rhs, typename Scalar>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  typename internal::enable_if<internal::is_same<typename internal::evaluator_traits<Rhs>::Shape,DenseShape>::value>::type
-  run(DstXprType &dst, const CwiseBinaryOp<internal::scalar_sum_op<Scalar,Scalar>, const Lhs, const Rhs> &src,
-      const internal::assign_op<typename DstXprType::Scalar,Scalar>& /*func*/)
-  {
-    #ifdef EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_ADD_DENSE_PLUGIN
-    EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_ADD_DENSE_PLUGIN
-    #endif
-
-    // Apply the dense matrix first, then the sparse one.
-    call_assignment_no_alias(dst, src.rhs(), Func1());
-    call_assignment_no_alias(dst, src.lhs(), Func2());
-  }
-
-  // Specialization for dense1 = sparse - dense2; -> dense1 = -dense2; dense1 += sparse;
-  template<typename Lhs, typename Rhs, typename Scalar>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  typename internal::enable_if<internal::is_same<typename internal::evaluator_traits<Rhs>::Shape,DenseShape>::value>::type
-  run(DstXprType &dst, const CwiseBinaryOp<internal::scalar_difference_op<Scalar,Scalar>, const Lhs, const Rhs> &src,
-      const internal::assign_op<typename DstXprType::Scalar,Scalar>& /*func*/)
-  {
-    #ifdef EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_SUB_DENSE_PLUGIN
-    EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_SUB_DENSE_PLUGIN
-    #endif
-
-    // Apply the dense matrix first, then the sparse one.
-    call_assignment_no_alias(dst, -src.rhs(), Func1());
-    call_assignment_no_alias(dst,  src.lhs(), add_assign_op<typename DstXprType::Scalar,typename Lhs::Scalar>());
-  }
-};
-
-#define EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(ASSIGN_OP,BINOP,ASSIGN_OP2) \
-  template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar> \
-  struct Assignment<DstXprType, CwiseBinaryOp<internal::BINOP<Scalar,Scalar>, const Lhs, const Rhs>, internal::ASSIGN_OP<typename DstXprType::Scalar,Scalar>, \
-                    Sparse2Dense, \
-                    typename internal::enable_if<   internal::is_same<typename internal::evaluator_traits<Lhs>::Shape,DenseShape>::value \
-                                                 || internal::is_same<typename internal::evaluator_traits<Rhs>::Shape,DenseShape>::value>::type> \
-    : assignment_from_dense_op_sparse<DstXprType, internal::ASSIGN_OP<typename DstXprType::Scalar,typename Lhs::Scalar>, internal::ASSIGN_OP2<typename DstXprType::Scalar,typename Rhs::Scalar> > \
-  {}
-
-EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(assign_op,    scalar_sum_op,add_assign_op);
-EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(add_assign_op,scalar_sum_op,add_assign_op);
-EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(sub_assign_op,scalar_sum_op,sub_assign_op);
-
-EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(assign_op,    scalar_difference_op,sub_assign_op);
-EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(add_assign_op,scalar_difference_op,sub_assign_op);
-EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(sub_assign_op,scalar_difference_op,add_assign_op);
-
-
 // Specialization for "dst = dec.solve(rhs)"
 // NOTE we need to specialize it for Sparse2Sparse to avoid ambiguous specialization error
 template<typename DstXprType, typename DecType, typename RhsType, typename Scalar>
@@ -246,22 +179,35 @@ struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Sparse>
 {
   typedef typename DstXprType::StorageIndex StorageIndex;
   typedef typename DstXprType::Scalar Scalar;
+  typedef Array<StorageIndex,Dynamic,1> ArrayXI;
+  typedef Array<Scalar,Dynamic,1> ArrayXS;
+  template<int Options>
+  static void run(SparseMatrix<Scalar,Options,StorageIndex> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if((dst.rows()!=dstRows) || (dst.cols()!=dstCols))
+      dst.resize(dstRows, dstCols);
 
-  template<int Options, typename AssignFunc>
-  static void run(SparseMatrix<Scalar,Options,StorageIndex> &dst, const SrcXprType &src, const AssignFunc &func)
-  { dst.assignDiagonal(src.diagonal(), func); }
+    Index size = src.diagonal().size();
+    dst.makeCompressed();
+    dst.resizeNonZeros(size);
+    Map<ArrayXI>(dst.innerIndexPtr(), size).setLinSpaced(0,StorageIndex(size)-1);
+    Map<ArrayXI>(dst.outerIndexPtr(), size+1).setLinSpaced(0,StorageIndex(size));
+    Map<ArrayXS>(dst.valuePtr(), size) = src.diagonal();
+  }
   
   template<typename DstDerived>
   static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
-  { dst.derived().diagonal() = src.diagonal(); }
+  {
+    dst.diagonal() = src.diagonal();
+  }
   
-  template<typename DstDerived>
-  static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
-  { dst.derived().diagonal() += src.diagonal(); }
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  { dst.diagonal() += src.diagonal(); }
   
-  template<typename DstDerived>
-  static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
-  { dst.derived().diagonal() -= src.diagonal(); }
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/)
+  { dst.diagonal() -= src.diagonal(); }
 };
 } // end namespace internal
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseBlock.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseBlock.h
index db5090257..511e92b2f 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseBlock.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseBlock.h
@@ -164,7 +164,7 @@ public:
       }
       else
       {
-        if(m_matrix.isCompressed() && nnz!=block_size)
+        if(m_matrix.isCompressed())
         {
           // no need to realloc, simply copy the tail at its respective position and insert tmp
           matrix.data().resize(start + nnz + tail_size);
@@ -326,6 +326,46 @@ private:
 
 //----------
 
+/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+  * is col-major (resp. row-major).
+  */
+template<typename Derived>
+typename SparseMatrixBase<Derived>::InnerVectorReturnType SparseMatrixBase<Derived>::innerVector(Index outer)
+{ return InnerVectorReturnType(derived(), outer); }
+
+/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+  * is col-major (resp. row-major). Read-only.
+  */
+template<typename Derived>
+const typename SparseMatrixBase<Derived>::ConstInnerVectorReturnType SparseMatrixBase<Derived>::innerVector(Index outer) const
+{ return ConstInnerVectorReturnType(derived(), outer); }
+
+/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+  * is col-major (resp. row-major).
+  */
+template<typename Derived>
+typename SparseMatrixBase<Derived>::InnerVectorsReturnType
+SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize)
+{
+  return Block<Derived,Dynamic,Dynamic,true>(derived(),
+                                             IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
+                                             IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
+
+}
+
+/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+  * is col-major (resp. row-major). Read-only.
+  */
+template<typename Derived>
+const typename SparseMatrixBase<Derived>::ConstInnerVectorsReturnType
+SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize) const
+{
+  return Block<const Derived,Dynamic,Dynamic,true>(derived(),
+                                                  IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
+                                                  IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
+
+}
+
 /** Generic implementation of sparse Block expression.
   * Real-only.
   */
@@ -463,25 +503,22 @@ template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
 class unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBased>::InnerVectorInnerIterator
  : public EvalIterator
 {
-  // NOTE MSVC fails to compile if we don't explicitely "import" IsRowMajor from unary_evaluator
-  //      because the base class EvalIterator has a private IsRowMajor enum too. (bug #1786)
-  // NOTE We cannot call it IsRowMajor because it would shadow unary_evaluator::IsRowMajor
-  enum { XprIsRowMajor = unary_evaluator::IsRowMajor };
+  enum { IsRowMajor = unary_evaluator::IsRowMajor };
   const XprType& m_block;
   Index m_end;
 public:
 
   EIGEN_STRONG_INLINE InnerVectorInnerIterator(const unary_evaluator& aEval, Index outer)
-    : EvalIterator(aEval.m_argImpl, outer + (XprIsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol())),
+    : EvalIterator(aEval.m_argImpl, outer + (IsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol())),
       m_block(aEval.m_block),
-      m_end(XprIsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows())
+      m_end(IsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows())
   {
-    while( (EvalIterator::operator bool()) && (EvalIterator::index() < (XprIsRowMajor ? m_block.startCol() : m_block.startRow())) )
+    while( (EvalIterator::operator bool()) && (EvalIterator::index() < (IsRowMajor ? m_block.startCol() : m_block.startRow())) )
       EvalIterator::operator++();
   }
 
-  inline StorageIndex index() const { return EvalIterator::index() - convert_index<StorageIndex>(XprIsRowMajor ? m_block.startCol() : m_block.startRow()); }
-  inline Index outer()  const { return EvalIterator::outer() - (XprIsRowMajor ? m_block.startRow() : m_block.startCol()); }
+  inline StorageIndex index() const { return EvalIterator::index() - convert_index<StorageIndex>(IsRowMajor ? m_block.startCol() : m_block.startRow()); }
+  inline Index outer()  const { return EvalIterator::outer() - (IsRowMajor ? m_block.startRow() : m_block.startCol()); }
   inline Index row()    const { return EvalIterator::row()   - m_block.startRow(); }
   inline Index col()    const { return EvalIterator::col()   - m_block.startCol(); }
 
@@ -491,8 +528,7 @@ public:
 template<typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
 class unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBased>::OuterVectorInnerIterator
 {
-  // NOTE see above
-  enum { XprIsRowMajor = unary_evaluator::IsRowMajor };
+  enum { IsRowMajor = unary_evaluator::IsRowMajor };
   const unary_evaluator& m_eval;
   Index m_outerPos;
   const Index m_innerIndex;
@@ -502,9 +538,9 @@ public:
 
   EIGEN_STRONG_INLINE OuterVectorInnerIterator(const unary_evaluator& aEval, Index outer)
     : m_eval(aEval),
-      m_outerPos( (XprIsRowMajor ? aEval.m_block.startCol() : aEval.m_block.startRow()) ),
-      m_innerIndex(XprIsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol()),
-      m_end(XprIsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows()),
+      m_outerPos( (IsRowMajor ? aEval.m_block.startCol() : aEval.m_block.startRow()) ),
+      m_innerIndex(IsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol()),
+      m_end(IsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows()),
       m_it(m_eval.m_argImpl, m_outerPos)
   {
     EIGEN_UNUSED_VARIABLE(outer);
@@ -515,10 +551,10 @@ public:
       ++(*this);
   }
 
-  inline StorageIndex index() const { return convert_index<StorageIndex>(m_outerPos - (XprIsRowMajor ? m_eval.m_block.startCol() : m_eval.m_block.startRow())); }
+  inline StorageIndex index() const { return convert_index<StorageIndex>(m_outerPos - (IsRowMajor ? m_eval.m_block.startCol() : m_eval.m_block.startRow())); }
   inline Index outer()  const { return 0; }
-  inline Index row()    const { return XprIsRowMajor ? 0 : index(); }
-  inline Index col()    const { return XprIsRowMajor ? index() : 0; }
+  inline Index row()    const { return IsRowMajor ? 0 : index(); }
+  inline Index col()    const { return IsRowMajor ? index() : 0; }
 
   inline Scalar value() const { return m_it.value(); }
   inline Scalar& valueRef() { return m_it.valueRef(); }
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseCompressedBase.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseCompressedBase.h
index 6a2c7a8ce..5ccb46656 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseCompressedBase.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseCompressedBase.h
@@ -128,28 +128,6 @@ class SparseCompressedBase
   protected:
     /** Default constructor. Do nothing. */
     SparseCompressedBase() {}
-
-    /** \internal return the index of the coeff at (row,col) or just before if it does not exist.
-      * This is an analogue of std::lower_bound.
-      */
-    internal::LowerBoundIndex lower_bound(Index row, Index col) const
-    {
-      eigen_internal_assert(row>=0 && row<this->rows() && col>=0 && col<this->cols());
-
-      const Index outer = Derived::IsRowMajor ? row : col;
-      const Index inner = Derived::IsRowMajor ? col : row;
-
-      Index start = this->outerIndexPtr()[outer];
-      Index end = this->isCompressed() ? this->outerIndexPtr()[outer+1] : this->outerIndexPtr()[outer] + this->innerNonZeroPtr()[outer];
-      eigen_assert(end>=start && "you are using a non finalized sparse matrix or written coefficient does not exist");
-      internal::LowerBoundIndex p;
-      p.value = std::lower_bound(this->innerIndexPtr()+start, this->innerIndexPtr()+end,inner) - this->innerIndexPtr();
-      p.found = (p.value<end) && (this->innerIndexPtr()[p.value]==inner);
-      return p;
-    }
-
-    friend struct internal::evaluator<SparseCompressedBase<Derived> >;
-
   private:
     template<typename OtherDerived> explicit SparseCompressedBase(const SparseCompressedBase<OtherDerived>&);
 };
@@ -207,14 +185,6 @@ class SparseCompressedBase<Derived>::InnerIterator
     }
 
     inline InnerIterator& operator++() { m_id++; return *this; }
-    inline InnerIterator& operator+=(Index i) { m_id += i ; return *this; }
-
-    inline InnerIterator operator+(Index i) 
-    { 
-        InnerIterator result = *this;
-        result += i;
-        return result;
-    }
 
     inline const Scalar& value() const { return m_values[m_id]; }
     inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id]); }
@@ -275,14 +245,6 @@ class SparseCompressedBase<Derived>::ReverseInnerIterator
     }
 
     inline ReverseInnerIterator& operator--() { --m_id; return *this; }
-    inline ReverseInnerIterator& operator-=(Index i) { m_id -= i; return *this; }
-
-    inline ReverseInnerIterator operator-(Index i) 
-    {
-        ReverseInnerIterator result = *this;
-        result -= i;
-        return result;
-    }
 
     inline const Scalar& value() const { return m_values[m_id-1]; }
     inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id-1]); }
@@ -355,8 +317,17 @@ protected:
 
   Index find(Index row, Index col) const
   {
-    internal::LowerBoundIndex p = m_matrix->lower_bound(row,col);
-    return p.found ? p.value : Dynamic;
+    eigen_internal_assert(row>=0 && row<m_matrix->rows() && col>=0 && col<m_matrix->cols());
+
+    const Index outer = Derived::IsRowMajor ? row : col;
+    const Index inner = Derived::IsRowMajor ? col : row;
+
+    Index start = m_matrix->outerIndexPtr()[outer];
+    Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer];
+    eigen_assert(end>=start && "you are using a non finalized sparse matrix or written coefficient does not exist");
+    const Index p = std::lower_bound(m_matrix->innerIndexPtr()+start, m_matrix->innerIndexPtr()+end,inner) - m_matrix->innerIndexPtr();
+
+    return ((p<end) && (m_matrix->innerIndexPtr()[p]==inner)) ? p : Dynamic;
   }
 
   const Derived *m_matrix;
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
index 6130bab43..e315e3550 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
@@ -101,7 +101,7 @@ public:
       }
       else
       {
-        m_value = Scalar(0); // this is to avoid a compilation warning
+        m_value = 0; // this is to avoid a compilation warning
         m_id = -1;
       }
       return *this;
@@ -212,7 +212,8 @@ public:
 
   enum {
     CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
-    Flags = XprType::Flags
+    // Expose storage order of the sparse expression
+    Flags = (XprType::Flags & ~RowMajorBit) | (int(Rhs::Flags)&RowMajorBit)
   };
 
   explicit binary_evaluator(const XprType& xpr)
@@ -299,7 +300,8 @@ public:
 
   enum {
     CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
-    Flags = XprType::Flags
+    // Expose storage order of the sparse expression
+    Flags = (XprType::Flags & ~RowMajorBit) | (int(Lhs::Flags)&RowMajorBit)
   };
 
   explicit binary_evaluator(const XprType& xpr)
@@ -531,7 +533,8 @@ public:
   
   enum {
     CoeffReadCost = evaluator<LhsArg>::CoeffReadCost + evaluator<RhsArg>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
-    Flags = XprType::Flags
+    // Expose storage order of the sparse expression
+    Flags = (XprType::Flags & ~RowMajorBit) | (int(RhsArg::Flags)&RowMajorBit)
   };
   
   explicit sparse_conjunction_evaluator(const XprType& xpr)
@@ -605,7 +608,8 @@ public:
   
   enum {
     CoeffReadCost = evaluator<LhsArg>::CoeffReadCost + evaluator<RhsArg>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
-    Flags = XprType::Flags
+    // Expose storage order of the sparse expression
+    Flags = (XprType::Flags & ~RowMajorBit) | (int(LhsArg::Flags)&RowMajorBit)
   };
   
   explicit sparse_conjunction_evaluator(const XprType& xpr)
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseDenseProduct.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseDenseProduct.h
index f005a18a1..0547db596 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseDenseProduct.h
@@ -88,11 +88,10 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, A
   typedef typename internal::remove_all<SparseLhsType>::type Lhs;
   typedef typename internal::remove_all<DenseRhsType>::type Rhs;
   typedef typename internal::remove_all<DenseResType>::type Res;
-  typedef evaluator<Lhs> LhsEval;
-  typedef typename LhsEval::InnerIterator LhsInnerIterator;
+  typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha)
   {
-    LhsEval lhsEval(lhs);
+    evaluator<Lhs> lhsEval(lhs);
     for(Index c=0; c<rhs.cols(); ++c)
     {
       for(Index j=0; j<lhs.outerSize(); ++j)
@@ -112,37 +111,16 @@ struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, t
   typedef typename internal::remove_all<SparseLhsType>::type Lhs;
   typedef typename internal::remove_all<DenseRhsType>::type Rhs;
   typedef typename internal::remove_all<DenseResType>::type Res;
-  typedef evaluator<Lhs> LhsEval;
-  typedef typename LhsEval::InnerIterator LhsInnerIterator;
+  typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
   static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
   {
-    Index n = lhs.rows();
-    LhsEval lhsEval(lhs);
-
-#ifdef EIGEN_HAS_OPENMP
-    Eigen::initParallel();
-    Index threads = Eigen::nbThreads();
-    // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
-    // It basically represents the minimal amount of work to be done to be worth it.
-    if(threads>1 && lhsEval.nonZerosEstimate()*rhs.cols() > 20000)
+    evaluator<Lhs> lhsEval(lhs);
+    for(Index j=0; j<lhs.outerSize(); ++j)
     {
-      #pragma omp parallel for schedule(dynamic,(n+threads*4-1)/(threads*4)) num_threads(threads)
-      for(Index i=0; i<n; ++i)
-        processRow(lhsEval,rhs,res,alpha,i);
+      typename Res::RowXpr res_j(res.row(j));
+      for(LhsInnerIterator it(lhsEval,j); it ;++it)
+        res_j += (alpha*it.value()) * rhs.row(it.index());
     }
-    else
-#endif
-    {
-      for(Index i=0; i<n; ++i)
-        processRow(lhsEval, rhs, res, alpha, i);
-    }
-  }
-
-  static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, Res& res, const typename Res::Scalar& alpha, Index i)
-  {
-    typename Res::RowXpr res_i(res.row(i));
-    for(LhsInnerIterator it(lhsEval,i); it ;++it)
-      res_i += (alpha*it.value()) * rhs.row(it.index());
   }
 };
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseMatrix.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseMatrix.h
index e0910a2cb..a5396538b 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseMatrix.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseMatrix.h
@@ -21,7 +21,7 @@ namespace Eigen {
   * This class implements a more versatile variants of the common \em compressed row/column storage format.
   * Each colmun's (resp. row) non zeros are stored as a pair of value with associated row (resp. colmiun) index.
   * All the non zeros are stored in a single large buffer. Unlike the \em compressed format, there might be extra
-  * space in between the nonzeros of two successive colmuns (resp. rows) such that insertion of new non-zero
+  * space inbetween the nonzeros of two successive colmuns (resp. rows) such that insertion of new non-zero
   * can be done with limited memory reallocation and copies.
   *
   * A call to the function makeCompressed() turns the matrix into the standard \em compressed format
@@ -99,8 +99,6 @@ class SparseMatrix
     typedef SparseCompressedBase<SparseMatrix> Base;
     using Base::convert_index;
     friend class SparseVector<_Scalar,0,_StorageIndex>;
-    template<typename, typename, typename, typename, typename>
-    friend struct internal::Assignment;
   public:
     using Base::isCompressed;
     using Base::nonZeros;
@@ -329,7 +327,8 @@ class SparseMatrix
           m_outerIndex[j] = newOuterIndex[j];
           m_innerNonZeros[j] = innerNNZ;
         }
-        m_outerIndex[m_outerSize] = m_outerIndex[m_outerSize-1] + m_innerNonZeros[m_outerSize-1] + reserveSizes[m_outerSize-1];
+        if(m_outerSize>0)
+          m_outerIndex[m_outerSize] = m_outerIndex[m_outerSize-1] + m_innerNonZeros[m_outerSize-1] + reserveSizes[m_outerSize-1];
         
         m_data.resize(m_outerIndex[m_outerSize]);
       }
@@ -504,8 +503,8 @@ class SparseMatrix
         m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i]; 
       }
     }
-
-    /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerance \a epsilon */
+    
+    /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerence \a epsilon */
     void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
     {
       prune(default_prunning_func(reference,epsilon));
@@ -606,9 +605,9 @@ class SparseMatrix
       m_outerIndex = newOuterIndex;
       if (outerChange > 0)
       {
-        StorageIndex lastIdx = m_outerSize == 0 ? 0 : m_outerIndex[m_outerSize];
+        StorageIndex last = m_outerSize == 0 ? 0 : m_outerIndex[m_outerSize];
         for(Index i=m_outerSize; i<m_outerSize+outerChange+1; i++)          
-          m_outerIndex[i] = lastIdx; 
+          m_outerIndex[i] = last; 
       }
       m_outerSize += outerChange;
     }
@@ -897,113 +896,6 @@ public:
       m_data.index(p) = convert_index(inner);
       return (m_data.value(p) = Scalar(0));
     }
-protected:
-    struct IndexPosPair {
-      IndexPosPair(Index a_i, Index a_p) : i(a_i), p(a_p) {}
-      Index i;
-      Index p;
-    };
-
-    /** \internal assign \a diagXpr to the diagonal of \c *this
-      * There are different strategies:
-      *   1 - if *this is overwritten (Func==assign_op) or *this is empty, then we can work treat *this as a dense vector expression.
-      *   2 - otherwise, for each diagonal coeff,
-      *     2.a - if it already exists, then we update it,
-      *     2.b - otherwise, if *this is uncompressed and that the current inner-vector has empty room for at least 1 element, then we perform an in-place insertion.
-      *     2.c - otherwise, we'll have to reallocate and copy everything, so instead of doing so for each new element, it is recorded in a std::vector.
-      *   3 - at the end, if some entries failed to be inserted in-place, then we alloc a new buffer, copy each chunk at the right position, and insert the new elements.
-      * 
-      * TODO: some piece of code could be isolated and reused for a general in-place update strategy.
-      * TODO: if we start to defer the insertion of some elements (i.e., case 2.c executed once),
-      *       then it *might* be better to disable case 2.b since they will have to be copied anyway.
-      */
-    template<typename DiagXpr, typename Func>
-    void assignDiagonal(const DiagXpr diagXpr, const Func& assignFunc)
-    {
-      Index n = diagXpr.size();
-
-      const bool overwrite = internal::is_same<Func, internal::assign_op<Scalar,Scalar> >::value;
-      if(overwrite)
-      {
-        if((this->rows()!=n) || (this->cols()!=n))
-          this->resize(n, n);
-      }
-
-      if(m_data.size()==0 || overwrite)
-      {
-        typedef Array<StorageIndex,Dynamic,1> ArrayXI;  
-        this->makeCompressed();
-        this->resizeNonZeros(n);
-        Eigen::Map<ArrayXI>(this->innerIndexPtr(), n).setLinSpaced(0,StorageIndex(n)-1);
-        Eigen::Map<ArrayXI>(this->outerIndexPtr(), n+1).setLinSpaced(0,StorageIndex(n));
-        Eigen::Map<Array<Scalar,Dynamic,1> > values = this->coeffs();
-        values.setZero();
-        internal::call_assignment_no_alias(values, diagXpr, assignFunc);
-      }
-      else
-      {
-        bool isComp = isCompressed();
-        internal::evaluator<DiagXpr> diaEval(diagXpr);
-        std::vector<IndexPosPair> newEntries;
-
-        // 1 - try in-place update and record insertion failures
-        for(Index i = 0; i<n; ++i)
-        {
-          internal::LowerBoundIndex lb = this->lower_bound(i,i);
-          Index p = lb.value;
-          if(lb.found)
-          {
-            // the coeff already exists
-            assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i));
-          }
-          else if((!isComp) && m_innerNonZeros[i] < (m_outerIndex[i+1]-m_outerIndex[i]))
-          {
-            // non compressed mode with local room for inserting one element
-            m_data.moveChunk(p, p+1, m_outerIndex[i]+m_innerNonZeros[i]-p);
-            m_innerNonZeros[i]++;
-            m_data.value(p) = Scalar(0);
-            m_data.index(p) = StorageIndex(i);
-            assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i));
-          }
-          else
-          {
-            // defer insertion
-            newEntries.push_back(IndexPosPair(i,p));
-          }
-        }
-        // 2 - insert deferred entries
-        Index n_entries = Index(newEntries.size());
-        if(n_entries>0)
-        {
-          Storage newData(m_data.size()+n_entries);
-          Index prev_p = 0;
-          Index prev_i = 0;
-          for(Index k=0; k<n_entries;++k)
-          {
-            Index i = newEntries[k].i;
-            Index p = newEntries[k].p;
-            internal::smart_copy(m_data.valuePtr()+prev_p, m_data.valuePtr()+p, newData.valuePtr()+prev_p+k);
-            internal::smart_copy(m_data.indexPtr()+prev_p, m_data.indexPtr()+p, newData.indexPtr()+prev_p+k);
-            for(Index j=prev_i;j<i;++j)
-              m_outerIndex[j+1] += k;
-            if(!isComp)
-              m_innerNonZeros[i]++;
-            prev_p = p;
-            prev_i = i;
-            newData.value(p+k) = Scalar(0);
-            newData.index(p+k) = StorageIndex(i);
-            assignFunc.assignCoeff(newData.value(p+k), diaEval.coeff(i));
-          }
-          {
-            internal::smart_copy(m_data.valuePtr()+prev_p, m_data.valuePtr()+m_data.size(), newData.valuePtr()+prev_p+n_entries);
-            internal::smart_copy(m_data.indexPtr()+prev_p, m_data.indexPtr()+m_data.size(), newData.indexPtr()+prev_p+n_entries);
-            for(Index j=prev_i+1;j<=m_outerSize;++j)
-              m_outerIndex[j] += n_entries;
-          }
-          m_data.swap(newData);
-        }
-      }
-    }
 
 private:
   static void check_template_parameters()
@@ -1095,7 +987,7 @@ void set_from_triplets(const InputIterator& begin, const InputIterator& end, Spa
   *
   * \warning The list of triplets is read multiple times (at least twice). Therefore, it is not recommended to define
   * an abstract iterator over a complex data-structure that would be expensive to evaluate. The triplets should rather
-  * be explicitly stored into a std::vector for instance.
+  * be explicitely stored into a std::vector for instance.
   */
 template<typename Scalar, int _Options, typename _StorageIndex>
 template<typename InputIterators>
@@ -1341,7 +1233,7 @@ typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& SparseMatrix<_Sca
     }
     
     m_data.index(p) = convert_index(inner);
-    return (m_data.value(p) = Scalar(0));
+    return (m_data.value(p) = 0);
   }
   
   if(m_data.size() != m_data.allocatedSize())
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseMatrixBase.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseMatrixBase.h
index 229449f02..c6b548f11 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseMatrixBase.h
@@ -87,11 +87,6 @@ template<typename Derived> class SparseMatrixBase
           * we are dealing with a column-vector (if there is only one column) or with
           * a row-vector (if there is only one row). */
 
-      NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2,
-        /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors,
-         * and 2 for matrices.
-         */
-
       Flags = internal::traits<Derived>::Flags,
         /**< This stores expression \ref flags flags which may or may not be inherited by new expressions
           * constructed from this one. See the \ref flags "list of flags".
@@ -355,6 +350,18 @@ template<typename Derived> class SparseMatrixBase
     const ConstTransposeReturnType transpose() const { return ConstTransposeReturnType(derived()); }
     const AdjointReturnType adjoint() const { return AdjointReturnType(transpose()); }
 
+    // inner-vector
+    typedef Block<Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true>       InnerVectorReturnType;
+    typedef Block<const Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true> ConstInnerVectorReturnType;
+    InnerVectorReturnType innerVector(Index outer);
+    const ConstInnerVectorReturnType innerVector(Index outer) const;
+
+    // set of inner-vectors
+    typedef Block<Derived,Dynamic,Dynamic,true> InnerVectorsReturnType;
+    typedef Block<const Derived,Dynamic,Dynamic,true> ConstInnerVectorsReturnType;
+    InnerVectorsReturnType innerVectors(Index outerStart, Index outerSize);
+    const ConstInnerVectorsReturnType innerVectors(Index outerStart, Index outerSize) const;
+
     DenseMatrixType toDense() const
     {
       return DenseMatrixType(derived());
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseProduct.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseProduct.h
index c495a7398..4cbf68781 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseProduct.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseProduct.h
@@ -17,7 +17,7 @@ namespace Eigen {
   * The automatic pruning of the small values can be achieved by calling the pruned() function
   * in which case a totally different product algorithm is employed:
   * \code
-  * C = (A*B).pruned();             // suppress numerical zeros (exact)
+  * C = (A*B).pruned();             // supress numerical zeros (exact)
   * C = (A*B).pruned(ref);
   * C = (A*B).pruned(ref,epsilon);
   * \endcode
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseRef.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseRef.h
index 748f87d62..d91f38f97 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseRef.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseRef.h
@@ -201,7 +201,7 @@ class Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType
 
     ~Ref() {
       if(m_hasCopy) {
-        TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(&m_storage);
+        TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
         obj->~TPlainObjectType();
       }
     }
@@ -213,7 +213,7 @@ class Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType
     {
       if((Options & int(StandardCompressedFormat)) && (!expr.isCompressed()))
       {
-        TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(&m_storage);
+        TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
         ::new (obj) TPlainObjectType(expr);
         m_hasCopy = true;
         Base::construct(*obj);
@@ -227,14 +227,14 @@ class Ref<const SparseMatrix<MatScalar,MatOptions,MatIndex>, Options, StrideType
     template<typename Expression>
     void construct(const Expression& expr, internal::false_type)
     {
-      TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(&m_storage);
+      TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
       ::new (obj) TPlainObjectType(expr);
       m_hasCopy = true;
       Base::construct(*obj);
     }
 
   protected:
-    typename internal::aligned_storage<sizeof(TPlainObjectType), EIGEN_ALIGNOF(TPlainObjectType)>::type m_storage;
+    char m_object_bytes[sizeof(TPlainObjectType)];
     bool m_hasCopy;
 };
 
@@ -319,7 +319,7 @@ class Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType
 
     ~Ref() {
       if(m_hasCopy) {
-        TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(&m_storage);
+        TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
         obj->~TPlainObjectType();
       }
     }
@@ -335,14 +335,14 @@ class Ref<const SparseVector<MatScalar,MatOptions,MatIndex>, Options, StrideType
     template<typename Expression>
     void construct(const Expression& expr, internal::false_type)
     {
-      TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(&m_storage);
+      TPlainObjectType* obj = reinterpret_cast<TPlainObjectType*>(m_object_bytes);
       ::new (obj) TPlainObjectType(expr);
       m_hasCopy = true;
       Base::construct(*obj);
     }
 
   protected:
-    typename internal::aligned_storage<sizeof(TPlainObjectType), EIGEN_ALIGNOF(TPlainObjectType)>::type m_storage;
+    char m_object_bytes[sizeof(TPlainObjectType)];
     bool m_hasCopy;
 };
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h
index 65611b3d4..76117a010 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h
@@ -453,7 +453,7 @@ void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename Matri
       Index r = it.row();
       Index c = it.col();
       Index ip = perm ? perm[i] : i;
-      if(Mode==(Upper|Lower))
+      if(Mode==int(Upper|Lower))
         count[StorageOrderMatch ? jp : ip]++;
       else if(r==c)
         count[ip]++;
@@ -486,7 +486,7 @@ void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename Matri
       StorageIndex jp = perm ? perm[j] : j;
       StorageIndex ip = perm ? perm[i] : i;
       
-      if(Mode==(Upper|Lower))
+      if(Mode==int(Upper|Lower))
       {
         Index k = count[StorageOrderMatch ? jp : ip]++;
         dest.innerIndexPtr()[k] = StorageOrderMatch ? ip : jp;
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseUtil.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseUtil.h
index ceb936887..74df0d496 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseUtil.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseUtil.h
@@ -140,14 +140,6 @@ struct SparseSelfAdjointShape { static std::string debugName() { return "SparseS
 template<> struct glue_shapes<SparseShape,SelfAdjointShape> { typedef SparseSelfAdjointShape type;  };
 template<> struct glue_shapes<SparseShape,TriangularShape > { typedef SparseTriangularShape  type;  };
 
-// return type of SparseCompressedBase::lower_bound;
-struct LowerBoundIndex {
-  LowerBoundIndex() : value(-1), found(false) {}
-  LowerBoundIndex(Index val, bool ok) : value(val), found(ok) {}
-  Index value;
-  bool found;
-};
-
 } // end namespace internal
 
 /** \ingroup SparseCore_Module
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseVector.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseVector.h
index 05779be68..19b0fbc9d 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseVector.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseVector.h
@@ -281,7 +281,7 @@ class SparseVector
     }
 
     /** Swaps the values of \c *this and \a other.
-      * Overloaded for performance: this version performs a \em shallow swap by swapping pointers and attributes only.
+      * Overloaded for performance: this version performs a \em shallow swap by swaping pointers and attributes only.
       * \sa SparseMatrixBase::swap()
       */
     inline void swap(SparseVector& other)
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU.h b/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU.h
index 090993adc..87f0efe37 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU.h
@@ -26,7 +26,7 @@ template <typename MatrixLType, typename MatrixUType> struct SparseLUMatrixURetu
   * This class implements the supernodal LU factorization for general matrices.
   * It uses the main techniques from the sequential SuperLU package 
   * (http://crd-legacy.lbl.gov/~xiaoye/SuperLU/). It handles transparently real 
-  * and complex arithmetic with single and double precision, depending on the 
+  * and complex arithmetics with single and double precision, depending on the 
   * scalar type of your input matrix. 
   * The code has been optimized to provide BLAS-3 operations during supernode-panel updates. 
   * It benefits directly from the built-in high-performant Eigen BLAS routines. 
@@ -193,7 +193,7 @@ class SparseLU : public SparseSolverBase<SparseLU<_MatrixType,_OrderingType> >,
     
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was successful,
+      * \returns \c Success if computation was succesful,
       *          \c NumericalIssue if the LU factorization reports a problem, zero diagonal for instance
       *          \c InvalidInput if the input matrix is invalid
       *
@@ -501,6 +501,7 @@ void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix)
   
   m_isInitialized = true;
   
+  
   // Apply the column permutation computed in analyzepattern()
   //   m_mat = matrix * m_perm_c.inverse(); 
   m_mat = matrix;
@@ -703,8 +704,8 @@ struct SparseLUMatrixLReturnType : internal::no_assignment_operator
   typedef typename MappedSupernodalType::Scalar Scalar;
   explicit SparseLUMatrixLReturnType(const MappedSupernodalType& mapL) : m_mapL(mapL)
   { }
-  Index rows() const { return m_mapL.rows(); }
-  Index cols() const { return m_mapL.cols(); }
+  Index rows() { return m_mapL.rows(); }
+  Index cols() { return m_mapL.cols(); }
   template<typename Dest>
   void solveInPlace( MatrixBase<Dest> &X) const
   {
@@ -720,8 +721,8 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator
   SparseLUMatrixUReturnType(const MatrixLType& mapL, const MatrixUType& mapU)
   : m_mapL(mapL),m_mapU(mapU)
   { }
-  Index rows() const { return m_mapL.rows(); }
-  Index cols() const { return m_mapL.cols(); }
+  Index rows() { return m_mapL.rows(); }
+  Index cols() { return m_mapL.cols(); }
 
   template<typename Dest>   void solveInPlace(MatrixBase<Dest> &X) const
   {
@@ -744,9 +745,8 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator
       }
       else
       {
-        // FIXME: the following lines should use Block expressions and not Map!
         Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
-        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X.coeffRef(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
         U = A.template triangularView<Upper>().solve(U);
       }
 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_Memory.h b/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_Memory.h
index 349bfd585..4dc42e87b 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_Memory.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_Memory.h
@@ -51,7 +51,7 @@ inline Index LUTempSpace(Index&m, Index& w)
 
 
 /** 
-  * Expand the existing storage to accommodate more fill-ins
+  * Expand the existing storage to accomodate more fill-ins
   * \param vec Valid pointer to the vector to allocate or expand
   * \param[in,out] length  At input, contain the current length of the vector that is to be increased. At output, length of the newly allocated vector
   * \param[in] nbElts Current number of elements in the factors
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h b/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
index 8583b1b69..721e1883b 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
@@ -75,12 +75,12 @@ class MappedSuperNodalMatrix
     /**
      * Number of rows
      */
-    Index rows() const { return m_row; }
+    Index rows() { return m_row; }
     
     /**
      * Number of columns
      */
-    Index cols() const { return m_col; }
+    Index cols() { return m_col; }
     
     /**
      * Return the array of nonzero values packed by column
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h b/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h
index 5a2c941b4..c98b30e32 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h
@@ -151,7 +151,7 @@ Index SparseLUImpl<Scalar,StorageIndex>::column_dfs(const Index m, const Index j
         StorageIndex ito = glu.xlsub(fsupc+1);
         glu.xlsub(jcolm1) = ito; 
         StorageIndex istop = ito + jptr - jm1ptr; 
-        xprune(jcolm1) = istop; // initialize xprune(jcol-1)
+        xprune(jcolm1) = istop; // intialize xprune(jcol-1)
         glu.xlsub(jcol) = istop; 
         
         for (StorageIndex ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito)
@@ -166,7 +166,7 @@ Index SparseLUImpl<Scalar,StorageIndex>::column_dfs(const Index m, const Index j
   // Tidy up the pointers before exit
   glu.xsup(nsuper+1) = jcolp1; 
   glu.supno(jcolp1) = nsuper; 
-  xprune(jcol) = StorageIndex(nextl);  // Initialize upper bound for pruning
+  xprune(jcol) = StorageIndex(nextl);  // Intialize upper bound for pruning
   glu.xlsub(jcolp1) = StorageIndex(nextl); 
   
   return 0; 
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h b/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
index e37c2fe0d..95ba7413f 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
@@ -215,7 +215,7 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const
         if(RK==4){ a3 = pload<Packet>(A3+i+(I+1)*PacketSize);  }\
                    pstore(C0+i+(I)*PacketSize, c0);
         
-        // aggressive vectorization and peeling
+        // agressive vectorization and peeling
         for(Index i=0; i<actual_b_end1; i+=PacketSize*8)
         {
           EIGEN_ASM_COMMENT("SPARSELU_GEMML_KERNEL2");
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h b/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h
index f052001c8..822cf32c3 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_panel_bmod.h
@@ -38,7 +38,7 @@ namespace internal {
  * \brief Performs numeric block updates (sup-panel) in topological order.
  * 
  * Before entering this routine, the original nonzeros in the panel
- * were already copied into the spa[m,w]
+ * were already copied i nto the spa[m,w]
  * 
  * \param m number of rows in the matrix
  * \param w Panel size
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseQR/SparseQR.h b/uppsrc/plugin/Eigen/Eigen/src/SparseQR/SparseQR.h
index d1fb96f5c..7409fcae9 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SparseQR/SparseQR.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SparseQR/SparseQR.h
@@ -41,16 +41,15 @@ namespace internal {
 /**
   * \ingroup SparseQR_Module
   * \class SparseQR
-  * \brief Sparse left-looking QR factorization with numerical column pivoting
+  * \brief Sparse left-looking rank-revealing QR factorization
   * 
-  * This class implements a left-looking QR decomposition of sparse matrices
-  * with numerical column pivoting.
-  * When a column has a norm less than a given tolerance
+  * This class implements a left-looking rank-revealing QR decomposition 
+  * of sparse matrices. When a column has a norm less than a given tolerance
   * it is implicitly permuted to the end. The QR factorization thus obtained is 
   * given by A*P = Q*R where R is upper triangular or trapezoidal. 
   * 
   * P is the column permutation which is the product of the fill-reducing and the
-  * numerical permutations. Use colsPermutation() to get it.
+  * rank-revealing permutations. Use colsPermutation() to get it.
   * 
   * Q is the orthogonal matrix represented as products of Householder reflectors. 
   * Use matrixQ() to get an expression and matrixQ().adjoint() to get the adjoint.
@@ -65,17 +64,6 @@ namespace internal {
   * 
   * \implsparsesolverconcept
   *
-  * The numerical pivoting strategy and default threshold are the same as in SuiteSparse QR, and
-  * detailed in the following paper:
-  * <i>
-  * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing
-  * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011.
-  * </i>
-  * Even though it is qualified as "rank-revealing", this strategy might fail for some 
-  * rank deficient problems. When this class is used to solve linear or least-square problems
-  * it is thus strongly recommended to check the accuracy of the computed solution. If it
-  * failed, it usually helps to increase the threshold with setPivotThreshold.
-  * 
   * \warning The input sparse matrix A must be in compressed mode (see SparseMatrix::makeCompressed()).
   * \warning For complex matrices matrixQ().transpose() will actually return the adjoint matrix.
   * 
@@ -343,7 +331,7 @@ void SparseQR<MatrixType,OrderingType>::analyzePattern(const MatrixType& mat)
   m_R.resize(m, n);
   m_Q.resize(m, diagSize);
   
-  // Allocate space for nonzero elements: rough estimation
+  // Allocate space for nonzero elements : rough estimation
   m_R.reserve(2*mat.nonZeros()); //FIXME Get a more accurate estimation through symbolic factorization with the etree
   m_Q.reserve(2*mat.nonZeros());
   m_hcoeffs.resize(diagSize);
@@ -652,8 +640,7 @@ struct SparseQR_QProduct : ReturnByValue<SparseQR_QProduct<SparseQRType, Derived
       // Compute res = Q * other column by column
       for(Index j = 0; j < res.cols(); j++)
       {
-        Index start_k = internal::is_identity<Derived>::value ? numext::mini(j,diagSize-1) : diagSize-1;
-        for (Index k = start_k; k >=0; k--)
+        for (Index k = diagSize-1; k >=0; k--)
         {
           Scalar tau = Scalar(0);
           tau = m_qr.m_Q.col(k).dot(res.col(j));
diff --git a/uppsrc/plugin/Eigen/Eigen/src/StlSupport/StdDeque.h b/uppsrc/plugin/Eigen/Eigen/src/StlSupport/StdDeque.h
index 045da7b4d..af158f425 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/StlSupport/StdDeque.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/StlSupport/StdDeque.h
@@ -36,7 +36,7 @@ namespace std \
     deque(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : deque_base(first, last, a) {} \
     deque(const deque& c) : deque_base(c) {}  \
     explicit deque(size_type num, const value_type& val = value_type()) : deque_base(num, val) {} \
-    deque(iterator start_, iterator end_) : deque_base(start_, end_) {}  \
+    deque(iterator start, iterator end) : deque_base(start, end) {}  \
     deque& operator=(const deque& x) {  \
       deque_base::operator=(x);  \
       return *this;  \
@@ -62,7 +62,7 @@ namespace std {
     : deque_base(first, last, a) {} \
     deque(const deque& c) : deque_base(c) {}  \
     explicit deque(size_type num, const value_type& val = value_type()) : deque_base(num, val) {} \
-    deque(iterator start_, iterator end_) : deque_base(start_, end_) {}  \
+    deque(iterator start, iterator end) : deque_base(start, end) {}  \
     deque& operator=(const deque& x) {  \
       deque_base::operator=(x);  \
       return *this;  \
@@ -98,8 +98,10 @@ namespace std {
   { return deque_base::insert(position,x); }
   void insert(const_iterator position, size_type new_size, const value_type& x)
   { deque_base::insert(position, new_size, x); }
-#elif defined(_GLIBCXX_DEQUE) && EIGEN_GNUC_AT_LEAST(4,2)
+#elif defined(_GLIBCXX_DEQUE) && EIGEN_GNUC_AT_LEAST(4,2) && !EIGEN_GNUC_AT_LEAST(10, 1)
   // workaround GCC std::deque implementation
+  // GCC 10.1 doesn't let us access _Deque_impl _M_impl anymore and we have to
+  // fall-back to the default case
   void resize(size_type new_size, const value_type& x)
   {
     if (new_size < deque_base::size())
@@ -108,7 +110,7 @@ namespace std {
       deque_base::insert(deque_base::end(), new_size - deque_base::size(), x);
   }
 #else
-  // either GCC 4.1 or non-GCC
+  // either non-GCC or GCC between 4.1 and 10.1
   // default implementation which should always work.
   void resize(size_type new_size, const value_type& x)
   {
diff --git a/uppsrc/plugin/Eigen/Eigen/src/StlSupport/StdList.h b/uppsrc/plugin/Eigen/Eigen/src/StlSupport/StdList.h
index 8ba3fada0..e1eba4985 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/StlSupport/StdList.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/StlSupport/StdList.h
@@ -35,7 +35,7 @@ namespace std \
     list(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : list_base(first, last, a) {} \
     list(const list& c) : list_base(c) {}  \
     explicit list(size_type num, const value_type& val = value_type()) : list_base(num, val) {} \
-    list(iterator start_, iterator end_) : list_base(start_, end_) {}  \
+    list(iterator start, iterator end) : list_base(start, end) {}  \
     list& operator=(const list& x) {  \
       list_base::operator=(x);  \
       return *this;  \
@@ -62,7 +62,7 @@ namespace std
     : list_base(first, last, a) {} \
     list(const list& c) : list_base(c) {}  \
     explicit list(size_type num, const value_type& val = value_type()) : list_base(num, val) {} \
-    list(iterator start_, iterator end_) : list_base(start_, end_) {}  \
+    list(iterator start, iterator end) : list_base(start, end) {}  \
     list& operator=(const list& x) {  \
     list_base::operator=(x);  \
     return *this; \
diff --git a/uppsrc/plugin/Eigen/Eigen/src/StlSupport/StdVector.h b/uppsrc/plugin/Eigen/Eigen/src/StlSupport/StdVector.h
index 9fcf19bce..ec22821d2 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/StlSupport/StdVector.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/StlSupport/StdVector.h
@@ -36,7 +36,7 @@ namespace std \
     vector(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : vector_base(first, last, a) {} \
     vector(const vector& c) : vector_base(c) {}  \
     explicit vector(size_type num, const value_type& val = value_type()) : vector_base(num, val) {} \
-    vector(iterator start_, iterator end_) : vector_base(start_, end_) {}  \
+    vector(iterator start, iterator end) : vector_base(start, end) {}  \
     vector& operator=(const vector& x) {  \
       vector_base::operator=(x);  \
       return *this;  \
@@ -62,7 +62,7 @@ namespace std {
     : vector_base(first, last, a) {} \
     vector(const vector& c) : vector_base(c) {}  \
     explicit vector(size_type num, const value_type& val = value_type()) : vector_base(num, val) {} \
-    vector(iterator start_, iterator end_) : vector_base(start_, end_) {}  \
+    vector(iterator start, iterator end) : vector_base(start, end) {}  \
     vector& operator=(const vector& x) {  \
       vector_base::operator=(x);  \
       return *this;  \
diff --git a/uppsrc/plugin/Eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h b/uppsrc/plugin/Eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h
index 354e33de5..7261c7d0f 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h
@@ -352,7 +352,7 @@ class SuperLUBase : public SparseSolverBase<Derived>
     
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was successful,
+      * \returns \c Success if computation was succesful,
       *          \c NumericalIssue if the matrix.appears to be negative.
       */
     ComputationInfo info() const
diff --git a/uppsrc/plugin/Eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h b/uppsrc/plugin/Eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h
index e3a333f80..91c09ab13 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h
@@ -10,16 +10,6 @@
 #ifndef EIGEN_UMFPACKSUPPORT_H
 #define EIGEN_UMFPACKSUPPORT_H
 
-// for compatibility with super old version of umfpack,
-// not sure this is really needed, but this is harmless.
-#ifndef SuiteSparse_long
-#ifdef UF_long
-#define SuiteSparse_long UF_long
-#else
-#error neither SuiteSparse_long nor UF_long are defined
-#endif
-#endif
-
 namespace Eigen {
 
 /* TODO extract L, extract U, compute det, etc... */
@@ -27,85 +17,42 @@ namespace Eigen {
 // generic double/complex<double> wrapper functions:
 
 
- // Defaults
-inline void umfpack_defaults(double control[UMFPACK_CONTROL], double, int)
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], double)
 { umfpack_di_defaults(control); }
 
-inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex<double>, int)
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex<double>)
 { umfpack_zi_defaults(control); }
 
-inline void umfpack_defaults(double control[UMFPACK_CONTROL], double, SuiteSparse_long)
-{ umfpack_dl_defaults(control); }
-
-inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex<double>, SuiteSparse_long)
-{ umfpack_zl_defaults(control); }
-
-// Report info
-inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double, int)
+inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double)
 { umfpack_di_report_info(control, info);}
 
-inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex<double>, int)
+inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex<double>)
 { umfpack_zi_report_info(control, info);}
 
-inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double, SuiteSparse_long)
-{ umfpack_dl_report_info(control, info);}
-
-inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex<double>, SuiteSparse_long)
-{ umfpack_zl_report_info(control, info);}
-
-// Report status
-inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double, int)
+inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double)
 { umfpack_di_report_status(control, status);}
 
-inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex<double>, int)
+inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex<double>)
 { umfpack_zi_report_status(control, status);}
 
-inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double, SuiteSparse_long)
-{ umfpack_dl_report_status(control, status);}
-
-inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex<double>, SuiteSparse_long)
-{ umfpack_zl_report_status(control, status);}
-
-// report control
-inline void umfpack_report_control(double control[UMFPACK_CONTROL], double, int)
+inline void umfpack_report_control(double control[UMFPACK_CONTROL], double)
 { umfpack_di_report_control(control);}
 
-inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex<double>, int)
+inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex<double>)
 { umfpack_zi_report_control(control);}
 
-inline void umfpack_report_control(double control[UMFPACK_CONTROL], double, SuiteSparse_long)
-{ umfpack_dl_report_control(control);}
-
-inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex<double>, SuiteSparse_long)
-{ umfpack_zl_report_control(control);}
-
-// Free numeric
-inline void umfpack_free_numeric(void **Numeric, double, int)
+inline void umfpack_free_numeric(void **Numeric, double)
 { umfpack_di_free_numeric(Numeric); *Numeric = 0; }
 
-inline void umfpack_free_numeric(void **Numeric, std::complex<double>, int)
+inline void umfpack_free_numeric(void **Numeric, std::complex<double>)
 { umfpack_zi_free_numeric(Numeric); *Numeric = 0; }
 
-inline void umfpack_free_numeric(void **Numeric, double, SuiteSparse_long)
-{ umfpack_dl_free_numeric(Numeric); *Numeric = 0; }
-
-inline void umfpack_free_numeric(void **Numeric, std::complex<double>, SuiteSparse_long)
-{ umfpack_zl_free_numeric(Numeric); *Numeric = 0; }
-
-// Free symbolic
-inline void umfpack_free_symbolic(void **Symbolic, double, int)
+inline void umfpack_free_symbolic(void **Symbolic, double)
 { umfpack_di_free_symbolic(Symbolic); *Symbolic = 0; }
 
-inline void umfpack_free_symbolic(void **Symbolic, std::complex<double>, int)
+inline void umfpack_free_symbolic(void **Symbolic, std::complex<double>)
 { umfpack_zi_free_symbolic(Symbolic); *Symbolic = 0; }
 
-inline void umfpack_free_symbolic(void **Symbolic, double, SuiteSparse_long)
-{ umfpack_dl_free_symbolic(Symbolic); *Symbolic = 0; }
-
-inline void umfpack_free_symbolic(void **Symbolic, std::complex<double>, SuiteSparse_long)
-{ umfpack_zl_free_symbolic(Symbolic); *Symbolic = 0; }
-
-// Symbolic
 inline int umfpack_symbolic(int n_row,int n_col,
                             const int Ap[], const int Ai[], const double Ax[], void **Symbolic,
                             const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO])
@@ -119,21 +66,7 @@ inline int umfpack_symbolic(int n_row,int n_col,
 {
   return umfpack_zi_symbolic(n_row,n_col,Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Control,Info);
 }
-inline SuiteSparse_long umfpack_symbolic( SuiteSparse_long n_row,SuiteSparse_long n_col,
-                                          const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[], void **Symbolic,
-                                          const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO])
-{
-  return umfpack_dl_symbolic(n_row,n_col,Ap,Ai,Ax,Symbolic,Control,Info);
-}
 
-inline SuiteSparse_long umfpack_symbolic( SuiteSparse_long n_row,SuiteSparse_long n_col,
-                                          const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex<double> Ax[], void **Symbolic,
-                                          const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO])
-{
-  return umfpack_zl_symbolic(n_row,n_col,Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Control,Info);
-}
-
-// Numeric
 inline int umfpack_numeric( const int Ap[], const int Ai[], const double Ax[],
                             void *Symbolic, void **Numeric,
                             const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO])
@@ -147,21 +80,7 @@ inline int umfpack_numeric( const int Ap[], const int Ai[], const std::complex<d
 {
   return umfpack_zi_numeric(Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Numeric,Control,Info);
 }
-inline SuiteSparse_long umfpack_numeric(const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[],
-                                        void *Symbolic, void **Numeric,
-                                        const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO])
-{
-  return umfpack_dl_numeric(Ap,Ai,Ax,Symbolic,Numeric,Control,Info);
-}
 
-inline SuiteSparse_long umfpack_numeric(const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex<double> Ax[],
-                                        void *Symbolic, void **Numeric,
-                                        const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO])
-{
-  return umfpack_zl_numeric(Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Numeric,Control,Info);
-}
-
-// solve
 inline int umfpack_solve( int sys, const int Ap[], const int Ai[], const double Ax[],
                           double X[], const double B[], void *Numeric,
                           const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO])
@@ -176,21 +95,6 @@ inline int umfpack_solve( int sys, const int Ap[], const int Ai[], const std::co
   return umfpack_zi_solve(sys,Ap,Ai,&numext::real_ref(Ax[0]),0,&numext::real_ref(X[0]),0,&numext::real_ref(B[0]),0,Numeric,Control,Info);
 }
 
-inline SuiteSparse_long umfpack_solve(int sys, const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[],
-                                      double X[], const double B[], void *Numeric,
-                                      const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO])
-{
-  return umfpack_dl_solve(sys,Ap,Ai,Ax,X,B,Numeric,Control,Info);
-}
-
-inline SuiteSparse_long umfpack_solve(int sys, const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex<double> Ax[],
-                                      std::complex<double> X[], const std::complex<double> B[], void *Numeric,
-                                      const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO])
-{
-  return umfpack_zl_solve(sys,Ap,Ai,&numext::real_ref(Ax[0]),0,&numext::real_ref(X[0]),0,&numext::real_ref(B[0]),0,Numeric,Control,Info);
-}
-
-// Get Lunz
 inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_udiag, void *Numeric, double)
 {
   return umfpack_di_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric);
@@ -201,19 +105,6 @@ inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_
   return umfpack_zi_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric);
 }
 
-inline SuiteSparse_long umfpack_get_lunz( SuiteSparse_long *lnz, SuiteSparse_long *unz, SuiteSparse_long *n_row, SuiteSparse_long *n_col,
-                                          SuiteSparse_long *nz_udiag, void *Numeric, double)
-{
-  return umfpack_dl_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric);
-}
-
-inline SuiteSparse_long umfpack_get_lunz( SuiteSparse_long *lnz, SuiteSparse_long *unz, SuiteSparse_long *n_row, SuiteSparse_long *n_col,
-                                          SuiteSparse_long *nz_udiag, void *Numeric, std::complex<double>)
-{
-  return umfpack_zl_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric);
-}
-
-// Get Numeric
 inline int umfpack_get_numeric(int Lp[], int Lj[], double Lx[], int Up[], int Ui[], double Ux[],
                                int P[], int Q[], double Dx[], int *do_recip, double Rs[], void *Numeric)
 {
@@ -229,45 +120,18 @@ inline int umfpack_get_numeric(int Lp[], int Lj[], std::complex<double> Lx[], in
   return umfpack_zi_get_numeric(Lp,Lj,Lx?&lx0_real:0,0,Up,Ui,Ux?&ux0_real:0,0,P,Q,
                                 Dx?&dx0_real:0,0,do_recip,Rs,Numeric);
 }
-inline SuiteSparse_long umfpack_get_numeric(SuiteSparse_long Lp[], SuiteSparse_long Lj[], double Lx[], SuiteSparse_long Up[], SuiteSparse_long Ui[], double Ux[],
-                                            SuiteSparse_long P[], SuiteSparse_long Q[], double Dx[], SuiteSparse_long *do_recip, double Rs[], void *Numeric)
-{
-  return umfpack_dl_get_numeric(Lp,Lj,Lx,Up,Ui,Ux,P,Q,Dx,do_recip,Rs,Numeric);
-}
 
-inline SuiteSparse_long umfpack_get_numeric(SuiteSparse_long Lp[], SuiteSparse_long Lj[], std::complex<double> Lx[], SuiteSparse_long Up[], SuiteSparse_long Ui[], std::complex<double> Ux[],
-                                            SuiteSparse_long P[], SuiteSparse_long Q[], std::complex<double> Dx[], SuiteSparse_long *do_recip, double Rs[], void *Numeric)
-{
-  double& lx0_real = numext::real_ref(Lx[0]);
-  double& ux0_real = numext::real_ref(Ux[0]);
-  double& dx0_real = numext::real_ref(Dx[0]);
-  return umfpack_zl_get_numeric(Lp,Lj,Lx?&lx0_real:0,0,Up,Ui,Ux?&ux0_real:0,0,P,Q,
-                                Dx?&dx0_real:0,0,do_recip,Rs,Numeric);
-}
-
-// Get Determinant
-inline int umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], int)
+inline int umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO])
 {
   return umfpack_di_get_determinant(Mx,Ex,NumericHandle,User_Info);
 }
 
-inline int umfpack_get_determinant(std::complex<double> *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], int)
+inline int umfpack_get_determinant(std::complex<double> *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO])
 {
   double& mx_real = numext::real_ref(*Mx);
   return umfpack_zi_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info);
 }
 
-inline SuiteSparse_long umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], SuiteSparse_long)
-{
-  return umfpack_dl_get_determinant(Mx,Ex,NumericHandle,User_Info);
-}
-
-inline SuiteSparse_long umfpack_get_determinant(std::complex<double> *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], SuiteSparse_long)
-{
-  double& mx_real = numext::real_ref(*Mx);
-  return umfpack_zl_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info);
-}
-
 
 /** \ingroup UmfPackSupport_Module
   * \brief A sparse LU factorization and solver based on UmfPack
@@ -300,7 +164,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
     typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
     typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
     typedef SparseMatrix<Scalar> LUMatrixType;
-    typedef SparseMatrix<Scalar,ColMajor,StorageIndex> UmfpackMatrixType;
+    typedef SparseMatrix<Scalar,ColMajor,int> UmfpackMatrixType;
     typedef Ref<const UmfpackMatrixType, StandardCompressedFormat> UmfpackMatrixRef;
     enum {
       ColsAtCompileTime = MatrixType::ColsAtCompileTime,
@@ -328,8 +192,8 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
 
     ~UmfPackLU()
     {
-      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(), StorageIndex());
-      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar(), StorageIndex());
+      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar());
+      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
     }
 
     inline Index rows() const { return mp_matrix.rows(); }
@@ -337,7 +201,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
 
     /** \brief Reports whether previous computation was successful.
       *
-      * \returns \c Success if computation was successful,
+      * \returns \c Success if computation was succesful,
       *          \c NumericalIssue if the matrix.appears to be negative.
       */
     ComputationInfo info() const
@@ -377,8 +241,8 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
     template<typename InputMatrixType>
     void compute(const InputMatrixType& matrix)
     {
-      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(),StorageIndex());
-      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex());
+      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar());
+      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
       grab(matrix.derived());
       analyzePattern_impl();
       factorize_impl();
@@ -393,8 +257,8 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
     template<typename InputMatrixType>
     void analyzePattern(const InputMatrixType& matrix)
     {
-      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(),StorageIndex());
-      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex());
+      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar());
+      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
 
       grab(matrix.derived());
 
@@ -445,7 +309,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
     {
       eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()");
       if(m_numeric)
-        umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex());
+        umfpack_free_numeric(&m_numeric,Scalar());
 
       grab(matrix.derived());
 
@@ -456,28 +320,28 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
       *
       * \sa umfpackControl()
       */
-    void printUmfpackControl()
+    void umfpackReportControl()
     {
-      umfpack_report_control(m_control.data(), Scalar(),StorageIndex());
+      umfpack_report_control(m_control.data(), Scalar());
     }
 
     /** Prints statistics collected by UmfPack.
       *
       * \sa analyzePattern(), compute()
       */
-    void printUmfpackInfo()
+    void umfpackReportInfo()
     {
       eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()");
-      umfpack_report_info(m_control.data(), m_umfpackInfo.data(), Scalar(),StorageIndex());
+      umfpack_report_info(m_control.data(), m_umfpackInfo.data(), Scalar());
     }
 
     /** Prints the status of the previous factorization operation performed by UmfPack (symbolic or numerical factorization).
       *
       * \sa analyzePattern(), compute()
       */
-    void printUmfpackStatus() {
+    void umfpackReportStatus() {
       eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()");
-      umfpack_report_status(m_control.data(), m_fact_errorCode, Scalar(),StorageIndex());
+      umfpack_report_status(m_control.data(), m_fact_errorCode, Scalar());
     }
 
     /** \internal */
@@ -498,13 +362,13 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
       m_symbolic              = 0;
       m_extractedDataAreDirty = true;
 
-      umfpack_defaults(m_control.data(), Scalar(),StorageIndex());
+      umfpack_defaults(m_control.data(), Scalar());
     }
 
     void analyzePattern_impl()
     {
-      m_fact_errorCode = umfpack_symbolic(internal::convert_index<StorageIndex>(mp_matrix.rows()),
-                                          internal::convert_index<StorageIndex>(mp_matrix.cols()),
+      m_fact_errorCode = umfpack_symbolic(internal::convert_index<int>(mp_matrix.rows()),
+                                          internal::convert_index<int>(mp_matrix.cols()),
                                           mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
                                           &m_symbolic, m_control.data(), m_umfpackInfo.data());
 
@@ -544,7 +408,7 @@ class UmfPackLU : public SparseSolverBase<UmfPackLU<_MatrixType> >
 
     // cached data to reduce reallocation, etc.
     mutable LUMatrixType m_l;
-    StorageIndex m_fact_errorCode;
+    int m_fact_errorCode;
     UmfpackControl m_control;
     mutable UmfpackInfo m_umfpackInfo;
 
@@ -574,7 +438,7 @@ void UmfPackLU<MatrixType>::extractData() const
   if (m_extractedDataAreDirty)
   {
     // get size of the data
-    StorageIndex lnz, unz, rows, cols, nz_udiag;
+    int lnz, unz, rows, cols, nz_udiag;
     umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar());
 
     // allocate data
@@ -600,7 +464,7 @@ template<typename MatrixType>
 typename UmfPackLU<MatrixType>::Scalar UmfPackLU<MatrixType>::determinant() const
 {
   Scalar det;
-  umfpack_get_determinant(&det, 0, m_numeric, 0, StorageIndex());
+  umfpack_get_determinant(&det, 0, m_numeric, 0);
   return det;
 }
 
@@ -613,6 +477,7 @@ bool UmfPackLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBas
   eigen_assert((XDerived::Flags&RowMajorBit)==0 && "UmfPackLU backend does not support non col-major result yet");
   eigen_assert(b.derived().data() != x.derived().data() && " Umfpack does not support inplace solve");
 
+  int errorCode;
   Scalar* x_ptr = 0;
   Matrix<Scalar,Dynamic,1> x_tmp;
   if(x.innerStride()!=1)
@@ -624,10 +489,9 @@ bool UmfPackLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBas
   {
     if(x.innerStride()==1)
       x_ptr = &x.col(j).coeffRef(0);
-    StorageIndex errorCode = umfpack_solve(UMFPACK_A,
-                                mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
-                                x_ptr, &b.const_cast_derived().col(j).coeffRef(0),
-                                m_numeric, m_control.data(), m_umfpackInfo.data());
+    errorCode = umfpack_solve(UMFPACK_A,
+        mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
+        x_ptr, &b.const_cast_derived().col(j).coeffRef(0), m_numeric, m_control.data(), m_umfpackInfo.data());
     if(x.innerStride()!=1)
       x.col(j) = x_tmp;
     if (errorCode!=0)
diff --git a/uppsrc/plugin/Eigen/Eigen/src/misc/lapacke.h b/uppsrc/plugin/Eigen/Eigen/src/misc/lapacke.h
index 3d8e24f5a..8c7e79b03 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/misc/lapacke.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/misc/lapacke.h
@@ -43,6 +43,10 @@
 #include "lapacke_config.h"
 #endif
 
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
 #include <stdlib.h>
 
 #ifndef lapack_int
@@ -104,11 +108,6 @@ lapack_complex_double lapack_make_complex_double( double re, double im );
 
 #endif
 
-
-#ifdef __cplusplus
-extern "C" {
-#endif /* __cplusplus */
-
 #ifndef LAPACKE_malloc
 #define LAPACKE_malloc( size ) malloc( size )
 #endif
diff --git a/uppsrc/plugin/Eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h b/uppsrc/plugin/Eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h
index 73d5f51c8..1f8a531af 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h
@@ -75,32 +75,6 @@ max
   return (max)(Derived::PlainObject::Constant(rows(), cols(), other));
 }
 
-/** \returns an expression of the coefficient-wise absdiff of \c *this and \a other
-  *
-  * Example: \include Cwise_absolute_difference.cpp
-  * Output: \verbinclude Cwise_absolute_difference.out
-  *
-  * \sa absolute_difference()
-  */
-EIGEN_MAKE_CWISE_BINARY_OP(absolute_difference,absolute_difference)
-
-/** \returns an expression of the coefficient-wise absolute_difference of \c *this and scalar \a other
-  *
-  * \sa absolute_difference()
-  */
-EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_absolute_difference_op<Scalar,Scalar>, const Derived,
-                                        const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
-#ifdef EIGEN_PARSED_BY_DOXYGEN
-absolute_difference
-#else
-(absolute_difference)
-#endif
-(const Scalar &other) const
-{
-  return (absolute_difference)(Derived::PlainObject::Constant(rows(), cols(), other));
-}
-
 /** \returns an expression of the coefficient-wise power of \c *this to the given array of \a exponents.
   *
   * This function computes the coefficient-wise power.
@@ -340,9 +314,9 @@ polygamma(const EIGEN_CURRENT_STORAGE_BASE_CLASS<DerivedN> &n) const
   *
   * It returns the Riemann zeta function of two arguments \c *this and \a q:
   *
+  * \param *this is the exposent, it must be > 1
   * \param q is the shift, it must be > 0
   *
-  * \note *this is the exponent, it must be > 1.
   * \note This function supports only float and double scalar types. To support other scalar types, the user has
   * to provide implementations of zeta(T,T) for any scalar type T to be supported.
   *
diff --git a/uppsrc/plugin/Eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/uppsrc/plugin/Eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h
index 59a4ee6a0..ebaa3f192 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h
@@ -10,7 +10,6 @@ typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> Inverse
 typedef CwiseUnaryOp<internal::scalar_boolean_not_op<Scalar>, const Derived> BooleanNotReturnType;
 
 typedef CwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived> ExpReturnType;
-typedef CwiseUnaryOp<internal::scalar_expm1_op<Scalar>, const Derived> Expm1ReturnType;
 typedef CwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived> LogReturnType;
 typedef CwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived> Log1pReturnType;
 typedef CwiseUnaryOp<internal::scalar_log10_op<Scalar>, const Derived> Log10ReturnType;
@@ -21,18 +20,11 @@ typedef CwiseUnaryOp<internal::scalar_acos_op<Scalar>, const Derived> AcosReturn
 typedef CwiseUnaryOp<internal::scalar_asin_op<Scalar>, const Derived> AsinReturnType;
 typedef CwiseUnaryOp<internal::scalar_atan_op<Scalar>, const Derived> AtanReturnType;
 typedef CwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived> TanhReturnType;
-typedef CwiseUnaryOp<internal::scalar_logistic_op<Scalar>, const Derived> LogisticReturnType;
 typedef CwiseUnaryOp<internal::scalar_sinh_op<Scalar>, const Derived> SinhReturnType;
-#if EIGEN_HAS_CXX11_MATH
-typedef CwiseUnaryOp<internal::scalar_atanh_op<Scalar>, const Derived> AtanhReturnType;
-typedef CwiseUnaryOp<internal::scalar_asinh_op<Scalar>, const Derived> AsinhReturnType;
-typedef CwiseUnaryOp<internal::scalar_acosh_op<Scalar>, const Derived> AcoshReturnType;
-#endif
 typedef CwiseUnaryOp<internal::scalar_cosh_op<Scalar>, const Derived> CoshReturnType;
 typedef CwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived> SquareReturnType;
 typedef CwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived> CubeReturnType;
 typedef CwiseUnaryOp<internal::scalar_round_op<Scalar>, const Derived> RoundReturnType;
-typedef CwiseUnaryOp<internal::scalar_rint_op<Scalar>, const Derived> RintReturnType;
 typedef CwiseUnaryOp<internal::scalar_floor_op<Scalar>, const Derived> FloorReturnType;
 typedef CwiseUnaryOp<internal::scalar_ceil_op<Scalar>, const Derived> CeilReturnType;
 typedef CwiseUnaryOp<internal::scalar_isnan_op<Scalar>, const Derived> IsNaNReturnType;
@@ -98,20 +90,6 @@ exp() const
   return ExpReturnType(derived());
 }
 
-/** \returns an expression of the coefficient-wise exponential of *this minus 1.
-  *
-  * In exact arithmetic, \c x.expm1() is equivalent to \c x.exp() - 1,
-  * however, with finite precision, this function is much more accurate when \c x is close to zero.
-  *
-  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_expm1">Math functions</a>, exp()
-  */
-EIGEN_DEVICE_FUNC
-inline const Expm1ReturnType
-expm1() const
-{
-  return Expm1ReturnType(derived());
-}
-
 /** \returns an expression of the coefficient-wise logarithm of *this.
   *
   * This function computes the coefficient-wise logarithm. The function MatrixBase::log() in the
@@ -120,7 +98,7 @@ expm1() const
   * Example: \include Cwise_log.cpp
   * Output: \verbinclude Cwise_log.out
   *
-  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_log">Math functions</a>, log()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_log">Math functions</a>, exp()
   */
 EIGEN_DEVICE_FUNC
 inline const LogReturnType
@@ -333,7 +311,7 @@ sinh() const
   * Example: \include Cwise_cosh.cpp
   * Output: \verbinclude Cwise_cosh.out
   *
-  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cosh">Math functions</a>, tanh(), sinh(), cosh()
+  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cosh">Math functions</a>, tan(), sinh(), cosh()
   */
 EIGEN_DEVICE_FUNC
 inline const CoshReturnType
@@ -342,50 +320,6 @@ cosh() const
   return CoshReturnType(derived());
 }
 
-#if EIGEN_HAS_CXX11_MATH
-/** \returns an expression of the coefficient-wise inverse hyperbolic tan of *this.
-  *
-  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_atanh">Math functions</a>, atanh(), asinh(), acosh()
-  */
-EIGEN_DEVICE_FUNC
-inline const AtanhReturnType
-atanh() const
-{
-  return AtanhReturnType(derived());
-}
-
-/** \returns an expression of the coefficient-wise inverse hyperbolic sin of *this.
-  *
-  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_asinh">Math functions</a>, atanh(), asinh(), acosh()
-  */
-EIGEN_DEVICE_FUNC
-inline const AsinhReturnType
-asinh() const
-{
-  return AsinhReturnType(derived());
-}
-
-/** \returns an expression of the coefficient-wise inverse hyperbolic cos of *this.
-  *
-  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_acosh">Math functions</a>, atanh(), asinh(), acosh()
-  */
-EIGEN_DEVICE_FUNC
-inline const AcoshReturnType
-acosh() const
-{
-  return AcoshReturnType(derived());
-}
-#endif
-
-/** \returns an expression of the coefficient-wise logistic of *this.
-  */
-EIGEN_DEVICE_FUNC
-inline const LogisticReturnType
-logistic() const
-{
-  return LogisticReturnType(derived());
-}
-
 /** \returns an expression of the coefficient-wise inverse of *this.
   *
   * Example: \include Cwise_inverse.cpp
@@ -428,20 +362,6 @@ cube() const
   return CubeReturnType(derived());
 }
 
-/** \returns an expression of the coefficient-wise rint of *this.
-  *
-  * Example: \include Cwise_rint.cpp
-  * Output: \verbinclude Cwise_rint.out
-  *
-  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_rint">Math functions</a>, ceil(), floor()
-  */
-EIGEN_DEVICE_FUNC
-inline const RintReturnType
-rint() const
-{
-  return RintReturnType(derived());
-}
-
 /** \returns an expression of the coefficient-wise round of *this.
   *
   * Example: \include Cwise_round.cpp
@@ -551,12 +471,14 @@ typedef CwiseUnaryOp<internal::scalar_lgamma_op<Scalar>, const Derived> LgammaRe
 typedef CwiseUnaryOp<internal::scalar_digamma_op<Scalar>, const Derived> DigammaReturnType;
 typedef CwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived> ErfReturnType;
 typedef CwiseUnaryOp<internal::scalar_erfc_op<Scalar>, const Derived> ErfcReturnType;
-typedef CwiseUnaryOp<internal::scalar_ndtri_op<Scalar>, const Derived> NdtriReturnType;
 
 /** \cpp11 \returns an expression of the coefficient-wise ln(|gamma(*this)|).
   *
   * \specialfunctions_module
   *
+  * Example: \include Cwise_lgamma.cpp
+  * Output: \verbinclude Cwise_lgamma.out
+  *
   * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
   * or float/double in non c++11 mode, the user has to provide implementations of lgamma(T) for any scalar
   * type T to be supported.
@@ -592,6 +514,9 @@ digamma() const
   *
   * \specialfunctions_module
   *
+  * Example: \include Cwise_erf.cpp
+  * Output: \verbinclude Cwise_erf.out
+  *
   * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
   * or float/double in non c++11 mode, the user has to provide implementations of erf(T) for any scalar
   * type T to be supported.
@@ -610,6 +535,9 @@ erf() const
   *
   * \specialfunctions_module
   *
+  * Example: \include Cwise_erfc.cpp
+  * Output: \verbinclude Cwise_erfc.out
+  *
   * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
   * or float/double in non c++11 mode, the user has to provide implementations of erfc(T) for any scalar
   * type T to be supported.
@@ -622,23 +550,3 @@ erfc() const
 {
   return ErfcReturnType(derived());
 }
-
-/** \returns an expression of the coefficient-wise inverse of the CDF of the Normal distribution function
-  * function of *this.
-  *
-  * \specialfunctions_module
-  * 
-  * In other words, considering `x = ndtri(y)`, it returns the argument, x, for which the area under the
-  * Gaussian probability density function (integrated from minus infinity to x) is equal to y.
-  *
-  * \note This function supports only float and double scalar types. To support other scalar types,
-  * the user has to provide implementations of ndtri(T) for any scalar type T to be supported.
-  *
-  * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_ndtri">Math functions</a>
-  */
-EIGEN_DEVICE_FUNC
-inline const NdtriReturnType
-ndtri() const
-{
-  return NdtriReturnType(derived());
-}
diff --git a/uppsrc/plugin/Eigen/Eigen/src/plugins/BlockMethods.h b/uppsrc/plugin/Eigen/Eigen/src/plugins/BlockMethods.h
index 935a604b6..ac35a0086 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/plugins/BlockMethods.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/plugins/BlockMethods.h
@@ -40,126 +40,68 @@ typedef const VectorBlock<const Derived> ConstSegmentReturnType;
 template<int Size> struct FixedSegmentReturnType { typedef VectorBlock<Derived, Size> Type; };
 template<int Size> struct ConstFixedSegmentReturnType { typedef const VectorBlock<const Derived, Size> Type; };
 
-/// \internal inner-vector
-typedef Block<Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true>       InnerVectorReturnType;
-typedef Block<const Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true> ConstInnerVectorReturnType;
-
-/// \internal set of inner-vectors
-typedef Block<Derived,Dynamic,Dynamic,true> InnerVectorsReturnType;
-typedef Block<const Derived,Dynamic,Dynamic,true> ConstInnerVectorsReturnType;
-
 #endif // not EIGEN_PARSED_BY_DOXYGEN
 
-/// \returns an expression of a block in \c *this with either dynamic or fixed sizes.
+/// \returns a dynamic-size expression of a block in *this.
 ///
-/// \param  startRow  the first row in the block
-/// \param  startCol  the first column in the block
-/// \param  blockRows number of rows in the block, specified at either run-time or compile-time
-/// \param  blockCols number of columns in the block, specified at either run-time or compile-time
-/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
-/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
+/// \param startRow the first row in the block
+/// \param startCol the first column in the block
+/// \param blockRows the number of rows in the block
+/// \param blockCols the number of columns in the block
 ///
-/// Example using runtime (aka dynamic) sizes: \include MatrixBase_block_int_int_int_int.cpp
+/// Example: \include MatrixBase_block_int_int_int_int.cpp
 /// Output: \verbinclude MatrixBase_block_int_int_int_int.out
 ///
-/// \newin{3.4}:
-///
-/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix<N>,
-/// or Eigen::fix<N>(n) as arguments. In the later case, \c n plays the role of a runtime fallback value in case \c N equals Eigen::Dynamic.
-/// Here is an example with a fixed number of rows \c NRows and dynamic number of columns \c cols:
-/// \code
-/// mat.block(i,j,fix<NRows>,cols)
-/// \endcode
-///
-/// This function thus fully covers the features offered by the following overloads block<NRows,NCols>(Index, Index),
-/// and block<NRows,NCols>(Index, Index, Index, Index) that are thus obsolete. Indeed, this generic version avoids
-/// redundancy, it preserves the argument order, and prevents the need to rely on the template keyword in templated code.
-///
-/// but with less redundancy and more consistency as it does not modify the argument order
-/// and seamlessly enable hybrid fixed/dynamic sizes.
-///
-/// \note Even in the case that the returned expression has dynamic size, in the case
+/// \note Even though the returned expression has dynamic size, in the case
 /// when it is applied to a fixed-size matrix, it inherits a fixed maximal size,
 /// which means that evaluating it does not cause a dynamic memory allocation.
 ///
 EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 ///
-/// \sa class Block, fix, fix<N>(int)
+/// \sa class Block, block(Index,Index)
 ///
-template<typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
-#else
-typename FixedBlockXpr<...,...>::Type
-#endif
-block(Index startRow, Index startCol, NRowsType blockRows, NColsType blockCols)
+EIGEN_DEVICE_FUNC
+inline BlockXpr block(Index startRow, Index startCol, Index blockRows, Index blockCols)
 {
-  return typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type(
-            derived(), startRow, startCol, internal::get_runtime_value(blockRows), internal::get_runtime_value(blockCols));
+  return BlockXpr(derived(), startRow, startCol, blockRows, blockCols);
 }
 
-/// This is the const version of block(Index,Index,NRowsType,NColsType)
-template<typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
-#else
-const typename ConstFixedBlockXpr<...,...>::Type
-#endif
-block(Index startRow, Index startCol, NRowsType blockRows, NColsType blockCols) const
+/// This is the const version of block(Index,Index,Index,Index). */
+EIGEN_DEVICE_FUNC
+inline const ConstBlockXpr block(Index startRow, Index startCol, Index blockRows, Index blockCols) const
 {
-  return typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type(
-            derived(), startRow, startCol, internal::get_runtime_value(blockRows), internal::get_runtime_value(blockCols));
+  return ConstBlockXpr(derived(), startRow, startCol, blockRows, blockCols);
 }
 
 
 
-/// \returns a expression of a top-right corner of \c *this with either dynamic or fixed sizes.
+
+/// \returns a dynamic-size expression of a top-right corner of *this.
 ///
 /// \param cRows the number of rows in the corner
 /// \param cCols the number of columns in the corner
-/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
-/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
 ///
-/// Example with dynamic sizes: \include MatrixBase_topRightCorner_int_int.cpp
+/// Example: \include MatrixBase_topRightCorner_int_int.cpp
 /// Output: \verbinclude MatrixBase_topRightCorner_int_int.out
 ///
-/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix<N>,
-/// or Eigen::fix<N>(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
-///
 EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 ///
-/// \sa block(Index,Index,NRowsType,NColsType), class Block
+/// \sa class Block, block(Index,Index,Index,Index)
 ///
-template<typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
-#else
-typename FixedBlockXpr<...,...>::Type
-#endif
-topRightCorner(NRowsType cRows, NColsType cCols)
+EIGEN_DEVICE_FUNC
+inline BlockXpr topRightCorner(Index cRows, Index cCols)
 {
-  return typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
-            (derived(), 0, cols() - internal::get_runtime_value(cCols), internal::get_runtime_value(cRows), internal::get_runtime_value(cCols));
+  return BlockXpr(derived(), 0, cols() - cCols, cRows, cCols);
 }
 
-/// This is the const version of topRightCorner(NRowsType, NColsType).
-template<typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
-#else
-const typename ConstFixedBlockXpr<...,...>::Type
-#endif
-topRightCorner(NRowsType cRows, NColsType cCols) const
+/// This is the const version of topRightCorner(Index, Index).
+EIGEN_DEVICE_FUNC
+inline const ConstBlockXpr topRightCorner(Index cRows, Index cCols) const
 {
-  return typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
-            (derived(), 0, cols() - internal::get_runtime_value(cCols), internal::get_runtime_value(cRows), internal::get_runtime_value(cCols));
+  return ConstBlockXpr(derived(), 0, cols() - cCols, cRows, cCols);
 }
 
-/// \returns an expression of a fixed-size top-right corner of \c *this.
+/// \returns an expression of a fixed-size top-right corner of *this.
 ///
 /// \tparam CRows the number of rows in the corner
 /// \tparam CCols the number of columns in the corner
@@ -172,21 +114,21 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa class Block, block<int,int>(Index,Index)
 ///
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename FixedBlockXpr<CRows,CCols>::Type topRightCorner()
+EIGEN_DEVICE_FUNC
+inline typename FixedBlockXpr<CRows,CCols>::Type topRightCorner()
 {
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), 0, cols() - CCols);
 }
 
 /// This is the const version of topRightCorner<int, int>().
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-const typename ConstFixedBlockXpr<CRows,CCols>::Type topRightCorner() const
+EIGEN_DEVICE_FUNC
+inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topRightCorner() const
 {
   return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), 0, cols() - CCols);
 }
 
-/// \returns an expression of a top-right corner of \c *this.
+/// \returns an expression of a top-right corner of *this.
 ///
 /// \tparam CRows number of rows in corner as specified at compile-time
 /// \tparam CCols number of columns in corner as specified at compile-time
@@ -206,67 +148,46 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa class Block
 ///
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename FixedBlockXpr<CRows,CCols>::Type topRightCorner(Index cRows, Index cCols)
+inline typename FixedBlockXpr<CRows,CCols>::Type topRightCorner(Index cRows, Index cCols)
 {
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), 0, cols() - cCols, cRows, cCols);
 }
 
 /// This is the const version of topRightCorner<int, int>(Index, Index).
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-const typename ConstFixedBlockXpr<CRows,CCols>::Type topRightCorner(Index cRows, Index cCols) const
+inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topRightCorner(Index cRows, Index cCols) const
 {
   return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), 0, cols() - cCols, cRows, cCols);
 }
 
 
 
-/// \returns an expression of a top-left corner of \c *this  with either dynamic or fixed sizes.
+/// \returns a dynamic-size expression of a top-left corner of *this.
 ///
 /// \param cRows the number of rows in the corner
 /// \param cCols the number of columns in the corner
-/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
-/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
 ///
 /// Example: \include MatrixBase_topLeftCorner_int_int.cpp
 /// Output: \verbinclude MatrixBase_topLeftCorner_int_int.out
 ///
-/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix<N>,
-/// or Eigen::fix<N>(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
-///
 EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 ///
-/// \sa block(Index,Index,NRowsType,NColsType), class Block
+/// \sa class Block, block(Index,Index,Index,Index)
 ///
-template<typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
-#else
-typename FixedBlockXpr<...,...>::Type
-#endif
-topLeftCorner(NRowsType cRows, NColsType cCols)
+EIGEN_DEVICE_FUNC
+inline BlockXpr topLeftCorner(Index cRows, Index cCols)
 {
-  return typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
-            (derived(), 0, 0, internal::get_runtime_value(cRows), internal::get_runtime_value(cCols));
+  return BlockXpr(derived(), 0, 0, cRows, cCols);
 }
 
 /// This is the const version of topLeftCorner(Index, Index).
-template<typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
-#else
-const typename ConstFixedBlockXpr<...,...>::Type
-#endif
-topLeftCorner(NRowsType cRows, NColsType cCols) const
+EIGEN_DEVICE_FUNC
+inline const ConstBlockXpr topLeftCorner(Index cRows, Index cCols) const
 {
-  return typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
-            (derived(), 0, 0, internal::get_runtime_value(cRows), internal::get_runtime_value(cCols));
+  return ConstBlockXpr(derived(), 0, 0, cRows, cCols);
 }
 
-/// \returns an expression of a fixed-size top-left corner of \c *this.
+/// \returns an expression of a fixed-size top-left corner of *this.
 ///
 /// The template parameters CRows and CCols are the number of rows and columns in the corner.
 ///
@@ -275,24 +196,24 @@ topLeftCorner(NRowsType cRows, NColsType cCols) const
 ///
 EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 ///
-/// \sa block(Index,Index,NRowsType,NColsType), class Block
+/// \sa class Block, block(Index,Index,Index,Index)
 ///
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename FixedBlockXpr<CRows,CCols>::Type topLeftCorner()
+EIGEN_DEVICE_FUNC
+inline typename FixedBlockXpr<CRows,CCols>::Type topLeftCorner()
 {
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), 0, 0);
 }
 
 /// This is the const version of topLeftCorner<int, int>().
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-const typename ConstFixedBlockXpr<CRows,CCols>::Type topLeftCorner() const
+EIGEN_DEVICE_FUNC
+inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topLeftCorner() const
 {
   return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), 0, 0);
 }
 
-/// \returns an expression of a top-left corner of \c *this.
+/// \returns an expression of a top-left corner of *this.
 ///
 /// \tparam CRows number of rows in corner as specified at compile-time
 /// \tparam CCols number of columns in corner as specified at compile-time
@@ -312,69 +233,46 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa class Block
 ///
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename FixedBlockXpr<CRows,CCols>::Type topLeftCorner(Index cRows, Index cCols)
+inline typename FixedBlockXpr<CRows,CCols>::Type topLeftCorner(Index cRows, Index cCols)
 {
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), 0, 0, cRows, cCols);
 }
 
 /// This is the const version of topLeftCorner<int, int>(Index, Index).
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-const typename ConstFixedBlockXpr<CRows,CCols>::Type topLeftCorner(Index cRows, Index cCols) const
+inline const typename ConstFixedBlockXpr<CRows,CCols>::Type topLeftCorner(Index cRows, Index cCols) const
 {
   return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), 0, 0, cRows, cCols);
 }
 
 
 
-/// \returns an expression of a bottom-right corner of \c *this  with either dynamic or fixed sizes.
+/// \returns a dynamic-size expression of a bottom-right corner of *this.
 ///
 /// \param cRows the number of rows in the corner
 /// \param cCols the number of columns in the corner
-/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
-/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
 ///
 /// Example: \include MatrixBase_bottomRightCorner_int_int.cpp
 /// Output: \verbinclude MatrixBase_bottomRightCorner_int_int.out
 ///
-/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix<N>,
-/// or Eigen::fix<N>(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
-///
 EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 ///
-/// \sa block(Index,Index,NRowsType,NColsType), class Block
+/// \sa class Block, block(Index,Index,Index,Index)
 ///
-template<typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
-#else
-typename FixedBlockXpr<...,...>::Type
-#endif
-bottomRightCorner(NRowsType cRows, NColsType cCols)
+EIGEN_DEVICE_FUNC
+inline BlockXpr bottomRightCorner(Index cRows, Index cCols)
 {
-  return typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
-            (derived(), rows() - internal::get_runtime_value(cRows), cols() - internal::get_runtime_value(cCols),
-                        internal::get_runtime_value(cRows), internal::get_runtime_value(cCols));
+  return BlockXpr(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
 }
 
-/// This is the const version of bottomRightCorner(NRowsType, NColsType).
-template<typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
-#else
-const typename ConstFixedBlockXpr<...,...>::Type
-#endif
-bottomRightCorner(NRowsType cRows, NColsType cCols) const
+/// This is the const version of bottomRightCorner(Index, Index).
+EIGEN_DEVICE_FUNC
+inline const ConstBlockXpr bottomRightCorner(Index cRows, Index cCols) const
 {
-  return typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
-            (derived(), rows() - internal::get_runtime_value(cRows), cols() - internal::get_runtime_value(cCols),
-                        internal::get_runtime_value(cRows), internal::get_runtime_value(cCols));
+  return ConstBlockXpr(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
 }
 
-/// \returns an expression of a fixed-size bottom-right corner of \c *this.
+/// \returns an expression of a fixed-size bottom-right corner of *this.
 ///
 /// The template parameters CRows and CCols are the number of rows and columns in the corner.
 ///
@@ -383,24 +281,24 @@ bottomRightCorner(NRowsType cRows, NColsType cCols) const
 ///
 EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 ///
-/// \sa block(Index,Index,NRowsType,NColsType), class Block
+/// \sa class Block, block(Index,Index,Index,Index)
 ///
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename FixedBlockXpr<CRows,CCols>::Type bottomRightCorner()
+EIGEN_DEVICE_FUNC
+inline typename FixedBlockXpr<CRows,CCols>::Type bottomRightCorner()
 {
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), rows() - CRows, cols() - CCols);
 }
 
 /// This is the const version of bottomRightCorner<int, int>().
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomRightCorner() const
+EIGEN_DEVICE_FUNC
+inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomRightCorner() const
 {
   return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), rows() - CRows, cols() - CCols);
 }
 
-/// \returns an expression of a bottom-right corner of \c *this.
+/// \returns an expression of a bottom-right corner of *this.
 ///
 /// \tparam CRows number of rows in corner as specified at compile-time
 /// \tparam CCols number of columns in corner as specified at compile-time
@@ -420,69 +318,46 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa class Block
 ///
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename FixedBlockXpr<CRows,CCols>::Type bottomRightCorner(Index cRows, Index cCols)
+inline typename FixedBlockXpr<CRows,CCols>::Type bottomRightCorner(Index cRows, Index cCols)
 {
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
 }
 
 /// This is the const version of bottomRightCorner<int, int>(Index, Index).
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomRightCorner(Index cRows, Index cCols) const
+inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomRightCorner(Index cRows, Index cCols) const
 {
   return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
 }
 
 
 
-/// \returns an expression of a bottom-left corner of \c *this  with either dynamic or fixed sizes.
+/// \returns a dynamic-size expression of a bottom-left corner of *this.
 ///
 /// \param cRows the number of rows in the corner
 /// \param cCols the number of columns in the corner
-/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
-/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
 ///
 /// Example: \include MatrixBase_bottomLeftCorner_int_int.cpp
 /// Output: \verbinclude MatrixBase_bottomLeftCorner_int_int.out
 ///
-/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix<N>,
-/// or Eigen::fix<N>(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
-///
 EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 ///
-/// \sa block(Index,Index,NRowsType,NColsType), class Block
+/// \sa class Block, block(Index,Index,Index,Index)
 ///
-template<typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
-#else
-typename FixedBlockXpr<...,...>::Type
-#endif
-bottomLeftCorner(NRowsType cRows, NColsType cCols)
+EIGEN_DEVICE_FUNC
+inline BlockXpr bottomLeftCorner(Index cRows, Index cCols)
 {
-  return typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
-            (derived(), rows() - internal::get_runtime_value(cRows), 0,
-                        internal::get_runtime_value(cRows), internal::get_runtime_value(cCols));
+  return BlockXpr(derived(), rows() - cRows, 0, cRows, cCols);
 }
 
-/// This is the const version of bottomLeftCorner(NRowsType, NColsType).
-template<typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
-#else
-typename ConstFixedBlockXpr<...,...>::Type
-#endif
-bottomLeftCorner(NRowsType cRows, NColsType cCols) const
+/// This is the const version of bottomLeftCorner(Index, Index).
+EIGEN_DEVICE_FUNC
+inline const ConstBlockXpr bottomLeftCorner(Index cRows, Index cCols) const
 {
-  return typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,internal::get_fixed_value<NColsType>::value>::Type
-            (derived(), rows() - internal::get_runtime_value(cRows), 0,
-                        internal::get_runtime_value(cRows), internal::get_runtime_value(cCols));
+  return ConstBlockXpr(derived(), rows() - cRows, 0, cRows, cCols);
 }
 
-/// \returns an expression of a fixed-size bottom-left corner of \c *this.
+/// \returns an expression of a fixed-size bottom-left corner of *this.
 ///
 /// The template parameters CRows and CCols are the number of rows and columns in the corner.
 ///
@@ -491,24 +366,24 @@ bottomLeftCorner(NRowsType cRows, NColsType cCols) const
 ///
 EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 ///
-/// \sa block(Index,Index,NRowsType,NColsType), class Block
+/// \sa class Block, block(Index,Index,Index,Index)
 ///
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename FixedBlockXpr<CRows,CCols>::Type bottomLeftCorner()
+EIGEN_DEVICE_FUNC
+inline typename FixedBlockXpr<CRows,CCols>::Type bottomLeftCorner()
 {
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), rows() - CRows, 0);
 }
 
 /// This is the const version of bottomLeftCorner<int, int>().
 template<int CRows, int CCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomLeftCorner() const
+EIGEN_DEVICE_FUNC
+inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomLeftCorner() const
 {
   return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), rows() - CRows, 0);
 }
 
-/// \returns an expression of a bottom-left corner of \c *this.
+/// \returns an expression of a bottom-left corner of *this.
 ///
 /// \tparam CRows number of rows in corner as specified at compile-time
 /// \tparam CCols number of columns in corner as specified at compile-time
@@ -528,66 +403,45 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 /// \sa class Block
 ///
 template<int CRows, int CCols>
-EIGEN_STRONG_INLINE
-typename FixedBlockXpr<CRows,CCols>::Type bottomLeftCorner(Index cRows, Index cCols)
+inline typename FixedBlockXpr<CRows,CCols>::Type bottomLeftCorner(Index cRows, Index cCols)
 {
   return typename FixedBlockXpr<CRows,CCols>::Type(derived(), rows() - cRows, 0, cRows, cCols);
 }
 
 /// This is the const version of bottomLeftCorner<int, int>(Index, Index).
 template<int CRows, int CCols>
-EIGEN_STRONG_INLINE
-const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomLeftCorner(Index cRows, Index cCols) const
+inline const typename ConstFixedBlockXpr<CRows,CCols>::Type bottomLeftCorner(Index cRows, Index cCols) const
 {
   return typename ConstFixedBlockXpr<CRows,CCols>::Type(derived(), rows() - cRows, 0, cRows, cCols);
 }
 
 
 
-/// \returns a block consisting of the top rows of \c *this.
+/// \returns a block consisting of the top rows of *this.
 ///
 /// \param n the number of rows in the block
-/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
 ///
 /// Example: \include MatrixBase_topRows_int.cpp
 /// Output: \verbinclude MatrixBase_topRows_int.out
 ///
-/// The number of rows \a n can also be specified at compile-time by passing Eigen::fix<N>,
-/// or Eigen::fix<N>(n) as arguments.
-/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
-///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
 ///
-/// \sa block(Index,Index,NRowsType,NColsType), class Block
+/// \sa class Block, block(Index,Index,Index,Index)
 ///
-template<typename NRowsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
-#else
-typename NRowsBlockXpr<...>::Type
-#endif
-topRows(NRowsType n)
+EIGEN_DEVICE_FUNC
+inline RowsBlockXpr topRows(Index n)
 {
-  return typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
-            (derived(), 0, 0, internal::get_runtime_value(n), cols());
+  return RowsBlockXpr(derived(), 0, 0, n, cols());
 }
 
-/// This is the const version of topRows(NRowsType).
-template<typename NRowsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-const typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
-#else
-const typename ConstNRowsBlockXpr<...>::Type
-#endif
-topRows(NRowsType n) const
+/// This is the const version of topRows(Index).
+EIGEN_DEVICE_FUNC
+inline ConstRowsBlockXpr topRows(Index n) const
 {
-  return typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
-            (derived(), 0, 0, internal::get_runtime_value(n), cols());
+  return ConstRowsBlockXpr(derived(), 0, 0, n, cols());
 }
 
-/// \returns a block consisting of the top rows of \c *this.
+/// \returns a block consisting of the top rows of *this.
 ///
 /// \tparam N the number of rows in the block as specified at compile-time
 /// \param n the number of rows in the block as specified at run-time
@@ -600,69 +454,50 @@ topRows(NRowsType n) const
 ///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
 ///
-/// \sa block(Index,Index,NRowsType,NColsType), class Block
+/// \sa class Block, block(Index,Index,Index,Index)
 ///
 template<int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename NRowsBlockXpr<N>::Type topRows(Index n = N)
+EIGEN_DEVICE_FUNC
+inline typename NRowsBlockXpr<N>::Type topRows(Index n = N)
 {
   return typename NRowsBlockXpr<N>::Type(derived(), 0, 0, n, cols());
 }
 
 /// This is the const version of topRows<int>().
 template<int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename ConstNRowsBlockXpr<N>::Type topRows(Index n = N) const
+EIGEN_DEVICE_FUNC
+inline typename ConstNRowsBlockXpr<N>::Type topRows(Index n = N) const
 {
   return typename ConstNRowsBlockXpr<N>::Type(derived(), 0, 0, n, cols());
 }
 
 
 
-/// \returns a block consisting of the bottom rows of \c *this.
+/// \returns a block consisting of the bottom rows of *this.
 ///
 /// \param n the number of rows in the block
-/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
 ///
 /// Example: \include MatrixBase_bottomRows_int.cpp
 /// Output: \verbinclude MatrixBase_bottomRows_int.out
 ///
-/// The number of rows \a n can also be specified at compile-time by passing Eigen::fix<N>,
-/// or Eigen::fix<N>(n) as arguments.
-/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
-///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
 ///
-/// \sa block(Index,Index,NRowsType,NColsType), class Block
+/// \sa class Block, block(Index,Index,Index,Index)
 ///
-template<typename NRowsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
-#else
-typename NRowsBlockXpr<...>::Type
-#endif
-bottomRows(NRowsType n)
+EIGEN_DEVICE_FUNC
+inline RowsBlockXpr bottomRows(Index n)
 {
-  return typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
-            (derived(), rows() - internal::get_runtime_value(n), 0, internal::get_runtime_value(n), cols());
+  return RowsBlockXpr(derived(), rows() - n, 0, n, cols());
 }
 
-/// This is the const version of bottomRows(NRowsType).
-template<typename NRowsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-const typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
-#else
-const typename ConstNRowsBlockXpr<...>::Type
-#endif
-bottomRows(NRowsType n) const
+/// This is the const version of bottomRows(Index).
+EIGEN_DEVICE_FUNC
+inline ConstRowsBlockXpr bottomRows(Index n) const
 {
-  return typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
-            (derived(), rows() - internal::get_runtime_value(n), 0, internal::get_runtime_value(n), cols());
+  return ConstRowsBlockXpr(derived(), rows() - n, 0, n, cols());
 }
 
-/// \returns a block consisting of the bottom rows of \c *this.
+/// \returns a block consisting of the bottom rows of *this.
 ///
 /// \tparam N the number of rows in the block as specified at compile-time
 /// \param n the number of rows in the block as specified at run-time
@@ -675,70 +510,51 @@ bottomRows(NRowsType n) const
 ///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
 ///
-/// \sa block(Index,Index,NRowsType,NColsType), class Block
+/// \sa class Block, block(Index,Index,Index,Index)
 ///
 template<int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename NRowsBlockXpr<N>::Type bottomRows(Index n = N)
+EIGEN_DEVICE_FUNC
+inline typename NRowsBlockXpr<N>::Type bottomRows(Index n = N)
 {
   return typename NRowsBlockXpr<N>::Type(derived(), rows() - n, 0, n, cols());
 }
 
 /// This is the const version of bottomRows<int>().
 template<int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename ConstNRowsBlockXpr<N>::Type bottomRows(Index n = N) const
+EIGEN_DEVICE_FUNC
+inline typename ConstNRowsBlockXpr<N>::Type bottomRows(Index n = N) const
 {
   return typename ConstNRowsBlockXpr<N>::Type(derived(), rows() - n, 0, n, cols());
 }
 
 
 
-/// \returns a block consisting of a range of rows of \c *this.
+/// \returns a block consisting of a range of rows of *this.
 ///
 /// \param startRow the index of the first row in the block
 /// \param n the number of rows in the block
-/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
 ///
 /// Example: \include DenseBase_middleRows_int.cpp
 /// Output: \verbinclude DenseBase_middleRows_int.out
 ///
-/// The number of rows \a n can also be specified at compile-time by passing Eigen::fix<N>,
-/// or Eigen::fix<N>(n) as arguments.
-/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
-///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
 ///
-/// \sa block(Index,Index,NRowsType,NColsType), class Block
+/// \sa class Block, block(Index,Index,Index,Index)
 ///
-template<typename NRowsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
-#else
-typename NRowsBlockXpr<...>::Type
-#endif
-middleRows(Index startRow, NRowsType n)
+EIGEN_DEVICE_FUNC
+inline RowsBlockXpr middleRows(Index startRow, Index n)
 {
-  return typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
-            (derived(), startRow, 0, internal::get_runtime_value(n), cols());
+  return RowsBlockXpr(derived(), startRow, 0, n, cols());
 }
 
-/// This is the const version of middleRows(Index,NRowsType).
-template<typename NRowsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-const typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
-#else
-const typename ConstNRowsBlockXpr<...>::Type
-#endif
-middleRows(Index startRow, NRowsType n) const
+/// This is the const version of middleRows(Index,Index).
+EIGEN_DEVICE_FUNC
+inline ConstRowsBlockXpr middleRows(Index startRow, Index n) const
 {
-  return typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
-            (derived(), startRow, 0, internal::get_runtime_value(n), cols());
+  return ConstRowsBlockXpr(derived(), startRow, 0, n, cols());
 }
 
-/// \returns a block consisting of a range of rows of \c *this.
+/// \returns a block consisting of a range of rows of *this.
 ///
 /// \tparam N the number of rows in the block as specified at compile-time
 /// \param startRow the index of the first row in the block
@@ -752,69 +568,50 @@ middleRows(Index startRow, NRowsType n) const
 ///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
 ///
-/// \sa block(Index,Index,NRowsType,NColsType), class Block
+/// \sa class Block, block(Index,Index,Index,Index)
 ///
 template<int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename NRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N)
+EIGEN_DEVICE_FUNC
+inline typename NRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N)
 {
   return typename NRowsBlockXpr<N>::Type(derived(), startRow, 0, n, cols());
 }
 
 /// This is the const version of middleRows<int>().
 template<int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename ConstNRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N) const
+EIGEN_DEVICE_FUNC
+inline typename ConstNRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N) const
 {
   return typename ConstNRowsBlockXpr<N>::Type(derived(), startRow, 0, n, cols());
 }
 
 
 
-/// \returns a block consisting of the left columns of \c *this.
+/// \returns a block consisting of the left columns of *this.
 ///
 /// \param n the number of columns in the block
-/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
 ///
 /// Example: \include MatrixBase_leftCols_int.cpp
 /// Output: \verbinclude MatrixBase_leftCols_int.out
 ///
-/// The number of columns \a n can also be specified at compile-time by passing Eigen::fix<N>,
-/// or Eigen::fix<N>(n) as arguments.
-/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
-///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
 ///
-/// \sa block(Index,Index,NRowsType,NColsType), class Block
+/// \sa class Block, block(Index,Index,Index,Index)
 ///
-template<typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
-#else
-typename NColsBlockXpr<...>::Type
-#endif
-leftCols(NColsType n)
+EIGEN_DEVICE_FUNC
+inline ColsBlockXpr leftCols(Index n)
 {
-  return typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
-            (derived(), 0, 0, rows(), internal::get_runtime_value(n));
+  return ColsBlockXpr(derived(), 0, 0, rows(), n);
 }
 
-/// This is the const version of leftCols(NColsType).
-template<typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-const typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
-#else
-const typename ConstNColsBlockXpr<...>::Type
-#endif
-leftCols(NColsType n) const
+/// This is the const version of leftCols(Index).
+EIGEN_DEVICE_FUNC
+inline ConstColsBlockXpr leftCols(Index n) const
 {
-  return typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
-            (derived(), 0, 0, rows(), internal::get_runtime_value(n));
+  return ConstColsBlockXpr(derived(), 0, 0, rows(), n);
 }
 
-/// \returns a block consisting of the left columns of \c *this.
+/// \returns a block consisting of the left columns of *this.
 ///
 /// \tparam N the number of columns in the block as specified at compile-time
 /// \param n the number of columns in the block as specified at run-time
@@ -827,69 +624,50 @@ leftCols(NColsType n) const
 ///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
 ///
-/// \sa block(Index,Index,NRowsType,NColsType), class Block
+/// \sa class Block, block(Index,Index,Index,Index)
 ///
 template<int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename NColsBlockXpr<N>::Type leftCols(Index n = N)
+EIGEN_DEVICE_FUNC
+inline typename NColsBlockXpr<N>::Type leftCols(Index n = N)
 {
   return typename NColsBlockXpr<N>::Type(derived(), 0, 0, rows(), n);
 }
 
 /// This is the const version of leftCols<int>().
 template<int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename ConstNColsBlockXpr<N>::Type leftCols(Index n = N) const
+EIGEN_DEVICE_FUNC
+inline typename ConstNColsBlockXpr<N>::Type leftCols(Index n = N) const
 {
   return typename ConstNColsBlockXpr<N>::Type(derived(), 0, 0, rows(), n);
 }
 
 
 
-/// \returns a block consisting of the right columns of \c *this.
+/// \returns a block consisting of the right columns of *this.
 ///
 /// \param n the number of columns in the block
-/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
 ///
 /// Example: \include MatrixBase_rightCols_int.cpp
 /// Output: \verbinclude MatrixBase_rightCols_int.out
 ///
-/// The number of columns \a n can also be specified at compile-time by passing Eigen::fix<N>,
-/// or Eigen::fix<N>(n) as arguments.
-/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
-///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
 ///
-/// \sa block(Index,Index,NRowsType,NColsType), class Block
+/// \sa class Block, block(Index,Index,Index,Index)
 ///
-template<typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
-#else
-typename NColsBlockXpr<...>::Type
-#endif
-rightCols(NColsType n)
+EIGEN_DEVICE_FUNC
+inline ColsBlockXpr rightCols(Index n)
 {
-  return typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
-            (derived(), 0, cols() - internal::get_runtime_value(n), rows(), internal::get_runtime_value(n));
+  return ColsBlockXpr(derived(), 0, cols() - n, rows(), n);
 }
 
-/// This is the const version of rightCols(NColsType).
-template<typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-const typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
-#else
-const typename ConstNColsBlockXpr<...>::Type
-#endif
-rightCols(NColsType n) const
+/// This is the const version of rightCols(Index).
+EIGEN_DEVICE_FUNC
+inline ConstColsBlockXpr rightCols(Index n) const
 {
-  return typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
-            (derived(), 0, cols() - internal::get_runtime_value(n), rows(), internal::get_runtime_value(n));
+  return ConstColsBlockXpr(derived(), 0, cols() - n, rows(), n);
 }
 
-/// \returns a block consisting of the right columns of \c *this.
+/// \returns a block consisting of the right columns of *this.
 ///
 /// \tparam N the number of columns in the block as specified at compile-time
 /// \param n the number of columns in the block as specified at run-time
@@ -902,70 +680,51 @@ rightCols(NColsType n) const
 ///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
 ///
-/// \sa block(Index,Index,NRowsType,NColsType), class Block
+/// \sa class Block, block(Index,Index,Index,Index)
 ///
 template<int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename NColsBlockXpr<N>::Type rightCols(Index n = N)
+EIGEN_DEVICE_FUNC
+inline typename NColsBlockXpr<N>::Type rightCols(Index n = N)
 {
   return typename NColsBlockXpr<N>::Type(derived(), 0, cols() - n, rows(), n);
 }
 
 /// This is the const version of rightCols<int>().
 template<int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename ConstNColsBlockXpr<N>::Type rightCols(Index n = N) const
+EIGEN_DEVICE_FUNC
+inline typename ConstNColsBlockXpr<N>::Type rightCols(Index n = N) const
 {
   return typename ConstNColsBlockXpr<N>::Type(derived(), 0, cols() - n, rows(), n);
 }
 
 
 
-/// \returns a block consisting of a range of columns of \c *this.
+/// \returns a block consisting of a range of columns of *this.
 ///
 /// \param startCol the index of the first column in the block
 /// \param numCols the number of columns in the block
-/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
 ///
 /// Example: \include DenseBase_middleCols_int.cpp
 /// Output: \verbinclude DenseBase_middleCols_int.out
 ///
-/// The number of columns \a n can also be specified at compile-time by passing Eigen::fix<N>,
-/// or Eigen::fix<N>(n) as arguments.
-/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
-///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
 ///
-/// \sa block(Index,Index,NRowsType,NColsType), class Block
+/// \sa class Block, block(Index,Index,Index,Index)
 ///
-template<typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
-#else
-typename NColsBlockXpr<...>::Type
-#endif
-middleCols(Index startCol, NColsType numCols)
+EIGEN_DEVICE_FUNC
+inline ColsBlockXpr middleCols(Index startCol, Index numCols)
 {
-  return typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
-            (derived(), 0, startCol, rows(), internal::get_runtime_value(numCols));
+  return ColsBlockXpr(derived(), 0, startCol, rows(), numCols);
 }
 
-/// This is the const version of middleCols(Index,NColsType).
-template<typename NColsType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-const typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
-#else
-const typename ConstNColsBlockXpr<...>::Type
-#endif
-middleCols(Index startCol, NColsType numCols) const
+/// This is the const version of middleCols(Index,Index).
+EIGEN_DEVICE_FUNC
+inline ConstColsBlockXpr middleCols(Index startCol, Index numCols) const
 {
-  return typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
-            (derived(), 0, startCol, rows(), internal::get_runtime_value(numCols));
+  return ConstColsBlockXpr(derived(), 0, startCol, rows(), numCols);
 }
 
-/// \returns a block consisting of a range of columns of \c *this.
+/// \returns a block consisting of a range of columns of *this.
 ///
 /// \tparam N the number of columns in the block as specified at compile-time
 /// \param startCol the index of the first column in the block
@@ -979,26 +738,26 @@ middleCols(Index startCol, NColsType numCols) const
 ///
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
 ///
-/// \sa block(Index,Index,NRowsType,NColsType), class Block
+/// \sa class Block, block(Index,Index,Index,Index)
 ///
 template<int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename NColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N)
+EIGEN_DEVICE_FUNC
+inline typename NColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N)
 {
   return typename NColsBlockXpr<N>::Type(derived(), 0, startCol, rows(), n);
 }
 
 /// This is the const version of middleCols<int>().
 template<int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename ConstNColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N) const
+EIGEN_DEVICE_FUNC
+inline typename ConstNColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N) const
 {
   return typename ConstNColsBlockXpr<N>::Type(derived(), 0, startCol, rows(), n);
 }
 
 
 
-/// \returns a fixed-size expression of a block of \c *this.
+/// \returns a fixed-size expression of a block in *this.
 ///
 /// The template parameters \a NRows and \a NCols are the number of
 /// rows and columns in the block.
@@ -1009,35 +768,29 @@ typename ConstNColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N) con
 /// Example: \include MatrixBase_block_int_int.cpp
 /// Output: \verbinclude MatrixBase_block_int_int.out
 ///
-/// \note The usage of of this overload is discouraged from %Eigen 3.4, better used the generic
-/// block(Index,Index,NRowsType,NColsType), here is the one-to-one equivalence:
-/// \code
-/// mat.template block<NRows,NCols>(i,j)  <-->  mat.block(i,j,fix<NRows>,fix<NCols>)
-/// \endcode
-///
 /// \note since block is a templated member, the keyword template has to be used
 /// if the matrix type is also a template parameter: \code m.template block<3,3>(1,1); \endcode
 ///
 EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 ///
-/// \sa block(Index,Index,NRowsType,NColsType), class Block
+/// \sa class Block, block(Index,Index,Index,Index)
 ///
 template<int NRows, int NCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol)
+EIGEN_DEVICE_FUNC
+inline typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol)
 {
   return typename FixedBlockXpr<NRows,NCols>::Type(derived(), startRow, startCol);
 }
 
 /// This is the const version of block<>(Index, Index). */
 template<int NRows, int NCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol) const
+EIGEN_DEVICE_FUNC
+inline const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol) const
 {
   return typename ConstFixedBlockXpr<NRows,NCols>::Type(derived(), startRow, startCol);
 }
 
-/// \returns an expression of a block of \c *this.
+/// \returns an expression of a block in *this.
 ///
 /// \tparam NRows number of rows in block as specified at compile-time
 /// \tparam NCols number of columns in block as specified at compile-time
@@ -1052,25 +805,14 @@ const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index
 /// \a NRows is \a Dynamic, and the same for the number of columns.
 ///
 /// Example: \include MatrixBase_template_int_int_block_int_int_int_int.cpp
-/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.out
-///
-/// \note The usage of of this overload is discouraged from %Eigen 3.4, better used the generic
-/// block(Index,Index,NRowsType,NColsType), here is the one-to-one complete equivalence:
-/// \code
-/// mat.template block<NRows,NCols>(i,j,rows,cols)     <-->  mat.block(i,j,fix<NRows>(rows),fix<NCols>(cols))
-/// \endcode
-/// If we known that, e.g., NRows==Dynamic and NCols!=Dynamic, then the equivalence becomes:
-/// \code
-/// mat.template block<Dynamic,NCols>(i,j,rows,NCols)  <-->  mat.block(i,j,rows,fix<NCols>)
-/// \endcode
+/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.cpp
 ///
 EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
 ///
-/// \sa block(Index,Index,NRowsType,NColsType), class Block
+/// \sa class Block, block(Index,Index,Index,Index)
 ///
 template<int NRows, int NCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol,
+inline typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol,
                                                   Index blockRows, Index blockCols)
 {
   return typename FixedBlockXpr<NRows,NCols>::Type(derived(), startRow, startCol, blockRows, blockCols);
@@ -1078,14 +820,13 @@ typename FixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol,
 
 /// This is the const version of block<>(Index, Index, Index, Index).
 template<int NRows, int NCols>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol,
+inline const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index startCol,
                                                               Index blockRows, Index blockCols) const
 {
   return typename ConstFixedBlockXpr<NRows,NCols>::Type(derived(), startRow, startCol, blockRows, blockCols);
 }
 
-/// \returns an expression of the \a i-th column of \c *this. Note that the numbering starts at 0.
+/// \returns an expression of the \a i-th column of *this. Note that the numbering starts at 0.
 ///
 /// Example: \include MatrixBase_col.cpp
 /// Output: \verbinclude MatrixBase_col.out
@@ -1093,20 +834,20 @@ const typename ConstFixedBlockXpr<NRows,NCols>::Type block(Index startRow, Index
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major)
 /**
   * \sa row(), class Block */
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-ColXpr col(Index i)
+EIGEN_DEVICE_FUNC
+inline ColXpr col(Index i)
 {
   return ColXpr(derived(), i);
 }
 
 /// This is the const version of col().
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-ConstColXpr col(Index i) const
+EIGEN_DEVICE_FUNC
+inline ConstColXpr col(Index i) const
 {
   return ConstColXpr(derived(), i);
 }
 
-/// \returns an expression of the \a i-th row of \c *this. Note that the numbering starts at 0.
+/// \returns an expression of the \a i-th row of *this. Note that the numbering starts at 0.
 ///
 /// Example: \include MatrixBase_row.cpp
 /// Output: \verbinclude MatrixBase_row.out
@@ -1114,166 +855,109 @@ ConstColXpr col(Index i) const
 EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major)
 /**
   * \sa col(), class Block */
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-RowXpr row(Index i)
+EIGEN_DEVICE_FUNC
+inline RowXpr row(Index i)
 {
   return RowXpr(derived(), i);
 }
 
 /// This is the const version of row(). */
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-ConstRowXpr row(Index i) const
+EIGEN_DEVICE_FUNC
+inline ConstRowXpr row(Index i) const
 {
   return ConstRowXpr(derived(), i);
 }
 
-/// \returns an expression of a segment (i.e. a vector block) in \c *this with either dynamic or fixed sizes.
+/// \returns a dynamic-size expression of a segment (i.e. a vector block) in *this.
 ///
 /// \only_for_vectors
 ///
 /// \param start the first coefficient in the segment
 /// \param n the number of coefficients in the segment
-/// \tparam NType the type of the value handling the number of coefficients in the segment, typically Index.
 ///
 /// Example: \include MatrixBase_segment_int_int.cpp
 /// Output: \verbinclude MatrixBase_segment_int_int.out
 ///
-/// The number of coefficients \a n can also be specified at compile-time by passing Eigen::fix<N>,
-/// or Eigen::fix<N>(n) as arguments.
-/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
-///
-/// \note Even in the case that the returned expression has dynamic size, in the case
+/// \note Even though the returned expression has dynamic size, in the case
 /// when it is applied to a fixed-size vector, it inherits a fixed maximal size,
 /// which means that evaluating it does not cause a dynamic memory allocation.
 ///
-/// \sa block(Index,Index,NRowsType,NColsType), fix<N>, fix<N>(int), class Block
+/// \sa class Block, segment(Index)
 ///
-template<typename NType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
-#else
-typename FixedSegmentReturnType<...>::Type
-#endif
-segment(Index start, NType n)
+EIGEN_DEVICE_FUNC
+inline SegmentReturnType segment(Index start, Index n)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
-            (derived(), start, internal::get_runtime_value(n));
+  return SegmentReturnType(derived(), start, n);
 }
 
 
-/// This is the const version of segment(Index,NType).
-template<typename NType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-const typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
-#else
-const typename ConstFixedSegmentReturnType<...>::Type
-#endif
-segment(Index start, NType n) const
+/// This is the const version of segment(Index,Index).
+EIGEN_DEVICE_FUNC
+inline ConstSegmentReturnType segment(Index start, Index n) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
-            (derived(), start, internal::get_runtime_value(n));
+  return ConstSegmentReturnType(derived(), start, n);
 }
 
-/// \returns an expression of the first coefficients of \c *this with either dynamic or fixed sizes.
+/// \returns a dynamic-size expression of the first coefficients of *this.
 ///
 /// \only_for_vectors
 ///
 /// \param n the number of coefficients in the segment
-/// \tparam NType the type of the value handling the number of coefficients in the segment, typically Index.
 ///
 /// Example: \include MatrixBase_start_int.cpp
 /// Output: \verbinclude MatrixBase_start_int.out
 ///
-/// The number of coefficients \a n can also be specified at compile-time by passing Eigen::fix<N>,
-/// or Eigen::fix<N>(n) as arguments.
-/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
-///
-/// \note Even in the case that the returned expression has dynamic size, in the case
+/// \note Even though the returned expression has dynamic size, in the case
 /// when it is applied to a fixed-size vector, it inherits a fixed maximal size,
 /// which means that evaluating it does not cause a dynamic memory allocation.
 ///
 /// \sa class Block, block(Index,Index)
 ///
-template<typename NType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
-#else
-typename FixedSegmentReturnType<...>::Type
-#endif
-head(NType n)
+EIGEN_DEVICE_FUNC
+inline SegmentReturnType head(Index n)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
-              (derived(), 0, internal::get_runtime_value(n));
+  return SegmentReturnType(derived(), 0, n);
 }
 
-/// This is the const version of head(NType).
-template<typename NType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-const typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
-#else
-const typename ConstFixedSegmentReturnType<...>::Type
-#endif
-head(NType n) const
+/// This is the const version of head(Index).
+EIGEN_DEVICE_FUNC
+inline ConstSegmentReturnType head(Index n) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
-            (derived(), 0, internal::get_runtime_value(n));
+  return ConstSegmentReturnType(derived(), 0, n);
 }
 
-/// \returns an expression of a last coefficients of \c *this with either dynamic or fixed sizes.
+/// \returns a dynamic-size expression of the last coefficients of *this.
 ///
 /// \only_for_vectors
 ///
 /// \param n the number of coefficients in the segment
-/// \tparam NType the type of the value handling the number of coefficients in the segment, typically Index.
 ///
 /// Example: \include MatrixBase_end_int.cpp
 /// Output: \verbinclude MatrixBase_end_int.out
 ///
-/// The number of coefficients \a n can also be specified at compile-time by passing Eigen::fix<N>,
-/// or Eigen::fix<N>(n) as arguments.
-/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
-///
-/// \note Even in the case that the returned expression has dynamic size, in the case
+/// \note Even though the returned expression has dynamic size, in the case
 /// when it is applied to a fixed-size vector, it inherits a fixed maximal size,
 /// which means that evaluating it does not cause a dynamic memory allocation.
 ///
 /// \sa class Block, block(Index,Index)
 ///
-template<typename NType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
-#else
-typename FixedSegmentReturnType<...>::Type
-#endif
-tail(NType n)
+EIGEN_DEVICE_FUNC
+inline SegmentReturnType tail(Index n)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
-            (derived(), this->size() - internal::get_runtime_value(n), internal::get_runtime_value(n));
+  return SegmentReturnType(derived(), this->size() - n, n);
 }
 
 /// This is the const version of tail(Index).
-template<typename NType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-const typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
-#else
-const typename ConstFixedSegmentReturnType<...>::Type
-#endif
-tail(NType n) const
+EIGEN_DEVICE_FUNC
+inline ConstSegmentReturnType tail(Index n) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
-            (derived(), this->size() - internal::get_runtime_value(n), internal::get_runtime_value(n));
+  return ConstSegmentReturnType(derived(), this->size() - n, n);
 }
 
 /// \returns a fixed-size expression of a segment (i.e. a vector block) in \c *this
@@ -1290,11 +974,11 @@ tail(NType n) const
 /// Example: \include MatrixBase_template_int_segment.cpp
 /// Output: \verbinclude MatrixBase_template_int_segment.out
 ///
-/// \sa segment(Index,NType), class Block
+/// \sa class Block
 ///
 template<int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename FixedSegmentReturnType<N>::Type segment(Index start, Index n = N)
+EIGEN_DEVICE_FUNC
+inline typename FixedSegmentReturnType<N>::Type segment(Index start, Index n = N)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename FixedSegmentReturnType<N>::Type(derived(), start, n);
@@ -1302,14 +986,14 @@ typename FixedSegmentReturnType<N>::Type segment(Index start, Index n = N)
 
 /// This is the const version of segment<int>(Index).
 template<int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename ConstFixedSegmentReturnType<N>::Type segment(Index start, Index n = N) const
+EIGEN_DEVICE_FUNC
+inline typename ConstFixedSegmentReturnType<N>::Type segment(Index start, Index n = N) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename ConstFixedSegmentReturnType<N>::Type(derived(), start, n);
 }
 
-/// \returns a fixed-size expression of the first coefficients of \c *this.
+/// \returns a fixed-size expression of the first coefficients of *this.
 ///
 /// \only_for_vectors
 ///
@@ -1322,11 +1006,11 @@ typename ConstFixedSegmentReturnType<N>::Type segment(Index start, Index n = N)
 /// Example: \include MatrixBase_template_int_start.cpp
 /// Output: \verbinclude MatrixBase_template_int_start.out
 ///
-/// \sa head(NType), class Block
+/// \sa class Block
 ///
 template<int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename FixedSegmentReturnType<N>::Type head(Index n = N)
+EIGEN_DEVICE_FUNC
+inline typename FixedSegmentReturnType<N>::Type head(Index n = N)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename FixedSegmentReturnType<N>::Type(derived(), 0, n);
@@ -1334,14 +1018,14 @@ typename FixedSegmentReturnType<N>::Type head(Index n = N)
 
 /// This is the const version of head<int>().
 template<int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename ConstFixedSegmentReturnType<N>::Type head(Index n = N) const
+EIGEN_DEVICE_FUNC
+inline typename ConstFixedSegmentReturnType<N>::Type head(Index n = N) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename ConstFixedSegmentReturnType<N>::Type(derived(), 0, n);
 }
 
-/// \returns a fixed-size expression of the last coefficients of \c *this.
+/// \returns a fixed-size expression of the last coefficients of *this.
 ///
 /// \only_for_vectors
 ///
@@ -1354,11 +1038,11 @@ typename ConstFixedSegmentReturnType<N>::Type head(Index n = N) const
 /// Example: \include MatrixBase_template_int_end.cpp
 /// Output: \verbinclude MatrixBase_template_int_end.out
 ///
-/// \sa tail(NType), class Block
+/// \sa class Block
 ///
 template<int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename FixedSegmentReturnType<N>::Type tail(Index n = N)
+EIGEN_DEVICE_FUNC
+inline typename FixedSegmentReturnType<N>::Type tail(Index n = N)
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename FixedSegmentReturnType<N>::Type(derived(), size() - n);
@@ -1366,78 +1050,9 @@ typename FixedSegmentReturnType<N>::Type tail(Index n = N)
 
 /// This is the const version of tail<int>.
 template<int N>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename ConstFixedSegmentReturnType<N>::Type tail(Index n = N) const
+EIGEN_DEVICE_FUNC
+inline typename ConstFixedSegmentReturnType<N>::Type tail(Index n = N) const
 {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return typename ConstFixedSegmentReturnType<N>::Type(derived(), size() - n);
 }
-
-/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
-/// is col-major (resp. row-major).
-///
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-InnerVectorReturnType innerVector(Index outer)
-{ return InnerVectorReturnType(derived(), outer); }
-
-/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
-/// is col-major (resp. row-major). Read-only.
-///
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-const ConstInnerVectorReturnType innerVector(Index outer) const
-{ return ConstInnerVectorReturnType(derived(), outer); }
-
-/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
-/// is col-major (resp. row-major).
-///
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-InnerVectorsReturnType
-innerVectors(Index outerStart, Index outerSize)
-{
-  return Block<Derived,Dynamic,Dynamic,true>(derived(),
-                                             IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
-                                             IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
-
-}
-
-/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
-/// is col-major (resp. row-major). Read-only.
-///
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-const ConstInnerVectorsReturnType
-innerVectors(Index outerStart, Index outerSize) const
-{
-  return Block<const Derived,Dynamic,Dynamic,true>(derived(),
-                                                  IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
-                                                  IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
-
-}
-
-/** \returns the i-th subvector (column or vector) according to the \c Direction
-  * \sa subVectors()
-  */
-template<DirectionType Direction>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename internal::conditional<Direction==Vertical,ColXpr,RowXpr>::type
-subVector(Index i)
-{
-  return typename internal::conditional<Direction==Vertical,ColXpr,RowXpr>::type(derived(),i);
-}
-
-/** This is the const version of subVector(Index) */
-template<DirectionType Direction>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename internal::conditional<Direction==Vertical,ConstColXpr,ConstRowXpr>::type
-subVector(Index i) const
-{
-  return typename internal::conditional<Direction==Vertical,ConstColXpr,ConstRowXpr>::type(derived(),i);
-}
-
-/** \returns the number of subvectors (rows or columns) in the direction \c Direction
-  * \sa subVector(Index)
-  */
-template<DirectionType Direction>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-Index subVectors() const
-{ return (Direction==Vertical)?cols():rows(); }
-
diff --git a/uppsrc/plugin/Eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h b/uppsrc/plugin/Eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h
index 42ff901ca..89f4faaac 100644
--- a/uppsrc/plugin/Eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h
+++ b/uppsrc/plugin/Eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h
@@ -64,49 +64,6 @@ cast() const
   return typename CastXpr<NewType>::Type(derived());
 }
 
-template<int N> struct ShiftRightXpr {
-  typedef CwiseUnaryOp<internal::scalar_shift_right_op<Scalar, N>, const Derived> Type;
-};
-
-/// \returns an expression of \c *this with the \a Scalar type arithmetically
-/// shifted right by \a N bit positions.
-///
-/// The template parameter \a N specifies the number of bit positions to shift.
-///
-EIGEN_DOC_UNARY_ADDONS(cast,conversion function)
-///
-/// \sa class CwiseUnaryOp
-///
-template<int N>
-EIGEN_DEVICE_FUNC
-typename ShiftRightXpr<N>::Type
-shift_right() const
-{
-  return typename ShiftRightXpr<N>::Type(derived());
-}
-
-
-template<int N> struct ShiftLeftXpr {
-  typedef CwiseUnaryOp<internal::scalar_shift_left_op<Scalar, N>, const Derived> Type;
-};
-
-/// \returns an expression of \c *this with the \a Scalar type logically
-/// shifted left by \a N bit positions.
-///
-/// The template parameter \a N specifies the number of bit positions to shift.
-///
-EIGEN_DOC_UNARY_ADDONS(cast,conversion function)
-///
-/// \sa class CwiseUnaryOp
-///
-template<int N>
-EIGEN_DEVICE_FUNC
-typename ShiftLeftXpr<N>::Type
-shift_left() const
-{
-  return typename ShiftLeftXpr<N>::Type(derived());
-}
-
 /// \returns an expression of the complex conjugate of \c *this.
 ///
 EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate)
@@ -119,20 +76,6 @@ conjugate() const
   return ConjugateReturnType(derived());
 }
 
-/// \returns an expression of the complex conjugate of \c *this if Cond==true, returns derived() otherwise.
-///
-EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate)
-///
-/// \sa conjugate()
-template<bool Cond>
-EIGEN_DEVICE_FUNC
-inline typename internal::conditional<Cond,ConjugateReturnType,const Derived&>::type
-conjugateIf() const
-{
-  typedef typename internal::conditional<Cond,ConjugateReturnType,const Derived&>::type ReturnType;
-  return ReturnType(derived());
-}
-
 /// \returns a read-only expression of the real part of \c *this.
 ///
 EIGEN_DOC_UNARY_ADDONS(real,real part function)
diff --git a/uppsrc/plugin/Eigen/Eigen/src/plugins/IndexedViewMethods.h b/uppsrc/plugin/Eigen/Eigen/src/plugins/IndexedViewMethods.h
deleted file mode 100644
index 5bfb19ac6..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/plugins/IndexedViewMethods.h
+++ /dev/null
@@ -1,262 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#if !defined(EIGEN_PARSED_BY_DOXYGEN)
-
-// This file is automatically included twice to generate const and non-const versions
-
-#ifndef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS
-#define EIGEN_INDEXED_VIEW_METHOD_CONST const
-#define EIGEN_INDEXED_VIEW_METHOD_TYPE  ConstIndexedViewType
-#else
-#define EIGEN_INDEXED_VIEW_METHOD_CONST
-#define EIGEN_INDEXED_VIEW_METHOD_TYPE IndexedViewType
-#endif
-
-#ifndef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS
-protected:
-
-// define some aliases to ease readability
-
-template<typename Indices>
-struct IvcRowType : public internal::IndexedViewCompatibleType<Indices,RowsAtCompileTime> {};
-
-template<typename Indices>
-struct IvcColType : public internal::IndexedViewCompatibleType<Indices,ColsAtCompileTime> {};
-
-template<typename Indices>
-struct IvcType : public internal::IndexedViewCompatibleType<Indices,SizeAtCompileTime> {};
-
-typedef typename internal::IndexedViewCompatibleType<Index,1>::type IvcIndex;
-
-template<typename Indices>
-typename IvcRowType<Indices>::type
-ivcRow(const Indices& indices) const {
-  return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic<Index,RowsAtCompileTime>(derived().rows()),Specialized);
-}
-
-template<typename Indices>
-typename IvcColType<Indices>::type
-ivcCol(const Indices& indices) const {
-  return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic<Index,ColsAtCompileTime>(derived().cols()),Specialized);
-}
-
-template<typename Indices>
-typename IvcColType<Indices>::type
-ivcSize(const Indices& indices) const {
-  return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic<Index,SizeAtCompileTime>(derived().size()),Specialized);
-}
-
-public:
-
-#endif
-
-template<typename RowIndices, typename ColIndices>
-struct EIGEN_INDEXED_VIEW_METHOD_TYPE {
-  typedef IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,
-                      typename IvcRowType<RowIndices>::type,
-                      typename IvcColType<ColIndices>::type> type;
-};
-
-// This is the generic version
-
-template<typename RowIndices, typename ColIndices>
-typename internal::enable_if<internal::valid_indexed_view_overload<RowIndices,ColIndices>::value
-  && internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::ReturnAsIndexedView,
-  typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type >::type
-operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
-{
-  return typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type
-            (derived(), ivcRow(rowIndices), ivcCol(colIndices));
-}
-
-// The following overload returns a Block<> object
-
-template<typename RowIndices, typename ColIndices>
-typename internal::enable_if<internal::valid_indexed_view_overload<RowIndices,ColIndices>::value
-  && internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::ReturnAsBlock,
-  typename internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::BlockType>::type
-operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
-{
-  typedef typename internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::BlockType BlockType;
-  typename IvcRowType<RowIndices>::type actualRowIndices = ivcRow(rowIndices);
-  typename IvcColType<ColIndices>::type actualColIndices = ivcCol(colIndices);
-  return BlockType(derived(),
-                   internal::first(actualRowIndices),
-                   internal::first(actualColIndices),
-                   internal::size(actualRowIndices),
-                   internal::size(actualColIndices));
-}
-
-// The following overload returns a Scalar
-
-template<typename RowIndices, typename ColIndices>
-typename internal::enable_if<internal::valid_indexed_view_overload<RowIndices,ColIndices>::value
-  && internal::traits<typename EIGEN_INDEXED_VIEW_METHOD_TYPE<RowIndices,ColIndices>::type>::ReturnAsScalar,
-  CoeffReturnType >::type
-operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
-{
-  return Base::operator()(internal::eval_expr_given_size(rowIndices,rows()),internal::eval_expr_given_size(colIndices,cols()));
-}
-
-#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE
-
-// The following three overloads are needed to handle raw Index[N] arrays.
-
-template<typename RowIndicesT, std::size_t RowIndicesN, typename ColIndices>
-IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const RowIndicesT (&)[RowIndicesN],typename IvcColType<ColIndices>::type>
-operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST
-{
-  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const RowIndicesT (&)[RowIndicesN],typename IvcColType<ColIndices>::type>
-                    (derived(), rowIndices, ivcCol(colIndices));
-}
-
-template<typename RowIndices, typename ColIndicesT, std::size_t ColIndicesN>
-IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,typename IvcRowType<RowIndices>::type, const ColIndicesT (&)[ColIndicesN]>
-operator()(const RowIndices& rowIndices, const ColIndicesT (&colIndices)[ColIndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST
-{
-  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,typename IvcRowType<RowIndices>::type,const ColIndicesT (&)[ColIndicesN]>
-                    (derived(), ivcRow(rowIndices), colIndices);
-}
-
-template<typename RowIndicesT, std::size_t RowIndicesN, typename ColIndicesT, std::size_t ColIndicesN>
-IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const RowIndicesT (&)[RowIndicesN], const ColIndicesT (&)[ColIndicesN]>
-operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndicesT (&colIndices)[ColIndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST
-{
-  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const RowIndicesT (&)[RowIndicesN],const ColIndicesT (&)[ColIndicesN]>
-                    (derived(), rowIndices, colIndices);
-}
-
-#endif // EIGEN_HAS_STATIC_ARRAY_TEMPLATE
-
-// Overloads for 1D vectors/arrays
-
-template<typename Indices>
-typename internal::enable_if<
-  IsRowMajor && (!(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1 || internal::is_valid_index_type<Indices>::value)),
-  IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,IvcIndex,typename IvcType<Indices>::type> >::type
-operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,IvcIndex,typename IvcType<Indices>::type>
-            (derived(), IvcIndex(0), ivcCol(indices));
-}
-
-template<typename Indices>
-typename internal::enable_if<
-  (!IsRowMajor) && (!(internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1 || internal::is_valid_index_type<Indices>::value)),
-  IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,typename IvcType<Indices>::type,IvcIndex> >::type
-operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,typename IvcType<Indices>::type,IvcIndex>
-            (derived(), ivcRow(indices), IvcIndex(0));
-}
-
-template<typename Indices>
-typename internal::enable_if<
-  (internal::get_compile_time_incr<typename IvcType<Indices>::type>::value==1) && (!internal::is_valid_index_type<Indices>::value) && (!symbolic::is_symbolic<Indices>::value),
-  VectorBlock<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,internal::array_size<Indices>::value> >::type
-operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  typename IvcType<Indices>::type actualIndices = ivcSize(indices);
-  return VectorBlock<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,internal::array_size<Indices>::value>
-            (derived(), internal::first(actualIndices), internal::size(actualIndices));
-}
-
-template<typename IndexType>
-typename internal::enable_if<symbolic::is_symbolic<IndexType>::value, CoeffReturnType >::type
-operator()(const IndexType& id) EIGEN_INDEXED_VIEW_METHOD_CONST
-{
-  return Base::operator()(internal::eval_expr_given_size(id,size()));
-}
-
-#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE
-
-template<typename IndicesT, std::size_t IndicesN>
-typename internal::enable_if<IsRowMajor,
-  IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,IvcIndex,const IndicesT (&)[IndicesN]> >::type
-operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,IvcIndex,const IndicesT (&)[IndicesN]>
-            (derived(), IvcIndex(0), indices);
-}
-
-template<typename IndicesT, std::size_t IndicesN>
-typename internal::enable_if<!IsRowMajor,
-  IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const IndicesT (&)[IndicesN],IvcIndex> >::type
-operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return IndexedView<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,const IndicesT (&)[IndicesN],IvcIndex>
-            (derived(), indices, IvcIndex(0));
-}
-
-#endif // EIGEN_HAS_STATIC_ARRAY_TEMPLATE
-
-#undef EIGEN_INDEXED_VIEW_METHOD_CONST
-#undef EIGEN_INDEXED_VIEW_METHOD_TYPE
-
-#ifndef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS
-#define EIGEN_INDEXED_VIEW_METHOD_2ND_PASS
-#include "IndexedViewMethods.h"
-#undef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS
-#endif
-
-#else // EIGEN_PARSED_BY_DOXYGEN
-
-/**
-  * \returns a generic submatrix view defined by the rows and columns indexed \a rowIndices and \a colIndices respectively.
-  *
-  * Each parameter must either be:
-  *  - An integer indexing a single row or column
-  *  - Eigen::all indexing the full set of respective rows or columns in increasing order
-  *  - An ArithmeticSequence as returned by the Eigen::seq and Eigen::seqN functions
-  *  - Any %Eigen's vector/array of integers or expressions
-  *  - Plain C arrays: \c int[N]
-  *  - And more generally any type exposing the following two member functions:
-  * \code
-  * <integral type> operator[](<integral type>) const;
-  * <integral type> size() const;
-  * \endcode
-  * where \c <integral \c type>  stands for any integer type compatible with Eigen::Index (i.e. \c std::ptrdiff_t).
-  *
-  * The last statement implies compatibility with \c std::vector, \c std::valarray, \c std::array, many of the Range-v3's ranges, etc.
-  *
-  * If the submatrix can be represented using a starting position \c (i,j) and positive sizes \c (rows,columns), then this
-  * method will returns a Block object after extraction of the relevant information from the passed arguments. This is the case
-  * when all arguments are either:
-  *  - An integer
-  *  - Eigen::all
-  *  - An ArithmeticSequence with compile-time increment strictly equal to 1, as returned by Eigen::seq(a,b), and Eigen::seqN(a,N).
-  *
-  * Otherwise a more general IndexedView<Derived,RowIndices',ColIndices'> object will be returned, after conversion of the inputs
-  * to more suitable types \c RowIndices' and \c ColIndices'.
-  *
-  * For 1D vectors and arrays, you better use the operator()(const Indices&) overload, which behave the same way but taking a single parameter.
-  *
-  * See also this <a href="https://stackoverflow.com/questions/46110917/eigen-replicate-items-along-one-dimension-without-useless-allocations">question</a> and its answer for an example of how to duplicate coefficients.
-  *
-  * \sa operator()(const Indices&), class Block, class IndexedView, DenseBase::block(Index,Index,Index,Index)
-  */
-template<typename RowIndices, typename ColIndices>
-IndexedView_or_Block
-operator()(const RowIndices& rowIndices, const ColIndices& colIndices);
-
-/** This is an overload of operator()(const RowIndices&, const ColIndices&) for 1D vectors or arrays
-  *
-  * \only_for_vectors
-  */
-template<typename Indices>
-IndexedView_or_VectorBlock
-operator()(const Indices& indices);
-
-#endif  // EIGEN_PARSED_BY_DOXYGEN
diff --git a/uppsrc/plugin/Eigen/Eigen/src/plugins/ReshapedMethods.h b/uppsrc/plugin/Eigen/Eigen/src/plugins/ReshapedMethods.h
deleted file mode 100644
index 482a6b045..000000000
--- a/uppsrc/plugin/Eigen/Eigen/src/plugins/ReshapedMethods.h
+++ /dev/null
@@ -1,149 +0,0 @@
-
-#ifdef EIGEN_PARSED_BY_DOXYGEN
-
-/// \returns an expression of \c *this with reshaped sizes.
-///
-/// \param nRows the number of rows in the reshaped expression, specified at either run-time or compile-time, or AutoSize
-/// \param nCols the number of columns in the reshaped expression, specified at either run-time or compile-time, or AutoSize
-/// \tparam Order specifies whether the coefficients should be processed in column-major-order (ColMajor), in row-major-order (RowMajor),
-///               or follows the \em natural order of the nested expression (AutoOrder). The default is ColMajor.
-/// \tparam NRowsType the type of the value handling the number of rows, typically Index.
-/// \tparam NColsType the type of the value handling the number of columns, typically Index.
-///
-/// Dynamic size example: \include MatrixBase_reshaped_int_int.cpp
-/// Output: \verbinclude MatrixBase_reshaped_int_int.out
-///
-/// The number of rows \a nRows and columns \a nCols can also be specified at compile-time by passing Eigen::fix<N>,
-/// or Eigen::fix<N>(n) as arguments. In the later case, \c n plays the role of a runtime fallback value in case \c N equals Eigen::Dynamic.
-/// Here is an example with a fixed number of rows and columns:
-/// \include MatrixBase_reshaped_fixed.cpp
-/// Output: \verbinclude MatrixBase_reshaped_fixed.out
-///
-/// Finally, one of the sizes parameter can be automatically deduced from the other one by passing AutoSize as in the following example:
-/// \include MatrixBase_reshaped_auto.cpp
-/// Output: \verbinclude MatrixBase_reshaped_auto.out
-/// AutoSize does preserve compile-time sizes when possible, i.e., when the sizes of the input are known at compile time \b and
-/// that the other size is passed at compile-time using Eigen::fix<N> as above.
-///
-/// \sa class Reshaped, fix, fix<N>(int)
-///
-template<int Order = ColMajor, typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC
-inline Reshaped<Derived,...>
-reshaped(NRowsType nRows, NColsType nCols);
-
-/// This is the const version of reshaped(NRowsType,NColsType).
-template<int Order = ColMajor, typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC
-inline const Reshaped<const Derived,...>
-reshaped(NRowsType nRows, NColsType nCols) const;
-
-/// \returns an expression of \c *this with columns (or rows) stacked to a linear column vector
-///
-/// \tparam Order specifies whether the coefficients should be processed in column-major-order (ColMajor), in row-major-order (RowMajor),
-///               or follows the \em natural order of the nested expression (AutoOrder). The default is ColMajor.
-///
-/// This overloads is essentially a shortcut for `A.reshaped<Order>(AutoSize,fix<1>)`.
-///
-/// - If `Order==ColMajor` (the default), then it returns a column-vector from the stacked columns of \c *this.
-/// - If `Order==RowMajor`, then it returns a column-vector from the stacked rows of \c *this.
-/// - If `Order==AutoOrder`, then it returns a column-vector with elements stacked following the storage order of \c *this.
-///   This mode is the recommended one when the particular ordering of the element is not relevant.
-///
-/// Example:
-/// \include MatrixBase_reshaped_to_vector.cpp
-/// Output: \verbinclude MatrixBase_reshaped_to_vector.out
-///
-/// If you want more control, you can still fall back to reshaped(NRowsType,NColsType).
-///
-/// \sa reshaped(NRowsType,NColsType), class Reshaped
-///
-template<int Order = ColMajor>
-EIGEN_DEVICE_FUNC
-inline Reshaped<Derived,...>
-reshaped();
-
-/// This is the const version of reshaped().
-template<int Order = ColMajor>
-EIGEN_DEVICE_FUNC
-inline const Reshaped<const Derived,...>
-reshaped() const;
-
-#else
-
-// This file is automatically included twice to generate const and non-const versions
-
-#ifndef EIGEN_RESHAPED_METHOD_2ND_PASS
-#define EIGEN_RESHAPED_METHOD_CONST const
-#else
-#define EIGEN_RESHAPED_METHOD_CONST
-#endif
-
-#ifndef EIGEN_RESHAPED_METHOD_2ND_PASS
-
-// This part is included once
-
-#endif
-
-template<typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC
-inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,
-                internal::get_compiletime_reshape_size<NRowsType,NColsType,SizeAtCompileTime>::value,
-                internal::get_compiletime_reshape_size<NColsType,NRowsType,SizeAtCompileTime>::value>
-reshaped(NRowsType nRows, NColsType nCols) EIGEN_RESHAPED_METHOD_CONST
-{
-  return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,
-                  internal::get_compiletime_reshape_size<NRowsType,NColsType,SizeAtCompileTime>::value,
-                  internal::get_compiletime_reshape_size<NColsType,NRowsType,SizeAtCompileTime>::value>
-                (derived(),
-                 internal::get_runtime_reshape_size(nRows,internal::get_runtime_value(nCols),size()),
-                 internal::get_runtime_reshape_size(nCols,internal::get_runtime_value(nRows),size()));
-}
-
-template<int Order, typename NRowsType, typename NColsType>
-EIGEN_DEVICE_FUNC
-inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,
-                internal::get_compiletime_reshape_size<NRowsType,NColsType,SizeAtCompileTime>::value,
-                internal::get_compiletime_reshape_size<NColsType,NRowsType,SizeAtCompileTime>::value,
-                internal::get_compiletime_reshape_order<Flags,Order>::value>
-reshaped(NRowsType nRows, NColsType nCols) EIGEN_RESHAPED_METHOD_CONST
-{
-  return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,
-                  internal::get_compiletime_reshape_size<NRowsType,NColsType,SizeAtCompileTime>::value,
-                  internal::get_compiletime_reshape_size<NColsType,NRowsType,SizeAtCompileTime>::value,
-                  internal::get_compiletime_reshape_order<Flags,Order>::value>
-                (derived(),
-                 internal::get_runtime_reshape_size(nRows,internal::get_runtime_value(nCols),size()),
-                 internal::get_runtime_reshape_size(nCols,internal::get_runtime_value(nRows),size()));
-}
-
-// Views as linear vectors
-
-EIGEN_DEVICE_FUNC
-inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,SizeAtCompileTime,1>
-reshaped() EIGEN_RESHAPED_METHOD_CONST
-{
-  return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,SizeAtCompileTime,1>(derived(),size(),1);
-}
-
-template<int Order>
-EIGEN_DEVICE_FUNC
-inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1,
-                internal::get_compiletime_reshape_order<Flags,Order>::value>
-reshaped() EIGEN_RESHAPED_METHOD_CONST
-{
-  EIGEN_STATIC_ASSERT(Order==RowMajor || Order==ColMajor || Order==AutoOrder, INVALID_TEMPLATE_PARAMETER);
-  return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1,
-                  internal::get_compiletime_reshape_order<Flags,Order>::value>
-                (derived(), size(), 1);
-}
-
-#undef EIGEN_RESHAPED_METHOD_CONST
-
-#ifndef EIGEN_RESHAPED_METHOD_2ND_PASS
-#define EIGEN_RESHAPED_METHOD_2ND_PASS
-#include "ReshapedMethods.h"
-#undef EIGEN_RESHAPED_METHOD_2ND_PASS
-#endif
-
-#endif // EIGEN_PARSED_BY_DOXYGEN
diff --git a/uppsrc/plugin/Eigen/srcdoc.tpp/Eigen_en-us.tpp b/uppsrc/plugin/Eigen/srcdoc.tpp/Eigen_en-us.tpp
index 8b935cdde..bef98af12 100644
--- a/uppsrc/plugin/Eigen/srcdoc.tpp/Eigen_en-us.tpp
+++ b/uppsrc/plugin/Eigen/srcdoc.tpp/Eigen_en-us.tpp
@@ -17,9 +17,8 @@ vectors, numerical solvers and related algorithms.]&]
 [s0; [C2 -|Matrix2d res `= a`*b;-|// Just multiply them using `*]&]
 [s0;#2 &]
 [s0;#2 &]
-[s0;# [2 Eigen package is a wrapper of Eigen library, updated to master 
-branch ][^https`:`/`/gitlab`.com`/libeigen`/eigen`/`-`/commit`/c1d944dd913d05180b7d2d1229072c9c52a11f29^2 c
-ommit C1D944DD][2  (9/May/2020). It includes the library and helper 
+[s0;# [2 Eigen package is a wrapper of Eigen library, updated to stable 
+release 3.3.8 (05/10/2020). It includes the library and helper 
 functions to integrate better Eigen with U`+`+. Starting from 
 the 3.1.1 version, it is licensed under the ][^http`:`/`/www`.mozilla`.org`/MPL`/2`.0`/^2 M
 PL2][2 , which is a simple weak copyleft license. Common questions 
diff --git a/uppsrc/plugin/Eigen/unsupported/CMakeLists.txt b/uppsrc/plugin/Eigen/unsupported/CMakeLists.txt
new file mode 100644
index 000000000..9a5666105
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_subdirectory(Eigen)
+add_subdirectory(doc EXCLUDE_FROM_ALL)
+if(BUILD_TESTING)
+  if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
+    add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest
+  else()
+    add_subdirectory(test EXCLUDE_FROM_ALL)
+  endif()
+endif()
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/AdolcForward b/uppsrc/plugin/Eigen/unsupported/Eigen/AdolcForward
index 9b8d3cd1a..15f5f0731 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/AdolcForward
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/AdolcForward
@@ -40,7 +40,7 @@
 # undef realloc
 #endif
 
-#include "../../Eigen/Core"
+#include <Eigen/Core>
 
 namespace Eigen {
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/AlignedVector3 b/uppsrc/plugin/Eigen/unsupported/Eigen/AlignedVector3
index 4fa1842ac..47a86d4c0 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/AlignedVector3
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/AlignedVector3
@@ -10,9 +10,7 @@
 #ifndef EIGEN_ALIGNED_VECTOR3
 #define EIGEN_ALIGNED_VECTOR3
 
-#include "../../Eigen/Geometry"
-
-#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+#include <Eigen/Geometry>
 
 namespace Eigen {
 
@@ -78,9 +76,6 @@ template<typename _Scalar> class AlignedVector3
     { return m_coeffs.coeffRef(index);}
 
 
-    inline AlignedVector3()
-    {}
-
     inline AlignedVector3(const Scalar& x, const Scalar& y, const Scalar& z)
       : m_coeffs(x, y, z, Scalar(0))
     {}
@@ -134,9 +129,6 @@ template<typename _Scalar> class AlignedVector3
     inline AlignedVector3 operator-(const AlignedVector3& other) const
     { return AlignedVector3(m_coeffs - other.m_coeffs); }
 
-    inline AlignedVector3 operator-() const
-    { return AlignedVector3(-m_coeffs); }
-
     inline AlignedVector3 operator-=(const AlignedVector3& other)
     { m_coeffs -= other.m_coeffs; return *this; }
 
@@ -229,6 +221,4 @@ struct evaluator<AlignedVector3<Scalar> >
 
 }
 
-#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
-
 #endif // EIGEN_ALIGNED_VECTOR3
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/ArpackSupport b/uppsrc/plugin/Eigen/unsupported/Eigen/ArpackSupport
index 28c95ffa2..a0d4820e1 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/ArpackSupport
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/ArpackSupport
@@ -9,7 +9,7 @@
 #ifndef EIGEN_ARPACKSUPPORT_MODULE_H
 #define EIGEN_ARPACKSUPPORT_MODULE_H
 
-#include "../../Eigen/Core"
+#include <Eigen/Core>
 
 /** \defgroup ArpackSupport_Module Arpack support module
   *
@@ -20,12 +20,12 @@
   * \endcode
   */
 
-#include "../../Eigen/SparseCholesky"
+#include <Eigen/SparseCholesky>
 
-#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+#include <Eigen/src/Core/util/DisableStupidWarnings.h>
 #include "src/Eigenvalues/ArpackSelfAdjointEigenSolver.h"
 
-#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
 
 #endif // EIGEN_ARPACKSUPPORT_MODULE_H
 /* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/AutoDiff b/uppsrc/plugin/Eigen/unsupported/Eigen/AutoDiff
index 7a4ff460c..abf5b7d67 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/AutoDiff
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/AutoDiff
@@ -28,17 +28,11 @@ namespace Eigen {
 //@{
 
 }
-#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
-
 
 #include "src/AutoDiff/AutoDiffScalar.h"
 // #include "src/AutoDiff/AutoDiffVector.h"
 #include "src/AutoDiff/AutoDiffJacobian.h"
 
-#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
-
-
-
 namespace Eigen {
 //@}
 }
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/BVH b/uppsrc/plugin/Eigen/unsupported/Eigen/BVH
index 666c9835f..0161a5402 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/BVH
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/BVH
@@ -10,9 +10,9 @@
 #ifndef EIGEN_BVH_MODULE_H
 #define EIGEN_BVH_MODULE_H
 
-#include "../../Eigen/Core"
-#include "../../Eigen/Geometry"
-#include "../../Eigen/StdVector"
+#include <Eigen/Core>
+#include <Eigen/Geometry>
+#include <Eigen/StdVector>
 #include <algorithm>
 #include <queue>
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CMakeLists.txt b/uppsrc/plugin/Eigen/unsupported/Eigen/CMakeLists.txt
new file mode 100644
index 000000000..631a06014
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CMakeLists.txt
@@ -0,0 +1,32 @@
+set(Eigen_HEADERS 
+  AdolcForward
+  AlignedVector3
+  ArpackSupport
+  AutoDiff
+  BVH
+  EulerAngles
+  FFT
+  IterativeSolvers 
+  KroneckerProduct
+  LevenbergMarquardt
+  MatrixFunctions 
+  MoreVectorization
+  MPRealSupport
+  NonLinearOptimization
+  NumericalDiff
+  OpenGLSupport
+  Polynomials
+  Skyline 
+  SparseExtra
+  SpecialFunctions
+  Splines
+  )
+
+install(FILES
+  ${Eigen_HEADERS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel
+  )
+
+install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h")
+
+add_subdirectory(CXX11)
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/Tensor b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/Tensor
index 2640f9565..bb6523d15 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/Tensor
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/Tensor
@@ -13,11 +13,21 @@
 
 #include "../../../Eigen/Core"
 
-#if EIGEN_HAS_CXX11
+#ifdef EIGEN_USE_SYCL
+#undef min
+#undef max
+#undef isnan
+#undef isinf
+#undef isfinite
+#include <SYCL/sycl.hpp>
+#include <map>
+#include <memory>
+#include <utility>
+#endif
+
+#include <Eigen/src/Core/util/DisableStupidWarnings.h>
 
 #include "../SpecialFunctions"
-
-#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
 #include "src/util/CXX11Meta.h"
 #include "src/util/MaxSizeVector.h"
 
@@ -36,7 +46,6 @@
 #include <cmath>
 #include <cstddef>
 #include <cstring>
-#include <random>
 
 #ifdef _WIN32
 typedef __int16 int16_t;
@@ -45,10 +54,12 @@ typedef __int32 int32_t;
 typedef unsigned __int32 uint32_t;
 typedef __int64 int64_t;
 typedef unsigned __int64 uint64_t;
-#include <windows.h>
 #else
 #include <stdint.h>
-#include <unistd.h>
+#endif
+
+#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
+#include <random>
 #endif
 
 #ifdef _WIN32
@@ -59,19 +70,17 @@ typedef unsigned __int64 uint64_t;
 #include <time.h>
 #endif
 
-#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL)
+#ifdef EIGEN_USE_THREADS
 #include "ThreadPool"
 #endif
 
 #ifdef EIGEN_USE_GPU
-  #include <iostream>
-  #if defined(EIGEN_USE_HIP)
-    #include <hip/hip_runtime.h>
-  #else
-    #include <cuda_runtime.h>
-  #endif
-  #include <atomic>
-  #include <unistd.h>
+#include <iostream>
+#include <cuda_runtime.h>
+#if __cplusplus >= 201103L
+#include <atomic>
+#include <unistd.h>
+#endif
 #endif
 
 #include "src/Tensor/TensorMacros.h"
@@ -81,10 +90,7 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorCostModel.h"
 #include "src/Tensor/TensorDeviceDefault.h"
 #include "src/Tensor/TensorDeviceThreadPool.h"
-#include "src/Tensor/TensorDeviceGpu.h"
-#ifndef gpu_assert
-#define gpu_assert(x)
-#endif
+#include "src/Tensor/TensorDeviceCuda.h"
 #include "src/Tensor/TensorDeviceSycl.h"
 #include "src/Tensor/TensorIndexList.h"
 #include "src/Tensor/TensorDimensionList.h"
@@ -97,19 +103,18 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorGlobalFunctions.h"
 
 #include "src/Tensor/TensorBase.h"
-#include "src/Tensor/TensorBlock.h"
 
 #include "src/Tensor/TensorEvaluator.h"
 #include "src/Tensor/TensorExpr.h"
 #include "src/Tensor/TensorReduction.h"
-#include "src/Tensor/TensorReductionGpu.h"
+#include "src/Tensor/TensorReductionCuda.h"
 #include "src/Tensor/TensorArgMax.h"
 #include "src/Tensor/TensorConcatenation.h"
 #include "src/Tensor/TensorContractionMapper.h"
 #include "src/Tensor/TensorContractionBlocking.h"
 #include "src/Tensor/TensorContraction.h"
 #include "src/Tensor/TensorContractionThreadPool.h"
-#include "src/Tensor/TensorContractionGpu.h"
+#include "src/Tensor/TensorContractionCuda.h"
 #include "src/Tensor/TensorConversion.h"
 #include "src/Tensor/TensorConvolution.h"
 #include "src/Tensor/TensorFFT.h"
@@ -131,15 +136,8 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorGenerator.h"
 #include "src/Tensor/TensorAssign.h"
 #include "src/Tensor/TensorScan.h"
-#include "src/Tensor/TensorTrace.h"
-
-#ifdef EIGEN_USE_SYCL
-#include "src/Tensor/TensorReductionSycl.h"
-#include "src/Tensor/TensorConvolutionSycl.h"
-#include "src/Tensor/TensorContractionSycl.h"
-#include "src/Tensor/TensorScanSycl.h"
-#endif
 
+#include "src/Tensor/TensorSycl.h"
 #include "src/Tensor/TensorExecutor.h"
 #include "src/Tensor/TensorDevice.h"
 
@@ -151,7 +149,6 @@ typedef unsigned __int64 uint64_t;
 
 #include "src/Tensor/TensorIO.h"
 
-#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
 
-#endif  // EIGEN_HAS_CXX11
 //#endif // EIGEN_CXX11_TENSOR_MODULE
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/TensorSymmetry b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/TensorSymmetry
index b09c5e472..fb1b0c0fb 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/TensorSymmetry
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/TensorSymmetry
@@ -10,9 +10,9 @@
 #ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE
 #define EIGEN_CXX11_TENSORSYMMETRY_MODULE
 
-#include "Tensor"
+#include <unsupported/Eigen/CXX11/Tensor>
 
-#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
+#include <Eigen/src/Core/util/DisableStupidWarnings.h>
 
 #include "src/util/CXX11Meta.h"
 
@@ -33,7 +33,7 @@
 #include "src/TensorSymmetry/StaticSymmetry.h"
 #include "src/TensorSymmetry/DynamicSymmetry.h"
 
-#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
 
 #endif // EIGEN_CXX11_TENSORSYMMETRY_MODULE
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/ThreadPool b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/ThreadPool
index 71a6afe39..09d637e9a 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/ThreadPool
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/ThreadPool
@@ -12,7 +12,7 @@
 
 #include "../../../Eigen/Core"
 
-#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
+#include <Eigen/src/Core/util/DisableStupidWarnings.h>
 
 /** \defgroup CXX11_ThreadPool_Module C++11 ThreadPool Module
   *
@@ -44,32 +44,22 @@
 #include <thread>
 #include <functional>
 #include <memory>
-#include <utility>
-
-// There are non-parenthesized calls to "max" in the  <unordered_map> header,
-// which trigger a check in test/main.h causing compilation to fail.
-// We work around the check here by removing the check for max in
-// the case where we have to emulate thread_local.
-#ifdef max
-#undef max
-#endif
-#include <unordered_map>
 
 #include "src/util/CXX11Meta.h"
 #include "src/util/MaxSizeVector.h"
 
 #include "src/ThreadPool/ThreadLocal.h"
 #include "src/ThreadPool/ThreadYield.h"
-#include "src/ThreadPool/ThreadCancel.h"
 #include "src/ThreadPool/EventCount.h"
 #include "src/ThreadPool/RunQueue.h"
 #include "src/ThreadPool/ThreadPoolInterface.h"
 #include "src/ThreadPool/ThreadEnvironment.h"
-#include "src/ThreadPool/Barrier.h"
+#include "src/ThreadPool/SimpleThreadPool.h"
 #include "src/ThreadPool/NonBlockingThreadPool.h"
 
 #endif
 
-#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
 
 #endif // EIGEN_CXX11_THREADPOOL_MODULE
+
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/README.md b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/README.md
index 9b6f14204..da70fa216 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -83,7 +83,7 @@ large enough to hold all the data.
 
     // You can also map fixed-size tensors.  Here we get a 1d view of
     // the 2d fixed-size tensor.
-    TensorFixedSize<float, Sizes<4, 3>> t_4x3;
+    TensorFixedSize<float, Sizes<4, 5>> t_4x3;
     TensorMap<Tensor<float, 1>> t_12(t_4x3.data(), 12);
 
 
@@ -430,11 +430,8 @@ This is exactly the same as not inserting a `device()` call.
 
 #### Evaluating with a Thread Pool
 
-    // Create the Eigen ThreadPool
-    Eigen::ThreadPool pool(8 /* number of threads in pool */)
-
     // Create the Eigen ThreadPoolDevice.
-    Eigen::ThreadPoolDevice my_device(&pool, 4 /* number of threads to use */);
+    Eigen::ThreadPoolDevice my_device(4 /* number of threads to use */);
 
     // Now just use the device when evaluating expressions.
     Eigen::Tensor<float, 2> c(30, 50);
@@ -1178,58 +1175,6 @@ Reduce a tensor using a user-defined reduction operator.  See `SumReducer`
 in TensorFunctors.h for information on how to implement a reduction operator.
 
 
-## Trace
-
-A *Trace* operation returns a tensor with fewer dimensions than the original
-tensor. It returns a tensor whose elements are the sum of the elements of the
-original tensor along the main diagonal for a list of specified dimensions, the
-"trace dimensions". Similar to the `Reduction Dimensions`, the trace dimensions
-are passed as an input parameter to the operation, are of type `<TensorType>::``Dimensions`
-, and have the same requirements when passed as an input parameter. In addition,
-the trace dimensions must have the same size.
-
-Example: Trace along 2 dimensions.
-
-    // Create a tensor of 3 dimensions
-    Eigen::Tensor<int, 3> a(2, 2, 3);
-    a.setValues({{{1, 2, 3}, {4, 5, 6}}, {{7, 8, 9}, {10, 11, 12}}});
-    // Specify the dimensions along which the trace will be computed.
-    // In this example, the trace can only be computed along the dimensions
-    // with indices 0 and 1
-    Eigen::array<int, 2> dims({0, 1});
-    // The output tensor contains all but the trace dimensions.
-    Tensor<int, 1> a_trace = a.trace(dims);
-    cout << "a_trace:" << endl;
-    cout << a_trace << endl;
-    =>
-    a_trace:
-    11
-    13
-    15
-
-
-### `<Operation> trace(const Dimensions& new_dims)`
-### `<Operation> trace()`
-
-As a special case, if no parameter is passed to the operation, trace is computed
-along *all* dimensions of the input tensor.
-
-Example: Trace along all dimensions.
-
-    // Create a tensor of 3 dimensions, with all dimensions having the same size.
-    Eigen::Tensor<int, 3> a(3, 3, 3);
-    a.setValues({{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}},
-                {{10, 11, 12}, {13, 14, 15}, {16, 17, 18}},
-                {{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}});
-    // Result is a zero dimension tensor
-    Tensor<int, 0> a_trace = a.trace();
-    cout<<"a_trace:"<<endl;
-    cout<<a_trace<<endl;
-    =>
-    a_trace:
-    42
-
-
 ## Scan Operations
 
 A *Scan* operation returns a tensor with the same dimensions as the original
@@ -1630,81 +1575,81 @@ dimension in RowMajor layout.
 
 For example, given the following input tensor:
 
-    Eigen::Tensor<float, 2, DataLayout> tensor(3,4);
-    tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f},
-                      {4.0f, 5.0f, 6.0f, 7.0f},
-                      {8.0f, 9.0f, 10.0f, 11.0f}});
+  Eigen::Tensor<float, 2, DataLayout> tensor(3,4);
+  tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f},
+                    {4.0f, 5.0f, 6.0f, 7.0f},
+                    {8.0f, 9.0f, 10.0f, 11.0f}});
 
-    cout << "tensor: " << endl << tensor << endl;
-    =>
-    tensor:
-     0   1   2   3
-     4   5   6   7
-     8   9  10  11
+  cout << "tensor: " << endl << tensor << endl;
+=>
+tensor:
+ 0   1   2   3
+ 4   5   6   7
+ 8   9  10  11
 
 Six 2x2 patches can be extracted and indexed using the following code:
 
-    Eigen::Tensor<float, 3, DataLayout> patch;
-    Eigen::array<ptrdiff_t, 2> patch_dims;
-    patch_dims[0] = 2;
-    patch_dims[1] = 2;
-    patch = tensor.extract_patches(patch_dims);
-    for (int k = 0; k < 6; ++k) {
-      cout << "patch index: " << k << endl;
-      for (int i = 0; i < 2; ++i) {
-    	for (int j = 0; j < 2; ++j) {
-    	  if (DataLayout == ColMajor) {
-    		cout << patch(i, j, k) << " ";
-    	  } else {
-    		cout << patch(k, i, j) << " ";
-    	  }
-    	}
-    	cout << endl;
+  Eigen::Tensor<float, 3, DataLayout> patch;
+  Eigen::array<ptrdiff_t, 2> patch_dims;
+  patch_dims[0] = 2;
+  patch_dims[1] = 2;
+  patch = tensor.extract_patches(patch_dims);
+  for (int k = 0; k < 6; ++k) {
+    cout << "patch index: " << k << endl;
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        if (DataLayout == ColMajor) {
+          cout << patch(i, j, k) << " ";
+        } else {
+          cout << patch(k, i, j) << " ";
+        }
       }
+      cout << endl;
     }
+  }
 
 This code results in the following output when the data layout is ColMajor:
 
-    patch index: 0
-    0 1
-    4 5
-    patch index: 1
-    4 5
-    8 9
-    patch index: 2
-    1 2
-    5 6
-    patch index: 3
-    5 6
-    9 10
-    patch index: 4
-    2 3
-    6 7
-    patch index: 5
-    6 7
-    10 11
+patch index: 0
+0 1
+4 5
+patch index: 1
+4 5
+8 9
+patch index: 2
+1 2
+5 6
+patch index: 3
+5 6
+9 10
+patch index: 4
+2 3
+6 7
+patch index: 5
+6 7
+10 11
 
 This code results in the following output when the data layout is RowMajor:
 (NOTE: the set of patches is the same as in ColMajor, but are indexed differently).
 
-    patch index: 0
-    0 1
-    4 5
-    patch index: 1
-    1 2
-    5 6
-    patch index: 2
-    2 3
-    6 7
-    patch index: 3
-    4 5
-    8 9
-    patch index: 4
-    5 6
-    9 10
-    patch index: 5
-    6 7
-    10 11
+patch index: 0
+0 1
+4 5
+patch index: 1
+1 2
+5 6
+patch index: 2
+2 3
+6 7
+patch index: 3
+4 5
+8 9
+patch index: 4
+5 6
+9 10
+patch index: 5
+6 7
+10 11
 
 ### `<Operation>  extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type)`
 
@@ -1736,30 +1681,28 @@ sizes:
  *) columns: 5
  *) batch:   7
 
-    Tensor<float, 4> tensor(2,3,5,7);
-    Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+  Tensor<float, 4> tensor(2,3,5,7);
+  Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
 
 2x2 image patches can be extracted and indexed using the following code:
 
 *) 2D patch: ColMajor (patch indexed by second-to-last dimension)
-
-    Tensor<float, 5> twod_patch;
-    twod_patch = tensor.extract_image_patches<2, 2>();
-    // twod_patch.dimension(0) == 2
-    // twod_patch.dimension(1) == 2
-    // twod_patch.dimension(2) == 2
-    // twod_patch.dimension(3) == 3*5
-    // twod_patch.dimension(4) == 7
+  Tensor<float, 5> twod_patch;
+  twod_patch = tensor.extract_image_patches<2, 2>();
+  // twod_patch.dimension(0) == 2
+  // twod_patch.dimension(1) == 2
+  // twod_patch.dimension(2) == 2
+  // twod_patch.dimension(3) == 3*5
+  // twod_patch.dimension(4) == 7
 
 *) 2D patch: RowMajor (patch indexed by the second dimension)
-
-    Tensor<float, 5, RowMajor> twod_patch_row_major;
-    twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>();
-    // twod_patch_row_major.dimension(0) == 7
-    // twod_patch_row_major.dimension(1) == 3*5
-    // twod_patch_row_major.dimension(2) == 2
-    // twod_patch_row_major.dimension(3) == 2
-    // twod_patch_row_major.dimension(4) == 2
+  Tensor<float, 5, RowMajor> twod_patch_row_major;
+  twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>();
+  // twod_patch_row_major.dimension(0) == 7
+  // twod_patch_row_major.dimension(1) == 3*5
+  // twod_patch_row_major.dimension(2) == 2
+  // twod_patch_row_major.dimension(3) == 2
+  // twod_patch_row_major.dimension(4) == 2
 
 ## Special Operations
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
index 17cee495f..00295a255 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -112,7 +112,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
 
 #if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
+    EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
     {
       // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -398,21 +398,6 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
       internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
     }
 
-    #if EIGEN_HAS_RVALUE_REFERENCES
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Tensor(Self&& other)
-      : Tensor()
-    {
-      m_storage.swap(other.m_storage);
-    }
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE Tensor& operator=(Self&& other)
-    {
-      m_storage.swap(other.m_storage);
-      return *this;
-    }
-    #endif
-
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other)
     {
@@ -477,18 +462,6 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
       // Nothing to do: rank 0 tensors have fixed size
     }
 
-#ifdef EIGEN_HAS_INDEX_LIST
-    template <typename FirstType, typename... OtherTypes>
-    EIGEN_DEVICE_FUNC
-    void resize(const Eigen::IndexList<FirstType, OtherTypes...>& dimensions) {
-      array<Index, NumIndices> dims;
-      for (int i = 0; i < NumIndices; ++i) {
-        dims[i] = static_cast<Index>(dimensions[i]);
-      }
-      resize(dims);
-    }
-#endif
-
     /** Custom Dimension */
 #ifdef EIGEN_HAS_SFINAE
     template<typename CustomDimension,
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
index 91a6f8d6c..d06f40cd8 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -37,7 +37,7 @@ struct traits<TensorIndexTupleOp<XprType> > : public traits<XprType>
 template<typename XprType>
 struct eval<TensorIndexTupleOp<XprType>, Eigen::Dense>
 {
-  typedef const TensorIndexTupleOp<XprType>EIGEN_DEVICE_REF type;
+  typedef const TensorIndexTupleOp<XprType>& type;
 };
 
 template<typename XprType>
@@ -82,23 +82,16 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
 
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
   static const int NumDims = internal::array_size<Dimensions>::value;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
     PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
     BlockAccess = false,
-    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device) { }
 
@@ -106,7 +99,7 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
     return m_impl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
@@ -124,13 +117,7 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
     return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1);
   }
 
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-
-#ifdef EIGEN_USE_SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
  protected:
   TensorEvaluator<ArgType, Device> m_impl;
@@ -160,7 +147,7 @@ struct traits<TensorTupleReducerOp<ReduceOp, Dims, XprType> > : public traits<Xp
 template<typename ReduceOp, typename Dims, typename XprType>
 struct eval<TensorTupleReducerOp<ReduceOp, Dims, XprType>, Eigen::Dense>
 {
-  typedef const TensorTupleReducerOp<ReduceOp, Dims, XprType>EIGEN_DEVICE_REF type;
+  typedef const TensorTupleReducerOp<ReduceOp, Dims, XprType>& type;
 };
 
 template<typename ReduceOp, typename Dims, typename XprType>
@@ -185,7 +172,7 @@ class TensorTupleReducerOp : public TensorBase<TensorTupleReducerOp<ReduceOp, Di
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTupleReducerOp(const XprType& expr,
                                                           const ReduceOp& reduce_op,
-                                                          const Index return_dim,
+                                                          const int return_dim,
                                                           const Dims& reduce_dims)
       : m_xpr(expr), m_reduce_op(reduce_op), m_return_dim(return_dim), m_reduce_dims(reduce_dims) {}
 
@@ -200,12 +187,12 @@ class TensorTupleReducerOp : public TensorBase<TensorTupleReducerOp<ReduceOp, Di
   const Dims& reduce_dims() const { return m_reduce_dims; }
 
   EIGEN_DEVICE_FUNC
-  Index return_dim() const { return m_return_dim; }
+  int return_dim() const { return m_return_dim; }
 
   protected:
     typename XprType::Nested m_xpr;
     const ReduceOp m_reduce_op;
-    const Index m_return_dim;
+    const int m_return_dim;
     const Dims m_reduce_dims;
 };
 
@@ -222,29 +209,21 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
   typedef typename TensorEvaluator<const TensorIndexTupleOp<ArgType> , Device>::Dimensions InputDimensions;
   static const int NumDims = internal::array_size<InputDimensions>::value;
   typedef array<Index, NumDims> StrideDims;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-  typedef StorageMemory<TupleType, Device> TupleStorageMem;
 
   enum {
-    IsAligned         = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
-    PacketAccess      = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
-    BlockAccess       = false,
-    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
-    Layout            = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
-    CoordAccess       = false,  // to be implemented
-    RawAccess         = false
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
+    BlockAccess = false,
+    Layout = TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_orig_impl(op.expression(), device),
         m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device),
-        m_return_dim(op.return_dim())
-  {
+        m_return_dim(op.return_dim()) {
+
     gen_strides(m_orig_impl.dimensions(), m_strides);
     if (Layout == static_cast<int>(ColMajor)) {
       const Index total_size = internal::array_prod(m_orig_impl.dimensions());
@@ -252,18 +231,15 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
     } else {
       const Index total_size = internal::array_prod(m_orig_impl.dimensions());
       m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size;
-    }    
-    // If m_return_dim is not a valid index, returns 1 or this can crash on Windows.
-    m_stride_div = ((m_return_dim >= 0) &&
-                    (m_return_dim < static_cast<Index>(m_strides.size())))
-                   ? m_strides[m_return_dim] : 1;
+    }
+    m_stride_div = m_strides[m_return_dim];
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
     return m_impl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
@@ -276,13 +252,7 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
     return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div;
   }
 
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-#ifdef EIGEN_USE_SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-    m_orig_impl.bind(cgh);
-  }
-#endif
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
   costPerCoeff(bool vectorized) const {
@@ -318,7 +288,7 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
  protected:
   TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device> m_orig_impl;
   TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexTupleOp<ArgType> >, Device> m_impl;
-  const Index m_return_dim;
+  const int m_return_dim;
   StrideDims m_strides;
   Index m_stride_mod;
   Index m_stride_div;
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index 72f072cf2..166be200c 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -34,7 +34,6 @@ struct traits<TensorAssignOp<LhsXprType, RhsXprType> >
   typedef typename remove_reference<RhsNested>::type _RhsNested;
   static const std::size_t NumDimensions = internal::traits<LhsXprType>::NumDimensions;
   static const int Layout = internal::traits<LhsXprType>::Layout;
-  typedef typename traits<LhsXprType>::PointerType PointerType;
 
   enum {
     Flags = 0
@@ -68,8 +67,6 @@ class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType>
   typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index;
 
-  static const int NumDims = Eigen::internal::traits<TensorAssignOp>::NumDimensions;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs)
       : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {}
 
@@ -97,41 +94,20 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  static const int NumDims = XprType::NumDims;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
-    IsAligned         = TensorEvaluator<LeftArgType, Device>::IsAligned &
-                        TensorEvaluator<RightArgType, Device>::IsAligned,
-    PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &
-                        TensorEvaluator<RightArgType, Device>::PacketAccess,
-    BlockAccess       = TensorEvaluator<LeftArgType, Device>::BlockAccess &
-                        TensorEvaluator<RightArgType, Device>::BlockAccess,
-    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
-                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
-    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
-    RawAccess         = TensorEvaluator<LeftArgType, Device>::RawAccess
+    IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess
   };
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlock
-      RightTensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
       m_leftImpl(op.lhsExpression(), device),
       m_rightImpl(op.rhsExpression(), device)
   {
-    EIGEN_STATIC_ASSERT(
-        (static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
-         static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
-        YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
   }
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
@@ -142,7 +118,7 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
     return m_rightImpl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
     eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
     m_leftImpl.evalSubExprsIfNeeded(NULL);
     // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non
@@ -151,18 +127,6 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
     // by the rhs to the lhs.
     return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data());
   }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType, EvalSubExprsCallback done) {
-    m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) {
-      m_rightImpl.evalSubExprsIfNeededAsync(
-          m_leftImpl.data(), [done](bool need_assign) { done(need_assign); });
-    });
-  }
-#endif  // EIGEN_USE_THREADS
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_leftImpl.cleanup();
     m_rightImpl.cleanup();
@@ -172,7 +136,6 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
     m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) {
-
     const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
     const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
     m_leftImpl.template writePacket<LhsStoreMode>(i, m_rightImpl.template packet<RhsLoadMode>(i));
@@ -200,41 +163,12 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
            TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    return internal::TensorBlockResourceRequirements::merge(
-        m_leftImpl.getResourceRequirements(),
-        m_rightImpl.getResourceRequirements());
-  }
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(
-      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
-    if (TensorEvaluator<LeftArgType, Device>::RawAccess &&
-        m_leftImpl.data() != NULL) {
-      // If destination has raw data access, we pass it as a potential
-      // destination for a block descriptor evaluation.
-      desc.template AddDestinationBuffer<Layout>(
-          /*dst_base=*/m_leftImpl.data() + desc.offset(),
-          /*dst_strides=*/internal::strides<Layout>(m_leftImpl.dimensions()));
-    }
-
-    RightTensorBlock block = m_rightImpl.block(desc, scratch, /*root_of_expr_ast=*/true);
-    // If block was evaluated into a destination, there is no need to do assignment.
-    if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
-      m_leftImpl.writeBlock(desc, block);
-    }
-    block.cleanup();
-  }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_leftImpl.bind(cgh);
-    m_rightImpl.bind(cgh);
-  }
-#endif
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_leftImpl.data(); }
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); }
 
  private:
   TensorEvaluator<LeftArgType, Device> m_leftImpl;
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index bb0969f49..f573608d9 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -20,7 +20,7 @@ namespace Eigen {
   * \brief The tensor base class.
   *
   * This class is the common parent of the Tensor and TensorMap class, thus
-  * making it possible to use either class interchangeably in expressions.
+  * making it possible to use either class interchangably in expressions.
   */
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 // FIXME Doxygen does not like the inheritance with different template parameters
@@ -135,78 +135,6 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return unaryExpr(internal::scalar_digamma_op<Scalar>());
     }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i0_op<Scalar>, const Derived>
-    bessel_i0() const {
-      return unaryExpr(internal::scalar_bessel_i0_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i0e_op<Scalar>, const Derived>
-    bessel_i0e() const {
-      return unaryExpr(internal::scalar_bessel_i0e_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i1_op<Scalar>, const Derived>
-    bessel_i1() const {
-      return unaryExpr(internal::scalar_bessel_i1_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i1e_op<Scalar>, const Derived>
-    bessel_i1e() const {
-      return unaryExpr(internal::scalar_bessel_i1e_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_j0_op<Scalar>, const Derived>
-    bessel_j0() const {
-      return unaryExpr(internal::scalar_bessel_j0_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_y0_op<Scalar>, const Derived>
-    bessel_y0() const {
-      return unaryExpr(internal::scalar_bessel_y0_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_j1_op<Scalar>, const Derived>
-    bessel_j1() const {
-      return unaryExpr(internal::scalar_bessel_j1_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_y1_op<Scalar>, const Derived>
-    bessel_y1() const {
-      return unaryExpr(internal::scalar_bessel_y1_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k0_op<Scalar>, const Derived>
-    bessel_k0() const {
-      return unaryExpr(internal::scalar_bessel_k0_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k0e_op<Scalar>, const Derived>
-    bessel_k0e() const {
-      return unaryExpr(internal::scalar_bessel_k0e_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k1_op<Scalar>, const Derived>
-    bessel_k1() const {
-      return unaryExpr(internal::scalar_bessel_k1_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k1e_op<Scalar>, const Derived>
-    bessel_k1e() const {
-      return unaryExpr(internal::scalar_bessel_k1e_op<Scalar>());
-    }
-
     // igamma(a = this, x = other)
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorCwiseBinaryOp<internal::scalar_igamma_op<Scalar>, const Derived, const OtherDerived>
@@ -214,20 +142,6 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return binaryExpr(other.derived(), internal::scalar_igamma_op<Scalar>());
     }
 
-    // igamma_der_a(a = this, x = other)
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_igamma_der_a_op<Scalar>, const Derived, const OtherDerived>
-    igamma_der_a(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_igamma_der_a_op<Scalar>());
-    }
-
-    // gamma_sample_der_alpha(alpha = this, sample = other)
-    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_gamma_sample_der_alpha_op<Scalar>, const Derived, const OtherDerived>
-    gamma_sample_der_alpha(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_gamma_sample_der_alpha_op<Scalar>());
-    }
-
     // igammac(a = this, x = other)
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorCwiseBinaryOp<internal::scalar_igammac_op<Scalar>, const Derived, const OtherDerived>
@@ -262,15 +176,9 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_ndtri_op<Scalar>, const Derived>
-    ndtri() const {
-      return unaryExpr(internal::scalar_ndtri_op<Scalar>());
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_logistic_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sigmoid_op<Scalar>, const Derived>
     sigmoid() const {
-      return unaryExpr(internal::scalar_logistic_op<Scalar>());
+      return unaryExpr(internal::scalar_sigmoid_op<Scalar>());
     }
 
     EIGEN_DEVICE_FUNC
@@ -279,12 +187,6 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return unaryExpr(internal::scalar_exp_op<Scalar>());
     }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_expm1_op<Scalar>, const Derived>
-    expm1() const {
-      return unaryExpr(internal::scalar_expm1_op<Scalar>());
-    }
-
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived>
     log() const {
@@ -304,17 +206,9 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_clamp_op<Scalar>, const Derived>
-    clip(Scalar min, Scalar max) const {
-      return unaryExpr(internal::scalar_clamp_op<Scalar>(min, max));
-    }
-
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const typename internal::conditional<NumTraits<CoeffReturnType>::IsComplex,
-                                                             TensorCwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Derived>,
-                                                             Derived>::type
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Derived>
     conjugate() const {
-      return choose(Cond<NumTraits<CoeffReturnType>::IsComplex>(), unaryExpr(internal::scalar_conjugate_op<Scalar>()), derived());
+      return unaryExpr(internal::scalar_conjugate_op<Scalar>());
     }
 
     EIGEN_DEVICE_FUNC
@@ -407,13 +301,10 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return cwiseMin(constant(threshold));
     }
 
-    template<typename NewType>
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const typename internal::conditional<internal::is_same<NewType, CoeffReturnType>::value,
-                                                             Derived,
-                                                             TensorConversionOp<NewType, const Derived> >::type
+    template <typename NewType> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorConversionOp<NewType, const Derived>
     cast() const {
-      return choose(Cond<internal::is_same<NewType, CoeffReturnType>::value>(), derived(), TensorConversionOp<NewType, const Derived>(derived()));
+      return TensorConversionOp<NewType, const Derived>(derived());
     }
 
     EIGEN_DEVICE_FUNC
@@ -422,12 +313,6 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return unaryExpr(internal::scalar_round_op<Scalar>());
     }
 
-    EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_rint_op<Scalar>, const Derived>
-    rint() const {
-      return unaryExpr(internal::scalar_rint_op<Scalar>());
-    }
-
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_ceil_op<Scalar>, const Derived>
     ceil() const {
@@ -596,15 +481,9 @@ class TensorBase<Derived, ReadOnlyAccessors>
     typedef Eigen::IndexPair<Index> DimensionPair;
 
     template<typename OtherDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const NoOpOutputKernel>
+    const TensorContractionOp<const Dimensions, const Derived, const OtherDerived>
     contract(const OtherDerived& other, const Dimensions& dims) const {
-      return TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const NoOpOutputKernel>(derived(), other.derived(), dims);
-    }
-
-    template<typename OtherDerived, typename Dimensions, typename OutputKernel> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const OutputKernel>
-    contract(const OtherDerived& other, const Dimensions& dims, const OutputKernel& output_kernel) const {
-      return TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const OutputKernel>(derived(), other.derived(), dims, output_kernel);
+      return TensorContractionOp<const Dimensions, const Derived, const OtherDerived>(derived(), other.derived(), dims);
     }
 
     // Convolutions.
@@ -617,8 +496,8 @@ class TensorBase<Derived, ReadOnlyAccessors>
     // Fourier transforms
     template <int FFTDataType, int FFTDirection, typename FFT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>
-    fft(const FFT& dims) const {
-      return TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>(derived(), dims);
+    fft(const FFT& fft) const {
+      return TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>(derived(), fft);
     }
 
     // Scan.
@@ -705,26 +584,26 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
     template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::AndReducer, const Dims, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
+    const TensorReductionOp<internal::AndReducer, const Dims, const TensorConversionOp<bool, const Derived> >
     all(const Dims& dims) const {
       return cast<bool>().reduce(dims, internal::AndReducer());
     }
 
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::AndReducer, const DimensionList<Index, NumDimensions>, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
+    const TensorReductionOp<internal::AndReducer, const DimensionList<Index, NumDimensions>, const TensorConversionOp<bool, const Derived> >
     all() const {
       DimensionList<Index, NumDimensions> in_dims;
       return cast<bool>().reduce(in_dims, internal::AndReducer());
     }
 
     template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::OrReducer, const Dims, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
+    const TensorReductionOp<internal::OrReducer, const Dims, const TensorConversionOp<bool, const Derived> >
     any(const Dims& dims) const {
       return cast<bool>().reduce(dims, internal::OrReducer());
     }
 
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorReductionOp<internal::OrReducer, const DimensionList<Index, NumDimensions>, const typename internal::conditional<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> >::type >
+    const TensorReductionOp<internal::OrReducer, const DimensionList<Index, NumDimensions>, const TensorConversionOp<bool, const Derived> >
     any() const {
       DimensionList<Index, NumDimensions> in_dims;
       return cast<bool>().reduce(in_dims, internal::OrReducer());
@@ -736,7 +615,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
       const array<Index, NumDimensions>, const Derived>
     argmax() const {
       array<Index, NumDimensions> in_dims;
-      for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d;
+      for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d;
       return TensorTupleReducerOp<
         internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
         const array<Index, NumDimensions>,
@@ -749,7 +628,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
       const array<Index, NumDimensions>, const Derived>
     argmin() const {
       array<Index, NumDimensions> in_dims;
-      for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d;
+      for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d;
       return TensorTupleReducerOp<
         internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
         const array<Index, NumDimensions>,
@@ -760,7 +639,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
     const TensorTupleReducerOp<
       internal::ArgMaxTupleReducer<Tuple<Index, CoeffReturnType> >,
       const array<Index, 1>, const Derived>
-    argmax(const Index return_dim) const {
+    argmax(const int return_dim) const {
       array<Index, 1> in_dims;
       in_dims[0] = return_dim;
       return TensorTupleReducerOp<
@@ -773,7 +652,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
     const TensorTupleReducerOp<
       internal::ArgMinTupleReducer<Tuple<Index, CoeffReturnType> >,
       const array<Index, 1>, const Derived>
-    argmin(const Index return_dim) const {
+    argmin(const int return_dim) const {
       array<Index, 1> in_dims;
       in_dims[0] = return_dim;
       return TensorTupleReducerOp<
@@ -788,22 +667,10 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return TensorReductionOp<Reducer, const Dims, const Derived>(derived(), dims, reducer);
     }
 
-    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorTraceOp<const Dims, const Derived>
-    trace(const Dims& dims) const {
-      return TensorTraceOp<const Dims, const Derived>(derived(), dims);
-    }
-
-    const TensorTraceOp<const DimensionList<Index, NumDimensions>, const Derived>
-    trace() const {
-      DimensionList<Index, NumDimensions> in_dims;
-      return TensorTraceOp<const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims);
-    }
-
     template <typename Broadcast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorBroadcastingOp<const Broadcast, const Derived>
-    broadcast(const Broadcast& bcast) const {
-      return TensorBroadcastingOp<const Broadcast, const Derived>(derived(), bcast);
+    broadcast(const Broadcast& broadcast) const {
+      return TensorBroadcastingOp<const Broadcast, const Derived>(derived(), broadcast);
     }
 
     template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -911,8 +778,8 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
     template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorShufflingOp<const Shuffle, const Derived>
-    shuffle(const Shuffle& shfl) const {
-      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shfl);
+    shuffle(const Shuffle& shuffle) const {
+      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shuffle);
     }
     template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorStridingOp<const Strides, const Derived>
@@ -953,8 +820,7 @@ class TensorBase<Derived, ReadOnlyAccessors>
   protected:
     template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
     template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
-    // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0
-    template <typename OtherDerived, int AccessLevel> friend class Eigen::TensorBase;
+    template <typename OtherDerived, int AccessLevel> friend class TensorBase;
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
 };
@@ -970,8 +836,7 @@ class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
 
     template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
     template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
-    // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0
-    template <typename OtherDerived, int OtherAccessLevel> friend class Eigen::TensorBase;
+    template <typename OtherDerived, int OtherAccessLevel> friend class TensorBase;
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& setZero() {
@@ -1109,13 +974,13 @@ class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
 
     template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorShufflingOp<const Shuffle, const Derived>
-    shuffle(const Shuffle& shfl) const {
-      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shfl);
+    shuffle(const Shuffle& shuffle) const {
+      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shuffle);
     }
     template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     TensorShufflingOp<const Shuffle, Derived>
-    shuffle(const Shuffle& shfl) {
-      return TensorShufflingOp<const Shuffle, Derived>(derived(), shfl);
+    shuffle(const Shuffle& shuffle) {
+      return TensorShufflingOp<const Shuffle, Derived>(derived(), shuffle);
     }
 
     template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -1131,14 +996,8 @@ class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
 
     // Select the device on which to evaluate the expression.
     template <typename DeviceType>
-    TensorDevice<Derived, DeviceType> device(const DeviceType& dev) {
-      return TensorDevice<Derived, DeviceType>(dev, derived());
-    }
-
-    // Select the async device on which to evaluate the expression.
-    template <typename DeviceType, typename DoneCallback>
-    TensorAsyncDevice<Derived, DeviceType, DoneCallback> device(const DeviceType& dev, DoneCallback done) {
-      return TensorAsyncDevice<Derived, DeviceType, DoneCallback>(dev, derived(), std::move(done));
+    TensorDevice<Derived, DeviceType> device(const DeviceType& device) {
+      return TensorDevice<Derived, DeviceType>(device, derived());
     }
 
  protected:
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
deleted file mode 100644
index 1e55d12c4..000000000
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ /dev/null
@@ -1,1559 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
-#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
-
-namespace Eigen {
-namespace internal {
-
-// -------------------------------------------------------------------------- //
-// Forward declarations for templates defined below.
-template <typename Scalar, typename IndexType, int NumDims, int Layout>
-class TensorBlockIO;
-
-// -------------------------------------------------------------------------- //
-// Helper function to compute strides for densely stored buffer of given
-// dimensions.
-
-// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
-// this function instead everywhere.
-template <int Layout, typename IndexType, int NumDims>
-EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
-    const DSizes<IndexType, NumDims>& dimensions) {
-  DSizes<IndexType, NumDims> strides;
-  if (NumDims == 0) return strides;
-
-  // TODO(ezhulenev): Use templates to unroll this loop (similar to
-  // h_array_reduce in CXX11meta.h)? Benchmark it.
-  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-    strides[0] = 1;
-    for (int i = 1; i < NumDims; ++i) {
-      strides[i] = strides[i - 1] * dimensions[i - 1];
-    }
-  } else {
-    strides[NumDims - 1] = 1;
-    for (int i = NumDims - 2; i >= 0; --i) {
-      strides[i] = strides[i + 1] * dimensions[i + 1];
-    }
-  }
-
-  return strides;
-}
-
-template <int Layout, typename IndexType, size_t NumDims>
-EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
-    const Eigen::array<IndexType, NumDims>& dimensions) {
-  return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
-}
-
-template <int Layout, std::ptrdiff_t... Indices>
-EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
-    const Sizes<Indices...>& sizes) {
-  return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
-}
-
-// -------------------------------------------------------------------------- //
-
-// Tensor block shape type defines what are the shape preference for the blocks
-// extracted from the larger tensor.
-//
-// Example: blocks of 100 elements from the large 100x100 tensor:
-// - tensor: 100x100
-// - target_block_size: 100
-//
-// TensorBlockShapeType:
-//  - kUniformAllDims: 100 blocks of size 10x10
-//  - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
-//                      or row major layout)
-enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
-
-struct TensorBlockResourceRequirements {
-  TensorBlockShapeType shape_type;  // target block shape
-  size_t size;                      // target block size
-  TensorOpCost cost_per_coeff;      // cost of computing a single block element
-
-#ifdef EIGEN_HIPCC
-  // For HIPCC, we need to explicitly declare as a "device fun", the constructor
-  // which is implicitly invoked in the "merge" / "any" routines. else HIPCC
-  // errors out complaining about the lack of a matching constructor
-  EIGEN_DEVICE_FUNC
-  TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_,
-				  TensorOpCost cost_)
-    : shape_type(shape_type_), size(size_), cost_per_coeff(cost_)
-  {}
-#endif
-
-  template <typename Scalar>
-  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
-      TensorBlockShapeType shape_type, size_t size_in_bytes,
-      TensorOpCost cost) {
-    const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));
-    return {shape_type, size, cost};
-  }
-
-  template <typename Scalar>
-  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
-      TensorBlockShapeType shape_type, size_t size_in_bytes) {
-    // This default cost per coefficient is valid for most materialized tensor
-    // block evaluation implementations, because they typically just read
-    // coefficients from the underlying tensor storage, and write to the tensor
-    // block buffer (scratch or destination memory, reads and writes have linear
-    // access pattern). We ignore the fixed cost of block evaluation, because in
-    // practice it should negligible.
-    //
-    // Lazy block evaluation adds the cost of calling a functor for each
-    // coefficient.
-    //
-    // All non-trivial block evaluation implementations must provide their own
-    // cost approximation (e.g. shuffling inner dimension has a much higher cost
-    // because it reads memory randomly, although the total number of moved
-    // bytes is the same).
-    return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
-                                    {/*bytes_loaded=*/sizeof(Scalar),
-                                     /*bytes_stored=*/sizeof(Scalar),
-                                     /*compute_cycles=*/0});
-  }
-
-  template <typename Scalar>
-  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(
-      size_t size_in_bytes) {
-    return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims,
-                                    size_in_bytes);
-  }
-
-  template <typename Scalar>
-  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(
-      size_t size_in_bytes) {
-    return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims,
-                                    size_in_bytes);
-  }
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
-  merge(const TensorBlockResourceRequirements& lhs,
-        const TensorBlockResourceRequirements& rhs) {
-    return {merge(lhs.shape_type, rhs.shape_type),           // shape_type
-            merge(lhs.size, rhs.size),                       // size
-            merge(lhs.cost_per_coeff, rhs.cost_per_coeff)};  // cost_per_coeff
-  }
-
-  EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(
-      TensorOpCost cost) {
-    cost_per_coeff += cost;
-    return *this;
-  }
-
-  // This is a resource requirement that should be returned from expressions
-  // that do not have any block evaluation preference (e.g. default tensor
-  // expression with raw buffer access).
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
-    return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
-  }
-
- private:
-  using Requirements = TensorBlockResourceRequirements;
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
-    return numext::maxi(lhs_size, rhs_size);
-  }
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE TensorBlockShapeType
-  merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) {
-    return (lhs == TensorBlockShapeType::kSkewedInnerDims ||
-            rhs == TensorBlockShapeType::kSkewedInnerDims)
-               ? TensorBlockShapeType::kSkewedInnerDims
-               : TensorBlockShapeType::kUniformAllDims;
-  }
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost,
-                                                TensorOpCost rhs_cost) {
-    return lhs_cost + rhs_cost;
-  }
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockDescriptor specifies a block offset within a tensor and the block
-// sizes along each of the tensor dimensions.
-
-template <int NumDims, typename IndexType = Eigen::Index>
-class TensorBlockDescriptor {
- public:
-  typedef DSizes<IndexType, NumDims> Dimensions;
-
-  // If we evaluate a Tensor assignment, and expression on the left, already has
-  // a memory buffer, then we might do performance optimization, and evaluate
-  // the root expression directly into the final output memory. Some time it's
-  // possible to reuse it for materializing subexpressions inside an expression
-  // tree, to to avoid dynamic memory allocation.
-  //
-  // The pointer type of the underlying storage is erased, because passing
-  // Scalar type through all the expression evaluation layers is way too many
-  // templates. In practice destination buffer type should always match the
-  // evaluated expression scalar type.
-  class DestinationBuffer {
-   public:
-    enum DestinationBufferKind : int {
-      // The above explicit specification of "int" as the enum basetype is
-      // needed to get around a HIPCC link error ("the field type is not
-      // amp-compatible")
-      // which is issued for class members with the enum type.
-      // TODO(rocm):
-      // remove the "int" basetype once HIPCC has been fixed to not error out
-      // in the above scenario.
-
-      // Destination buffer is not defined (`m_data` == nullptr).
-      kEmpty,
-
-      // Tensor block defined by an owning tensor block descriptor can fit
-      // contiguously into the destination buffer. In this case it's safe to
-      // materialize tensor block in the destination buffer, wrap it in a
-      // TensorMap, and use to build Eigen expression on top of it.
-      kContiguous,
-
-      // Destination buffer strides do not match strides of the contiguously
-      // stored block, and it's impossible to define a TensorMap over this
-      // buffer. However if we are evaluating a root of an expression tree, we
-      // still can materialize an output into this destination, because we can
-      // guarantee that no one will ever access it through block API.
-      //
-      // In theory it is possible to build valid TensorStriding<TensorMap>
-      // expression on top of this destination buffer, however it has
-      // inefficient coeff/packet access, and defeats the purpose of fast block
-      // evaluation API.
-      kStrided
-    };
-
-    template <typename Scalar>
-    Scalar* data() const {
-      eigen_assert(m_data_type_size == sizeof(Scalar));
-      return static_cast<Scalar*>(m_data);
-    }
-
-    const Dimensions& strides() const { return m_strides; }
-    const DestinationBufferKind& kind() const { return m_kind; }
-
-   private:
-    friend class TensorBlockDescriptor;
-
-    DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
-
-    template <typename Scalar>
-    DestinationBuffer(Scalar* data, const Dimensions& strides,
-                      DestinationBufferKind kind)
-        : m_data(static_cast<void*>(data)),
-          m_data_type_size(sizeof(Scalar)),
-          m_strides(strides),
-          m_kind(kind) {}
-
-    template <int Layout, typename Scalar>
-    static DestinationBuffer make(const TensorBlockDescriptor& desc,
-                                  Scalar* data, const Dimensions& strides) {
-      return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
-    }
-
-    template <int Layout>
-    static DestinationBufferKind kind(const TensorBlockDescriptor& desc,
-                                      const Dimensions& strides) {
-      const Dimensions& desc_dims = desc.dimensions();
-      const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
-      for (int i = 0; i < NumDims; ++i) {
-        if (desc_dims[i] == 1) continue;
-        if (desc_strides[i] != strides[i]) return kStrided;
-      }
-      return kContiguous;
-    }
-
-    // Storage pointer is type erased, to reduce template bloat, but we still
-    // keep the size of the underlying element type for error checking.
-    void* m_data;
-    size_t m_data_type_size;
-
-    // Destination buffer dimensions always match the dimensions of a tensor
-    // block descriptor it belongs to, however strides might be different.
-    Dimensions m_strides;
-
-    DestinationBufferKind m_kind;
-  };
-
-  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
-                        const DestinationBuffer& destination)
-      : m_offset(offset),
-        m_dimensions(dimensions),
-        m_destination(destination) {}
-
-  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
-      : m_offset(offset),
-        m_dimensions(dimensions),
-        m_destination(DestinationBuffer()) {}
-
-  IndexType offset() const { return m_offset; }
-  const Dimensions& dimensions() const { return m_dimensions; }
-  IndexType dimension(int index) const { return m_dimensions[index]; }
-  IndexType size() const { return array_prod<IndexType>(m_dimensions); }
-
-  const DestinationBuffer& destination() const { return m_destination; }
-
-  template <int Layout, typename Scalar>
-  void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
-    eigen_assert(dst_base != NULL);
-    m_destination =
-        DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
-  }
-
-  template <int Layout, typename Scalar, typename DstStridesIndexType>
-  void AddDestinationBuffer(
-      Scalar* dst_base,
-      const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
-    // DSizes constructor will do index type promotion if it's safe.
-    AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
-  }
-
-  TensorBlockDescriptor& DropDestinationBuffer() {
-    m_destination.m_data = NULL;
-    m_destination.m_kind = DestinationBuffer::kEmpty;
-    return *this;
-  }
-
-  bool HasDestinationBuffer() const {
-    return m_destination.kind() != DestinationBuffer::kEmpty;
-  }
-
-  // Returns a copy of `*this` with updated offset.
-  TensorBlockDescriptor WithOffset(IndexType offset) const {
-    return TensorBlockDescriptor(offset, m_dimensions, m_destination);
-  }
-
- private:
-  // Offset and dimensions are immutable after construction. Block descriptor
-  // can only be mutated by adding or dropping destination.
-  const IndexType m_offset;
-  const Dimensions m_dimensions;
-  DestinationBuffer m_destination;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockMapper is responsible for iterating over the blocks of a tensor.
-
-template <int NumDims, int Layout, typename IndexType = Eigen::Index>
-class TensorBlockMapper {
-  typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
-
- public:
-  typedef DSizes<IndexType, NumDims> Dimensions;
-
-  TensorBlockMapper() = default;
-  TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions,
-                    const TensorBlockResourceRequirements& requirements)
-      : m_tensor_dimensions(dimensions), m_requirements(requirements) {
-    // Compute block dimensions and the total number of blocks.
-    InitializeBlockDimensions();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const {
-    return m_total_block_count;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const {
-    return m_block_dimensions.TotalSize();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>&
-  blockDimensions() const {
-    return m_block_dimensions;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor
-  blockDescriptor(IndexType block_index) const {
-    static const bool isColMajor = Layout == static_cast<int>(ColMajor);
-
-    IndexType offset = 0;
-    DSizes<IndexType, NumDims> dimensions;
-
-    if (NumDims == 0) return BlockDescriptor(offset, dimensions);
-
-    // Iterate outer -> inner dimensions.
-    for (int i = NumDims - 1; i >= 0; --i) {
-      const int dim = isColMajor ? i : NumDims - i - 1;
-
-      const IndexType idx = block_index / m_block_strides[dim];
-      block_index -= idx * m_block_strides[dim];
-
-      const IndexType coord = idx * m_block_dimensions[dim];
-      dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord,
-                                     m_block_dimensions[dim]);
-      offset += coord * m_tensor_strides[dim];
-    }
-
-    return {offset, dimensions};
-  }
-
- private:
-  void InitializeBlockDimensions() {
-    // Requested block shape and size.
-    const TensorBlockShapeType shape_type = m_requirements.shape_type;
-    IndexType target_block_size =
-        numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
-
-    IndexType tensor_size = m_tensor_dimensions.TotalSize();
-
-    // Corner case: one of the dimensions is zero. Logic below is too complex
-    // to handle this case on a general basis, just use unit block size.
-    // Note: we must not yield blocks with zero dimensions (recipe for
-    // overflows/underflows, divisions by zero and NaNs later).
-    if (tensor_size == 0) {
-      for (int i = 0; i < NumDims; ++i) {
-        m_block_dimensions[i] = 1;
-      }
-      m_total_block_count = 0;
-      return;
-    }
-
-    // If tensor fits into a target block size, evaluate it as a single block.
-    if (tensor_size <= target_block_size) {
-      m_block_dimensions = m_tensor_dimensions;
-      m_total_block_count = 1;
-      // The only valid block index is `0`, and in this case we do not need
-      // to compute real strides for tensor or blocks (see blockDescriptor).
-      for (int i = 0; i < NumDims; ++i) {
-        m_tensor_strides[i] = 0;
-        m_block_strides[i] = 1;
-      }
-      return;
-    }
-
-    static const bool isColMajor = Layout == static_cast<int>(ColMajor);
-
-    // Block shape skewed towards inner dimension.
-    if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
-      IndexType coeff_to_allocate = target_block_size;
-
-      for (int i = 0; i < NumDims; ++i) {
-        const int dim = isColMajor ? i : NumDims - i - 1;
-        m_block_dimensions[dim] =
-            numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
-        coeff_to_allocate = divup(
-            coeff_to_allocate,
-            numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
-      }
-      eigen_assert(coeff_to_allocate == 1);
-
-    } else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
-      // Tensor will not fit within 'target_block_size' budget: calculate tensor
-      // block dimension sizes based on "square" dimension size target.
-      const IndexType dim_size_target = convert_index<IndexType>(
-          std::pow(static_cast<float>(target_block_size),
-                   1.0f / static_cast<float>(m_block_dimensions.rank())));
-
-      for (int i = 0; i < NumDims; ++i) {
-        // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
-        // a multiple of the packet size. Note that reducing
-        // 'block_dim_size' in this manner can increase the number of
-        // blocks, and so will amplify any per-block overhead.
-        m_block_dimensions[i] =
-            numext::mini(dim_size_target, m_tensor_dimensions[i]);
-      }
-
-      // Add any un-allocated coefficients to inner dimension(s).
-      IndexType total_size = m_block_dimensions.TotalSize();
-      for (int i = 0; i < NumDims; ++i) {
-        const int dim = isColMajor ? i : NumDims - i - 1;
-
-        if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
-          const IndexType total_size_other_dims =
-              total_size / m_block_dimensions[dim];
-          const IndexType alloc_avail =
-              divup<IndexType>(target_block_size, total_size_other_dims);
-          if (alloc_avail == m_block_dimensions[dim]) {
-            // Insufficient excess coefficients to allocate.
-            break;
-          }
-          m_block_dimensions[dim] =
-              numext::mini(m_tensor_dimensions[dim], alloc_avail);
-          total_size = total_size_other_dims * m_block_dimensions[dim];
-        }
-      }
-
-    } else {
-      eigen_assert(false);  // unknown block shape
-    }
-
-    eigen_assert(m_block_dimensions.TotalSize() >=
-                 numext::mini<IndexType>(target_block_size,
-                                         m_tensor_dimensions.TotalSize()));
-
-    // Calculate block counts by dimension and total block count.
-    DSizes<IndexType, NumDims> block_count;
-    for (int i = 0; i < NumDims; ++i) {
-      block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);
-    }
-    m_total_block_count = array_prod(block_count);
-
-    // Calculate block strides (used for enumerating blocks).
-    m_tensor_strides = strides<Layout>(m_tensor_dimensions);
-    m_block_strides = strides<Layout>(block_count);
-  }
-
-  DSizes<IndexType, NumDims> m_tensor_dimensions;
-  TensorBlockResourceRequirements m_requirements;
-
-  DSizes<IndexType, NumDims> m_block_dimensions;
-  IndexType m_total_block_count;
-
-  DSizes<IndexType, NumDims> m_tensor_strides;
-  DSizes<IndexType, NumDims> m_block_strides;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockScratchAllocator is responsible for allocating temporary buffers
-// for block evaluation (output or input block materialization). Given that
-// Eigen expression traversal order is deterministic, all temporary allocations
-// are happening in the same order, and usually have exactly the same size.
-// Scratch allocator keeps a trace of all dynamic allocations, and after the
-// first block evaluation is completed, we should be able to reuse all the
-// temporary buffers for the next block evaluation.
-
-template <typename Device>
-class TensorBlockScratchAllocator {
- public:
-  explicit TensorBlockScratchAllocator(const Device& device)
-      : m_device(device), m_allocation_index(0) {}
-
-  ~TensorBlockScratchAllocator() {
-    for (size_t i = 0; i < m_allocations.size(); ++i) {
-      m_device.deallocate(m_allocations[i].ptr);
-    }
-  }
-
-  void* allocate(size_t size) {
-    // TODO(ezhulenev): Remove when replaced with inlined vector.
-    if (m_allocations.capacity() == 0) m_allocations.reserve(8);
-
-    // Check if we already have an existing allocation att current index.
-    const int num_allocations = static_cast<int>(m_allocations.size());
-    const bool has_allocation = m_allocation_index < num_allocations;
-
-    // Allocation index can't be larger than the number of allocations.
-    eigen_assert(m_allocation_index <= num_allocations);
-
-    // If we have existing allocation, and its size is larger or equal to
-    // requested size, we do nothing.
-
-    // If current allocation can't fit requested size, we deallocate it, and
-    // replace with a larger allocation.
-    if (has_allocation && m_allocations[m_allocation_index].size < size) {
-      m_device.deallocate(m_allocations[m_allocation_index].ptr);
-      m_allocations[m_allocation_index].ptr = m_device.allocate(size);
-      m_allocations[m_allocation_index].size = size;
-    }
-
-    // Make a new allocation if we don't have and existing one.
-    if (!has_allocation) {
-      Allocation allocation;
-      allocation.ptr = m_device.allocate(size);
-      allocation.size = size;
-      m_allocations.push_back(allocation);
-    }
-
-    eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
-    eigen_assert(m_allocations[m_allocation_index].size >= size);
-
-    return m_allocations[m_allocation_index++].ptr;
-  }
-
-  void reset() { m_allocation_index = 0; }
-
- private:
-  struct Allocation {
-    void* ptr;
-    size_t size;
-  };
-
-  const Device& m_device;
-  int m_allocation_index;
-  // TODO(ezhulenev): This should be an inlined vector.
-  std::vector<Allocation> m_allocations;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockKind represents all possible block kinds, that can be produced by
-// TensorEvaluator::evalBlock function.
-enum TensorBlockKind {
-  // Tensor block that is a lazy expression that must be assigned to a
-  // destination using TensorBlockAssign.
-  kExpr,
-
-  // Tensor block that is a view into a memory buffer owned by an underlying
-  // Tensor expression (e.g. it can be a view into a Tensor buffer).
-  kView,
-
-  // Tensor block that was materialized in a scratch memory buffer, allocated
-  // with TensorBlockScratchAllocator. This block must be copied to a
-  // destination, similar to a block of `kExpr` type.
-  kMaterializedInScratch,
-
-  // Tensor block that was materialized directly into the final output memory
-  // buffer. For example if the left side of an assignment is a Tensor, we can
-  // directly materialize the block in the destination memory.
-  //
-  // If strides in the output buffer do not match tensor block strides, the
-  // Tensor expression will be invalid, and should not be used by
-  // TensorBlockAssign or for constructing another block expression.
-  kMaterializedInOutput
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockNotImplemented should be used to defined TensorBlock typedef in
-// TensorEvaluators that do not support block evaluation.
-
-class TensorBlockNotImplemented {
- public:
-  typedef void XprType;
-};
-
-// -------------------------------------------------------------------------- //
-// XprScalar extracts Scalar type from the Eigen expressions (if expression type
-// is not void). It's required to be able to define lazy block expression for
-// argument types, that do not support block evaluation.
-
-template <typename XprType>
-struct XprScalar {
-  typedef typename XprType::Scalar type;
-};
-template <>
-struct XprScalar<void> {
-  typedef void type;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorMaterializedBlock is a fully evaluated block of the original tensor,
-// and XprType is just a TensorMap over the data. This block type is typically
-// used to materialize blocks of tensor expressions, that can't be efficiently
-// represented as lazy Tensor expressions with fast coeff/packet operations,
-// e.g. we materialize all broadcasts into evaluated blocks.
-//
-// TensorMaterializedBlock does not own its memory buffer, it's either a memory
-// buffer that backs the original expression (e.g. block is just a view into a
-// Tensor), or a memory buffer allocated with scratch allocator, and in this
-// case the scratch allocator will deallocate it at the end of block based
-// expression execution.
-//
-// If the block was evaluated directly into the output buffer, and strides in
-// the output buffer do not match block strides, the TensorMap expression will
-// be invalid, and should never be used in block assignment or any other tensor
-// expression.
-
-template <typename Scalar, int NumDims, int Layout,
-          typename IndexType = Eigen::Index>
-class TensorMaterializedBlock {
- public:
-  typedef DSizes<IndexType, NumDims> Dimensions;
-  typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
-
-  TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
-                          const Dimensions& dimensions, bool valid_expr = true)
-      : m_kind(kind),
-        m_data(data),
-        m_dimensions(dimensions),
-        m_expr(m_data, m_dimensions),
-        m_valid_expr(valid_expr) {
-    eigen_assert(m_kind == internal::TensorBlockKind::kView ||
-                 m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
-                 m_kind == internal::TensorBlockKind::kMaterializedInOutput);
-  }
-
-  TensorBlockKind kind() const { return m_kind; }
-  // NOTE(ezhulenev): Returning XprType by value like in other block types
-  // causes asan failures. The theory is that XprType::Nested doesn't work
-  // properly for TensorMap.
-  const XprType& expr() const {
-    eigen_assert(m_valid_expr);
-    return m_expr;
-  }
-  const Scalar* data() const { return m_data; }
-  void cleanup() {}
-
-  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
-
-  // TensorMaterializedBlock can be backed by different types of storage:
-  //
-  //   (1) Contiguous block of memory allocated with scratch allocator.
-  //   (2) Contiguous block of memory reused from tensor block descriptor
-  //       destination buffer.
-  //   (3) Strided block of memory reused from tensor block descriptor
-  //       destination buffer.
-  //
-  class Storage {
-   public:
-    Scalar* data() const { return m_data; }
-    const Dimensions& dimensions() const { return m_dimensions; }
-    const Dimensions& strides() const { return m_strides; }
-
-    TensorMaterializedBlock AsTensorMaterializedBlock() const {
-      return TensorMaterializedBlock(
-          m_materialized_in_output
-              ? internal::TensorBlockKind::kMaterializedInOutput
-              : internal::TensorBlockKind::kMaterializedInScratch,
-          m_data, m_dimensions, !m_strided_storage);
-    }
-
-   private:
-    friend class TensorMaterializedBlock;
-
-    Storage(Scalar* data, const Dimensions& dimensions,
-            const Dimensions& strides, bool materialized_in_output,
-            bool strided_storage)
-        : m_data(data),
-          m_dimensions(dimensions),
-          m_strides(strides),
-          m_materialized_in_output(materialized_in_output),
-          m_strided_storage(strided_storage) {}
-
-    Scalar* m_data;
-    Dimensions m_dimensions;
-    Dimensions m_strides;
-    bool m_materialized_in_output;
-    bool m_strided_storage;
-  };
-
-  // Creates a storage for materialized block either from the block descriptor
-  // destination buffer, or allocates a new buffer with scratch allocator.
-  template <typename TensorBlockScratch>
-  EIGEN_STRONG_INLINE static Storage prepareStorage(
-      TensorBlockDesc& desc, TensorBlockScratch& scratch,
-      bool allow_strided_storage = false) {
-    // Try to reuse destination as an output block buffer.
-    typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
-
-    if (desc.destination().kind() == DestinationBuffer::kContiguous) {
-      Scalar* buffer = desc.destination().template data<Scalar>();
-      desc.DropDestinationBuffer();
-      return Storage(buffer, desc.dimensions(),
-                     internal::strides<Layout>(desc.dimensions()),
-                     /*materialized_in_output=*/true,
-                     /*strided_storage=*/false);
-
-    } else if (desc.destination().kind() == DestinationBuffer::kStrided &&
-               allow_strided_storage) {
-      Scalar* buffer = desc.destination().template data<Scalar>();
-      desc.DropDestinationBuffer();
-      return Storage(buffer, desc.dimensions(), desc.destination().strides(),
-                     /*materialized_in_output=*/true, /*strided_storage=*/true);
-
-    } else {
-      void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
-      return Storage(static_cast<Scalar*>(mem), desc.dimensions(),
-                     internal::strides<Layout>(desc.dimensions()),
-                     /*materialized_in_output=*/false,
-                     /*strided_storage=*/false);
-    }
-  }
-
-  // Creates a materialized block for the given descriptor from a memory buffer.
-  template <typename DataDimensions, typename TensorBlockScratch>
-  EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
-      const Scalar* data, const DataDimensions& data_dims,
-      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
-    eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
-
-    // If a tensor block dimensions covers a contiguous block of the underlying
-    // memory, we can skip block buffer memory allocation, and construct a block
-    // from existing `data` memory buffer.
-    //
-    // Example: (RowMajor layout)
-    //   data_dims:          [11, 12, 13, 14]
-    //   desc.dimensions():  [1,   1,  3, 14]
-    //
-    // In this case we can construct a TensorBlock starting at
-    // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
-    static const bool is_col_major = Layout == ColMajor;
-
-    // Find out how many inner dimensions have a matching size.
-    int num_matching_inner_dims = 0;
-    for (int i = 0; i < NumDims; ++i) {
-      int dim = is_col_major ? i : NumDims - i - 1;
-      if (data_dims[dim] != desc.dimensions()[dim]) break;
-      ++num_matching_inner_dims;
-    }
-
-    // All the outer dimensions must be of size `1`, except a single dimension
-    // before the matching inner dimension (`3` in the example above).
-    bool can_use_direct_access = true;
-    for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
-      int dim = is_col_major ? i : NumDims - i - 1;
-      if (desc.dimension(dim) != 1) {
-        can_use_direct_access = false;
-        break;
-      }
-    }
-
-    if (can_use_direct_access) {
-      const Scalar* block_start = data + desc.offset();
-      return TensorMaterializedBlock(internal::TensorBlockKind::kView,
-                                     block_start, desc.dimensions());
-
-    } else {
-      // Reuse destination buffer or allocate new buffer with scratch allocator.
-      const Storage storage = prepareStorage(desc, scratch);
-
-      typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout>
-          TensorBlockIO;
-      typedef typename TensorBlockIO::Dst TensorBlockIODst;
-      typedef typename TensorBlockIO::Src TensorBlockIOSrc;
-
-      TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
-                           data, desc.offset());
-      TensorBlockIODst dst(storage.dimensions(), storage.strides(),
-                           storage.data());
-
-      TensorBlockIO::Copy(dst, src);
-      return storage.AsTensorMaterializedBlock();
-    }
-  }
-
- private:
-  TensorBlockKind m_kind;
-  const Scalar* m_data;
-  Dimensions m_dimensions;
-  XprType m_expr;
-  bool m_valid_expr;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
-// functor to the blocks produced by the underlying Tensor expression.
-
-template <typename UnaryOp, typename ArgTensorBlock>
-class TensorCwiseUnaryBlock {
-  static const bool NoArgBlockAccess =
-      internal::is_void<typename ArgTensorBlock::XprType>::value;
-
- public:
-  typedef typename conditional<
-      NoArgBlockAccess, void,
-      TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >::
-      type XprType;
-
-  typedef typename XprScalar<XprType>::type Scalar;
-
-  TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
-      : m_arg_block(arg_block), m_functor(functor) {}
-
-  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
-
-  XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
-  const Scalar* data() const { return NULL; }
-  void cleanup() { m_arg_block.cleanup(); }
-
- private:
-  ArgTensorBlock m_arg_block;
-  UnaryOp m_functor;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
-// functor to the blocks produced by the underlying Tensor expression.
-
-template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
-class TensorCwiseBinaryBlock {
-  static const bool NoArgBlockAccess =
-      internal::is_void<typename LhsTensorBlock::XprType>::value ||
-      internal::is_void<typename RhsTensorBlock::XprType>::value;
-
- public:
-  typedef typename conditional<
-      NoArgBlockAccess, void,
-      TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,
-                          const typename RhsTensorBlock::XprType> >::type
-      XprType;
-
-  typedef typename XprScalar<XprType>::type Scalar;
-
-  TensorCwiseBinaryBlock(const LhsTensorBlock& left_block,
-                         const RhsTensorBlock& right_block,
-                         const BinaryOp& functor)
-      : m_left_block(left_block),
-        m_right_block(right_block),
-        m_functor(functor) {}
-
-  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
-
-  XprType expr() const {
-    return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
-  }
-
-  const Scalar* data() const { return NULL; }
-
-  void cleanup() {
-    m_left_block.cleanup();
-    m_right_block.cleanup();
-  }
-
- private:
-  LhsTensorBlock m_left_block;
-  RhsTensorBlock m_right_block;
-  BinaryOp m_functor;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorUnaryExprBlock is a lazy tensor expression block that can construct
-// an arbitrary tensor expression from a block of the underlying type (this is a
-// generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
-
-template <typename BlockFactory, typename ArgTensorBlock>
-class TensorUnaryExprBlock {
-  typedef typename ArgTensorBlock::XprType ArgXprType;
-  static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
-
- public:
-  typedef typename conditional<
-      NoArgBlockAccess, void,
-      typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;
-
-  typedef typename XprScalar<XprType>::type Scalar;
-
-  TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
-                       const BlockFactory& factory)
-      : m_arg_block(arg_block), m_factory(factory) {}
-
-  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
-  XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
-  const Scalar* data() const { return NULL; }
-  void cleanup() { m_arg_block.cleanup(); }
-
- private:
-  ArgTensorBlock m_arg_block;
-  BlockFactory m_factory;
-};
-
-// -------------------------------------------------------------------------- //
-// TensorTernaryExprBlock is a lazy tensor expression block that can construct
-// an arbitrary tensor expression from three blocks of the underlying type.
-
-template <typename BlockFactory, typename Arg1TensorBlock,
-          typename Arg2TensorBlock, typename Arg3TensorBlock>
-class TensorTernaryExprBlock {
-  typedef typename Arg1TensorBlock::XprType Arg1XprType;
-  typedef typename Arg2TensorBlock::XprType Arg2XprType;
-  typedef typename Arg3TensorBlock::XprType Arg3XprType;
-
-  static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
-                                       internal::is_void<Arg2XprType>::value ||
-                                       internal::is_void<Arg3XprType>::value;
-
- public:
-  typedef typename conditional<
-      NoArgBlockAccess, void,
-      typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
-                                              Arg3XprType>::type>::type XprType;
-
-  typedef typename XprScalar<XprType>::type Scalar;
-
-  TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,
-                         const Arg2TensorBlock& arg2_block,
-                         const Arg3TensorBlock& arg3_block,
-                         const BlockFactory& factory)
-      : m_arg1_block(arg1_block),
-        m_arg2_block(arg2_block),
-        m_arg3_block(arg3_block),
-        m_factory(factory) {}
-
-  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
-  XprType expr() const {
-    return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
-                          m_arg3_block.expr());
-  }
-  const Scalar* data() const { return NULL; }
-  void cleanup() {
-    m_arg1_block.cleanup();
-    m_arg2_block.cleanup();
-    m_arg3_block.cleanup();
-  }
-
- private:
-  Arg1TensorBlock m_arg1_block;
-  Arg2TensorBlock m_arg2_block;
-  Arg3TensorBlock m_arg3_block;
-  BlockFactory m_factory;
-};
-
-// -------------------------------------------------------------------------- //
-// StridedLinearBufferCopy provides a method to copy data between two linear
-// buffers with different strides, with optimized paths for scatter/gather.
-
-template <typename Scalar, typename IndexType>
-class StridedLinearBufferCopy {
-  typedef typename packet_traits<Scalar>::type Packet;
-  enum {
-    Vectorizable = packet_traits<Scalar>::Vectorizable,
-    PacketSize = packet_traits<Scalar>::size
-  };
-
- public:
-  // Specifying linear copy kind statically gives ~30% speedup for small sizes.
-  enum class Kind {
-    Linear = 0,       // src_stride == 1 && dst_stride == 1
-    Scatter = 1,      // src_stride == 1 && dst_stride != 1
-    FillLinear = 2,   // src_stride == 0 && dst_stride == 1
-    FillScatter = 3,  // src_stride == 0 && dst_stride != 1
-    Gather = 4,       // dst_stride == 1
-    Random = 5        // everything else
-  };
-
-  struct Dst {
-    Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
-
-    IndexType offset;
-    IndexType stride;
-    Scalar* data;
-  };
-
-  struct Src {
-    Src(IndexType o, IndexType s, const Scalar* d)
-        : offset(o), stride(s), data(d) {}
-
-    IndexType offset;
-    IndexType stride;
-    const Scalar* data;
-  };
-
-  template <typename StridedLinearBufferCopy::Kind kind>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
-                                                        const Src& src,
-                                                        const size_t count) {
-    Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
-              src.data);
-  }
-
- private:
-  template <typename StridedLinearBufferCopy::Kind kind>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const IndexType count, const IndexType dst_offset,
-      const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
-      const IndexType src_offset, const IndexType src_stride,
-      const Scalar* EIGEN_RESTRICT src_data) {
-    const Scalar* src = &src_data[src_offset];
-    Scalar* dst = &dst_data[dst_offset];
-
-    if (!Vectorizable) {
-      for (Index i = 0; i < count; ++i) {
-        dst[i * dst_stride] = src[i * src_stride];
-      }
-      return;
-    }
-
-    const IndexType vectorized_size = count - PacketSize;
-    IndexType i = 0;
-
-    if (kind == StridedLinearBufferCopy::Kind::Linear) {
-      // ******************************************************************** //
-      // Linear copy from `src` to `dst`.
-      const IndexType unrolled_size = count - 4 * PacketSize;
-      eigen_assert(src_stride == 1 && dst_stride == 1);
-      for (; i <= unrolled_size; i += 4 * PacketSize) {
-        for (int j = 0; j < 4; ++j) {
-          Packet p = ploadu<Packet>(src + i + j * PacketSize);
-          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
-        }
-      }
-      for (; i <= vectorized_size; i += PacketSize) {
-        Packet p = ploadu<Packet>(src + i);
-        pstoreu<Scalar, Packet>(dst + i, p);
-      }
-      for (; i < count; ++i) {
-        dst[i] = src[i];
-      }
-      // ******************************************************************** //
-    } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
-      // Scatter from `src` to `dst`.
-      eigen_assert(src_stride == 1 && dst_stride != 1);
-      for (; i <= vectorized_size; i += PacketSize) {
-        Packet p = ploadu<Packet>(src + i);
-        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
-      }
-      for (; i < count; ++i) {
-        dst[i * dst_stride] = src[i];
-      }
-      // ******************************************************************** //
-    } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
-      // Fill `dst` with value at `*src`.
-      eigen_assert(src_stride == 0 && dst_stride == 1);
-      const IndexType unrolled_size = count - 4 * PacketSize;
-      Packet p = pload1<Packet>(src);
-      for (; i <= unrolled_size; i += 4 * PacketSize) {
-        for (int j = 0; j < 4; ++j) {
-          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
-        }
-      }
-      for (; i <= vectorized_size; i += PacketSize) {
-        pstoreu<Scalar, Packet>(dst + i, p);
-      }
-      for (; i < count; ++i) {
-        dst[i] = *src;
-      }
-      // ******************************************************************** //
-    } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
-      // Scatter `*src` into `dst`.
-      eigen_assert(src_stride == 0 && dst_stride != 1);
-      Packet p = pload1<Packet>(src);
-      for (; i <= vectorized_size; i += PacketSize) {
-        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
-      }
-      for (; i < count; ++i) {
-        dst[i * dst_stride] = *src;
-      }
-      // ******************************************************************** //
-    } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
-      // Gather from `src` into `dst`.
-      eigen_assert(dst_stride == 1);
-      for (; i <= vectorized_size; i += PacketSize) {
-        Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
-        pstoreu<Scalar, Packet>(dst + i, p);
-      }
-      for (; i < count; ++i) {
-        dst[i] = src[i * src_stride];
-      }
-      // ******************************************************************** //
-    } else if (kind == StridedLinearBufferCopy::Kind::Random) {
-      // Random.
-      for (; i < count; ++i) {
-        dst[i * dst_stride] = src[i * src_stride];
-      }
-    } else {
-      eigen_assert(false);
-    }
-  }
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
-// It's possible to specify src->dst dimension mapping for the copy operation.
-// Dimensions of `dst` specify how many elements have to be copied, for the
-// `src` we need to know only stride to navigate through source memory buffer.
-
-template <typename Scalar, typename IndexType, int NumDims, int Layout>
-class TensorBlockIO {
-  static const bool IsColMajor = (Layout == ColMajor);
-
-  typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
-
- public:
-  typedef DSizes<IndexType, NumDims> Dimensions;
-  typedef DSizes<int, NumDims> DimensionsMap;
-
-  struct Dst {
-    Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
-        IndexType dst_offset = 0)
-        : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
-
-    Dimensions dims;
-    Dimensions strides;
-    Scalar* data;
-    IndexType offset;
-  };
-
-  struct Src {
-    Src(const Dimensions& src_strides, const Scalar* src,
-        IndexType src_offset = 0)
-        : strides(src_strides), data(src), offset(src_offset) {}
-
-    Dimensions strides;
-    const Scalar* data;
-    IndexType offset;
-  };
-
-  // Copies data to `dst` from `src`, using provided dimensions mapping:
-  //
-  //   src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
-  //
-  // Returns the number of copied elements.
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(
-      const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) {
-    // Copy single scalar value from `src` to `dst`.
-    if (NumDims == 0) {
-      *(dst.data + dst.offset) = *(src.data + src.offset);
-      return 1;
-    }
-
-    // Both `dst` and `src` must have contiguous innermost dimension. We also
-    // accept the special case with stride '0', because it's used as a trick to
-    // implement broadcasting.
-    {
-      int inner_dim = IsColMajor ? 0 : NumDims - 1;
-      EIGEN_UNUSED_VARIABLE(inner_dim);
-      eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
-      eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
-    }
-
-    // Give a shorter name to `dst_to_src_dim_map`.
-    const DimensionsMap& dim_map = dst_to_src_dim_map;
-
-    // Do not squeeze reordered inner dimensions.
-    int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
-
-    // NOTE: We find the innermost dimension (contiguous in memory) in the dst
-    // block, and we write data linearly into that dimension, reading it from
-    // the src. If dimensions are reordered, we might end up reading data from
-    // the src with `stride != 1`.
-    //
-    // NOTE: Random-Read/Linear-Write can be up to ~2X faster than
-    // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
-
-    // Find the innermost dimension in the dst whose size is not 1. This is the
-    // effective inner dim.
-    int num_size_one_inner_dims = 0;
-    for (int i = 0; i < num_squeezable_dims; ++i) {
-      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
-      if (dst.dims[dst_dim] != 1) break;
-      num_size_one_inner_dims++;
-    }
-
-    // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
-    if (num_size_one_inner_dims == NumDims) {
-      *(dst.data + dst.offset) = *(src.data + src.offset);
-      return 1;
-    }
-
-    // Outermost dimension in the dst with `stride == 1` (contiguous in memory).
-    const int dst_stride1_dim = IsColMajor
-                                    ? num_size_one_inner_dims
-                                    : NumDims - num_size_one_inner_dims - 1;
-
-    // Dimension in the src that corresponds to the dst innermost dimension.
-    const int src_dim_for_dst_stride1_dim =
-        NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
-
-    // Size of the innermost dimension (length of contiguous blocks of memory).
-    IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
-
-    // Squeeze multiple inner dims into one if they are contiguous in `dst` and
-    // `src` memory, so we can do less linear copy calls.
-    for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
-      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
-      const IndexType dst_stride = dst.strides[dst_dim];
-      const IndexType src_stride = src.strides[dim_map[dst_dim]];
-      if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
-        dst_inner_dim_size *= dst.dims[dst_dim];
-        ++num_size_one_inner_dims;
-      } else {
-        break;
-      }
-    }
-
-    // Setup strides to read data from `src` and write to `dst`.
-    IndexType input_offset = src.offset;
-    IndexType output_offset = dst.offset;
-    IndexType input_stride =
-        NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
-    IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
-
-    const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
-    array<BlockIteratorState, at_least_1_dim> it;
-
-    // Initialize block iterator state. Squeeze away any dimension of size 1.
-    int idx = 0;  // currently initialized iterator state index
-    for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
-      const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
-      if (dst.dims[dst_dim] == 1) continue;
-
-      it[idx].size = dst.dims[dst_dim];
-      it[idx].input_stride = src.strides[dim_map[dst_dim]];
-      it[idx].output_stride = dst.strides[dst_dim];
-
-      it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
-      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
-
-      idx++;
-    }
-
-    // Iterate copying data from src to dst.
-    const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
-
-#define COPY_INNER_DIM(KIND)                                           \
-  IndexType num_copied = 0;                                            \
-  for (num_copied = 0; num_copied < block_total_size;                  \
-       num_copied += dst_inner_dim_size) {                             \
-    LinCopy::template Run<KIND>(                                       \
-        typename LinCopy::Dst(output_offset, output_stride, dst.data), \
-        typename LinCopy::Src(input_offset, input_stride, src.data),   \
-        dst_inner_dim_size);                                           \
-                                                                       \
-    for (int j = 0; j < idx; ++j) {                                    \
-      if (++it[j].count < it[j].size) {                                \
-        input_offset += it[j].input_stride;                            \
-        output_offset += it[j].output_stride;                          \
-        break;                                                         \
-      }                                                                \
-      it[j].count = 0;                                                 \
-      input_offset -= it[j].input_span;                                \
-      output_offset -= it[j].output_span;                              \
-    }                                                                  \
-  }                                                                    \
-  return num_copied;
-
-    if (input_stride == 1 && output_stride == 1) {
-      COPY_INNER_DIM(LinCopy::Kind::Linear);
-    } else if (input_stride == 1 && output_stride != 1) {
-      COPY_INNER_DIM(LinCopy::Kind::Scatter);
-    } else if (input_stride == 0 && output_stride == 1) {
-      COPY_INNER_DIM(LinCopy::Kind::FillLinear);
-    } else if (input_stride == 0 && output_stride != 1) {
-      COPY_INNER_DIM(LinCopy::Kind::FillScatter);
-    } else if (output_stride == 1) {
-      COPY_INNER_DIM(LinCopy::Kind::Gather);
-    } else {
-      COPY_INNER_DIM(LinCopy::Kind::Random);
-    }
-
-#undef COPY_INNER_DIM
-  }
-
-  // Copy from `src` to `dst` with an identity src->dst dimension map. Returns
-  // the number of copied elements.
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst,
-                                                              const Src& src) {
-    DimensionsMap dst_to_src_map;
-    for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
-    return Copy(dst, src, dst_to_src_map);
-  }
-
- private:
-  struct BlockIteratorState {
-    BlockIteratorState()
-        : size(0),
-          count(0),
-          input_stride(0),
-          output_stride(0),
-          input_span(0),
-          output_span(0) {}
-
-    IndexType size;
-    IndexType count;
-    IndexType input_stride;
-    IndexType output_stride;
-    IndexType input_span;
-    IndexType output_span;
-  };
-
-  // Compute how many inner dimensions it's allowed to squeeze when doing IO
-  // between two tensor blocks. It's safe to squeeze inner dimensions, only
-  // if they are not reordered.
-  static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
-    int num_squeezable_dims = 0;
-    for (int i = 0; i < NumDims; ++i) {
-      const int dim = IsColMajor ? i : NumDims - i - 1;
-      if (dim_map[dim] != dim) break;
-      num_squeezable_dims++;
-    }
-    return num_squeezable_dims;
-  }
-};
-
-// -------------------------------------------------------------------------- //
-// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
-// a Tensor block defined by `desc`, backed by a memory buffer at `target`.
-//
-// Currently there is no way to write from a Tensor expression to a block of
-// memory, if dimensions are reordered. If you need to do that, you should
-// materialize a Tensor block expression into a memory buffer, and then use
-// TensorBlockIO to copy data between two memory buffers with a custom
-// `target->src` dimension map (see definition above).
-//
-// Also currently the innermost dimension of `target` must have a stride '1'
-// (contiguous in memory). This restriction could be lifted with a `pscatter`,
-// but in practice it's never needed, and there is a similar TensorBlockIO
-// workaround for that.
-//
-// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
-// where `src` is a tensor expression. Explore if it is possible to rewrite IO
-// to use expressions instead of pointers, and after that TensorBlockAssignment
-// will become an alias to IO.
-template <typename Scalar, int NumDims, typename TensorBlockExpr,
-          typename IndexType = Eigen::Index>
-class TensorBlockAssignment {
-  // We will use coeff/packet path to evaluate block expressions.
-  typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
-      TensorBlockEvaluator;
-
-  typedef DSizes<IndexType, NumDims> Dimensions;
-
-  enum {
-    Vectorizable = packet_traits<Scalar>::Vectorizable,
-    PacketSize = packet_traits<Scalar>::size
-  };
-
-  template <bool Vectorizable, typename Evaluator>
-  struct InnerDimAssign {
-    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
-                                        const Evaluator& eval,
-                                        IndexType eval_offset) {
-      for (IndexType i = 0; i < count; ++i) {
-        target[i] = eval.coeff(eval_offset + i);
-      }
-    }
-  };
-
-  template <typename Evaluator>
-  struct InnerDimAssign<true, Evaluator> {
-    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
-                                        const Evaluator& eval,
-                                        IndexType eval_offset) {
-      typedef typename packet_traits<Scalar>::type Packet;
-
-      const IndexType unrolled_size = count - 4 * PacketSize;
-      const IndexType vectorized_size = count - PacketSize;
-      IndexType i = 0;
-
-      for (; i <= unrolled_size; i += 4 * PacketSize) {
-        for (int j = 0; j < 4; ++j) {
-          const IndexType idx = eval_offset + i + j * PacketSize;
-          Packet p = eval.template packet<Unaligned>(idx);
-          pstoreu<Scalar>(target + i + j * PacketSize, p);
-        }
-      }
-
-      for (; i <= vectorized_size; i += PacketSize) {
-        Packet p = eval.template packet<Unaligned>(eval_offset + i);
-        pstoreu<Scalar>(target + i, p);
-      }
-
-      for (; i < count; ++i) {
-        target[i] = eval.coeff(eval_offset + i);
-      }
-    }
-  };
-
- public:
-  struct Target {
-    Target(const Dimensions& target_dims, const Dimensions& target_strides,
-           Scalar* target_data, IndexType target_offset = 0)
-        : dims(target_dims),
-          strides(target_strides),
-          data(target_data),
-          offset(target_offset) {}
-
-    Dimensions dims;
-    Dimensions strides;
-    Scalar* data;
-    IndexType offset;
-  };
-
-  static Target target(const Dimensions& target_dims,
-                       const Dimensions& target_strides, Scalar* target_data,
-                       IndexType target_offset = 0) {
-    return Target(target_dims, target_strides, target_data, target_offset);
-  }
-
-  template <typename TargetDimsIndexType, typename TargetStridesIndexType>
-  static Target target(
-      const DSizes<TargetDimsIndexType, NumDims>& target_dims,
-      const DSizes<TargetStridesIndexType, NumDims>& target_strides,
-      Scalar* target_data, IndexType target_offset = 0) {
-    // DSizes constructor will do index type promotion if it's safe.
-    return Target(Dimensions(target_dims), Dimensions(target_strides),
-                  target_data, target_offset);
-  }
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const Target& target, const TensorBlockExpr& expr) {
-    // Prepare evaluator for block expression.
-    DefaultDevice default_device;
-    TensorBlockEvaluator eval(expr, default_device);
-
-    // Tensor block expression dimension should match destination dimensions.
-    eigen_assert(dimensions_match(target.dims, eval.dimensions()));
-
-    static const int Layout = TensorBlockEvaluator::Layout;
-    static const bool is_col_major = Layout == ColMajor;
-
-    // Initialize output inner dimension size based on a layout.
-    const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
-    const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
-    IndexType output_inner_dim_size = target.dims[inner_dim_idx];
-
-    // Target inner dimension stride must be '1'.
-    eigen_assert(target.strides[inner_dim_idx] == 1);
-
-    // Squeeze multiple inner dims into one if they are contiguous in `target`.
-    IndexType num_squeezed_dims = 0;
-    for (Index i = 1; i < NumDims; ++i) {
-      const Index dim = is_col_major ? i : NumDims - i - 1;
-      const IndexType target_stride = target.strides[dim];
-
-      if (output_inner_dim_size == target_stride) {
-        output_inner_dim_size *= target.dims[dim];
-        num_squeezed_dims++;
-      } else {
-        break;
-      }
-    }
-
-    // Initialize output block iterator state. Dimension in this array are
-    // always in inner_most -> outer_most order (col major layout).
-    array<BlockIteratorState, NumDims> it;
-
-    int idx = 0;  // currently initialized iterator state index
-    for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
-      const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
-
-      it[idx].count = 0;
-      it[idx].size = target.dims[dim];
-      it[idx].output_stride = target.strides[dim];
-      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
-      idx++;
-    }
-
-    // We read block expression from the beginning, and start writing data to
-    // `target` at given offset.
-    IndexType input_offset = 0;
-    IndexType output_offset = target.offset;
-
-    // Iterate copying data from `eval` to `target`.
-    for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
-      // Assign to `target` at current offset.
-      InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
-                     TensorBlockEvaluator>::Run(target.data + output_offset,
-                                                output_inner_dim_size, eval,
-                                                input_offset);
-
-      // Move input offset forward by the number of assigned coefficients.
-      input_offset += output_inner_dim_size;
-
-      // Update index.
-      for (int j = 0; j < idx; ++j) {
-        if (++it[j].count < it[j].size) {
-          output_offset += it[j].output_stride;
-          break;
-        }
-        it[j].count = 0;
-        output_offset -= it[j].output_span;
-      }
-    }
-  }
-
- private:
-  struct BlockIteratorState {
-    BlockIteratorState()
-        : count(0), size(0), output_stride(0), output_span(0) {}
-
-    IndexType count;
-    IndexType size;
-    IndexType output_stride;
-    IndexType output_span;
-  };
-};
-
-// -------------------------------------------------------------------------- //
-
-}  // namespace internal
-}  // namespace Eigen
-
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index 3408f90d1..4cfe300eb 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -31,13 +31,12 @@ struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename Broadcast, typename XprType>
 struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense>
 {
-  typedef const TensorBroadcastingOp<Broadcast, XprType> EIGEN_DEVICE_REF type;
+  typedef const TensorBroadcastingOp<Broadcast, XprType>& type;
 };
 
 template<typename Broadcast, typename XprType>
@@ -55,7 +54,7 @@ struct is_input_scalar<Sizes<> > {
   static const bool value = true;
 };
 #ifndef EIGEN_EMULATE_CXX11_META_H
-template <typename std::ptrdiff_t... Indices>
+template <typename std::size_t... Indices>
 struct is_input_scalar<Sizes<Indices...> > {
   static const bool value = (Sizes<Indices...>::total_size == 1);
 };
@@ -104,58 +103,27 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  protected: //  all the non-static fields must have the same access control, otherwise the TensorEvaluator wont be standard layout;
-  bool isCopy, nByOne, oneByN;
-  public:
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
-    IsAligned         = true,
-    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
-    PreferBlockAccess = true,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess         = false
+    IsAligned = true,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess = false
   };
 
-  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
-
-  // We do block based broadcasting using a trick with 2x tensor rank and 0
-  // strides. See block method implementation for details.
-  typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
- typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
-      ArgTensorBlock;
-
-  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
-                                                     Layout, Index>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
-                                                        const Device& device)
-      : isCopy(false), nByOne(false), oneByN(false),
-        m_device(device), m_broadcast(op.broadcast()), m_impl(op.expression(), device)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : m_broadcast(op.broadcast()),m_impl(op.expression(), device)
   {
-
     // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
     // and store the result in a scalar. Instead one should reshape the scalar into a a N-D
     // tensor with N >= 1 of 1 element first and then broadcast.
     EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
     const InputDimensions& input_dims = m_impl.dimensions();
-    isCopy = true;
+    const Broadcast& broadcast = op.broadcast();
     for (int i = 0; i < NumDims; ++i) {
       eigen_assert(input_dims[i] > 0);
-      m_dimensions[i] = input_dims[i] * m_broadcast[i];
-      if (m_broadcast[i] != 1) {
-        isCopy = false;
-      }
+      m_dimensions[i] = input_dims[i] * broadcast[i];
     }
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -173,57 +141,15 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
         m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
       }
     }
-
-    if (input_dims[0] == 1) {
-      oneByN = true;
-      for (int i = 1; i < NumDims; ++i) {
-        if (m_broadcast[i] != 1) {
-          oneByN = false;
-          break;
-        }
-      }
-    } else if (input_dims[NumDims-1] == 1) {
-      nByOne = true;
-      for (int i = 0; i < NumDims-1; ++i) {
-        if (m_broadcast[i] != 1) {
-          nByOne = false;
-          break;
-        }
-      }
-    }
-
-    // Handle special format like NCHW, its input shape is '[1, N..., 1]' and
-    // broadcast shape is '[N, 1..., N]'
-    if (!oneByN && !nByOne) {
-      if (input_dims[0] == 1 && input_dims[NumDims-1] == 1 && NumDims > 2) {
-        nByOne = true;
-        oneByN = true;
-        for (int i = 1; i < NumDims-1; ++i) {
-          if (m_broadcast[i] != 1) {
-            nByOne = false;
-            oneByN = false;
-            break;
-          }
-        }
-      }
-    }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType, EvalSubExprsCallback done) {
-    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
-  }
-#endif  // EIGEN_USE_THREADS
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
@@ -235,24 +161,16 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     }
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      if (isCopy) {
-        return m_impl.coeff(index);
-      } else {
-        return coeffColMajor(index);
-      }
+      return coeffColMajor(index);
     } else {
-      if (isCopy) {
-        return m_impl.coeff(index);
-      } else {
-        return coeffRowMajor(index);
-      }
+      return coeffRowMajor(index);
     }
   }
 
   // TODO: attempt to speed this up. The integer divisions and modulo are slow
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
+  {
     Index inputIndex = 0;
-    EIGEN_UNROLL_LOOP
     for (int i = NumDims - 1; i > 0; --i) {
       const Index idx = index / m_outputStrides[i];
       if (internal::index_statically_eq<Broadcast>(i, 1)) {
@@ -277,17 +195,12 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
         inputIndex += (index % m_impl.dimensions()[0]);
       }
     }
-    return inputIndex;
+    return m_impl.coeff(inputIndex);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
   {
-    return m_impl.coeff(indexColMajor(index));
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const {
     Index inputIndex = 0;
-    EIGEN_UNROLL_LOOP
     for (int i = 0; i < NumDims - 1; ++i) {
       const Index idx = index / m_outputStrides[i];
       if (internal::index_statically_eq<Broadcast>(i, 1)) {
@@ -302,22 +215,17 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
       }
       index -= idx * m_outputStrides[i];
     }
-    if (internal::index_statically_eq<Broadcast>(NumDims - 1, 1)) {
-      eigen_assert(index < m_impl.dimensions()[NumDims - 1]);
+    if (internal::index_statically_eq<Broadcast>(NumDims-1, 1)) {
+      eigen_assert(index < m_impl.dimensions()[NumDims-1]);
       inputIndex += index;
     } else {
-      if (internal::index_statically_eq<InputDimensions>(NumDims - 1, 1)) {
-        eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0);
+      if (internal::index_statically_eq<InputDimensions>(NumDims-1, 1)) {
+        eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
       } else {
-        inputIndex += (index % m_impl.dimensions()[NumDims - 1]);
+        inputIndex += (index % m_impl.dimensions()[NumDims-1]);
       }
     }
-    return inputIndex;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
-  {
-    return m_impl.coeff(indexRowMajor(index));
+    return m_impl.coeff(inputIndex);
   }
 
   template<int LoadMode>
@@ -328,148 +236,9 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     }
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      if (isCopy) {
-        #ifdef EIGEN_GPU_COMPILE_PHASE
-        // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing
-        // unaligned loads here. The reason is unclear though.
-        return m_impl.template packet<Unaligned>(index);
-        #else
-        return m_impl.template packet<LoadMode>(index);
-        #endif
-      } else if (oneByN && !nByOne) {
-        return packetNByOne<LoadMode>(index);
-      } else if (!oneByN && nByOne) {
-        return packetOneByN<LoadMode>(index);
-      } else if (oneByN && nByOne) {
-        return packetOneByNByOne<LoadMode>(index);
-      } else {
-        return packetColMajor<LoadMode>(index);
-      }
+      return packetColMajor<LoadMode>(index);
     } else {
-      if (isCopy) {
-        #ifdef EIGEN_GPU_COMPILE_PHASE
-        // See above.
-        return m_impl.template packet<Unaligned>(index);
-        #else
-        return m_impl.template packet<LoadMode>(index);
-        #endif
-      } else if (oneByN && !nByOne) {
-        return packetOneByN<LoadMode>(index);
-      } else if (!oneByN && nByOne) {
-        return packetNByOne<LoadMode>(index);
-      } else if (oneByN && nByOne) {
-        return packetOneByNByOne<LoadMode>(index);
-      } else {
-        return packetRowMajor<LoadMode>(index);
-      }
-    }
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne
-  (Index index) const
-  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
-
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-    Index startDim, endDim;
-    Index inputIndex, outputOffset, batchedIndex;
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      startDim = NumDims - 1;
-      endDim = 1;
-    } else {
-      startDim = 0;
-      endDim = NumDims - 2;
-    }
-
-    batchedIndex = index % m_outputStrides[startDim];
-    inputIndex   = batchedIndex / m_outputStrides[endDim];
-    outputOffset = batchedIndex % m_outputStrides[endDim];
-
-    if (outputOffset + PacketSize <= m_outputStrides[endDim]) {
-      values[0] = m_impl.coeff(inputIndex);
-      return internal::pload1<PacketReturnType>(values);
-    } else {
-      EIGEN_UNROLL_LOOP
-      for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
-        if (outputOffset + cur < m_outputStrides[endDim]) {
-          values[i] = m_impl.coeff(inputIndex);
-        } else {
-          ++inputIndex;
-          inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex);
-          values[i] = m_impl.coeff(inputIndex);
-          outputOffset = 0;
-          cur = 0;
-        }
-      }
-      return internal::pload<PacketReturnType>(values);
-    }
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const
-  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
-
-    Index dim, inputIndex;
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      dim = NumDims - 1;
-    } else {
-      dim = 0;
-    }
-
-    inputIndex = index % m_inputStrides[dim];
-    if (inputIndex + PacketSize <= m_inputStrides[dim]) {
-      return m_impl.template packet<Unaligned>(inputIndex);
-    } else {
-      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-      EIGEN_UNROLL_LOOP
-      for (int i = 0; i < PacketSize; ++i) {
-        if (inputIndex > m_inputStrides[dim]-1) {
-          inputIndex = 0;
-        }
-        values[i] = m_impl.coeff(inputIndex++);
-      }
-      return internal::pload<PacketReturnType>(values);
-    }
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const
-  {
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
-
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-    Index dim, inputIndex, outputOffset;
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      dim = 1;
-    } else {
-      dim = NumDims - 2;
-    }
-
-    inputIndex   = index / m_outputStrides[dim];
-    outputOffset = index % m_outputStrides[dim];
-    if (outputOffset + PacketSize <= m_outputStrides[dim]) {
-      values[0] = m_impl.coeff(inputIndex);
-      return internal::pload1<PacketReturnType>(values);
-    } else {
-      EIGEN_UNROLL_LOOP
-      for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
-        if (outputOffset + cur < m_outputStrides[dim]) {
-          values[i] = m_impl.coeff(inputIndex);
-        } else {
-          values[i] = m_impl.coeff(++inputIndex);
-          outputOffset = 0;
-          cur = 0;
-        }
-      }
-      return internal::pload<PacketReturnType>(values);
+      return packetRowMajor<LoadMode>(index);
     }
   }
 
@@ -484,7 +253,6 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     const Index originalIndex = index;
 
     Index inputIndex = 0;
-    EIGEN_UNROLL_LOOP
     for (int i = NumDims - 1; i > 0; --i) {
       const Index idx = index / m_outputStrides[i];
       if (internal::index_statically_eq<Broadcast>(i, 1)) {
@@ -520,13 +288,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     } else {
       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       values[0] = m_impl.coeff(inputIndex);
-      EIGEN_UNROLL_LOOP
       for (int i = 1; i < PacketSize; ++i) {
-        if (innermostLoc + i < m_impl.dimensions()[0]) {
-          values[i] = m_impl.coeff(inputIndex+i);
-        } else {
-          values[i] = coeffColMajor(originalIndex+i);
-        }
+        values[i] = coeffColMajor(originalIndex+i);
       }
       PacketReturnType rslt = internal::pload<PacketReturnType>(values);
       return rslt;
@@ -542,7 +305,6 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     const Index originalIndex = index;
 
     Index inputIndex = 0;
-    EIGEN_UNROLL_LOOP
     for (int i = 0; i < NumDims - 1; ++i) {
       const Index idx = index / m_outputStrides[i];
       if (internal::index_statically_eq<Broadcast>(i, 1)) {
@@ -578,13 +340,8 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     } else {
       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       values[0] = m_impl.coeff(inputIndex);
-      EIGEN_UNROLL_LOOP
       for (int i = 1; i < PacketSize; ++i) {
-        if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) {
-          values[i] = m_impl.coeff(inputIndex+i);
-        } else {
-          values[i] = coeffRowMajor(originalIndex+i);
-        }
+        values[i] = coeffRowMajor(originalIndex+i);
       }
       PacketReturnType rslt = internal::pload<PacketReturnType>(values);
       return rslt;
@@ -594,8 +351,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
   costPerCoeff(bool vectorized) const {
     double compute_cost = TensorOpCost::AddCost<Index>();
-    if (!isCopy && NumDims > 0) {
-      EIGEN_UNROLL_LOOP
+    if (NumDims > 0) {
       for (int i = NumDims - 1; i > 0; --i) {
         compute_cost += TensorOpCost::DivCost<Index>();
         if (internal::index_statically_eq<Broadcast>(i, 1)) {
@@ -616,472 +372,14 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
            TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
-    // tensors. But this might need further tuning.
-    const size_t target_size = m_device.firstLevelCacheSize();
-    return internal::TensorBlockResourceRequirements::merge(
-        m_impl.getResourceRequirements(),
-        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size));
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    BlockBroadcastingParams params = blockBroadcastingParams(desc);
-
-    if (params.inner_dim_size == 0 || params.bcast_dim_size == 0) {
-      return emptyBlock();
-    }
-
-    // Prepare storage for the materialized broadcasting result.
-    const typename TensorBlock::Storage block_storage =
-        TensorBlock::prepareStorage(desc, scratch);
-    ScalarNoConst* materialized_output = block_storage.data();
-
-    // We potentially will need to materialize input blocks.
-    size_t materialized_input_size = 0;
-    ScalarNoConst* materialized_input = NULL;
-
-    // Initialize block broadcating iterator state for outer dimensions (outer
-    // with regard to bcast dimension). Dimension in this array are always in
-    // inner_most -> outer_most order (col major layout).
-    array<BlockBroadcastingIteratorState, NumDims> it;
-    int idx = 0;
-
-    for (int i = params.inner_dim_count + 1; i < NumDims; ++i) {
-      const Index dim = IsColMajor ? i : NumDims - 1 - i;
-      it[idx].size = params.output_dims[dim];
-      it[idx].count = 0;
-      it[idx].output_stride = m_outputStrides[dim];
-      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
-      idx++;
-    }
-
-    // Write output into the beginning of `materialized_output`.
-    Index output_offset = 0;
-
-    // We will fill output block by broadcasting along the bcast dim, and
-    // iterating over outer dimension.
-    const Index output_size = NumDims == 0 ? 1 : params.output_dims.TotalSize();
-
-    for (Index num_output_coeffs = 0; num_output_coeffs < output_size;) {
-      ScalarNoConst* bcast_output = materialized_output + num_output_coeffs;
-      Index bcast_offset = desc.offset() + output_offset;
-
-      // Broadcast along the bcast dimension.
-      num_output_coeffs += BroadcastBlockAlongBcastDim(
-          params, bcast_offset, scratch, bcast_output, &materialized_input,
-          &materialized_input_size);
-
-      // Switch to the next outer dimension.
-      for (int j = 0; j < idx; ++j) {
-        if (++it[j].count < it[j].size) {
-          output_offset += it[j].output_stride;
-          break;
-        }
-        it[j].count = 0;
-        output_offset -= it[j].output_span;
-      }
-    }
-
-    return block_storage.AsTensorMaterializedBlock();
-  }
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
   const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 
   Broadcast functor() const { return m_broadcast; }
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(
-      cl::sycl::handler& cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
- private:
-  static const bool IsColMajor =
-      static_cast<int>(Layout) == static_cast<int>(ColMajor);
 
-  // We will build a general case block broadcasting on top of broadcasting
-  // primitive that will do broadcasting only for the inner dimension(s) along
-  // the first dimension smaller than the input size (it's called `bcast_dim`).
-  //
-  // Example:
-  //           dim:  0  1  2   (ColMajor)
-  //    input size: [9, 3, 6]
-  //    block size: [9, 2, 6]
-  //
-  // We will compute broadcasted block by iterating over the outer dimensions
-  // before `bcast_dim` (only dimension `2` in this example) and computing
-  // broadcasts along the `bcast_dim` (dimension `1` in this example).
-
-  // BlockBroadcastingParams holds precomputed parameters for broadcasting a
-  // single block along the broadcasting dimension. Sizes and strides along the
-  // `bcast_dim` might be invalid, they will be adjusted later in
-  // `BroadcastBlockAlongBcastDim`.
-  struct BlockBroadcastingParams {
-    Dimensions input_dims;      // input expression dimensions
-    Dimensions output_dims;     // output block sizes
-    Dimensions output_strides;  // output block strides
-
-    int inner_dim_count;   // count inner dimensions matching in size
-    int bcast_dim;         // broadcasting dimension index
-    Index bcast_dim_size;  // broadcasting dimension size
-    Index inner_dim_size;  // inner dimensions size
-
-    // Block sizes and strides for the input block where all dimensions before
-    // `bcast_dim` are equal to `1`.
-    Dimensions input_block_sizes;
-    Dimensions input_block_strides;
-
-    // Block sizes and strides for blocks with extra dimensions and strides `0`.
-    BroadcastDimensions bcast_block_sizes;
-    BroadcastDimensions bcast_block_strides;
-    BroadcastDimensions bcast_input_strides;
-  };
-
-  struct BlockBroadcastingIteratorState {
-    Index size;
-    Index count;
-    Index output_stride;
-    Index output_span;
-  };
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams
-  blockBroadcastingParams(TensorBlockDesc& desc) const {
-    BlockBroadcastingParams params;
-
-    params.input_dims = Dimensions(m_impl.dimensions());
-
-    // Output block sizes and strides.
-    params.output_dims = desc.dimensions();
-    params.output_strides = internal::strides<Layout>(params.output_dims);
-
-    // Find the broadcasting dimension (first dimension with output size smaller
-    // that the input size).
-    params.bcast_dim = 0;
-    params.bcast_dim_size = 1;
-    params.inner_dim_size = 1;
-
-    // Count the number of inner dimensions that have the same size in the block
-    // and in the broadcast expression.
-    params.inner_dim_count = 0;
-
-    for (int i = 0; i < NumDims; ++i) {
-      const int dim = IsColMajor ? i : NumDims - i - 1;
-
-      if (params.output_dims[dim] == m_dimensions[dim]) {
-        params.inner_dim_size *= params.output_dims[dim];
-        ++params.inner_dim_count;
-        continue;
-      }
-
-      // First non-matching dimension is the broadcasting dimension.
-      eigen_assert(params.output_dims[dim] < m_dimensions[dim]);
-      params.bcast_dim = dim;
-      params.bcast_dim_size = params.output_dims[dim];
-      break;
-    }
-
-    // Calculate the input block size for looking into the input.
-    for (int i = 0; i < params.inner_dim_count; ++i) {
-      const int dim = IsColMajor ? i : NumDims - i - 1;
-      params.input_block_sizes[dim] = params.input_dims[dim];
-    }
-    for (int i = params.inner_dim_count; i < NumDims; ++i) {
-      const int dim = IsColMajor ? i : NumDims - i - 1;
-      params.input_block_sizes[dim] = 1;
-    }
-    params.input_block_strides =
-        internal::strides<Layout>(params.input_block_sizes);
-
-    // Broadcast with the 0-stride trick: Create 1 extra dim for each
-    // broadcast, set the input stride to 0.
-    //
-    // When ColMajor:
-    //
-    // - bcast_block_sizes:
-    //   [d_0, b_0, d_1, b_1, ...]
-    //
-    // - bcast_block_strides:
-    //   [output_block_strides[0], output_block_strides[0] * d_0,
-    //    output_block_strides[1], output_block_strides[1] * d_1,
-    //   ...]
-    //
-    // - bcast_input_strides:
-    //   [input_block_strides[0], 0,
-    //    input_block_strides[1], 0,
-    //   ...].
-    //
-    for (int i = 0; i < params.inner_dim_count; ++i) {
-      const int dim = IsColMajor ? i : NumDims - i - 1;
-
-      const int copy_dim = IsColMajor ? 2 * i : 2 * NumDims - 2 * i - 1;
-      const int broadcast_dim = IsColMajor ? copy_dim + 1 : copy_dim - 1;
-
-      params.bcast_block_sizes[copy_dim] = params.input_dims[dim];
-      params.bcast_block_sizes[broadcast_dim] = m_broadcast[dim];
-      params.bcast_block_strides[copy_dim] = params.output_strides[dim];
-      params.bcast_block_strides[broadcast_dim] =
-          params.output_strides[dim] * params.input_dims[dim];
-      params.bcast_input_strides[copy_dim] = params.input_block_strides[dim];
-      params.bcast_input_strides[broadcast_dim] = 0;
-    }
-
-    for (int i = 2 * params.inner_dim_count; i < 2 * NumDims; ++i) {
-      const int dim = IsColMajor ? i : 2 * NumDims - i - 1;
-      params.bcast_block_sizes[dim] = 1;
-      params.bcast_block_strides[dim] = 0;
-      params.bcast_input_strides[dim] = 0;
-    }
-
-    return params;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock emptyBlock() const {
-    DSizes<Index, NumDims> dimensions;
-    for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
-    return TensorBlock(internal::TensorBlockKind::kView, NULL, dimensions);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockAlongBcastDim(
-      BlockBroadcastingParams params, Index bcast_offset,
-      TensorBlockScratch& scratch, ScalarNoConst* materialized_output,
-      ScalarNoConst** materialized_input,
-      size_t* materialized_input_size) const {
-    if (params.bcast_dim_size == 1) {
-      // We just need one block read using the ready-set values above.
-      return BroadcastBlock(
-          params.input_block_sizes, params.input_block_strides,
-          params.bcast_block_sizes, params.bcast_block_strides,
-          params.bcast_input_strides, bcast_offset, 0, scratch,
-          materialized_output, materialized_input, materialized_input_size);
-
-    } else if (params.input_dims[params.bcast_dim] == 1) {
-      // Broadcast bcast dimension (< NumDims) by bcast_dim_size.
-      const int broadcast_bcast_dim =
-          IsColMajor ? 2 * params.inner_dim_count + 1
-                     : 2 * NumDims - 2 * params.inner_dim_count - 2;
-
-      params.bcast_block_sizes[broadcast_bcast_dim] = params.bcast_dim_size;
-      params.bcast_input_strides[broadcast_bcast_dim] = 0;
-      params.bcast_block_strides[broadcast_bcast_dim] =
-          params.output_strides[params.bcast_dim];
-
-      return BroadcastBlock(
-          params.input_block_sizes, params.input_block_strides,
-          params.bcast_block_sizes, params.bcast_block_strides,
-          params.bcast_input_strides, bcast_offset, 0, scratch,
-          materialized_output, materialized_input, materialized_input_size);
-
-    } else {
-      // Keep track of the total number of the coefficients written to the
-      // output block.
-      Index num_output_coeffs = 0;
-
-      // The general case. Let's denote the output block as
-      //
-      //   x[..., a:a+bcast_dim_size, :, ..., :]
-      //
-      // where a:a+bcast_dim_size is a slice on the bcast_dim dimension
-      // (< NumDims). We need to split the a:a+bcast_dim_size into possibly 3
-      // sub-blocks:
-      //
-      // (1) a:b, where b is the smallest multiple of
-      //     input_dims[bcast_dim_start] in [a, a+bcast_dim_size].
-      //
-      // (2) b:c, where c is the largest multiple of input_dims[bcast_dim_start]
-      //     in [a, a+bcast_dim_size].
-      //
-      // (3) c:a+bcast_dim_size .
-      //
-      // Or, when b and c do not exist, we just need to process the whole block
-      // together.
-
-      // Find a.
-      const Index bcast_dim_left_index =
-          bcast_offset / m_outputStrides[params.bcast_dim];
-
-      // Find b and c.
-      const Index input_bcast_dim_size = params.input_dims[params.bcast_dim];
-
-      // First multiple after a. This is b when <= bcast_dim_left_index +
-      // bcast_dim_size.
-      const Index first_multiple =
-          divup<Index>(bcast_dim_left_index, input_bcast_dim_size) *
-          input_bcast_dim_size;
-
-      if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) {
-        // b exists, so does c. Find it.
-        const Index last_multiple =
-            (bcast_dim_left_index + params.bcast_dim_size) /
-            input_bcast_dim_size * input_bcast_dim_size;
-        const int copy_bcast_dim =
-            IsColMajor ? 2 * params.inner_dim_count
-                       : 2 * NumDims - 2 * params.inner_dim_count - 1;
-        const int broadcast_bcast_dim =
-            IsColMajor ? 2 * params.inner_dim_count + 1
-                       : 2 * NumDims - 2 * params.inner_dim_count - 2;
-
-        if (first_multiple > bcast_dim_left_index) {
-          const Index head_size = first_multiple - bcast_dim_left_index;
-          params.input_block_sizes[params.bcast_dim] = head_size;
-          params.bcast_block_sizes[copy_bcast_dim] = head_size;
-          params.bcast_input_strides[copy_bcast_dim] =
-              params.input_block_strides[params.bcast_dim];
-          params.bcast_block_strides[copy_bcast_dim] =
-              params.output_strides[params.bcast_dim];
-          params.bcast_block_sizes[broadcast_bcast_dim] = 1;
-          params.bcast_input_strides[broadcast_bcast_dim] = 0;
-          params.bcast_block_strides[broadcast_bcast_dim] =
-              params.output_strides[params.bcast_dim] *
-              params.input_dims[params.bcast_dim];
-
-          num_output_coeffs += BroadcastBlock(
-              params.input_block_sizes, params.input_block_strides,
-              params.bcast_block_sizes, params.bcast_block_strides,
-              params.bcast_input_strides, bcast_offset, 0, scratch,
-              materialized_output, materialized_input, materialized_input_size);
-        }
-        if (first_multiple < last_multiple) {
-          params.input_block_sizes[params.bcast_dim] = input_bcast_dim_size;
-          params.bcast_block_sizes[copy_bcast_dim] = input_bcast_dim_size;
-          params.bcast_input_strides[copy_bcast_dim] =
-              params.input_block_strides[params.bcast_dim];
-          params.bcast_block_strides[copy_bcast_dim] =
-              params.output_strides[params.bcast_dim];
-          params.bcast_block_sizes[broadcast_bcast_dim] =
-              (last_multiple - first_multiple) / input_bcast_dim_size;
-          params.bcast_input_strides[broadcast_bcast_dim] = 0;
-          params.bcast_block_strides[broadcast_bcast_dim] =
-              params.output_strides[params.bcast_dim] *
-              params.input_dims[params.bcast_dim];
-          const Index offset = (first_multiple - bcast_dim_left_index) *
-                               m_outputStrides[params.bcast_dim];
-
-          num_output_coeffs += BroadcastBlock(
-              params.input_block_sizes, params.input_block_strides,
-              params.bcast_block_sizes, params.bcast_block_strides,
-              params.bcast_input_strides, bcast_offset, offset, scratch,
-              materialized_output, materialized_input, materialized_input_size);
-        }
-        if (last_multiple < bcast_dim_left_index + params.bcast_dim_size) {
-          const Index tail_size =
-              bcast_dim_left_index + params.bcast_dim_size - last_multiple;
-          params.input_block_sizes[params.bcast_dim] = tail_size;
-          params.bcast_block_sizes[copy_bcast_dim] = tail_size;
-          params.bcast_input_strides[copy_bcast_dim] =
-              params.input_block_strides[params.bcast_dim];
-          params.bcast_block_strides[copy_bcast_dim] =
-              params.output_strides[params.bcast_dim];
-          params.bcast_block_sizes[broadcast_bcast_dim] = 1;
-          params.bcast_input_strides[broadcast_bcast_dim] = 0;
-          params.bcast_block_strides[broadcast_bcast_dim] =
-              params.output_strides[params.bcast_dim] *
-              params.input_dims[params.bcast_dim];
-          const Index offset = (last_multiple - bcast_dim_left_index) *
-                               m_outputStrides[params.bcast_dim];
-
-          num_output_coeffs += BroadcastBlock(
-              params.input_block_sizes, params.input_block_strides,
-              params.bcast_block_sizes, params.bcast_block_strides,
-              params.bcast_input_strides, bcast_offset, offset, scratch,
-              materialized_output, materialized_input, materialized_input_size);
-        }
-      } else {
-        // b and c do not exist.
-        const int copy_bcast_dim =
-            IsColMajor ? 2 * params.inner_dim_count
-                       : 2 * NumDims - 2 * params.inner_dim_count - 1;
-        params.input_block_sizes[params.bcast_dim] = params.bcast_dim_size;
-        params.bcast_block_sizes[copy_bcast_dim] = params.bcast_dim_size;
-        params.bcast_input_strides[copy_bcast_dim] =
-            params.input_block_strides[params.bcast_dim];
-        params.bcast_block_strides[copy_bcast_dim] =
-            params.output_strides[params.bcast_dim];
-
-        num_output_coeffs += BroadcastBlock(
-            params.input_block_sizes, params.input_block_strides,
-            params.bcast_block_sizes, params.bcast_block_strides,
-            params.bcast_input_strides, bcast_offset, 0, scratch,
-            materialized_output, materialized_input, materialized_input_size);
-      }
-
-      return num_output_coeffs;
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlock(
-      const Dimensions& input_block_sizes,
-      const Dimensions& input_block_strides,
-      const BroadcastDimensions& bcast_block_sizes,
-      const BroadcastDimensions& bcast_block_strides,
-      const BroadcastDimensions& bcast_input_strides, Index bcast_offset,
-      Index offset, TensorBlockScratch& scratch,
-      ScalarNoConst* materialized_output, ScalarNoConst** materialized_input,
-      size_t* materialized_input_size) const {
-    // ---------------------------------------------------------------------- //
-    // Tensor block descriptor for reading block from the input.
-    const Index input_offset = bcast_offset + offset;
-    TensorBlockDesc input_desc(
-        IsColMajor ? indexColMajor(input_offset) : indexRowMajor(input_offset),
-        input_block_sizes);
-
-    ArgTensorBlock input_block = m_impl.block(input_desc, scratch);
-
-    // ---------------------------------------------------------------------- //
-    // Materialize input block into a temporary memory buffer only if it's not
-    // already available in the arg block.
-    const ScalarNoConst* input_buffer = NULL;
-
-    if (input_block.data() != NULL) {
-      // Input block already has raw data, there is no need to materialize it.
-      input_buffer = input_block.data();
-
-    } else {
-      // Otherwise we have to do block assignment into a temporary buffer.
-
-      // Maybe reuse previously allocated buffer, or allocate a new one with a
-      // scratch allocator.
-      const size_t input_total_size = input_block_sizes.TotalSize();
-      if (*materialized_input == NULL ||
-          *materialized_input_size < input_total_size) {
-        *materialized_input_size = input_total_size;
-        void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar));
-        *materialized_input = static_cast<ScalarNoConst*>(mem);
-      }
-
-      typedef internal::TensorBlockAssignment<
-          ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index>
-          TensorBlockAssignment;
-
-      TensorBlockAssignment::Run(
-          TensorBlockAssignment::target(input_block_sizes, input_block_strides,
-                                        *materialized_input),
-          input_block.expr());
-
-      input_buffer = *materialized_input;
-    }
-
-    // ---------------------------------------------------------------------- //
-    // Copy data from materialized input block to the materialized output, using
-    // given broadcast strides (strides with zeroes).
-    typedef internal::TensorBlockIO<ScalarNoConst, Index, 2 * NumDims, Layout>
-        TensorBlockIO;
-
-    typename TensorBlockIO::Src src(bcast_input_strides, input_buffer);
-    typename TensorBlockIO::Dst dst(bcast_block_sizes, bcast_block_strides,
-                                      materialized_output + offset);
-
-    return TensorBlockIO::Copy(dst, src);
-  }
-
-protected:
-  const Device EIGEN_DEVICE_REF m_device;
-  const typename internal::remove_reference<Broadcast>::type m_broadcast;
+ protected:
+  const Broadcast m_broadcast;
   Dimensions m_dimensions;
   array<Index, NumDims> m_outputStrides;
   array<Index, NumDims> m_inputStrides;
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 5b28e706d..1ba7ef170 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -32,13 +32,12 @@ struct traits<TensorChippingOp<DimId, XprType> > : public traits<XprType>
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions - 1;
   static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<DenseIndex DimId, typename XprType>
 struct eval<TensorChippingOp<DimId, XprType>, Eigen::Dense>
 {
-  typedef const TensorChippingOp<DimId, XprType> EIGEN_DEVICE_REF type;
+  typedef const TensorChippingOp<DimId, XprType>& type;
 };
 
 template<DenseIndex DimId, typename XprType>
@@ -51,7 +50,6 @@ template <DenseIndex DimId>
 struct DimensionId
 {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) {
-    EIGEN_UNUSED_VARIABLE(dim);
     eigen_assert(dim == DimId);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const {
@@ -138,48 +136,19 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
 
   enum {
     // Alignment can't be guaranteed at compile time since it depends on the
     // slice offsets.
-    IsAligned         = false,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
-    // Chipping of outer-most dimension is a trivial operation, because we can
-    // read and write directly from the underlying tensor using single offset.
-    IsOuterChipping   = (static_cast<int>(Layout) == ColMajor && DimId == NumInputDims - 1) ||
-                        (static_cast<int>(Layout) == RowMajor && DimId == 0),
-    // Chipping inner-most dimension.
-    IsInnerChipping   = (static_cast<int>(Layout) == ColMajor && DimId == 0) ||
-                        (static_cast<int>(Layout) == RowMajor && DimId == NumInputDims - 1),
-    // Prefer block access if the underlying expression prefers it, otherwise
-    // only if chipping is not trivial.
-    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess ||
-                        !IsOuterChipping,
-    CoordAccess       = false,  // to be implemented
-    RawAccess         = false
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
-  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef internal::TensorBlockDescriptor<NumInputDims, Index>
-      ArgTensorBlockDesc;
-  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
-      ArgTensorBlock;
-
-  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
-                                                     Layout, Index>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
   {
@@ -216,7 +185,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
@@ -236,20 +205,21 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
-    if (isInnerChipping()) {
+    if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
+	(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
       // m_stride is equal to 1, so let's avoid the integer division.
       eigen_assert(m_stride == 1);
       Index inputIndex = index * m_inputStride + m_inputOffset;
       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-      EIGEN_UNROLL_LOOP
       for (int i = 0; i < PacketSize; ++i) {
         values[i] = m_impl.coeff(inputIndex);
         inputIndex += m_inputStride;
       }
       PacketReturnType rslt = internal::pload<PacketReturnType>(values);
       return rslt;
-    } else if (isOuterChipping()) {
-      // m_stride is always greater than index, so let's avoid the integer division.
+    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims - 1) ||
+	       (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
+      // m_stride is aways greater than index, so let's avoid the integer division.
       eigen_assert(m_stride > index);
       return m_impl.template packet<LoadMode>(index + m_inputOffset);
     } else {
@@ -261,7 +231,6 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
       } else {
         // Cross the stride boundary. Fallback to slow path.
         EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-       EIGEN_UNROLL_LOOP
         for (int i = 0; i < PacketSize; ++i) {
           values[i] = coeff(index);
           ++index;
@@ -294,100 +263,29 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
            TensorOpCost(0, 0, cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    const size_t target_size = m_device.lastLevelCacheSize();
-    return internal::TensorBlockResourceRequirements::merge(
-        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size),
-        m_impl.getResourceRequirements());
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool root_of_expr_ast = false) const {
-    const Index chip_dim = m_dim.actualDim();
-
-    DSizes<Index, NumInputDims> input_block_dims;
-    for (int i = 0; i < NumInputDims; ++i) {
-      input_block_dims[i]
-            = i < chip_dim ? desc.dimension(i)
-            : i > chip_dim ? desc.dimension(i - 1)
-            : 1;
-    }
-
-    ArgTensorBlockDesc arg_desc(srcCoeff(desc.offset()), input_block_dims);
-
-    // Try to reuse destination buffer for materializing argument block.
-    if (desc.HasDestinationBuffer()) {
-      DSizes<Index, NumInputDims> arg_destination_strides;
-      for (int i = 0; i < NumInputDims; ++i) {
-      arg_destination_strides[i]
-            = i < chip_dim ? desc.destination().strides()[i]
-            : i > chip_dim ? desc.destination().strides()[i - 1]
-            : 0; // for dimensions of size `1` stride should never be used.
-      }
-
-      arg_desc.template AddDestinationBuffer<Layout>(
-          desc.destination().template data<ScalarNoConst>(),
-          arg_destination_strides);
-    }
-
-    ArgTensorBlock arg_block = m_impl.block(arg_desc, scratch, root_of_expr_ast);
-    if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
-
-    if (arg_block.data() != NULL) {
-      // Forward argument block buffer if possible.
-      return TensorBlock(arg_block.kind(), arg_block.data(),
-                           desc.dimensions());
-
-    } else {
-      // Assign argument block expression to a buffer.
-
-      // Prepare storage for the materialized chipping result.
-      const typename TensorBlock::Storage block_storage =
-          TensorBlock::prepareStorage(desc, scratch);
-
-      typedef internal::TensorBlockAssignment<
-          ScalarNoConst, NumInputDims, typename ArgTensorBlock::XprType, Index>
-          TensorBlockAssignment;
-
-      TensorBlockAssignment::Run(
-          TensorBlockAssignment::target(
-              arg_desc.dimensions(),
-              internal::strides<Layout>(arg_desc.dimensions()),
-              block_storage.data()),
-          arg_block.expr());
-
-      return block_storage.AsTensorMaterializedBlock();
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
-    typename Storage::Type result = constCast(m_impl.data());
-    if (isOuterChipping() && result) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const {
+    CoeffReturnType* result = const_cast<CoeffReturnType*>(m_impl.data());
+    if (((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumDims) ||
+         (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) &&
+        result) {
       return result + m_inputOffset;
     } else {
       return NULL;
     }
   }
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
 
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
   {
     Index inputIndex;
-    if (isInnerChipping()) {
+    if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
+	(static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims-1)) {
       // m_stride is equal to 1, so let's avoid the integer division.
       eigen_assert(m_stride == 1);
       inputIndex = index * m_inputStride + m_inputOffset;
-    } else if (isOuterChipping()) {
-      // m_stride is always greater than index, so let's avoid the integer
-      // division.
+    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims-1) ||
+	       (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
+      // m_stride is aways greater than index, so let's avoid the integer division.
       eigen_assert(m_stride > index);
       inputIndex = index + m_inputOffset;
     } else {
@@ -399,25 +297,13 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
     return inputIndex;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isInnerChipping() const {
-    return IsInnerChipping ||
-           (static_cast<int>(Layout) == ColMajor && m_dim.actualDim() == 0) ||
-           (static_cast<int>(Layout) == RowMajor && m_dim.actualDim() == NumInputDims - 1);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isOuterChipping() const {
-    return IsOuterChipping ||
-           (static_cast<int>(Layout) == ColMajor && m_dim.actualDim() == NumInputDims-1) ||
-           (static_cast<int>(Layout) == RowMajor && m_dim.actualDim() == 0);
-  }
-
   Dimensions m_dimensions;
   Index m_stride;
   Index m_inputOffset;
   Index m_inputStride;
   TensorEvaluator<ArgType, Device> m_impl;
   const internal::DimensionId<DimId> m_dim;
-  const Device EIGEN_DEVICE_REF m_device;
+  const Device& m_device;
 };
 
 
@@ -435,20 +321,14 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
-    IsAligned     = false,
-    PacketAccess  = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess   = TensorEvaluator<ArgType, Device>::RawAccess,
-    Layout        = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess     = false
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    RawAccess = false
   };
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
     { }
@@ -463,19 +343,20 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
   {
     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
 
-    if (this->isInnerChipping()) {
+    if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) ||
+	(static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) {
       // m_stride is equal to 1, so let's avoid the integer division.
       eigen_assert(this->m_stride == 1);
       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
       Index inputIndex = index * this->m_inputStride + this->m_inputOffset;
-      EIGEN_UNROLL_LOOP
       for (int i = 0; i < PacketSize; ++i) {
         this->m_impl.coeffRef(inputIndex) = values[i];
         inputIndex += this->m_inputStride;
       }
-    } else if (this->isOuterChipping()) {
-      // m_stride is always greater than index, so let's avoid the integer division.
+    } else if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) ||
+	       (static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == 0)) {
+      // m_stride is aways greater than index, so let's avoid the integer division.
       eigen_assert(this->m_stride > index);
       this->m_impl.template writePacket<StoreMode>(index + this->m_inputOffset, x);
     } else {
@@ -488,7 +369,6 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
         // Cross stride boundary. Fallback to slow path.
         EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
         internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
-        EIGEN_UNROLL_LOOP
         for (int i = 0; i < PacketSize; ++i) {
           this->coeffRef(index) = values[i];
           ++index;
@@ -496,36 +376,6 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
       }
     }
   }
-
-  template <typename TensorBlock>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
-      const TensorBlockDesc& desc, const TensorBlock& block) {
-    assert(this->m_impl.data() != NULL);
-
-    const Index chip_dim = this->m_dim.actualDim();
-
-    DSizes<Index, NumInputDims> input_block_dims;
-    for (int i = 0; i < NumInputDims; ++i) {
-      input_block_dims[i] = i < chip_dim ? desc.dimension(i)
-                          : i > chip_dim ? desc.dimension(i - 1)
-                          : 1;
-    }
-
-    typedef TensorReshapingOp<const DSizes<Index, NumInputDims>,
-                              const typename TensorBlock::XprType>
-        TensorBlockExpr;
-
-    typedef internal::TensorBlockAssignment<Scalar, NumInputDims,
-                                            TensorBlockExpr, Index>
-        TensorBlockAssign;
-
-    TensorBlockAssign::Run(
-        TensorBlockAssign::target(
-            input_block_dims,
-            internal::strides<Layout>(this->m_impl.dimensions()),
-            this->m_impl.data(), this->srcCoeff(desc.offset())),
-        block.expr().reshape(input_block_dims));
-  }
 };
 
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 5968ff4b7..59bf90d93 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -37,8 +37,6 @@ struct traits<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >
   static const int NumDimensions = traits<LhsXprType>::NumDimensions;
   static const int Layout = traits<LhsXprType>::Layout;
   enum { Flags = 0 };
-  typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
-                               typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>::type PointerType;
 };
 
 template<typename Axis, typename LhsXprType, typename RhsXprType>
@@ -119,23 +117,13 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
   enum {
-    IsAligned         = false,
-    PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &&
-                        TensorEvaluator<RightArgType, Device>::PacketAccess,
-    BlockAccess       = false,
-    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
-                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
-    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
-    RawAccess         = false
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    RawAccess = false
   };
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis())
   {
@@ -189,7 +177,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
   // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear?
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/)
   {
     m_leftImpl.evalSubExprsIfNeeded(NULL);
     m_rightImpl.evalSubExprsIfNeeded(NULL);
@@ -227,13 +215,11 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
       Index left_index;
       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
         left_index = subs[0];
-        EIGEN_UNROLL_LOOP
         for (int i = 1; i < NumDims; ++i) {
           left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
         }
       } else {
         left_index = subs[NumDims - 1];
-        EIGEN_UNROLL_LOOP
         for (int i = NumDims - 2; i >= 0; --i) {
           left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
         }
@@ -245,13 +231,11 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
       Index right_index;
       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
         right_index = subs[0];
-        EIGEN_UNROLL_LOOP
         for (int i = 1; i < NumDims; ++i) {
           right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
         }
       } else {
         right_index = subs[NumDims - 1];
-        EIGEN_UNROLL_LOOP
         for (int i = NumDims - 2; i >= 0; --i) {
           right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
         }
@@ -264,12 +248,11 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    const int packetSize = PacketType<CoeffReturnType, Device>::size;
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
     EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
 
     EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
-    EIGEN_UNROLL_LOOP
     for (int i = 0; i < packetSize; ++i) {
       values[i] = coeff(index+i);
     }
@@ -292,15 +275,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
            TensorOpCost(0, 0, compute_cost);
   }
 
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-
-  #ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_leftImpl.bind(cgh);
-    m_rightImpl.bind(cgh);
-  }
-  #endif
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
   protected:
     Dimensions m_dimensions;
@@ -321,20 +296,12 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
   typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType;
   typedef typename Base::Dimensions Dimensions;
   enum {
-    IsAligned         = false,
-    PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &&
-                        TensorEvaluator<RightArgType, Device>::PacketAccess,
-    BlockAccess       = false,
-    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
-                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
-    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
-    RawAccess         = false
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    RawAccess = false
   };
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device)
     : Base(op, device)
   {
@@ -377,7 +344,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
-    const int packetSize = PacketType<CoeffReturnType, Device>::size;
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
     EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize());
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 605d72c8d..20b29e5fd 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -21,8 +21,8 @@ namespace Eigen {
   */
 namespace internal {
 
-template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
-struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> >
+template<typename Dimensions, typename LhsXprType, typename RhsXprType>
+struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
 {
   // Type promotion to handle the case where the types of the lhs and the rhs are different.
   typedef typename gebp_traits<typename remove_const<typename LhsXprType::Scalar>::type,
@@ -38,305 +38,53 @@ struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKern
   typedef typename remove_reference<RhsNested>::type _RhsNested;
 
   // From NumDims below.
-  static const int NumDimensions = traits<LhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value;
+  static const int NumDimensions = traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value;
   static const int Layout = traits<LhsXprType>::Layout;
-  typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
-                               typename traits<LhsXprType>::PointerType,
-                               typename traits<RhsXprType>::PointerType>::type
-      PointerType;
 
   enum {
     Flags = 0
   };
 };
 
-template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
-struct eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>, Eigen::Dense>
+template<typename Dimensions, typename LhsXprType, typename RhsXprType>
+struct eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, Eigen::Dense>
 {
-  typedef const TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>& type;
+  typedef const TensorContractionOp<Dimensions, LhsXprType, RhsXprType>& type;
 };
 
-template<typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
-struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>, 1, typename eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> >::type>
+template<typename Dimensions, typename LhsXprType, typename RhsXprType>
+struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType>, 1, typename eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >::type>
 {
-  typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> type;
+  typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType> type;
 };
 
-template<typename Indices_, typename LeftArgType_, typename RightArgType_, typename OutputKernelType_, typename Device_>
-struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, RightArgType_, OutputKernelType_>, Device_> > {
+template<typename Indices_, typename LeftArgType_, typename RightArgType_, typename Device_>
+struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, RightArgType_>, Device_> > {
   typedef Indices_ Indices;
   typedef LeftArgType_ LeftArgType;
   typedef RightArgType_ RightArgType;
-  typedef OutputKernelType_ OutputKernelType;
   typedef Device_ Device;
 
   // From NumDims below.
   static const int NumDimensions = traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value;
 };
 
-// Helper class to allocate and deallocate temporary memory for packed buffers.
-template <typename LhsScalar, typename RhsScalar>
-struct TensorContractionBlockMemAllocator {
-  typedef void* BlockMemHandle;
-
-  template <typename Device>
-  EIGEN_DEVICE_FUNC static BlockMemHandle allocate(Device& d, const Index bm,
-                                                   const Index bk,
-                                                   const Index bn,
-                                                   LhsScalar** lhs_block,
-                                                   RhsScalar** rhs_block) {
-    eigen_assert(lhs_block);
-    eigen_assert(rhs_block);
-    BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn);
-    char* block_mem = static_cast<char*>(d.allocate(sz.lhs_size + sz.rhs_size));
-    eigen_assert(block_mem);
-    *lhs_block = reinterpret_cast<LhsScalar*>(block_mem);
-    *rhs_block = reinterpret_cast<RhsScalar*>(block_mem + sz.lhs_size);
-    return block_mem;
-  }
-
-  template <typename Device>
-  EIGEN_DEVICE_FUNC static BlockMemHandle allocateSlices(
-      Device& d, const Index bm, const Index bk, const Index bn,
-      const Index num_lhs, const Index num_rhs, const Index num_slices,
-      std::vector<LhsScalar*>* lhs_blocks,
-      std::vector<RhsScalar*>* rhs_blocks) {
-    eigen_assert(num_slices > 0);
-    eigen_assert(num_lhs >= 0 && num_rhs >= 0);
-    eigen_assert(num_lhs == 0 || lhs_blocks);
-    eigen_assert(num_rhs == 0 || rhs_blocks);
-    BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn);
-    void* block_mem = d.allocate(
-        (num_lhs * sz.lhs_size + num_rhs * sz.rhs_size) * num_slices);
-    eigen_assert(block_mem);
-    char* mem = static_cast<char*>(block_mem);
-
-    for (Index x = 0; x < num_slices; x++) {
-      if (num_lhs > 0) lhs_blocks[x].resize(num_lhs);
-      for (Index m = 0; m < num_lhs; m++) {
-        lhs_blocks[x][m] = reinterpret_cast<LhsScalar*>(mem);
-        mem += sz.lhs_size;
-      }
-      if (num_rhs > 0) rhs_blocks[x].resize(num_rhs);
-      for (Index n = 0; n < num_rhs; n++) {
-        rhs_blocks[x][n] = reinterpret_cast<RhsScalar*>(mem);
-        mem += sz.rhs_size;
-      }
-    }
-
-    return block_mem;
-  }
-
-  template <typename Device>
-  EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) {
-    d.deallocate(handle);
-  }
-
- private:
-  struct BlockSizes {
-    Index lhs_size;
-    Index rhs_size;
-  };
-  EIGEN_DEVICE_FUNC static BlockSizes ComputeLhsRhsBlockSizes(const Index bm,
-                                                              const Index bk,
-                                                              const Index bn) {
-    Index align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
-    BlockSizes sz;
-    sz.lhs_size = divup<Index>(bm * bk * sizeof(LhsScalar), align) * align;
-    sz.rhs_size = divup<Index>(bn * bk * sizeof(RhsScalar), align) * align;
-    return sz;
-  }
-};
-
-// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in
-// ColMajor storage order. This property is guaranteed by the
-// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack
-// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix
-// multiplication for these blocks. Default tensor contraction uses
-// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see
-// GeneralBlocPanelKernel.h for details).
-//
-// By specializing contraction kernels we can use other low level libraries to
-// perform matrix multiplication, and still rely on Eigen contraction evaluator.
-// This also includes full support in TensorContractionThreadPool, assuming that
-// underlying gemm do not use it's own threading.
-//
-// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of
-//   multiplication, lhs tensor and rhs tensor respectively.
-//
-// - StorageIndex - index type for the tensor expressions. In practice almost
-//   always is Eigen::Index.
-//
-// - OutputMapper provides access to the memory of the output matrix. In
-//   practice it's always column major blas_data_mapper (it must be of ResScalar
-//   type).
-//
-// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional
-//   view into the Lhs/Rhs tensor expressions. In practice it's
-//   TensorContractionInputMapper, or some specialization of it based on the
-//   type of tensor expression (e.g. TensorImagePatchOp has optimized input
-//   mapper).
-template <typename ResScalar, typename LhsScalar, typename RhsScalar,
-    typename StorageIndex, typename OutputMapper, typename LhsMapper,
-    typename RhsMapper>
-struct TensorContractionKernel {
-  // True if `invoke()` supports `beta` in `C <- alpha * A * B + beta * C`
-  // (otherwise beta should be always equal to 1).
-  enum { HasBeta = false };
-
-  EIGEN_DEVICE_FUNC
-  TensorContractionKernel(StorageIndex m_, StorageIndex k_, StorageIndex n_,
-                          StorageIndex bm_, StorageIndex bk_, StorageIndex bn_)
-      : m(m_), k(k_), n(n_), bm(bm_), bk(bk_), bn(bn_) {}
-
-  // Pack blocks of Lhs and Rhs into contiguous blocks in memory.
-  typedef LhsScalar* LhsBlock;
-  typedef RhsScalar* RhsBlock;
-
-  // Packed Lhs/Rhs block memory allocator.
-  typedef TensorContractionBlockMemAllocator<LhsScalar, RhsScalar>
-      BlockMemAllocator;
-  typedef typename BlockMemAllocator::BlockMemHandle BlockMemHandle;
-
-  typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
-
-  typedef internal::gemm_pack_lhs<
-      LhsScalar, StorageIndex, typename LhsMapper::SubMapper, Traits::mr,
-      Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor>
-      LhsPacker;
-
-  typedef internal::gemm_pack_rhs<RhsScalar, StorageIndex,
-                                  typename RhsMapper::SubMapper, Traits::nr,
-                                  ColMajor>
-      RhsPacker;
-
-  typedef internal::gebp_kernel<LhsScalar, RhsScalar, StorageIndex,
-                                OutputMapper, Traits::mr, Traits::nr,
-      /*ConjugateLhs*/ false, /*ConjugateRhs*/ false>
-      GebpKernel;
-
-  template <typename Device>
-  EIGEN_DEVICE_FUNC BlockMemHandle allocate(Device& d, LhsBlock* lhs_block,
-                                            RhsBlock* rhs_block) {
-    return BlockMemAllocator::allocate(d, bm, bk, bn, lhs_block, rhs_block);
-  }
-
-  template <typename Device>
-  EIGEN_DEVICE_FUNC BlockMemHandle allocateSlices(
-      Device& d, const StorageIndex num_lhs, const StorageIndex num_rhs,
-      const StorageIndex num_slices, std::vector<LhsBlock>* lhs_blocks,
-      std::vector<RhsBlock>* rhs_blocks) {
-    return BlockMemAllocator::allocateSlices(
-        d, bm, bk, bn, num_lhs, num_rhs, num_slices, lhs_blocks, rhs_blocks);
-  }
-
-  template <typename Device>
-  EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) {
-    BlockMemAllocator::deallocate(d, handle);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packLhs(
-      LhsBlock* lhsBlock, const typename LhsMapper::SubMapper& data_mapper,
-      const StorageIndex depth, const StorageIndex rows) {
-    LhsPacker()(*lhsBlock, data_mapper, depth, rows, /*stride*/ 0,
-        /*offset*/ 0);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packRhs(
-      RhsBlock* rhsBlock, const typename RhsMapper::SubMapper& data_mapper,
-      const StorageIndex depth, const StorageIndex cols) {
-    RhsPacker()(*rhsBlock, data_mapper, depth, cols);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void invoke(
-      const OutputMapper& output_mapper, const LhsBlock& lhsBlock,
-      const RhsBlock& rhsBlock, const StorageIndex rows,
-      const StorageIndex depth, const StorageIndex cols,
-      const ResScalar alpha, const ResScalar beta) {
-    // Default GEBP kernel does not support beta.
-    eigen_assert(beta == ResScalar(1));
-    static const int kComputeStrideFromBlockDimensions = -1;
-    GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha,
-        /*strideA*/ kComputeStrideFromBlockDimensions,
-        /*strideB*/ kComputeStrideFromBlockDimensions,
-        /*offsetA*/ 0, /*offsetB*/ 0);
-  }
-
- private:
-  // These are dimensions of the original Tensors, and selected block sizes. The
-  // actual block sizes passed to all function above might be smaller because of
-  // the partial blocks at the end.
-  const StorageIndex m;
-  const StorageIndex k;
-  const StorageIndex n;
-  const StorageIndex bm;
-  const StorageIndex bk;
-  const StorageIndex bn;
-};
-
 }  // end namespace internal
 
-// Tensor contraction params that should enable to get from output matrix
-// 2-dimensional coordinates to the output tensor dimensions.
-struct TensorContractionParams {
-  // TensorContraction evaluator assumes that both tensors are in ColMajor
-  // layout, if tensors are in RowMajor evaluator swap lhs with rhs.
-  bool swapped_arguments;
-};
-
-// Output kernel allows to fuse operations into the tensor contraction.
-//
-// Examples:
-//   1. Elementwise Relu transformation following Conv2D.
-//   2. AddBias to the Conv2D output channels dimension.
-//
-// The NoOpOutputKernel implements an output kernel that does absolutely nothing.
-struct NoOpOutputKernel {
-  /**
-   * Tensor contraction evaluator calls this kernel after finishing each block
-   * of output matrix. Output blocks belong to the 2-dimensional output tensor.
-   *
-   * TensorContractionParams contains contraction dimensions information
-   * required to map output 2-d space into the expected output tensor space
-   * (potentially higher dimensional).
-   *
-   * \param[in] output_mapper Access to output tensor memory
-   * \param[in] params   Tensor contraction parameters
-   * \param[in] i        Index of a first row available through output_mapper
-   * \param[in] j        Index of a first column available through output_mapper
-   * \param[in] num_rows Number of available rows
-   * \param[in] num_cols Number of available columns
-   */
-  template <typename Index, typename Scalar>
-  EIGEN_ALWAYS_INLINE void operator()(
-      const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper,
-      const TensorContractionParams& params, Index i,
-      Index j, Index num_rows, Index num_cols) const {
-    EIGEN_UNUSED_VARIABLE(output_mapper);
-    EIGEN_UNUSED_VARIABLE(params);
-    EIGEN_UNUSED_VARIABLE(i);
-    EIGEN_UNUSED_VARIABLE(j);
-    EIGEN_UNUSED_VARIABLE(num_rows);
-    EIGEN_UNUSED_VARIABLE(num_cols);
-  }
-};
-
-template<typename Indices, typename LhsXprType, typename RhsXprType, typename OutputKernelType = const NoOpOutputKernel>
-class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType, OutputKernelType>, ReadOnlyAccessors>
+template<typename Indices, typename LhsXprType, typename RhsXprType>
+class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
   typedef typename internal::gebp_traits<typename LhsXprType::CoeffReturnType,
-                                         typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType;
+                                                   typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType;
   typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested;
   typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(
-      const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims,
-      const OutputKernelType& output_kernel = OutputKernelType())
-      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims),
-        m_output_kernel(output_kernel) {}
+      const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims)
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {}
 
   EIGEN_DEVICE_FUNC
   const Indices& indices() const { return m_indices; }
@@ -350,14 +98,10 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp
   const typename internal::remove_all<typename RhsXprType::Nested>::type&
   rhsExpression() const { return m_rhs_xpr; }
 
-  EIGEN_DEVICE_FUNC
-  const OutputKernelType& outputKernel() const { return m_output_kernel; }
-
   protected:
     typename LhsXprType::Nested m_lhs_xpr;
     typename RhsXprType::Nested m_rhs_xpr;
     const Indices m_indices;
-    const OutputKernelType m_output_kernel;
 };
 
 
@@ -367,31 +111,22 @@ struct TensorContractionEvaluatorBase
   typedef typename internal::traits<Derived>::Indices Indices;
   typedef typename internal::traits<Derived>::LeftArgType LeftArgType;
   typedef typename internal::traits<Derived>::RightArgType RightArgType;
-  typedef typename internal::traits<Derived>::OutputKernelType OutputKernelType;
   typedef typename internal::traits<Derived>::Device Device;
 
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef StorageMemory<Scalar, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned         = true,
-    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess       = false,
-    PreferBlockAccess = false,
-    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
-    CoordAccess       = false,  // to be implemented
-    RawAccess         = true
+    IsAligned = true,
+    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = true
   };
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   // Most of the code is assuming that both input tensors are ColMajor. If the
   // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
   // If we want to compute A * B = C, where A is LHS and B is RHS, the code
@@ -401,9 +136,6 @@ struct TensorContractionEvaluatorBase
   typedef typename internal::conditional<
     static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
 
-  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluatorType;
-  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluatorType;
-
   static const int LDims =
       internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
   static const int RDims =
@@ -419,15 +151,14 @@ struct TensorContractionEvaluatorBase
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   TensorContractionEvaluatorBase(const XprType& op, const Device& device)
-      : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
+    : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
                           op.lhsExpression(), op.rhsExpression()), device),
-        m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
-                           op.rhsExpression(), op.lhsExpression()), device),
+    m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
+                          op.rhsExpression(), op.lhsExpression()), device),
         m_device(device),
-        m_output_kernel(op.outputKernel()),
         m_result(NULL) {
     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
-         static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
+			   static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
                         YOU_MADE_A_PROGRAMMING_MISTAKE);
 
 
@@ -502,7 +233,7 @@ struct TensorContractionEvaluatorBase
     // dimensions and right non-contracting dimensions.
     m_lhs_inner_dim_contiguous = true;
     int dim_idx = 0;
-    Index nocontract_idx = 0;
+    unsigned int nocontract_idx = 0;
 
     for (int i = 0; i < LDims; i++) {
       // find if we are contracting on index i of left tensor
@@ -592,140 +323,64 @@ struct TensorContractionEvaluatorBase
         numext::swap(m_dimensions[i], m_dimensions[j]);
       }
     }
-
-    // A set of parameters that will allow output kernel to get from output
-    // tensor dimensions (i, j) into the original tensor dimensions.
-    // TODO(ezhulenev): Add parameters required to infer output tensor index for
-    // more complex contractions than 2x2 on internal dimension.
-    m_tensor_contraction_params.swapped_arguments = static_cast<int>(Layout) == RowMajor;
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
     m_leftImpl.evalSubExprsIfNeeded(NULL);
     m_rightImpl.evalSubExprsIfNeeded(NULL);
     if (data) {
       evalTo(data);
       return false;
     } else {
-      m_result = static_cast<EvaluatorPointerType>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+      m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
       evalTo(m_result);
       return true;
     }
   }
 
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType dest, EvalSubExprsCallback done) {
-    m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) {
-      m_rightImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) {
-        if (dest) {
-          evalToAsync(dest, [done]() { done(false); });
-        } else {
-          m_result = static_cast<EvaluatorPointerType>(
-              m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
-          evalToAsync(m_result, [done]() { done(true); });
-        }
-      });
-    });
-  }
-#endif  // EIGEN_USE_THREADS
-
-#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \
-  if (this->m_lhs_inner_dim_contiguous) {                    \
-    if (this->m_rhs_inner_dim_contiguous) {                  \
-      if (this->m_rhs_inner_dim_reordered) {                 \
-        METHOD<true, true, true, ALIGNMENT> ARGS;            \
-      } else {                                               \
-        METHOD<true, true, false, ALIGNMENT> ARGS;           \
-      }                                                      \
-    } else {                                                 \
-      if (this->m_rhs_inner_dim_reordered) {                 \
-        METHOD<true, false, true, ALIGNMENT> ARGS;           \
-      } else {                                               \
-        METHOD<true, false, false, ALIGNMENT> ARGS;          \
-      }                                                      \
-    }                                                        \
-  } else {                                                   \
-    if (this->m_rhs_inner_dim_contiguous) {                  \
-      if (this->m_rhs_inner_dim_reordered) {                 \
-        METHOD<false, true, true, ALIGNMENT> ARGS;           \
-      } else {                                               \
-        METHOD<false, true, false, ALIGNMENT> ARGS;          \
-      }                                                      \
-    } else {                                                 \
-      if (this->m_rhs_inner_dim_reordered) {                 \
-        METHOD<false, false, true, ALIGNMENT> ARGS;          \
-      } else {                                               \
-        METHOD<false, false, false, ALIGNMENT> ARGS;         \
-      }                                                      \
-    }                                                        \
-  }
-
-#define TENSOR_CONTRACTION_ASYNC_DISPATCH(METHOD, DONE, ALIGNMENT, ARGS, FN) \
-  if (this->m_lhs_inner_dim_contiguous) {                                    \
-    if (this->m_rhs_inner_dim_contiguous) {                                  \
-      if (this->m_rhs_inner_dim_reordered) {                                 \
-        (new METHOD<DONE, true, true, true, ALIGNMENT> ARGS)->FN;            \
-      } else {                                                               \
-        (new METHOD<DONE, true, true, false, ALIGNMENT> ARGS)->FN;           \
-      }                                                                      \
-    } else {                                                                 \
-      if (this->m_rhs_inner_dim_reordered) {                                 \
-        (new METHOD<DONE, true, false, true, ALIGNMENT> ARGS)->FN;           \
-      } else {                                                               \
-        (new METHOD<DONE, true, false, false, ALIGNMENT> ARGS)->FN;          \
-      }                                                                      \
-    }                                                                        \
-  } else {                                                                   \
-    if (this->m_rhs_inner_dim_contiguous) {                                  \
-      if (this->m_rhs_inner_dim_reordered) {                                 \
-        (new METHOD<DONE, false, true, true, ALIGNMENT> ARGS)->FN;           \
-      } else {                                                               \
-        (new METHOD<DONE, false, true, false, ALIGNMENT> ARGS)->FN;          \
-      }                                                                      \
-    } else {                                                                 \
-      if (this->m_rhs_inner_dim_reordered) {                                 \
-        (new METHOD<DONE, false, false, true, ALIGNMENT> ARGS)->FN;          \
-      } else {                                                               \
-        (new METHOD<DONE, false, false, false, ALIGNMENT> ARGS)->FN;         \
-      }                                                                      \
-    }                                                                        \
-  }
-
   EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
-   static_cast<const Derived*>(this)->template evalProduct<Unaligned>(buffer);
-  }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalToCallback>
-  void evalToAsync(Scalar* buffer, EvalToCallback done) const {
-    static_cast<const Derived*>(this)
-        ->template evalProductAsync<EvalToCallback, Unaligned>(buffer,
-                                                               std::move(done));
-  }
-#endif  // EIGEN_USE_THREADS
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
-            bool rhs_inner_dim_reordered, int Alignment>
-  void evalProductSequential(Scalar* buffer) const {
-    if (this->m_j_size == 1) {
-      this->template evalGemv<lhs_inner_dim_contiguous,
-                              rhs_inner_dim_contiguous, rhs_inner_dim_reordered,
-                              Alignment>(buffer);
-    } else {
-      this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous,
-                              rhs_inner_dim_reordered, Alignment>(buffer);
+    if (this->m_lhs_inner_dim_contiguous) {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          static_cast<const Derived*>(this)->template evalProduct<true, true, true, Unaligned>(buffer);
+        }
+        else {
+          static_cast<const Derived*>(this)->template evalProduct<true, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          static_cast<const Derived*>(this)->template evalProduct<true, false, true, Unaligned>(buffer);
+        }
+        else {
+          static_cast<const Derived*>(this)->template evalProduct<true, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+    else {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          static_cast<const Derived*>(this)->template evalProduct<false, true, true, Unaligned>(buffer);
+        }
+        else {
+          static_cast<const Derived*>(this)->template evalProduct<false, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          static_cast<const Derived*>(this)->template evalProduct<false, false, true, Unaligned>(buffer);
+        }
+        else {
+          static_cast<const Derived*>(this)->template evalProduct<false, false, false, Unaligned>(buffer);
+        }
+      }
     }
   }
 
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  #if !defined(EIGEN_HIPCC)
-  EIGEN_DEVICE_FUNC
-  #endif
-  void evalGemv(Scalar* buffer) const {
+  EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const {
     const Index rows = m_i_size;
     const Index cols = m_k_size;
 
@@ -763,41 +418,12 @@ struct TensorContractionEvaluatorBase
     internal::general_matrix_vector_product<Index,LhsScalar,LhsMapper,ColMajor,false,RhsScalar,RhsMapper,false>::run(
         rows, cols, lhs, rhs,
         buffer, resIncr, alpha);
-
-    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-    m_output_kernel(OutputMapper(buffer, rows), m_tensor_contraction_params,
-                    static_cast<Index>(0), static_cast<Index>(0), rows,
-                    static_cast<Index>(1));
   }
 
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  #if !defined(EIGEN_HIPCC)
-  EIGEN_DEVICE_FUNC
-  #endif
-  void evalGemm(Scalar* buffer) const {
+  EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const {
     // columns in left side, rows in right side
     const Index k = this->m_k_size;
-    this->template evalGemmPartial<lhs_inner_dim_contiguous,
-                                   rhs_inner_dim_contiguous,
-                                   rhs_inner_dim_reordered,
-                                   Alignment, true>(buffer, 0, k, 1);
-  }
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
-      bool rhs_inner_dim_reordered, int Alignment>
-  EIGEN_DEVICE_FUNC void evalGemmPartialWithoutOutputKernel(
-      Scalar* buffer, Index k_start, Index k_end, int num_threads) const {
-    evalGemmPartial<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous,
-                    rhs_inner_dim_reordered, Alignment,
-        /*use_output_kernel*/ false>(buffer, k_start, k_end,
-                                     num_threads);
-  }
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment, bool use_output_kernel>
-  EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int num_threads) const {
-    eigen_assert(k_end >= k_start && k_start >= 0 && k_end <= this->m_k_size);
-    // columns in slice on left side, rows on right side
-    const Index k_slice = k_end - k_start;
 
     // rows in left side
     const Index m = this->m_i_size;
@@ -805,9 +431,16 @@ struct TensorContractionEvaluatorBase
     // columns in right side
     const Index n = this->m_j_size;
 
-    // define data mappers for Lhs and Rhs
+    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+    // define mr, nr, and all of my data mapper types
     typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
     typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+    typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+    const Index nr = Traits::nr;
+    const Index mr = Traits::mr;
 
     typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
     typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
@@ -829,9 +462,11 @@ struct TensorContractionEvaluatorBase
 
     typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
 
-    typedef internal::TensorContractionKernel<
-        Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
-        TensorContractionKernel;
+    // Declare GEBP packing and kernel structs
+    internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, mr, Traits::LhsProgress, ColMajor> pack_lhs;
+    internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, nr, ColMajor> pack_rhs;
+
+    internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper, mr, nr, false, false> gebp;
 
     // initialize data mappers
     LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
@@ -843,69 +478,39 @@ struct TensorContractionEvaluatorBase
     OutputMapper output(buffer, m);
 
     // Sizes of the blocks to load in cache. See the Goto paper for details.
-    internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar,
-                                        Index, internal::ShardByCol>
-        blocking(k_slice, m, n, num_threads);
+    internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, 1);
     const Index kc = blocking.kc();
     const Index mc = numext::mini(m, blocking.mc());
     const Index nc = numext::mini(n, blocking.nc());
+    const Index sizeA = mc * kc;
+    const Index sizeB = kc * nc;
 
-    typedef typename TensorContractionKernel::LhsBlock LhsBlock;
-    typedef typename TensorContractionKernel::RhsBlock RhsBlock;
-
-    LhsBlock blockA;
-    RhsBlock blockB;
-
-    TensorContractionKernel kernel(m, k_slice, n, mc, kc, nc);
-
-    typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle;
-    const BlockMemHandle packed_mem =
-        kernel.allocate(this->m_device, &blockA, &blockB);
-
-    // If a contraction kernel does not support beta, explicitly initialize
-    // output buffer with zeroes.
-    if (!TensorContractionKernel::HasBeta) {
-      this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
-    }
+    LhsScalar* blockA = static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar)));
+    RhsScalar* blockB = static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar)));
 
     for(Index i2=0; i2<m; i2+=mc)
     {
       const Index actual_mc = numext::mini(i2+mc,m)-i2;
-      for (Index k2 = k_start; k2 < k_end; k2 += kc) {
+      for (Index k2 = 0; k2 < k; k2 += kc) {
         // make sure we don't overshoot right edge of left matrix, then pack vertical panel
-        const Index actual_kc = numext::mini(k2 + kc, k_end) - k2;
-        kernel.packLhs(&blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
-
-        // If kernel supports beta, there is no need to initialize output
-        // buffer with zeroes.
-        const Scalar alpha = Scalar(1);
-        const Scalar beta = (TensorContractionKernel::HasBeta && k2 == k_start)
-                                ? Scalar(0)
-                                : Scalar(1);
+        const Index actual_kc = numext::mini(k2 + kc, k) - k2;
+        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc, 0, 0);
 
         // series of horizontal blocks
         for (Index j2 = 0; j2 < n; j2 += nc) {
           // make sure we don't overshoot right edge of right matrix, then pack block
           const Index actual_nc = numext::mini(j2 + nc, n) - j2;
-          kernel.packRhs(&blockB, rhs.getSubMapper(k2, j2), actual_kc,
-                         actual_nc);
+          pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc, 0, 0);
 
           // call gebp (matrix kernel)
           // The parameters here are copied from Eigen's GEMM implementation
-          const OutputMapper output_mapper = output.getSubMapper(i2, j2);
-          kernel.invoke(output_mapper, blockA, blockB, actual_mc, actual_kc,
-                        actual_nc, alpha, beta);
-
-          // We are done with this [i2, j2] output block.
-          if (use_output_kernel && k2 + kc >= k_end) {
-            m_output_kernel(output_mapper, m_tensor_contraction_params, i2, j2,
-                            actual_mc, actual_nc);
-          }
+          gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, Scalar(1), -1, -1, 0, 0);
         }
       }
     }
 
-    kernel.deallocate(this->m_device, packed_mem);
+    this->m_device.deallocate(blockA);
+    this->m_device.deallocate(blockB);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
@@ -931,9 +536,9 @@ struct TensorContractionEvaluatorBase
     return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_result; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; }
 
-protected:
+  protected:
   // Prevent assignment
   TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&);
   Dimensions m_dimensions;
@@ -955,25 +560,22 @@ protected:
   Index m_j_size;
   Index m_k_size;
 
-  TensorContractionParams m_tensor_contraction_params;
-
   TensorEvaluator<EvalLeftArgType, Device> m_leftImpl;
   TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
-  const Device EIGEN_DEVICE_REF m_device;
-  OutputKernelType m_output_kernel;
-  EvaluatorPointerType m_result;
+  const Device& m_device;
+  Scalar* m_result;
 };
 
 
 // evaluator for default device
-template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType, typename Device>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> :
+template<typename Indices, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> :
     public TensorContractionEvaluatorBase<
-      TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> > {
-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
+      TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> > {
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
   typedef TensorContractionEvaluatorBase<Self> Base;
 
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
@@ -1010,9 +612,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
       Base(op, device) { }
 
-  template <int Alignment>
-  void evalProduct(Scalar* buffer) const {
-    TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, Alignment, (buffer));
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const {
+    if (this->m_j_size == 1) {
+      this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+      return;
+    }
+
+    this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
   }
 };
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
index 974feb0ad..5cf7b4f71 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -21,28 +21,14 @@ enum {
 
 
 // Default Blocking Strategy
-template<typename ResScalar, typename LhsScalar, typename RhsScalar, typename StorageIndex, int ShardingType = ShardByCol>
+template <typename LhsMapper, typename RhsMapper, typename Index, int ShardingType=ShardByCol>
 class TensorContractionBlocking {
  public:
 
- /*
-   adding EIGEN_DEVICE_FUNC unconditionally to 'TensorContractionBlocking' constructor in `TensorContractionBlocking.h`
-     requires adding EIGEN_DEVICE_FUNC to `computeProductBlockingSizes` in `GeneralBlockPanelKernel.h`
-     which in turn, requires adding EIGEN_DEVICE_FUNC to `evaluateProductBlockingSizesHeuristic` in `GeneralBlockPanelKernel.h`
-     which in turn, requires adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h`
-     (else HIPCC will error out)
+  typedef typename LhsMapper::Scalar LhsScalar;
+  typedef typename RhsMapper::Scalar RhsScalar;
 
-   However adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h`
-   results in NVCC erroring out with the following error
-
-   ../Eigen/src/Core/products/GeneralBlockPanelKernel.h(57): error #2901:
-      dynamic initialization is not supported for function-scope static variables within a __device__/__global__ function
- */
-
-  #if !defined(EIGEN_HIPCC)
-  EIGEN_DEVICE_FUNC
-  #endif
- TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, StorageIndex num_threads = 1) :
+  EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) :
       kc_(k), mc_(m), nc_(n)
   {
     if (ShardingType == ShardByCol) {
@@ -51,22 +37,19 @@ class TensorContractionBlocking {
     else {
       computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, nc_, mc_, num_threads);
     }
-
-    const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
-    kc_ = (rhs_packet_size <= 8 || kc_ <= rhs_packet_size) ?
-      kc_ : (kc_ / rhs_packet_size) * rhs_packet_size;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; }
 
  private:
-  StorageIndex kc_;
-  StorageIndex mc_;
-  StorageIndex nc_;
+  Index kc_;
+  Index mc_;
+  Index nc_;
 };
 
+
 } // end namespace internal
 } // end namespace Eigen
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index 3f315fedc..d65dbb40f 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -1,6 +1,1391 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com>
+// Copyright (C) 2014 Eric Martin <eric@ericmart.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#if defined(__clang__) || defined(__GNUC__)
-#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorContractionGpu.h file"
-#endif
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
 
-#include "TensorContractionGpu.h"
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+
+namespace Eigen {
+
+template<typename Scalar, typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool needs_edge_check>
+__device__ EIGEN_STRONG_INLINE void
+EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+                               const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem,
+                       const Index m_size, const Index n_size, const Index k_size) {
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  // declare and initialize 64 registers for output 8x8 block
+
+  // prefetch registers
+  Scalar lhs_pf0;
+  Scalar lhs_pf1;
+  Scalar lhs_pf2;
+  Scalar lhs_pf3;
+  Scalar lhs_pf4;
+  Scalar lhs_pf5;
+  Scalar lhs_pf6;
+  Scalar lhs_pf7;
+
+  Scalar rhs_pf0;
+  Scalar rhs_pf1;
+  Scalar rhs_pf2;
+  Scalar rhs_pf3;
+  Scalar rhs_pf4;
+  Scalar rhs_pf5;
+  Scalar rhs_pf6;
+  Scalar rhs_pf7;
+
+  // shared memory is formatted
+  // (contract idx in block, nocontract idx in block, block idx)
+  // where block idx is column major. This transposition limits the number of
+  // bank conflicts when reading the LHS. The core idea is that since the contracting
+  // index is shared by both sides, then the contracting index should be in threadIdx.x.
+
+  // On the LHS, we pad each row inside of each block with an extra element. This makes
+  // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
+  // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
+
+  // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
+  // conflicts on writes and also none on reads.
+
+  // storage indices
+  const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z;
+  const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x;
+
+  const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
+  const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
+  const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
+  const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
+  const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
+  const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
+  const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
+  const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
+
+  const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
+  const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
+  const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
+  const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
+  const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
+  const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
+  const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
+  const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
+
+  // in the loading code, the following variables are important:
+  // threadIdx.x: the vertical position in an 8x8 block
+  // threadIdx.y: the vertical index of the 8x8 block in the grid
+  // threadIdx.z: the horizontal position in an 8x8 block
+  // k: the horizontal index of the 8x8 block in the grid
+  //
+  // The k parameter is implicit (it was the loop counter for a loop that went
+  // from 0 to <8, but now that loop is unrolled in the below code.
+
+  const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y;
+  const Index lhs_vert = base_m + load_idx_vert;
+
+#define prefetchIntoRegisters(base_k)                           \
+  {                                                             \
+    lhs_pf0 = conv(0);                                          \
+    lhs_pf1 = conv(0);                                          \
+    lhs_pf2 = conv(0);                                          \
+    lhs_pf3 = conv(0);                                          \
+    lhs_pf4 = conv(0);                                          \
+    lhs_pf5 = conv(0);                                          \
+    lhs_pf6 = conv(0);                                          \
+    lhs_pf7 = conv(0);                                          \
+                                                                \
+    rhs_pf0 = conv(0);                                          \
+    rhs_pf1 = conv(0);                                          \
+    rhs_pf2 = conv(0);                                          \
+    rhs_pf3 = conv(0);                                          \
+    rhs_pf4 = conv(0);                                          \
+    rhs_pf5 = conv(0);                                          \
+    rhs_pf6 = conv(0);                                          \
+    rhs_pf7 = conv(0);                                          \
+                                                                \
+    if (!needs_edge_check || lhs_vert < m_size) {               \
+      const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8;   \
+      const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8;   \
+      const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8;   \
+      const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8;   \
+      const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8;   \
+      const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8;   \
+      const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8;   \
+      const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8;   \
+                                                                \
+      if (!needs_edge_check || lhs_horiz_7 < k_size) {          \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
+        lhs_pf7 = lhs(lhs_vert, lhs_horiz_7);                   \
+      } else if (lhs_horiz_6 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
+      } else if (lhs_horiz_5 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
+      } else if (lhs_horiz_4 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
+      } else if (lhs_horiz_3 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
+      } else if (lhs_horiz_2 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
+      } else if (lhs_horiz_1 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
+      } else if (lhs_horiz_0 < k_size) {                        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
+      }                                                         \
+    }                                                           \
+                                                                \
+    const Index rhs_vert = base_k + load_idx_vert;              \
+    if (!needs_edge_check || rhs_vert < k_size) {               \
+      const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8;   \
+      const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8;   \
+      const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8;   \
+      const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8;   \
+      const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8;   \
+      const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8;   \
+      const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8;   \
+      const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8;   \
+                                                                \
+      if (rhs_horiz_7 < n_size) {                               \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
+        rhs_pf7 = rhs(rhs_vert, rhs_horiz_7);                   \
+      } else if (rhs_horiz_6 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
+      } else if (rhs_horiz_5 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
+      } else if (rhs_horiz_4 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
+      } else if (rhs_horiz_3 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
+      } else if (rhs_horiz_2 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
+      } else if (rhs_horiz_1 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
+      } else if (rhs_horiz_0 < n_size) {                        \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
+      }                                                         \
+    }                                                           \
+  }                                                             \
+
+#define writeRegToShmem(_)                      \
+  lhs_shmem[lhs_store_idx_0] = lhs_pf0;         \
+  rhs_shmem[rhs_store_idx_0] = rhs_pf0;         \
+                                                \
+  lhs_shmem[lhs_store_idx_1] = lhs_pf1;         \
+  rhs_shmem[rhs_store_idx_1] = rhs_pf1;         \
+                                                \
+  lhs_shmem[lhs_store_idx_2] = lhs_pf2;         \
+  rhs_shmem[rhs_store_idx_2] = rhs_pf2;         \
+                                                \
+  lhs_shmem[lhs_store_idx_3] = lhs_pf3;         \
+  rhs_shmem[rhs_store_idx_3] = rhs_pf3;         \
+                                                \
+  lhs_shmem[lhs_store_idx_4] = lhs_pf4;         \
+  rhs_shmem[rhs_store_idx_4] = rhs_pf4;         \
+                                                \
+  lhs_shmem[lhs_store_idx_5] = lhs_pf5;         \
+  rhs_shmem[rhs_store_idx_5] = rhs_pf5;         \
+                                                \
+  lhs_shmem[lhs_store_idx_6] = lhs_pf6;         \
+  rhs_shmem[rhs_store_idx_6] = rhs_pf6;         \
+                                                \
+  lhs_shmem[lhs_store_idx_7] = lhs_pf7;         \
+  rhs_shmem[rhs_store_idx_7] = rhs_pf7;         \
+
+  // declare and initialize result array
+#define res(i, j) _res_##i##j
+#define initResultRow(i)                        \
+  Scalar res(i, 0) = conv(0);                   \
+  Scalar res(i, 1) = conv(0);                   \
+  Scalar res(i, 2) = conv(0);                   \
+  Scalar res(i, 3) = conv(0);                   \
+  Scalar res(i, 4) = conv(0);                   \
+  Scalar res(i, 5) = conv(0);                   \
+  Scalar res(i, 6) = conv(0);                   \
+  Scalar res(i, 7) = conv(0);                   \
+
+  internal::scalar_cast_op<int, Scalar> conv;
+  initResultRow(0);
+  initResultRow(1);
+  initResultRow(2);
+  initResultRow(3);
+  initResultRow(4);
+  initResultRow(5);
+  initResultRow(6);
+  initResultRow(7);
+#undef initResultRow
+
+  for (Index base_k = 0; base_k < k_size; base_k += 64) {
+    // wait for previous iteration to finish with shmem. Despite common sense,
+    // the code is a bit faster with this here then at bottom of loop
+    __syncthreads();
+
+    prefetchIntoRegisters(base_k);
+    writeRegToShmem();
+
+    #undef prefetchIntoRegisters
+    #undef writeRegToShmem
+
+    // wait for shared mem packing to be done before starting computation
+    __syncthreads();
+
+    // compute 8x8 matrix product by outer product. This involves packing one column
+    // of LHS and one row of RHS into registers (takes 16 registers).
+
+#define lcol(i) _lcol##i
+    Scalar lcol(0);
+    Scalar lcol(1);
+    Scalar lcol(2);
+    Scalar lcol(3);
+    Scalar lcol(4);
+    Scalar lcol(5);
+    Scalar lcol(6);
+    Scalar lcol(7);
+
+#define rrow(j) _rrow##j
+    Scalar rrow(0);
+    Scalar rrow(1);
+    Scalar rrow(2);
+    Scalar rrow(3);
+    Scalar rrow(4);
+    Scalar rrow(5);
+    Scalar rrow(6);
+    Scalar rrow(7);
+
+    // Now x corresponds to k, y to m, and z to n
+    const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
+    const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
+
+#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
+#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
+
+#define loadData(i, j)                          \
+    lcol(0) = lhs_element(0, j);               \
+    rrow(0) = rhs_element(i, 0);               \
+    lcol(1) = lhs_element(1, j);               \
+    rrow(1) = rhs_element(i, 1);               \
+    lcol(2) = lhs_element(2, j);               \
+    rrow(2) = rhs_element(i, 2);               \
+    lcol(3) = lhs_element(3, j);               \
+    rrow(3) = rhs_element(i, 3);               \
+    lcol(4) = lhs_element(4, j);               \
+    rrow(4) = rhs_element(i, 4);               \
+    lcol(5) = lhs_element(5, j);               \
+    rrow(5) = rhs_element(i, 5);               \
+    lcol(6) = lhs_element(6, j);               \
+    rrow(6) = rhs_element(i, 6);               \
+    lcol(7) = lhs_element(7, j);               \
+    rrow(7) = rhs_element(i, 7);               \
+
+#define computeCol(j)                           \
+    res(0, j) += lcol(0) * rrow(j);             \
+    res(1, j) += lcol(1) * rrow(j);             \
+    res(2, j) += lcol(2) * rrow(j);             \
+    res(3, j) += lcol(3) * rrow(j);             \
+    res(4, j) += lcol(4) * rrow(j);             \
+    res(5, j) += lcol(5) * rrow(j);             \
+    res(6, j) += lcol(6) * rrow(j);             \
+    res(7, j) += lcol(7) * rrow(j);             \
+
+#define computePass(i)                          \
+    loadData(i, i);                             \
+                                                \
+    computeCol(0);                              \
+    computeCol(1);                              \
+    computeCol(2);                              \
+    computeCol(3);                              \
+    computeCol(4);                              \
+    computeCol(5);                              \
+    computeCol(6);                              \
+    computeCol(7);                              \
+
+    computePass(0);
+    computePass(1);
+    computePass(2);
+    computePass(3);
+    computePass(4);
+    computePass(5);
+    computePass(6);
+    computePass(7);
+
+#undef lcol
+#undef rrow
+#undef lhs_element
+#undef rhs_element
+#undef loadData
+#undef computeCol
+#undef computePass
+  } // end loop over k
+
+  // we've now iterated over all of the large (ie width 64) k blocks and
+  // accumulated results in registers. At this point thread (x, y, z) contains
+  // the sum across all big k blocks of the product of little k block of index (x, y)
+  // with block of index (y, z). To compute the final output, we need to reduce
+  // the 8 threads over y by summation.
+#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
+
+#define reduceRow(i, mask)                      \
+  shuffleInc(i, 0, mask);                       \
+  shuffleInc(i, 1, mask);                       \
+  shuffleInc(i, 2, mask);                       \
+  shuffleInc(i, 3, mask);                       \
+  shuffleInc(i, 4, mask);                       \
+  shuffleInc(i, 5, mask);                       \
+  shuffleInc(i, 6, mask);                       \
+  shuffleInc(i, 7, mask);                       \
+
+#define reduceMatrix(mask)                      \
+  reduceRow(0, mask);                           \
+  reduceRow(1, mask);                           \
+  reduceRow(2, mask);                           \
+  reduceRow(3, mask);                           \
+  reduceRow(4, mask);                           \
+  reduceRow(5, mask);                           \
+  reduceRow(6, mask);                           \
+  reduceRow(7, mask);                           \
+
+  // actually perform the reduction, now each thread of index (_, y, z)
+  // contains the correct values in its registers that belong in the output
+  // block
+  reduceMatrix(1);
+  reduceMatrix(2);
+  reduceMatrix(4);
+
+#undef shuffleInc
+#undef reduceRow
+#undef reduceMatrix
+
+  // now we need to copy the 64 values into main memory. We can't split work
+  // among threads because all variables are in registers. There's 2 ways
+  // to do this:
+  // (1) have 1 thread do 64 writes from registers into global memory
+  // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
+  //     each do 8 writes into global memory. We can just overwrite the shared
+  //     memory from the problem we just solved.
+  // (2) is slightly faster than (1) due to less branching and more ILP
+
+  // TODO: won't yield much gain, but could just use currently unused shared mem
+  //       and then we won't have to sync
+  // wait for shared mem to be out of use
+  __syncthreads();
+
+#define writeResultShmem(i, j)                                          \
+  lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \
+
+#define writeRow(i)                             \
+  writeResultShmem(i, 0);                       \
+  writeResultShmem(i, 1);                       \
+  writeResultShmem(i, 2);                       \
+  writeResultShmem(i, 3);                       \
+  writeResultShmem(i, 4);                       \
+  writeResultShmem(i, 5);                       \
+  writeResultShmem(i, 6);                       \
+  writeResultShmem(i, 7);                       \
+
+  if (threadIdx.x == 0) {
+    writeRow(0);
+    writeRow(1);
+    writeRow(2);
+    writeRow(3);
+    writeRow(4);
+    writeRow(5);
+    writeRow(6);
+    writeRow(7);
+  }
+#undef writeResultShmem
+#undef writeRow
+
+  const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
+  const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
+
+  if (threadIdx.x < max_i_write) {
+    if (max_j_write == 8) {
+      // TODO: can i trade bank conflicts for coalesced writes?
+      Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0];
+      Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1];
+      Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2];
+      Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3];
+      Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4];
+      Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5];
+      Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6];
+      Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7];
+
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7;
+    } else {
+#pragma unroll 7
+      for (int j = 0; j < max_j_write; j++) {
+        Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j];
+        output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val;
+      }
+    }
+  }
+#undef res
+}
+
+
+template<typename Scalar, typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+__launch_bounds__(512)
+EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ Scalar lhs_shmem[72 * 64];
+  __shared__ Scalar rhs_shmem[72 * 64];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  if (base_m + 63 < m_size && base_n + 63 < n_size) {
+    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+  } else {
+    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+         bool CHECK_RHS_BOUNDARY>
+__device__ EIGEN_STRONG_INLINE void
+EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output, float2 lhs_shmem2[][16],
+                       float2 rhs_shmem2[][8], const Index m_size,
+                       const Index n_size, const Index k_size,
+                       const Index base_m, const Index base_n) {
+  typedef float Scalar;
+
+  // prefetch registers
+  float4 lhs_pf0, rhs_pf0;
+
+  float4 results[4];
+  for (int i=0; i < 4; i++) {
+    results[i].x = results[i].y = results[i].z = results[i].w = 0;
+  }
+
+
+#define prefetch_lhs(reg, row, col)                   \
+    if (!CHECK_LHS_BOUNDARY) {                        \
+      if (col < k_size) {                             \
+        reg =lhs.loadPacket<Unaligned>(row, col);     \
+      }                                               \
+    } else {                                          \
+      if (col < k_size) {                             \
+        if (row + 3 < m_size) {                       \
+          reg =lhs.loadPacket<Unaligned>(row, col);   \
+        } else if (row + 2 < m_size) {                \
+          reg.x =lhs(row + 0, col);                   \
+          reg.y =lhs(row + 1, col);                   \
+          reg.z =lhs(row + 2, col);                   \
+        } else if (row + 1 < m_size) {                \
+          reg.x =lhs(row + 0, col);                   \
+          reg.y =lhs(row + 1, col);                   \
+        } else if (row  < m_size) {                   \
+          reg.x =lhs(row + 0, col);                   \
+        }                                             \
+      }                                               \
+    }                                                 \
+
+
+  Index lhs_vert = base_m+threadIdx.x*4;
+
+  for (Index k = 0; k < k_size; k += 16) {
+    lhs_pf0 = internal::pset1<float4>(0);
+    rhs_pf0 = internal::pset1<float4>(0);
+
+    Index lhs_horiz = threadIdx.y+k;
+    prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
+
+    Index rhs_vert = k+(threadIdx.x%4)*4;
+    Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n;
+
+    if (!CHECK_RHS_BOUNDARY) {
+      if ((rhs_vert + 3) < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+      } else if (rhs_vert + 2 < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+      } else if (rhs_vert + 1 < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+      } else if (rhs_vert  < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+      }
+    } else {
+      if (rhs_horiz0 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+        } else if ((rhs_vert + 2) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        } else if ((rhs_vert + 1) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        } else if (rhs_vert  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        }
+      }
+    }
+    float x1, x2 ;
+    // the following can be a bitwise operation..... some day.
+    if((threadIdx.x%8) < 4) {
+      x1 = rhs_pf0.y;
+      x2 = rhs_pf0.w;
+    } else {
+      x1 = rhs_pf0.x;
+      x2 = rhs_pf0.z;
+    }
+    x1 = __shfl_xor(x1, 4);
+    x2 = __shfl_xor(x2, 4);
+    if((threadIdx.x%8) < 4) {
+      rhs_pf0.y = x1;
+      rhs_pf0.w = x2;
+    } else {
+      rhs_pf0.x = x1;
+      rhs_pf0.z = x2;
+    }
+
+    // We have 64 features.
+    // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
+    // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
+    // ...
+    // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
+    // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
+    // ...
+    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y);
+    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w);
+
+    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // ...
+    // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63)
+    // ...
+
+    lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y);
+    lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w);
+
+
+#define add_vals(fl1, fl2, fr1, fr2)\
+    results[0].x += fl1.x * fr1.x;\
+    results[0].y += fl1.y * fr1.x;\
+    results[0].z += fl2.x * fr1.x;\
+    results[0].w += fl2.y * fr1.x;\
+\
+    results[1].x += fl1.x * fr1.y;\
+    results[1].y += fl1.y * fr1.y;\
+    results[1].z += fl2.x * fr1.y;\
+    results[1].w += fl2.y * fr1.y;\
+\
+    results[2].x += fl1.x * fr2.x;\
+    results[2].y += fl1.y * fr2.x;\
+    results[2].z += fl2.x * fr2.x;\
+    results[2].w += fl2.y * fr2.x;\
+\
+    results[3].x += fl1.x * fr2.y;\
+    results[3].y += fl1.y * fr2.y;\
+    results[3].z += fl2.x * fr2.y;\
+    results[3].w += fl2.y * fr2.y;\
+
+    __syncthreads();
+
+    // Do the multiplies.
+    #pragma unroll
+    for (int koff = 0; koff < 16; koff ++) {
+      // 32 x threads.
+      float2 fl1 = lhs_shmem2[koff][threadIdx.x];
+      float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x];
+
+      int start_feature = threadIdx.y * 4;
+      float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
+      float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
+
+      add_vals(fl1, fl2, fr1, fr2)
+    }
+    __syncthreads();
+  }
+
+#undef prefetch_lhs
+#undef add_vals
+
+  Index horiz_base = threadIdx.y*4+base_n;
+  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+    for (int i = 0; i < 4; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }
+  } else if (!CHECK_RHS_BOUNDARY) {
+    // CHECK LHS
+    if (lhs_vert + 3 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    } else if (lhs_vert + 2 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      }
+    } else if (lhs_vert + 1 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      }
+    } else if (lhs_vert  < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+      }
+    }
+  } else if (!CHECK_LHS_BOUNDARY) {
+    // CHECK RHS
+    /*
+    int ncols_rem = fminf(n_size- horiz_base, 4);
+    for (int i = 0; i < ncols_rem; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }*/
+    for (int i = 0; i < 4; i++) {
+      if (horiz_base+i < n_size) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+       }
+    }
+  } else {
+    // CHECK both boundaries.
+    for (int i = 0; i < 4; i++) {
+      if (horiz_base+i < n_size) {
+        if (lhs_vert < m_size)
+          output(lhs_vert, horiz_base + i) = results[i].x;
+        if (lhs_vert + 1 < m_size)
+          output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        if (lhs_vert + 2 < m_size)
+          output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        if (lhs_vert + 3 < m_size)
+          output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+         bool CHECK_RHS_BOUNDARY>
+__device__ EIGEN_STRONG_INLINE void
+EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output, float2 lhs_shmem2[][32],
+                       float2 rhs_shmem2[][8], const Index m_size,
+                       const Index n_size, const Index k_size,
+                       const Index base_m, const Index base_n) {
+  typedef float Scalar;
+
+  // prefetch registers
+  float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
+  float4 rhs_pf0, rhs_pf1;
+
+  float4 results[8];
+  for (int i=0; i < 8; i++) {
+    results[i].x = results[i].y = results[i].z = results[i].w = 0;
+  }
+
+
+  Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32;
+  for (Index k = 0; k < k_size; k += 32) {
+    lhs_pf0 = internal::pset1<float4>(0);
+    lhs_pf1 = internal::pset1<float4>(0);
+    lhs_pf2 = internal::pset1<float4>(0);
+    lhs_pf3 = internal::pset1<float4>(0);
+
+    rhs_pf0 = internal::pset1<float4>(0);
+    rhs_pf1 = internal::pset1<float4>(0);
+
+     if (!CHECK_LHS_BOUNDARY) {
+      if ((threadIdx.y/4+k+24) < k_size) {
+        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+        lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+      } else if ((threadIdx.y/4+k+16) < k_size) {
+        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+      } else if ((threadIdx.y/4+k+8) < k_size) {
+        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+      } else if ((threadIdx.y/4+k) < k_size) {
+        lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+      }
+    } else {
+      // just CHECK_LHS_BOUNDARY
+      if (lhs_vert + 3 < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+          lhs_pf3 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+          lhs_pf2 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+          lhs_pf1 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0 =lhs.loadPacket<Unaligned>(lhs_vert, (threadIdx.y/4+k));
+        }
+      } else if (lhs_vert + 2 < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
+          lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
+        }
+      } else if (lhs_vert + 1 < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
+        }
+      } else if (lhs_vert < m_size) {
+        if ((threadIdx.y/4+k+24) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
+        } else if ((threadIdx.y/4+k+16) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
+        } else if ((threadIdx.y/4+k+8) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
+        } else if ((threadIdx.y/4+k) < k_size) {
+          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
+        }
+      }
+    }
+    __syncthreads();
+    Index rhs_vert = k+threadIdx.x*4;
+    Index rhs_horiz0 = threadIdx.y*2+base_n;
+    Index rhs_horiz1 = threadIdx.y*2+1+base_n;
+    if (!CHECK_RHS_BOUNDARY) {
+      if ((rhs_vert + 3) < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+        rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
+      } else if (rhs_vert + 2 < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+        rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+      } else if (rhs_vert + 1 < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+      } else if (rhs_vert  < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+      }
+    } else {
+      if (rhs_horiz1 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+          rhs_pf1 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz1);
+        } else if (rhs_vert + 2 < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+          rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+        } else if (k+threadIdx.x*4 + 1 < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+        } else if (k+threadIdx.x*4  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        }
+      } else if (rhs_horiz0 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0 = rhs.loadPacket<Unaligned>(rhs_vert, rhs_horiz0);
+        } else if ((rhs_vert + 2) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        } else if ((rhs_vert + 1) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        } else if (rhs_vert  < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        }
+      }
+    }
+    __syncthreads();
+    // Loaded. Do computation
+    // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
+    // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
+    // ..
+    // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
+    rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x);
+    // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
+    // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
+    // ..
+    rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y);
+    // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
+    // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
+    rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z);
+    // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
+    // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
+    rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w);
+
+    // LHS.
+    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
+    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
+    // ...
+    // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
+    // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
+
+
+#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\
+      results[0].x += a_feat1.x * f1.x;\
+      results[1].x += a_feat1.x * f1.y;\
+      results[2].x += a_feat1.x * f2.x;\
+      results[3].x += a_feat1.x * f2.y;\
+      results[4].x += a_feat1.x * f3.x;\
+      results[5].x += a_feat1.x * f3.y;\
+      results[6].x += a_feat1.x * f4.x;\
+      results[7].x += a_feat1.x * f4.y;\
+\
+      results[0].y += a_feat1.y * f1.x;\
+      results[1].y += a_feat1.y * f1.y;\
+      results[2].y += a_feat1.y * f2.x;\
+      results[3].y += a_feat1.y * f2.y;\
+      results[4].y += a_feat1.y * f3.x;\
+      results[5].y += a_feat1.y * f3.y;\
+      results[6].y += a_feat1.y * f4.x;\
+      results[7].y += a_feat1.y * f4.y;\
+\
+      results[0].z += a_feat2.x * f1.x;\
+      results[1].z += a_feat2.x * f1.y;\
+      results[2].z += a_feat2.x * f2.x;\
+      results[3].z += a_feat2.x * f2.y;\
+      results[4].z += a_feat2.x * f3.x;\
+      results[5].z += a_feat2.x * f3.y;\
+      results[6].z += a_feat2.x * f4.x;\
+      results[7].z += a_feat2.x * f4.y;\
+\
+      results[0].w += a_feat2.y * f1.x;\
+      results[1].w += a_feat2.y * f1.y;\
+      results[2].w += a_feat2.y * f2.x;\
+      results[3].w += a_feat2.y * f2.y;\
+      results[4].w += a_feat2.y * f3.x;\
+      results[5].w += a_feat2.y * f3.y;\
+      results[6].w += a_feat2.y * f4.x;\
+      results[7].w += a_feat2.y * f4.y;\
+
+    lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y);
+    lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y);
+    lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y);
+    lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y);
+
+    lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w);
+    lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w);
+    lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w);
+    lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w);
+
+    __syncthreads();
+
+    // Do the multiplies.
+    #pragma unroll
+    for (int koff = 0; koff < 32; koff ++) {
+      float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8];
+      float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8];
+
+      // first feature is at (threadIdx.y/4) * 8 last is at start + 8.
+      int start_feature = (threadIdx.y / 4) * 8;
+
+      float2 br1 = rhs_shmem2[start_feature/2 +     (koff % 4) * 32][koff/4];
+      float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4];
+      float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4];
+      float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4];
+
+      add_vals(a3, a4, br1, br2, br3, br4)
+    }
+    __syncthreads();
+  } // end loop over k
+
+
+  __syncthreads();
+  Index horiz_base = (threadIdx.y/4)*8+base_n;
+  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+    for (int i = 0; i < 8; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }
+  } else if (!CHECK_RHS_BOUNDARY) {
+    if (lhs_vert + 3 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    } else if (lhs_vert + 2 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      }
+    } else if (lhs_vert + 1 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      }
+    } else if (lhs_vert  < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+      }
+    }
+  } else if (!CHECK_LHS_BOUNDARY) {
+    // CHECK BOUNDARY_B
+    for (int i = 0; i < 8; i++) {
+      if (horiz_base + i < n_size) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  } else {
+    // CHECK both boundaries.
+    for (int i = 0; i < 8; i++) {
+      if (horiz_base + i < n_size) {
+        if (lhs_vert < m_size)
+          output(lhs_vert, horiz_base + i) = results[i].x;
+        if (lhs_vert + 1 < m_size)
+          output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        if (lhs_vert + 2 < m_size)
+          output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        if (lhs_vert + 3 < m_size)
+          output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  }
+}
+
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+__launch_bounds__(256)
+EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ float2 lhs_shmem[64*32];
+  __shared__ float2 rhs_shmem[128*8];
+
+  typedef float2 LHS_MEM[64][32];
+  typedef float2 RHS_MEM[128][8];
+
+  typedef float2 LHS_MEM16x16[32][16];
+  typedef float2 RHS_MEM16x16[64][8];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 128 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  bool check_rhs = (base_n + 63) >= n_size;
+  bool check_lhs128 = (base_m + 127) >= m_size;
+
+  if (!check_rhs) {
+    if (!check_lhs128) {
+      // >= 128 rows left
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    }
+  } else {
+    if (!check_lhs128) {
+      // >= 128 rows left
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
+                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    }
+  }
+}
+
+template<typename Index, typename LhsMapper,
+         typename RhsMapper, typename OutputMapper>
+__global__ void
+__launch_bounds__(256)
+EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
+                       const OutputMapper output,
+                       const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ float2 lhs_shmem[32][16];
+  __shared__ float2 rhs_shmem[64][8];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  if (base_m + 63 < m_size) {
+    if (base_n + 63 < n_size) {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    }
+  } else {
+    if (base_n + 63 < n_size) {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    }
+  }
+}
+
+
+template<typename Indices, typename LeftArgType, typename RightArgType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> :
+    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, GpuDevice> > {
+
+  typedef GpuDevice Device;
+
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
+
+  enum {
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+  };
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
+  typedef typename internal::conditional<
+    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
+
+  static const int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static const int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static const int ContractDims = internal::array_size<Indices>::value;
+
+  typedef array<Index, LDims> left_dim_mapper_t;
+  typedef array<Index, RDims> right_dim_mapper_t;
+
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
+
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  // typedefs needed in evalTo
+  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
+  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
+
+  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+  typedef typename LeftEvaluator::Dimensions LeftDimensions;
+  typedef typename RightEvaluator::Dimensions RightDimensions;
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+      Base(op, device) {}
+
+  // We need to redefine this method to make nvcc happy
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
+    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
+      evalTo(this->m_result);
+      return true;
+    }
+  }
+
+  void evalTo(Scalar* buffer) const {
+    if (this->m_lhs_inner_dim_contiguous) {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, true, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<true, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, false, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<true, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+    else {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, true, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<false, true, false, Unaligned>(buffer);
+        }
+      }
+      else {
+       if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, false, true, Unaligned>(buffer);
+        }
+        else {
+          evalTyped<false, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+  }
+
+  template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels {
+    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
+    const Index m_blocks = (m + 63) / 64;
+    const Index n_blocks = (n + 63) / 64;
+    const dim3 num_blocks(m_blocks, n_blocks, 1);
+    const dim3 block_size(8, 8, 8);
+    LAUNCH_CUDA_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
+    }
+  };
+
+  template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> {
+    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
+      if (m < 768 || n < 768) {
+        const Index m_blocks = (m + 63) / 64;
+        const Index n_blocks = (n + 63) / 64;
+        const dim3 num_blocks(m_blocks, n_blocks, 1);
+        const dim3 block_size(16, 16, 1);
+        LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
+      } else {
+        const Index m_blocks = (m + 127) / 128;
+        const Index n_blocks = (n + 63) / 64;
+        const dim3 num_blocks(m_blocks, n_blocks, 1);
+        const dim3 block_size(8, 32, 1);
+        LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
+      }
+    }
+  };
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalTyped(Scalar* buffer) const {
+    // columns in left side, rows in right side
+    const Index k = this->m_k_size;
+    EIGEN_UNUSED_VARIABLE(k)
+
+    // rows in left side
+    const Index m = this->m_i_size;
+
+    // columns in right side
+    const Index n = this->m_j_size;
+
+    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+                                                   LeftEvaluator, left_nocontract_t,
+                                                   contract_t, 4,
+                                                   lhs_inner_dim_contiguous,
+                                                   false, Unaligned> LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+                                                   RightEvaluator, right_nocontract_t,
+                                                   contract_t, 4,
+                                                   rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+    OutputMapper output(buffer, m);
+
+    setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte);
+    LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output,  m, n, k, this->m_device);
+  }
+};
+
+} // end namespace Eigen
+
+#endif // EIGEN_USE_GPU and __CUDACC__
+#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
deleted file mode 100644
index bb990b378..000000000
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
+++ /dev/null
@@ -1,1413 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com>
-// Copyright (C) 2014 Eric Martin <eric@ericmart.in>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
-
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
-
-namespace Eigen {
-
-template<typename Scalar, typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper, bool needs_edge_check>
-__device__ EIGEN_STRONG_INLINE void
-EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
-                               const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem,
-                       const Index m_size, const Index n_size, const Index k_size) {
-
-  const Index m_block_idx = blockIdx.x;
-  const Index n_block_idx = blockIdx.y;
-
-  const Index base_m = 64 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  // declare and initialize 64 registers for output 8x8 block
-
-  // prefetch registers
-  Scalar lhs_pf0;
-  Scalar lhs_pf1;
-  Scalar lhs_pf2;
-  Scalar lhs_pf3;
-  Scalar lhs_pf4;
-  Scalar lhs_pf5;
-  Scalar lhs_pf6;
-  Scalar lhs_pf7;
-
-  Scalar rhs_pf0;
-  Scalar rhs_pf1;
-  Scalar rhs_pf2;
-  Scalar rhs_pf3;
-  Scalar rhs_pf4;
-  Scalar rhs_pf5;
-  Scalar rhs_pf6;
-  Scalar rhs_pf7;
-
-  // shared memory is formatted
-  // (contract idx in block, nocontract idx in block, block idx)
-  // where block idx is column major. This transposition limits the number of
-  // bank conflicts when reading the LHS. The core idea is that since the contracting
-  // index is shared by both sides, then the contracting index should be in threadIdx.x.
-
-  // On the LHS, we pad each row inside of each block with an extra element. This makes
-  // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
-  // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
-
-  // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
-  // conflicts on writes and also none on reads.
-
-  // storage indices
-  const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z;
-  const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x;
-
-  const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
-  const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
-  const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
-  const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
-  const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
-  const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
-  const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
-  const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
-
-  const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
-  const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
-  const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
-  const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
-  const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
-  const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
-  const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
-  const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
-
-  // in the loading code, the following variables are important:
-  // threadIdx.x: the vertical position in an 8x8 block
-  // threadIdx.y: the vertical index of the 8x8 block in the grid
-  // threadIdx.z: the horizontal position in an 8x8 block
-  // k: the horizontal index of the 8x8 block in the grid
-  //
-  // The k parameter is implicit (it was the loop counter for a loop that went
-  // from 0 to <8, but now that loop is unrolled in the below code.
-
-  const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y;
-  const Index lhs_vert = base_m + load_idx_vert;
-
-#define prefetchIntoRegisters(base_k)                           \
-  {                                                             \
-    lhs_pf0 = conv(0);                                          \
-    lhs_pf1 = conv(0);                                          \
-    lhs_pf2 = conv(0);                                          \
-    lhs_pf3 = conv(0);                                          \
-    lhs_pf4 = conv(0);                                          \
-    lhs_pf5 = conv(0);                                          \
-    lhs_pf6 = conv(0);                                          \
-    lhs_pf7 = conv(0);                                          \
-                                                                \
-    rhs_pf0 = conv(0);                                          \
-    rhs_pf1 = conv(0);                                          \
-    rhs_pf2 = conv(0);                                          \
-    rhs_pf3 = conv(0);                                          \
-    rhs_pf4 = conv(0);                                          \
-    rhs_pf5 = conv(0);                                          \
-    rhs_pf6 = conv(0);                                          \
-    rhs_pf7 = conv(0);                                          \
-                                                                \
-    if (!needs_edge_check || lhs_vert < m_size) {               \
-      const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8;   \
-      const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8;   \
-      const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8;   \
-      const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8;   \
-      const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8;   \
-      const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8;   \
-      const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8;   \
-      const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8;   \
-                                                                \
-      if (!needs_edge_check || lhs_horiz_7 < k_size) {          \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
-        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
-        lhs_pf7 = lhs(lhs_vert, lhs_horiz_7);                   \
-      } else if (lhs_horiz_6 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
-        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                   \
-      } else if (lhs_horiz_5 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                   \
-      } else if (lhs_horiz_4 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                   \
-      } else if (lhs_horiz_3 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                   \
-      } else if (lhs_horiz_2 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                   \
-      } else if (lhs_horiz_1 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                   \
-      } else if (lhs_horiz_0 < k_size) {                        \
-        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                   \
-      }                                                         \
-    }                                                           \
-                                                                \
-    const Index rhs_vert = base_k + load_idx_vert;              \
-    if (!needs_edge_check || rhs_vert < k_size) {               \
-      const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8;   \
-      const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8;   \
-      const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8;   \
-      const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8;   \
-      const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8;   \
-      const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8;   \
-      const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8;   \
-      const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8;   \
-                                                                \
-      if (rhs_horiz_7 < n_size) {                               \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
-        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
-        rhs_pf7 = rhs(rhs_vert, rhs_horiz_7);                   \
-      } else if (rhs_horiz_6 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
-        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                   \
-      } else if (rhs_horiz_5 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                   \
-      } else if (rhs_horiz_4 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                   \
-      } else if (rhs_horiz_3 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                   \
-      } else if (rhs_horiz_2 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                   \
-      } else if (rhs_horiz_1 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                   \
-      } else if (rhs_horiz_0 < n_size) {                        \
-        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                   \
-      }                                                         \
-    }                                                           \
-  }                                                             \
-
-#define writeRegToShmem(_)                      \
-  lhs_shmem[lhs_store_idx_0] = lhs_pf0;         \
-  rhs_shmem[rhs_store_idx_0] = rhs_pf0;         \
-                                                \
-  lhs_shmem[lhs_store_idx_1] = lhs_pf1;         \
-  rhs_shmem[rhs_store_idx_1] = rhs_pf1;         \
-                                                \
-  lhs_shmem[lhs_store_idx_2] = lhs_pf2;         \
-  rhs_shmem[rhs_store_idx_2] = rhs_pf2;         \
-                                                \
-  lhs_shmem[lhs_store_idx_3] = lhs_pf3;         \
-  rhs_shmem[rhs_store_idx_3] = rhs_pf3;         \
-                                                \
-  lhs_shmem[lhs_store_idx_4] = lhs_pf4;         \
-  rhs_shmem[rhs_store_idx_4] = rhs_pf4;         \
-                                                \
-  lhs_shmem[lhs_store_idx_5] = lhs_pf5;         \
-  rhs_shmem[rhs_store_idx_5] = rhs_pf5;         \
-                                                \
-  lhs_shmem[lhs_store_idx_6] = lhs_pf6;         \
-  rhs_shmem[rhs_store_idx_6] = rhs_pf6;         \
-                                                \
-  lhs_shmem[lhs_store_idx_7] = lhs_pf7;         \
-  rhs_shmem[rhs_store_idx_7] = rhs_pf7;         \
-
-  // declare and initialize result array
-#define res(i, j) _res_##i##j
-#define initResultRow(i)                        \
-  Scalar res(i, 0) = conv(0);                   \
-  Scalar res(i, 1) = conv(0);                   \
-  Scalar res(i, 2) = conv(0);                   \
-  Scalar res(i, 3) = conv(0);                   \
-  Scalar res(i, 4) = conv(0);                   \
-  Scalar res(i, 5) = conv(0);                   \
-  Scalar res(i, 6) = conv(0);                   \
-  Scalar res(i, 7) = conv(0);                   \
-
-  internal::scalar_cast_op<int, Scalar> conv;
-  initResultRow(0);
-  initResultRow(1);
-  initResultRow(2);
-  initResultRow(3);
-  initResultRow(4);
-  initResultRow(5);
-  initResultRow(6);
-  initResultRow(7);
-#undef initResultRow
-
-  for (Index base_k = 0; base_k < k_size; base_k += 64) {
-    // wait for previous iteration to finish with shmem. Despite common sense,
-    // the code is a bit faster with this here then at bottom of loop
-    __syncthreads();
-
-    prefetchIntoRegisters(base_k);
-    writeRegToShmem();
-
-    #undef prefetchIntoRegisters
-    #undef writeRegToShmem
-
-    // wait for shared mem packing to be done before starting computation
-    __syncthreads();
-
-    // compute 8x8 matrix product by outer product. This involves packing one column
-    // of LHS and one row of RHS into registers (takes 16 registers).
-
-#define lcol(i) _lcol##i
-    Scalar lcol(0);
-    Scalar lcol(1);
-    Scalar lcol(2);
-    Scalar lcol(3);
-    Scalar lcol(4);
-    Scalar lcol(5);
-    Scalar lcol(6);
-    Scalar lcol(7);
-
-#define rrow(j) _rrow##j
-    Scalar rrow(0);
-    Scalar rrow(1);
-    Scalar rrow(2);
-    Scalar rrow(3);
-    Scalar rrow(4);
-    Scalar rrow(5);
-    Scalar rrow(6);
-    Scalar rrow(7);
-
-    // Now x corresponds to k, y to m, and z to n
-    const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
-    const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
-
-#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
-#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
-
-#define loadData(i, j)                          \
-    lcol(0) = lhs_element(0, j);               \
-    rrow(0) = rhs_element(i, 0);               \
-    lcol(1) = lhs_element(1, j);               \
-    rrow(1) = rhs_element(i, 1);               \
-    lcol(2) = lhs_element(2, j);               \
-    rrow(2) = rhs_element(i, 2);               \
-    lcol(3) = lhs_element(3, j);               \
-    rrow(3) = rhs_element(i, 3);               \
-    lcol(4) = lhs_element(4, j);               \
-    rrow(4) = rhs_element(i, 4);               \
-    lcol(5) = lhs_element(5, j);               \
-    rrow(5) = rhs_element(i, 5);               \
-    lcol(6) = lhs_element(6, j);               \
-    rrow(6) = rhs_element(i, 6);               \
-    lcol(7) = lhs_element(7, j);               \
-    rrow(7) = rhs_element(i, 7);               \
-
-#define computeCol(j)                           \
-    res(0, j) += lcol(0) * rrow(j);             \
-    res(1, j) += lcol(1) * rrow(j);             \
-    res(2, j) += lcol(2) * rrow(j);             \
-    res(3, j) += lcol(3) * rrow(j);             \
-    res(4, j) += lcol(4) * rrow(j);             \
-    res(5, j) += lcol(5) * rrow(j);             \
-    res(6, j) += lcol(6) * rrow(j);             \
-    res(7, j) += lcol(7) * rrow(j);             \
-
-#define computePass(i)                          \
-    loadData(i, i);                             \
-                                                \
-    computeCol(0);                              \
-    computeCol(1);                              \
-    computeCol(2);                              \
-    computeCol(3);                              \
-    computeCol(4);                              \
-    computeCol(5);                              \
-    computeCol(6);                              \
-    computeCol(7);                              \
-
-    computePass(0);
-    computePass(1);
-    computePass(2);
-    computePass(3);
-    computePass(4);
-    computePass(5);
-    computePass(6);
-    computePass(7);
-
-#undef lcol
-#undef rrow
-#undef lhs_element
-#undef rhs_element
-#undef loadData
-#undef computeCol
-#undef computePass
-  } // end loop over k
-
-  // we've now iterated over all of the large (ie width 64) k blocks and
-  // accumulated results in registers. At this point thread (x, y, z) contains
-  // the sum across all big k blocks of the product of little k block of index (x, y)
-  // with block of index (y, z). To compute the final output, we need to reduce
-  // the 8 threads over y by summation.
-#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
-#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
-#else
-#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
-#endif
-
-#define reduceRow(i, mask)                      \
-  shuffleInc(i, 0, mask);                       \
-  shuffleInc(i, 1, mask);                       \
-  shuffleInc(i, 2, mask);                       \
-  shuffleInc(i, 3, mask);                       \
-  shuffleInc(i, 4, mask);                       \
-  shuffleInc(i, 5, mask);                       \
-  shuffleInc(i, 6, mask);                       \
-  shuffleInc(i, 7, mask);                       \
-
-#define reduceMatrix(mask)                      \
-  reduceRow(0, mask);                           \
-  reduceRow(1, mask);                           \
-  reduceRow(2, mask);                           \
-  reduceRow(3, mask);                           \
-  reduceRow(4, mask);                           \
-  reduceRow(5, mask);                           \
-  reduceRow(6, mask);                           \
-  reduceRow(7, mask);                           \
-
-  // actually perform the reduction, now each thread of index (_, y, z)
-  // contains the correct values in its registers that belong in the output
-  // block
-  reduceMatrix(1);
-  reduceMatrix(2);
-  reduceMatrix(4);
-
-#undef shuffleInc
-#undef reduceRow
-#undef reduceMatrix
-
-  // now we need to copy the 64 values into main memory. We can't split work
-  // among threads because all variables are in registers. There's 2 ways
-  // to do this:
-  // (1) have 1 thread do 64 writes from registers into global memory
-  // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
-  //     each do 8 writes into global memory. We can just overwrite the shared
-  //     memory from the problem we just solved.
-  // (2) is slightly faster than (1) due to less branching and more ILP
-
-  // TODO: won't yield much gain, but could just use currently unused shared mem
-  //       and then we won't have to sync
-  // wait for shared mem to be out of use
-  __syncthreads();
-
-#define writeResultShmem(i, j)                                          \
-  lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \
-
-#define writeRow(i)                             \
-  writeResultShmem(i, 0);                       \
-  writeResultShmem(i, 1);                       \
-  writeResultShmem(i, 2);                       \
-  writeResultShmem(i, 3);                       \
-  writeResultShmem(i, 4);                       \
-  writeResultShmem(i, 5);                       \
-  writeResultShmem(i, 6);                       \
-  writeResultShmem(i, 7);                       \
-
-  if (threadIdx.x == 0) {
-    writeRow(0);
-    writeRow(1);
-    writeRow(2);
-    writeRow(3);
-    writeRow(4);
-    writeRow(5);
-    writeRow(6);
-    writeRow(7);
-  }
-#undef writeResultShmem
-#undef writeRow
-
-  const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
-  const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
-
-  if (threadIdx.x < max_i_write) {
-    if (max_j_write == 8) {
-      // TODO: can i trade bank conflicts for coalesced writes?
-      Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0];
-      Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1];
-      Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2];
-      Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3];
-      Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4];
-      Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5];
-      Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6];
-      Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7];
-
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6;
-      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7;
-    } else {
-#pragma unroll 7
-      for (int j = 0; j < max_j_write; j++) {
-        Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j];
-        output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val;
-      }
-    }
-  }
-#undef res
-}
-
-
-template<typename Scalar, typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper>
-__global__ void
-#if defined(EIGEN_HIPCC)
-__launch_bounds__(512, 1)
-#else
-__launch_bounds__(512)
-#endif
-EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output,
-                       const Index m_size, const Index n_size, const Index k_size) {
-  __shared__ Scalar lhs_shmem[72 * 64];
-  __shared__ Scalar rhs_shmem[72 * 64];
-
-  const Index m_block_idx = blockIdx.x;
-  const Index n_block_idx = blockIdx.y;
-
-  const Index base_m = 64 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  if (base_m + 63 < m_size && base_n + 63 < n_size) {
-    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
-  } else {
-    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
-  }
-}
-
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
-         bool CHECK_RHS_BOUNDARY>
-__device__ __forceinline__ void
-EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output, float2 lhs_shmem2[][16],
-                       float2 rhs_shmem2[][8], const Index m_size,
-                       const Index n_size, const Index k_size,
-                       const Index base_m, const Index base_n) {
-
-  // prefetch registers
-  float4 lhs_pf0, rhs_pf0;
-
-  float4 results[4];
-  for (int i=0; i < 4; i++) {
-    results[i].x = results[i].y = results[i].z = results[i].w = 0;
-  }
-
-#define prefetch_lhs(reg, row, col)                            \
-    if (!CHECK_LHS_BOUNDARY) {                                 \
-      if (col < k_size) {                                      \
-        reg =lhs.template loadPacket<float4,Unaligned>(row, col);     \
-      }                                                        \
-    } else {                                                   \
-      if (col < k_size) {                                      \
-        if (row + 3 < m_size) {                                \
-          reg =lhs.template loadPacket<float4,Unaligned>(row, col);   \
-        } else if (row + 2 < m_size) {                         \
-          reg.x =lhs(row + 0, col);                            \
-          reg.y =lhs(row + 1, col);                            \
-          reg.z =lhs(row + 2, col);                            \
-        } else if (row + 1 < m_size) {                         \
-          reg.x =lhs(row + 0, col);                            \
-          reg.y =lhs(row + 1, col);                            \
-        } else if (row  < m_size) {                            \
-          reg.x =lhs(row + 0, col);                            \
-        }                                                      \
-      }                                                        \
-    }							       \
-
-  Index lhs_vert = base_m+threadIdx.x*4;
-
-  for (Index k = 0; k < k_size; k += 16) {
-
-    lhs_pf0 = internal::pset1<float4>(0);
-    rhs_pf0 = internal::pset1<float4>(0);
-
-    Index lhs_horiz = threadIdx.y+k;
-    prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
-
-    Index rhs_vert = k+(threadIdx.x%4)*4;
-    Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n;
-
-    if (!CHECK_RHS_BOUNDARY) {
-      if ((rhs_vert + 3) < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
-      } else if (rhs_vert + 2 < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-      } else if (rhs_vert + 1 < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-      } else if (rhs_vert  < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-      }
-    } else {
-      if (rhs_horiz0 < n_size) {
-        if ((rhs_vert + 3) < k_size) {
-          rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
-        } else if ((rhs_vert + 2) < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-        } else if ((rhs_vert + 1) < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        } else if (rhs_vert  < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        }
-      }
-    }
-    float x1, x2 ;
-    // the following can be a bitwise operation..... some day.
-    if((threadIdx.x%8) < 4) {
-      x1 = rhs_pf0.y;
-      x2 = rhs_pf0.w;
-    } else {
-      x1 = rhs_pf0.x;
-      x2 = rhs_pf0.z;
-    }
-    #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
-    x1 = __shfl_xor(x1, 4);
-    x2 = __shfl_xor(x2, 4);
-    #else
-    x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4);
-    x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4);
-    #endif
-    if((threadIdx.x%8) < 4) {
-      rhs_pf0.y = x1;
-      rhs_pf0.w = x2;
-    } else {
-      rhs_pf0.x = x1;
-      rhs_pf0.z = x2;
-    }
-
-    // We have 64 features.
-    // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
-    // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
-    // ...
-    // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
-    // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
-    // ...
-    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y);
-    rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w);
-
-    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
-    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
-    // ...
-    // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
-    // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63)
-    // ...
-
-    lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y);
-    lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w);
-
-
-#define add_vals(fl1, fl2, fr1, fr2)\
-    results[0].x += fl1.x * fr1.x;\
-    results[0].y += fl1.y * fr1.x;\
-    results[0].z += fl2.x * fr1.x;\
-    results[0].w += fl2.y * fr1.x;\
-\
-    results[1].x += fl1.x * fr1.y;\
-    results[1].y += fl1.y * fr1.y;\
-    results[1].z += fl2.x * fr1.y;\
-    results[1].w += fl2.y * fr1.y;\
-\
-    results[2].x += fl1.x * fr2.x;\
-    results[2].y += fl1.y * fr2.x;\
-    results[2].z += fl2.x * fr2.x;\
-    results[2].w += fl2.y * fr2.x;\
-\
-    results[3].x += fl1.x * fr2.y;\
-    results[3].y += fl1.y * fr2.y;\
-    results[3].z += fl2.x * fr2.y;\
-    results[3].w += fl2.y * fr2.y;\
-
-    __syncthreads();
-
-    // Do the multiplies.
-    #pragma unroll
-    for (int koff = 0; koff < 16; koff ++) {
-      // 32 x threads.
-      float2 fl1 = lhs_shmem2[koff][threadIdx.x];
-      float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x];
-
-      int start_feature = threadIdx.y * 4;
-      float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
-      float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4];
-
-      add_vals(fl1, fl2, fr1, fr2)
-    }
-    __syncthreads();
-  }
-
-#undef prefetch_lhs
-#undef add_vals
-
-  Index horiz_base = threadIdx.y*4+base_n;
-  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
-    for (int i = 0; i < 4; i++) {
-      output(lhs_vert, horiz_base + i) = results[i].x;
-      output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      output(lhs_vert + 3, horiz_base + i) = results[i].w;
-    }
-  } else if (!CHECK_RHS_BOUNDARY) {
-    // CHECK LHS
-    if (lhs_vert + 3 < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    } else if (lhs_vert + 2 < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      }
-    } else if (lhs_vert + 1 < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      }
-    } else if (lhs_vert  < m_size) {
-      for (int i = 0; i < 4; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-      }
-    }
-  } else if (!CHECK_LHS_BOUNDARY) {
-    // CHECK RHS
-    /*
-    int ncols_rem = fminf(n_size- horiz_base, 4);
-    for (int i = 0; i < ncols_rem; i++) {
-      output(lhs_vert, horiz_base + i) = results[i].x;
-      output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      output(lhs_vert + 3, horiz_base + i) = results[i].w;
-    }*/
-    for (int i = 0; i < 4; i++) {
-      if (horiz_base+i < n_size) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-       }
-    }
-  } else {
-    // CHECK both boundaries.
-    for (int i = 0; i < 4; i++) {
-      if (horiz_base+i < n_size) {
-        if (lhs_vert < m_size)
-          output(lhs_vert, horiz_base + i) = results[i].x;
-        if (lhs_vert + 1 < m_size)
-          output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        if (lhs_vert + 2 < m_size)
-          output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        if (lhs_vert + 3 < m_size)
-          output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    }
-  }
-}
-
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
-         bool CHECK_RHS_BOUNDARY>
-__device__ __forceinline__ void
-EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output, float2 lhs_shmem2[][32],
-                       float2 rhs_shmem2[][8], const Index m_size,
-                       const Index n_size, const Index k_size,
-                       const Index base_m, const Index base_n) {
-
-  // prefetch registers
-  float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
-  float4 rhs_pf0, rhs_pf1;
-
-  float4 results[8];
-  for (int i=0; i < 8; i++) {
-    results[i].x = results[i].y = results[i].z = results[i].w = 0;
-  }
-
-  Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32;
-  for (Index k = 0; k < k_size; k += 32) {
-    lhs_pf0 = internal::pset1<float4>(0);
-    lhs_pf1 = internal::pset1<float4>(0);
-    lhs_pf2 = internal::pset1<float4>(0);
-    lhs_pf3 = internal::pset1<float4>(0);
-
-    rhs_pf0 = internal::pset1<float4>(0);
-    rhs_pf1 = internal::pset1<float4>(0);
-
-     if (!CHECK_LHS_BOUNDARY) {
-      if ((threadIdx.y/4+k+24) < k_size) {
-        lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-        lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-        lhs_pf3 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
-      } else if ((threadIdx.y/4+k+16) < k_size) {
-        lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-        lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-      } else if ((threadIdx.y/4+k+8) < k_size) {
-        lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-      } else if ((threadIdx.y/4+k) < k_size) {
-        lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
-      }
-    } else {
-      // just CHECK_LHS_BOUNDARY
-      if (lhs_vert + 3 < m_size) {
-        if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-          lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-          lhs_pf3 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+24));
-        } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-          lhs_pf2 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+16));
-        } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
-          lhs_pf1 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k+8));
-        } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0 =lhs.template loadPacket<float4,Unaligned>(lhs_vert, (threadIdx.y/4+k));
-        }
-      } else if (lhs_vert + 2 < m_size) {
-        if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
-          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
-          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
-          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
-          lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24));
-        } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
-          lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16));
-        } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8));
-        } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k));
-        }
-      } else if (lhs_vert + 1 < m_size) {
-        if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
-          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
-          lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24));
-        } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16));
-        } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8));
-        } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k));
-        }
-      } else if (lhs_vert < m_size) {
-        if ((threadIdx.y/4+k+24) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-          lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24));
-        } else if ((threadIdx.y/4+k+16) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-          lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16));
-        } else if ((threadIdx.y/4+k+8) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-          lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8));
-        } else if ((threadIdx.y/4+k) < k_size) {
-          lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k));
-        }
-      }
-    }
-    __syncthreads();
-    Index rhs_vert = k+threadIdx.x*4;
-    Index rhs_horiz0 = threadIdx.y*2+base_n;
-    Index rhs_horiz1 = threadIdx.y*2+1+base_n;
-    if (!CHECK_RHS_BOUNDARY) {
-      if ((rhs_vert + 3) < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
-        rhs_pf1 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz1);
-      } else if (rhs_vert + 2 < k_size) {
-        // just CHECK_RHS_BOUNDARY
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-        rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
-      } else if (rhs_vert + 1 < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-      } else if (rhs_vert  < k_size) {
-        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-      }
-    } else {
-      if (rhs_horiz1 < n_size) {
-        if ((rhs_vert + 3) < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
-          rhs_pf1 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz1);
-        } else if (rhs_vert + 2 < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-          rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
-        } else if (k+threadIdx.x*4 + 1 < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
-        } else if (k+threadIdx.x*4  < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
-        }
-      } else if (rhs_horiz0 < n_size) {
-        if ((rhs_vert + 3) < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0 = rhs.template loadPacket<float4,Unaligned>(rhs_vert, rhs_horiz0);
-        } else if ((rhs_vert + 2) < k_size) {
-          // just CHECK_RHS_BOUNDARY
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
-        } else if ((rhs_vert + 1) < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
-        } else if (rhs_vert  < k_size) {
-          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
-        }
-      }
-    }
-    __syncthreads();
-    // Loaded. Do computation
-    // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
-    // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
-    // ..
-    // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
-    rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x);
-    // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
-    // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
-    // ..
-    rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y);
-    // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
-    // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
-    rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z);
-    // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
-    // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
-    rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w);
-
-    // LHS.
-    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
-    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
-    // ...
-    // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
-    // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
-
-
-#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\
-      results[0].x += a_feat1.x * f1.x;\
-      results[1].x += a_feat1.x * f1.y;\
-      results[2].x += a_feat1.x * f2.x;\
-      results[3].x += a_feat1.x * f2.y;\
-      results[4].x += a_feat1.x * f3.x;\
-      results[5].x += a_feat1.x * f3.y;\
-      results[6].x += a_feat1.x * f4.x;\
-      results[7].x += a_feat1.x * f4.y;\
-\
-      results[0].y += a_feat1.y * f1.x;\
-      results[1].y += a_feat1.y * f1.y;\
-      results[2].y += a_feat1.y * f2.x;\
-      results[3].y += a_feat1.y * f2.y;\
-      results[4].y += a_feat1.y * f3.x;\
-      results[5].y += a_feat1.y * f3.y;\
-      results[6].y += a_feat1.y * f4.x;\
-      results[7].y += a_feat1.y * f4.y;\
-\
-      results[0].z += a_feat2.x * f1.x;\
-      results[1].z += a_feat2.x * f1.y;\
-      results[2].z += a_feat2.x * f2.x;\
-      results[3].z += a_feat2.x * f2.y;\
-      results[4].z += a_feat2.x * f3.x;\
-      results[5].z += a_feat2.x * f3.y;\
-      results[6].z += a_feat2.x * f4.x;\
-      results[7].z += a_feat2.x * f4.y;\
-\
-      results[0].w += a_feat2.y * f1.x;\
-      results[1].w += a_feat2.y * f1.y;\
-      results[2].w += a_feat2.y * f2.x;\
-      results[3].w += a_feat2.y * f2.y;\
-      results[4].w += a_feat2.y * f3.x;\
-      results[5].w += a_feat2.y * f3.y;\
-      results[6].w += a_feat2.y * f4.x;\
-      results[7].w += a_feat2.y * f4.y;\
-
-    lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y);
-    lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y);
-    lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y);
-    lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y);
-
-    lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w);
-    lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w);
-    lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w);
-    lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w);
-
-    __syncthreads();
-
-    // Do the multiplies.
-    #pragma unroll
-    for (int koff = 0; koff < 32; koff ++) {
-      float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8];
-      float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8];
-
-      // first feature is at (threadIdx.y/4) * 8 last is at start + 8.
-      int start_feature = (threadIdx.y / 4) * 8;
-
-      float2 br1 = rhs_shmem2[start_feature/2 +     (koff % 4) * 32][koff/4];
-      float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4];
-      float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4];
-      float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4];
-
-      add_vals(a3, a4, br1, br2, br3, br4)
-    }
-    __syncthreads();
-  } // end loop over k
-
-  __syncthreads();
-  Index horiz_base = (threadIdx.y/4)*8+base_n;
-  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
-    for (int i = 0; i < 8; i++) {
-      output(lhs_vert, horiz_base + i) = results[i].x;
-      output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      output(lhs_vert + 3, horiz_base + i) = results[i].w;
-    }
-  } else if (!CHECK_RHS_BOUNDARY) {
-    if (lhs_vert + 3 < m_size) {
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    } else if (lhs_vert + 2 < m_size) {
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-      }
-    } else if (lhs_vert + 1 < m_size) {
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-      }
-    } else if (lhs_vert  < m_size) {
-      for (int i = 0; i < 8; i++) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-      }
-    }
-  } else if (!CHECK_LHS_BOUNDARY) {
-    // CHECK BOUNDARY_B
-    for (int i = 0; i < 8; i++) {
-      if (horiz_base + i < n_size) {
-        output(lhs_vert, horiz_base + i) = results[i].x;
-        output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    }
-  } else {
-    // CHECK both boundaries.
-    for (int i = 0; i < 8; i++) {
-      if (horiz_base + i < n_size) {
-        if (lhs_vert < m_size)
-          output(lhs_vert, horiz_base + i) = results[i].x;
-        if (lhs_vert + 1 < m_size)
-          output(lhs_vert + 1, horiz_base + i) = results[i].y;
-        if (lhs_vert + 2 < m_size)
-          output(lhs_vert + 2, horiz_base + i) = results[i].z;
-        if (lhs_vert + 3 < m_size)
-          output(lhs_vert + 3, horiz_base + i) = results[i].w;
-      }
-    }
-  }
-}
-
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper>
-__global__ void
-#if defined(EIGEN_HIPCC)
-__launch_bounds__(256, 1)
-#else
-__launch_bounds__(256)
-#endif
-EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output,
-                       const Index m_size, const Index n_size, const Index k_size) {
-  __shared__ float2 lhs_shmem[64*32];
-  __shared__ float2 rhs_shmem[128*8];
-
-  typedef float2 LHS_MEM[64][32];
-  typedef float2 RHS_MEM[128][8];
-
-  const Index m_block_idx = blockIdx.x;
-  const Index n_block_idx = blockIdx.y;
-
-  const Index base_m = 128 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  bool check_rhs = (base_n + 63) >= n_size;
-  bool check_lhs128 = (base_m + 127) >= m_size;
-
-  if (!check_rhs) {
-    if (!check_lhs128) {
-      // >= 128 rows left
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    }
-  } else {
-    if (!check_lhs128) {
-      // >= 128 rows left
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
-                     lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n);
-    }
-  }
-}
-
-template<typename Index, typename LhsMapper,
-         typename RhsMapper, typename OutputMapper>
-__global__ void
-#if defined(EIGEN_HIPCC)
-__launch_bounds__(256, 1)
-#else
-__launch_bounds__(256)
-#endif
-EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs,
-                       const OutputMapper output,
-                       const Index m_size, const Index n_size, const Index k_size) {
-  __shared__ float2 lhs_shmem[32][16];
-  __shared__ float2 rhs_shmem[64][8];
-
-  const Index m_block_idx = blockIdx.x;
-  const Index n_block_idx = blockIdx.y;
-
-  const Index base_m = 64 * m_block_idx;
-  const Index base_n = 64 * n_block_idx;
-
-  if (base_m + 63 < m_size) {
-    if (base_n + 63 < n_size) {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    }
-  } else {
-    if (base_n + 63 < n_size) {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    } else {
-      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
-    }
-  }
-}
-
-
-template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> :
-    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> > {
-
-  typedef GpuDevice Device;
-
-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
-  typedef TensorContractionEvaluatorBase<Self> Base;
-
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
-  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
-
-  enum {
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-  };
-
-  // Most of the code is assuming that both input tensors are ColMajor. If the
-  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
-  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
-  // will pretend B is LHS and A is RHS.
-  typedef typename internal::conditional<
-    static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType;
-  typedef typename internal::conditional<
-    static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType;
-
-  static const int LDims =
-      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
-  static const int RDims =
-      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
-  static const int ContractDims = internal::array_size<Indices>::value;
-
-  typedef array<Index, LDims> left_dim_mapper_t;
-  typedef array<Index, RDims> right_dim_mapper_t;
-
-  typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, LDims - ContractDims> left_nocontract_t;
-  typedef array<Index, RDims - ContractDims> right_nocontract_t;
-
-  static const int NumDims = LDims + RDims - 2 * ContractDims;
-
-  typedef DSizes<Index, NumDims> Dimensions;
-
-  // typedefs needed in evalTo
-  typedef typename internal::remove_const<typename EvalLeftArgType::Scalar>::type LhsScalar;
-  typedef typename internal::remove_const<typename EvalRightArgType::Scalar>::type RhsScalar;
-
-  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
-  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
-
-  typedef typename LeftEvaluator::Dimensions LeftDimensions;
-  typedef typename RightEvaluator::Dimensions RightDimensions;
-
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
-      Base(op, device)
-  {
-    EIGEN_STATIC_ASSERT( (internal::is_same<OutputKernelType, const NoOpOutputKernel>::value),
-                          GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS);
-  }
-
-  // We need to redefine this method to make nvcc happy
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
-    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
-    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
-    if (data) {
-      evalTo(data);
-      return false;
-    } else {
-      this->m_result = static_cast<Scalar *>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
-      evalTo(this->m_result);
-      return true;
-    }
-  }
-
-  void evalTo(Scalar* buffer) const {
-    if (this->m_lhs_inner_dim_contiguous) {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<true, true, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<true, true, false, Unaligned>(buffer);
-        }
-      }
-      else {
-       if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<true, false, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<true, false, false, Unaligned>(buffer);
-        }
-      }
-    }
-    else {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<false, true, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<false, true, false, Unaligned>(buffer);
-        }
-      }
-      else {
-       if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<false, false, true, Unaligned>(buffer);
-        }
-        else {
-          evalTyped<false, false, false, Unaligned>(buffer);
-        }
-      }
-    }
-  }
-
-  template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels {
-    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
-    const Index m_blocks = (m + 63) / 64;
-    const Index n_blocks = (n + 63) / 64;
-    const dim3 num_blocks(m_blocks, n_blocks, 1);
-    const dim3 block_size(8, 8, 8);
-    LAUNCH_GPU_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
-    }
-  };
-
-  template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper> struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> {
-    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) {
-      if (m < 768 || n < 768) {
-        const Index m_blocks = (m + 63) / 64;
-        const Index n_blocks = (n + 63) / 64;
-        const dim3 num_blocks(m_blocks, n_blocks, 1);
-        const dim3 block_size(16, 16, 1);
-        LAUNCH_GPU_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
-      } else {
-        const Index m_blocks = (m + 127) / 128;
-        const Index n_blocks = (n + 63) / 64;
-        const dim3 num_blocks(m_blocks, n_blocks, 1);
-        const dim3 block_size(8, 32, 1);
-        LAUNCH_GPU_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k);
-      }
-    }
-  };
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  void evalTyped(Scalar* buffer) const {
-    // columns in left side, rows in right side
-    const Index k = this->m_k_size;
-    EIGEN_UNUSED_VARIABLE(k)
-
-    // rows in left side
-    const Index m = this->m_i_size;
-
-    // columns in right side
-    const Index n = this->m_j_size;
-
-    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
-    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
-
-    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
-                                                   LeftEvaluator, left_nocontract_t,
-                                                   contract_t, 4,
-                                                   lhs_inner_dim_contiguous,
-                                                   false, Unaligned> LhsMapper;
-
-    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
-                                                   RightEvaluator, right_nocontract_t,
-                                                   contract_t, 4,
-                                                   rhs_inner_dim_contiguous,
-                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
-
-    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-
-
-    // initialize data mappers
-    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
-                  this->m_left_contracting_strides, this->m_k_strides);
-
-    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
-                  this->m_right_contracting_strides, this->m_k_strides);
-
-    OutputMapper output(buffer, m);
-
-#if defined(EIGEN_USE_HIP)
-    setGpuSharedMemConfig(hipSharedMemBankSizeEightByte);
-#else
-    setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte);
-#endif
-
-    LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output,  m, n, k, this->m_device);
-  }
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_USE_GPU and EIGEN_GPUCC
-#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
index 9ab900b4a..c28a10dd4 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -22,19 +22,8 @@ enum {
 /*
  * Implementation of the Eigen blas_data_mapper class for tensors.
  */
-/// The make pointer class is used by sycl in order to build the mapper class on the device. For other platform the default make pointer is used which
-/// is scalar * for CoeffLoader.
-template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_ = MakePointer>
-struct CoeffLoader;
 
-template <typename Scalar, typename Index, int side, typename Tensor,
-          typename nocontract_t, typename contract_t, int packet_size,
-          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
-          template <class> class MakePointer_ = MakePointer>
-class BaseTensorContractionMapper;
-
-template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_>
-struct CoeffLoader {
+template <typename Tensor, bool HasRawAccess> struct CoeffLoader {
   enum {
     DirectOffsets = false
   };
@@ -45,12 +34,6 @@ struct CoeffLoader {
     eigen_assert(false && "unsupported");
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_<const typename Tensor::Scalar>::Type
-  data() const {
-    eigen_assert(false && "unsupported");
-    return NULL;
-  }
-
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return m_tensor.coeff(index); }
 
  template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -59,19 +42,12 @@ struct CoeffLoader {
     return m_tensor.template packet<LoadMode>(index);
   }
 
-  #ifdef EIGEN_USE_SYCL
-  // The placeholder accessors require to be bound to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_tensor.bind(cgh);
-  }
-  #endif
 
  private:
   const Tensor m_tensor;
 };
 
-template <typename Tensor, template <class> class MakePointer_>
-struct CoeffLoader<Tensor, true, MakePointer_> {
+template <typename Tensor> struct CoeffLoader<Tensor, true> {
   enum {
     DirectOffsets = true
   };
@@ -82,11 +58,6 @@ struct CoeffLoader<Tensor, true, MakePointer_> {
     m_data += offset;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_<const typename Tensor::Scalar>::Type
-  data() const {
-    return m_data;
-  }
-
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return loadConstant(m_data+index); }
 
  template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -94,23 +65,15 @@ struct CoeffLoader<Tensor, true, MakePointer_> {
   {
     return internal::ploadt_ro<typename Tensor::PacketReturnType, LoadMode>(m_data + index);
   }
-
-  #ifdef EIGEN_USE_SYCL
-  // The placeholder accessors require to be bound to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_data.bind(cgh);
-  }
-  #endif
  private:
   typedef typename Tensor::Scalar Scalar;
-
-  typename MakePointer_<const Scalar>::Type m_data;
+  const Scalar* m_data;
 };
 
 template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
-         int packet_size, bool inner_dim_contiguous, int Alignment, template <class> class MakePointer_ = MakePointer>
+         int packet_size, bool inner_dim_contiguous, int Alignment>
 class SimpleTensorContractionMapper {
   public:
   EIGEN_DEVICE_FUNC
@@ -126,7 +89,7 @@ class SimpleTensorContractionMapper {
       m_k_strides(k_strides) { }
 
   enum {
-    DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>::DirectOffsets
+    DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess>::DirectOffsets
   };
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
@@ -153,7 +116,6 @@ class SimpleTensorContractionMapper {
     EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963
     Index nocontract_val = left ? row : col;
     Index linidx = 0;
-    EIGEN_UNROLL_LOOP
     for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
       const Index idx = nocontract_val / m_ij_strides[i];
       linidx += idx * m_nocontract_strides[i];
@@ -170,7 +132,6 @@ class SimpleTensorContractionMapper {
 
     Index contract_val = left ? col : row;
     if(array_size<contract_t>::value > 0) {
-      EIGEN_UNROLL_LOOP
       for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
         const Index idx = contract_val / m_k_strides[i];
         linidx += idx * m_contract_strides[i];
@@ -195,7 +156,6 @@ class SimpleTensorContractionMapper {
     Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
     Index linidx[2] = {0, 0};
     if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
-      EIGEN_UNROLL_LOOP
       for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
         const Index idx0 = nocontract_val[0] / m_ij_strides[i];
         const Index idx1 = nocontract_val[1] / m_ij_strides[i];
@@ -216,7 +176,6 @@ class SimpleTensorContractionMapper {
 
     Index contract_val[2] = {left ? col : row, left ? col : row + distance};
     if (array_size<contract_t>::value> 0) {
-      EIGEN_UNROLL_LOOP
       for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
         const Index idx0 = contract_val[0] / m_k_strides[i];
         const Index idx1 = contract_val[1] / m_k_strides[i];
@@ -248,41 +207,24 @@ class SimpleTensorContractionMapper {
     return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1;
   }
 
-  #ifdef EIGEN_USE_SYCL
-  // The placeholder accessors require to be bound to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_tensor.bind(cgh);
-  }
-  #endif
-
-  const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& tensor() const {
-    return m_tensor;
-  }
-
-  const nocontract_t& nocontract_strides() const {
-    return m_nocontract_strides;
-  }
-  const nocontract_t& ij_strides() const { return m_ij_strides; }
-  const contract_t& contract_strides() const { return m_contract_strides; }
-  const contract_t& k_strides() const { return m_k_strides; }
-
  protected:
-  CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_> m_tensor;
+  CoeffLoader<Tensor, Tensor::RawAccess> m_tensor;
   const nocontract_t m_nocontract_strides;
   const nocontract_t m_ij_strides;
   const contract_t m_contract_strides;
   const contract_t m_k_strides;
 };
 
+
 template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
          int packet_size, bool inner_dim_contiguous,
-         bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
-class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_>
+         bool inner_dim_reordered, int Alignment>
+class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment>
 {
  public:
-  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper;
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, Alignment> ParentMapper;
 
   EIGEN_DEVICE_FUNC
   BaseTensorContractionMapper(const Tensor& tensor,
@@ -292,11 +234,12 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
                               const contract_t& k_strides) :
   ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
 
-  template <typename PacketT,int AlignmentType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  typename internal::enable_if<internal::unpacket_traits<PacketT>::size==packet_size,PacketT>::type
-  load(Index i, Index j) const
-  {
+  typedef typename Tensor::PacketReturnType Packet;
+  typedef typename unpacket_traits<Packet>::half HalfPacket;
+
+  template <int AlignmentType>
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
     // whole method makes column major assumption
 
     // don't need to add offsets for now (because operator handles that)
@@ -311,7 +254,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
 
     const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1);
     const Index first = indexPair.first;
-    const Index lastIdx = indexPair.second;
+    const Index last = indexPair.second;
 
     // We can always do optimized packet reads from left hand side right now, because
     // the vertical matrix dimension on the left hand side is never contracting.
@@ -319,7 +262,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
     // been shuffled first.
     if (Tensor::PacketAccess &&
         (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) &&
-        (lastIdx - first) == (packet_size - 1)) {
+        (last - first) == (packet_size - 1)) {
 
       return this->m_tensor.template packet<AlignmentType>(first);
     }
@@ -327,44 +270,31 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper<Scalar,
     EIGEN_ALIGN_MAX Scalar data[packet_size];
 
     data[0] = this->m_tensor.coeff(first);
-    EIGEN_UNROLL_LOOP
     for (Index k = 1; k < packet_size - 1; k += 2) {
       const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
       data[k] = this->m_tensor.coeff(internal_pair.first);
       data[k + 1] = this->m_tensor.coeff(internal_pair.second);
     }
-    data[packet_size - 1] = this->m_tensor.coeff(lastIdx);
+    data[packet_size - 1] = this->m_tensor.coeff(last);
 
-    return pload<PacketT>(data);
+    return pload<Packet>(data);
   }
 
-  template <typename PacketT,int AlignmentType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  typename internal::enable_if<internal::unpacket_traits<PacketT>::size!=packet_size,PacketT>::type
-  load(Index i, Index j) const
-  {
-    const Index requested_packet_size = internal::unpacket_traits<PacketT>::size;
-    EIGEN_ALIGN_MAX Scalar data[requested_packet_size];
-
-    const IndexPair<Index> indexPair = this->computeIndexPair(i, j, requested_packet_size - 1);
-    const Index first = indexPair.first;
-    const Index lastIdx = indexPair.second;
-
-    data[0] = this->m_tensor.coeff(first);
-    for (Index k = 1; k < requested_packet_size - 1; k += 2) {
-      const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
-      data[k] = this->m_tensor.coeff(internal_pair.first);
-      data[k + 1] = this->m_tensor.coeff(internal_pair.second);
-    }
-    data[requested_packet_size - 1] = this->m_tensor.coeff(lastIdx);
-
-    return pload<PacketT>(data);
-  }
-
-  template <typename PacketT,int AlignmentType>
+  template <int AlignmentType>
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const {
-    return this->load<PacketT,AlignmentType>(i,j);
+  EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const {
+    // whole method makes column major assumption
+
+    // don't need to add offsets for now (because operator handles that)
+    const Index half_packet_size = unpacket_traits<HalfPacket>::size;
+    if (half_packet_size == packet_size) {
+      return loadPacket<AlignmentType>(i, j);
+    }
+    EIGEN_ALIGN_MAX Scalar data[half_packet_size];
+    for (Index k = 0; k < half_packet_size; k++) {
+      data[k] = operator()(i + k, j);
+    }
+    return pload<HalfPacket>(data);
   }
 };
 
@@ -373,12 +303,11 @@ template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
          bool inner_dim_contiguous,
-         bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
-class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_>
-  : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_>
+         bool inner_dim_reordered, int Alignment>
+class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, inner_dim_reordered, Alignment> : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment>
 {
  public:
-  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment, MakePointer_> ParentMapper;
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous, Alignment> ParentMapper;
 
   EIGEN_DEVICE_FUNC
   BaseTensorContractionMapper(const Tensor& tensor,
@@ -388,17 +317,16 @@ class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, con
                               const contract_t& k_strides) :
   ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) { }
 
-  template <typename PacketT,int> EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const {
+  typedef typename Tensor::PacketReturnType Packet;
+  template <int> EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const {
     EIGEN_ALIGN_MAX Scalar data[1];
     data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
-    return pload<PacketT>(data);
+    return pload<typename Tensor::PacketReturnType>(data);
   }
-  template <typename PacketT,int> EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const {
-    EIGEN_ALIGN_MAX Scalar data[1];
-    data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
-    return pload<PacketT>(data);
+  template <int> EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const {
+    return loadPacket(i, j);
   }
 };
 
@@ -407,12 +335,14 @@ template<typename Scalar, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
          int packet_size,
-         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template <class> class MakePointer_=MakePointer>
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionSubMapper {
  public:
+  typedef typename Tensor::PacketReturnType Packet;
+  typedef typename unpacket_traits<Packet>::half HalfPacket;
 
-  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> ParentMapper;
-  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Self;
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> ParentMapper;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Self;
   typedef Self LinearMapper;
 
   enum {
@@ -444,32 +374,27 @@ class TensorContractionSubMapper {
     return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
   }
 
-  template <typename PacketT>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const {
     if (UseDirectOffsets) {
-      return m_base_mapper.template loadPacket<PacketT,Alignment>(i, 0);
+      return m_base_mapper.template loadPacket<Alignment>(i, 0);
     }
-    return m_base_mapper.template loadPacket<PacketT,Alignment>(i + m_vert_offset, m_horiz_offset);
+    return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper.template loadPacket<Alignment>(i, j);
+    }
+    return m_base_mapper.template loadPacket<Alignment>(i + m_vert_offset, j + m_horiz_offset);
   }
 
-  template <typename PacketT>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const {
     if (UseDirectOffsets) {
-      return m_base_mapper.template loadPacket<PacketT,Alignment>(i, j);
+      return m_base_mapper.template loadHalfPacket<Alignment>(i, 0);
     }
-    return m_base_mapper.template loadPacket<PacketT,Alignment>(i + m_vert_offset, j + m_horiz_offset);
+    return m_base_mapper.template loadHalfPacket<Alignment>(i + m_vert_offset, m_horiz_offset);
   }
 
-  template <typename PacketT, int AlignmentType>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const {
-    if (UseDirectOffsets) {
-      return m_base_mapper.template load<PacketT,AlignmentType>(i, j);
-    }
-    return m_base_mapper.template loadPacket<PacketT,AlignmentType>(i + m_vert_offset, j + m_horiz_offset);
-  }
-
-  template <typename PacketT>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketT& p) const {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const {
     if (UseDirectOffsets) {
       m_base_mapper.storePacket(i, 0, p);
     }
@@ -485,30 +410,19 @@ class TensorContractionSubMapper {
 
   template <typename PacketT, int AlignmentType>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
-    EIGEN_STATIC_ASSERT((internal::is_same<PacketT, PacketT>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((internal::is_same<PacketT, Packet>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
     const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned;
     if (UseDirectOffsets) {
-     return m_base_mapper.template loadPacket<PacketT,ActualAlignment>(i, 0);
+     return m_base_mapper.template loadPacket<ActualAlignment>(i, 0);
     }
-    return m_base_mapper.template loadPacket<PacketT,ActualAlignment>(i + m_vert_offset, m_horiz_offset);
+    return m_base_mapper.template loadPacket<ActualAlignment>(i + m_vert_offset, m_horiz_offset);
   }
 
-  template <typename PacketT>
+  template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const {
     return false;
   }
 
-  #ifdef EIGEN_USE_SYCL
-  // The placeholder accessors require to be bound to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_base_mapper.bind(cgh);
-  }
-  #endif
-
-  const ParentMapper& base_mapper() const { return m_base_mapper; }
-  Index vert_offset() const { return m_vert_offset; }
-  Index horiz_offset() const { return m_horiz_offset; }
-
  private:
   ParentMapper m_base_mapper;
   const Index m_vert_offset;
@@ -520,14 +434,14 @@ template<typename Scalar_, typename Index, int side,
          typename Tensor,
          typename nocontract_t, typename contract_t,
          int packet_size,
-         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,  template <class> class MakePointer_=MakePointer>
+         bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
 class TensorContractionInputMapper
-  : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> {
+  : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> {
 
  public:
   typedef Scalar_ Scalar;
-  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> Base;
-  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> SubMapper;
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> Base;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size, inner_dim_contiguous, inner_dim_reordered, Alignment> SubMapper;
   typedef SubMapper VectorMapper;
 
   EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor,
@@ -545,29 +459,9 @@ class TensorContractionInputMapper
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
     return VectorMapper(*this, i, j);
   }
-  
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& get_tensor() const {
-    return Base::m_tensor;
-  }
 };
 
 
-template <typename T> struct TensorContractionInputMapperTrait;
-
-template<typename Scalar_, typename Index_, int side_,
-         typename Tensor_,
-         typename nocontract_t_, typename contract_t_,
-         int packet_size_,
-         bool inner_dim_contiguous_, bool inner_dim_reordered_, int Alignment_,  template <class> class MakePointer_>
-struct TensorContractionInputMapperTrait<TensorContractionInputMapper<Scalar_, Index_, side_, Tensor_, 
-                                                    nocontract_t_, contract_t_, packet_size_, inner_dim_contiguous_, 
-                                                    inner_dim_reordered_, Alignment_, MakePointer_> > {
-
-      typedef Tensor_ XprType;
-      static const bool  inner_dim_contiguous = inner_dim_contiguous_;
-      static const bool  inner_dim_reordered = inner_dim_reordered_;
-  };  
-
 
 }  // end namespace internal
 }  // end namespace Eigen
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
deleted file mode 100644
index a6ca1777a..000000000
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ /dev/null
@@ -1,1650 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla Public License v. 2.0. If a copy of the MPL was not
-// distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorContractionSycl.h
- *
- * \brief:
- *  TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
- *
- *****************************************************************/
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
-
-namespace Eigen {
-
-namespace TensorSycl {
-namespace internal {
-
-#ifndef EIGEN_SYCL_DISABLE_GEMV
-/*!
- * \brief TVPanelSize, a template class used for setting the panel size required for launching General TensorVector
- * contraction kernel on various hardware devices.
- *
- * \tparam Scalar: determines the element type of the tensor/vector
- *
- * \tparam StorageIndex  determines the Index type.
- *
- * \tparam NCWindow: determines the number of non-contracting element to be process by each work-group
- *
- * \tparam CFactor: determines the number of contracting element to be process by each thread
- *
- * \tparam NCFactor: determines the number of non-contracting element to be process by each thread
- */
-template <typename Scalar, typename StorageIndex, StorageIndex NCWindow, StorageIndex CFactor, StorageIndex NCFactor>
-struct TVPanelSize {
-  // LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension
-  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0;
-  // LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension
-  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1;
-  // TileSizeDimNC: determines the tile size for the non-contracting dimension
-  static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = NCWindow / NCFactor;
-  // TileSizeDimC: determines the tile size for the contracting dimension
-  static EIGEN_CONSTEXPR StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC;
-  // WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension
-  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC;
-  // WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension
-  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC;
-  // BC : determines if supporting bank conflict is required
-  static EIGEN_CONSTEXPR bool BC = false;
-};
-#endif
-
-/*!
- * \brief TTPanelSize, a template class used for setting the panel size required for launching General Tensor Tensor
- contraction kernel on various hardware devices.
- *
- * \tparam Scalar: determines the element type of the tensor
- *
- * \tparam StorageIndex: determines the Index type.
- *
- * \tparam REG_SIZE_M: determines workload per thread for loading the M dimension This can be varied based on the
- available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro).
- *
- * \tparam REG_SIZE_N: determines workload per thread for loading the N dimension This can be varied based on the
- available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro).
- *
- * \tparam TSDK: determines Tile size for dimension K. The packet size is assumed to be considered
- */
-
-template <typename Scalar, typename StorageIndex, StorageIndex REG_SIZE_M, StorageIndex REG_SIZE_N, StorageIndex TSDK>
-struct TTPanelSize {
-  // TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered
-  static EIGEN_CONSTEXPR StorageIndex TileSizeDimK = TSDK;
-  // WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the
-  // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro//
-#ifndef EIGEN_SYCL_REG_M
-  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = REG_SIZE_M;
-#else
-  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M;
-#endif
-// WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the
-// available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro
-#ifndef EIGEN_SYCL_REG_N
-  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = REG_SIZE_N;
-#else
-  static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N;
-#endif
-  // LocalThreadSizeM: determines total number of thread per workgroup for the m dimension
-  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0;
-  // LocalThreadSizeN: determines total number of thread per workgroup for the n dimension
-  static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1;
-  // TileSizeDimM: determines the tile size for the m dimension
-  static EIGEN_CONSTEXPR StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM;
-  // TileSizeDimN: determines the tile size for the n dimension
-  static EIGEN_CONSTEXPR StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN;
-  // LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisable by packetsize
-  static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs =
-      ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimN));
-  // LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisable by packetsize
-  static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs =
-      ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimM));
-  // BC : determines if supporting bank conflict is required
-  static EIGEN_CONSTEXPR bool BC = true;
-  // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by
-  // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device doesnot have sufficient  local memory)
-  static EIGEN_CONSTEXPR bool DoubleBuffer =
-#ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER
-      false;
-#else
-      true;
-#endif
-};
-
-/* !
- * \brief contraction_type: an enum class representing the Tensor Contraction implementation algorithm. This is used to
- * specialize the contraction algorithm based on device support for dedicated local memory.
- */
-enum class contraction_type { local, no_local };
-/* !
- * \brief data_source an enum class determining the location of the data in a memory hierarchy (global, local, private).
- */
-enum class data_source { global_mem, local_mem, private_mem };
-
-/*!
- * \brief read, a template function used for loading the data from global
- memory. This function is used to guarantee coalesced and vectorized load whenever possible
- *
- * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode
- *
- * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and
- vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
- contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
- when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
- *
- * \tparam PacketType:  determines the type of packet
- *
- * \tparam TensorMapper: determines the input tensor mapper type
- *
- * \tparam StorageIndex: determines the Index type
-
- * \param tensorMapper: is the input tensor
- *
- * \param NCIndex: is the non-contracting dim index
- *
- * \param CIndex is the contracting dim index
- *
- * \param ld: is the leading dimension of the flattened tensor
- */
-template <bool PacketLoad, bool is_coalesced_layout, bool, typename PacketType, typename TensorMapper,
-          typename StorageIndex>
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<PacketLoad, PacketType>::type read(
-    const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld) {
-  const StorageIndex row = (is_coalesced_layout) ? NCIndex : CIndex;
-  const StorageIndex col = (is_coalesced_layout) ? CIndex : NCIndex;
-  return tensorMapper.get_tensor().template packet<Unaligned>(row + (col * ld));
-}
-
-/*!
- * \brief read, special overload of read function, when the read access is not vectorized
- *
- * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode
- *
- * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and
-  vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
-  contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
-  when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
- *
- * \tparam PacketType: determines the type of packet
- *
- * \tparam TensorMapper: determines the input tensor mapper type
- *
- * \tparam StorageIndex: determines the Index type
-
- * \param tensorMapper: is the input tensor
- *
- * \param NCIndex: is the non-contracting dim index
- *
- * \param CIndex: is the contracting dim index
- */
-template <bool PacketLoad, bool, bool IsRhs, typename PacketType, typename TensorMapper, typename StorageIndex>
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!PacketLoad, PacketType>::type read(
-    const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &) {
-  const StorageIndex row = (IsRhs) ? CIndex : NCIndex;
-  const StorageIndex col = (IsRhs) ? NCIndex : CIndex;
-  return tensorMapper(row, col);
-}
-
-/*!
- * \brief write, a template function used for storing the data to local memory. This function is used to guarantee
- * coalesced and vectorized store whenever possible.
- *
- * \tparam StorageIndex: determines the Index type
- *
- * \param ld is the leading dimension of the local memory. ld is a compile time value for the local memory
- *
- * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
- *
- * \tparam PacketType:  determines the type of packet
- *
- * \tparam DataScalar: determines the output data type
- *
- * \param packet_data: the data to be written in the local memory
- *
- * \param ptr: a pointer to the local memory
- *
- * \param CIndex is the contracting dim index
- */
-
-template <typename StorageIndex, StorageIndex ld, data_source dt, typename PacketType, typename DataScalar>
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    typename ::Eigen::internal::enable_if<dt != data_source::global_mem, void>::type
-    write(PacketType &packet_data, DataScalar ptr) {
-  EIGEN_CONSTEXPR int PacketSize = Eigen::internal::unpacket_traits<PacketType>::size;
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i < PacketSize; i++) {
-    *ptr = PacketWrapper<PacketType, PacketSize>::scalarize(i, packet_data);
-    ptr += ld;
-  }
-}
-
-/*!
- * \brief Overloading the write function for storing the data to global memory, when vectorization enabled This function
- * is used to guarantee coalesced and vectorized store whenever possible.
- *
- * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
- *
- * \tparam PacketType:  determines the type of packet
- *
- * \tparam DataScalar: determines the output data type
- *
- * \param packet_data: the data to be written in the local memory
- *
- * \param ptr: a pointer to the local memory
- */
-
-template <data_source dt, typename PacketType, typename DataScalar>
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<
-    Eigen::internal::unpacket_traits<PacketType>::size != 1 && dt == data_source::global_mem, void>::type
-write(PacketType &packet_data, DataScalar *ptr) {
-  ::Eigen::internal::pstoreu<DataScalar, PacketType>(ptr, packet_data);
-}
-
-/*!
- * \brief Overloading the write function for storing the data to global memory, when vectorization is disabled.
- *
- * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
- *
- * \tparam PacketType:  determines the type of packet
- *
- * \tparam DataScalar: determines the output data type
- *
- * \param packet_data: the data to be written in the local memory
- *
- * \param ptr: a pointer to the local memory
- */
-template <data_source dt, typename PacketType, typename DataScalar>
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<
-    Eigen::internal::unpacket_traits<PacketType>::size == 1 && dt == data_source::global_mem, void>::type
-write(PacketType &packet_data, DataScalar *ptr) {
-  *ptr = packet_data;
-}
-
-/*!
- * \brief check_boundary: is used to check the edge condition for non-internal blocks.
- *
- * \tparam is_internal: determines if the block is internal
- */
-template <bool is_internal>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool) {
-  return true;
-}
-
-/*!
- * \brief check_boundary: specialization of the check_boundary for non-internal blocks.
- *
- * \param cond: true when the data is in range. Otherwise false
- */
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary<false>(bool cond) {
-  return cond;
-}
-
-/*!
- * \brief BlockProperties is a template class that provides different characteristic of a block of each Tensor processed
- * by each workgroup.
- *
- * \tparam is_transposed: iff true, determines whether or not the block of the Tensor is transposed
- *
- * \tparam packet_load_: determines if the each element of this tensor block should be loaded in a packet mode
- *
- * \tparam PacketType:  determines the type of packet
- *
- * \tparam OutType: determines the type of each element for this block of tensor. If packet load is true, it will be
- * packetType; Otherwise it will be scalar Type
- *
- * \param elements_per_access determines the size of each element based on OutType
- *
- * \param is_coalesced_layout  determines whether or not the Tensor data in a memory can be access coalesced and
- * vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
- * contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
- * when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
- *
- * \param nc_stride determines the stride of non-contracting dimension to access the next adjustment element within the
- * Tensor Block for each workgroup
- *
- * \param c_stride  determines the stride of contracting dimension to access the next adjustment element within the
- * Tensor Block for each workgroup
- */
-template <bool is_transposed, bool is_rhs_, bool packet_load_, typename PacketType>
-struct BlockProperties {
-  static EIGEN_CONSTEXPR bool packet_load = packet_load_;
-  typedef typename Eigen::internal::unpacket_traits<PacketType>::type OutScalar;
-  static EIGEN_CONSTEXPR bool is_rhs = is_rhs_;
-  typedef typename Eigen::internal::conditional<packet_load, PacketType, OutScalar>::type OutType;
-  static EIGEN_CONSTEXPR int elements_per_access = Eigen::internal::unpacket_traits<OutType>::size;
-  static EIGEN_CONSTEXPR bool is_coalesced_layout = !(is_transposed ^ is_rhs);
-  static EIGEN_CONSTEXPR int nc_stride = (is_coalesced_layout ? elements_per_access : 1);
-  static EIGEN_CONSTEXPR int c_stride = (is_coalesced_layout ? 1 : elements_per_access);
-};
-
-/*!
- * \brief ThreadProperties is a template class that provides each thread's properties within a workgroup.  Please see
- * the sycl-1.2.1 specification (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for the workgroup,
- * work-items
- *
- * \tparam StorageIndex: determines the StorageIndex Type
- *
- * \param linearLocalThreadId: determines the linearized location of a thread within a work-group
- *
- * \param kGroupId: determines the logical group id in a k dimension of the flattened tensor. It will be > 1 when
- * tall/skinny algorithm is used
- *
- * \param mGroupOffset: determines the logical start position of all thread within a workgroup for the m dimension of
- * the flattened tensor.
- *
- * \param kGroupOffset determines the logical start position of all thread within a workgroup for the k dimension of the
- * flattened tensor. It will be > 1 when tall/skinny algorithm is used.
- *
- * \param mLocalOffset: determines the logical start position of each thread within a workgroup for the m dimension of a
- * flattened tensor. The position determines the distance of each thread within the workgroup from each other
- * independent from their global position.
- *
- * \param nLocalOffset: determines the logical start position of each thread within a workgroup for the n dimension of a
- * flattened tensor. The position determines the distance of each thread within the workgroup from each other
- * independent from their global position.
- *
- * \param mGlobalOffset: determines the logical start position of each thread a thread for the m dimension on a
- * flattened tensor
- *
- * \param nGlobalOffset: determines the logical start position of each thread a thread for the n dimension on a
- * flattened tensor
- *
- * \param kSize : determine the number of the k elements of the flattened Tensor to be processed by each thread for the
- * given tensor block. This is !=K dimension of Flattened Tensor when Tall/Skinny matrix is used.
- *
- * \param is_internal : this will determined if the thread within the work-group computes an internal block of tensor or
- * the edge blocks. When it is internal, there is no need to check the boundaries and all the if stantement can be
- * resolve by compiler.
- */
-template <typename StorageIndex>
-struct ThreadProperties {
-  const StorageIndex linearLocalThreadId;
-  const StorageIndex kGroupId;
-  const StorageIndex mGroupOffset;
-  const StorageIndex nGroupOffset;
-  const StorageIndex kGroupOffset;
-  const StorageIndex mLocalOffset;
-  const StorageIndex nLocalOffset;
-  const StorageIndex mGlobalOffset;
-  const StorageIndex nGlobalOffset;
-  StorageIndex kSize;
-  const bool is_internal;
-  // this is used to adjust the last block
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ThreadProperties(
-      const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_,
-      const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_,
-      const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_,
-      StorageIndex kSize_, const bool is_internal_)
-      : linearLocalThreadId(linearLocalThreadId_),
-        kGroupId(kGroupId_),
-        mGroupOffset(mGroupOffset_),
-        nGroupOffset(nGroupOffset_),
-        kGroupOffset(kGroupOffset_),
-        mLocalOffset(mLocalOffset_),
-        nLocalOffset(nLocalOffset_),
-        mGlobalOffset(mGlobalOffset_),
-        nGlobalOffset(nGlobalOffset_),
-        kSize(kSize_),
-        is_internal(is_internal_) {}
-};
-
-/*!
- * \brief TensorContractionKernel is a template class that provides Tensor -Tensor contraction operation.
- *
- * \tparam OutScalar: determines the output scalar type
- *
- * \tparam LhsScalar: determines the left-hand-side scalar type
- *
- * \tparam RhsScalar: determines the right-hand-side scalar type
- *
- * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
- (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
- *
- * \tparam LhsMapper determines the tensor contraction mapper type for left-hand-side matrix
- *
- * \tparam RhsMapper determines the tensor contraction mapper type for right-hand-side matrix
- *
- * \tparam StorageIndex: determines the StorageIndex Type
- *
- * \tparam Properties: determines the Contraction Panel properties
- *
- * \tparam TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix
- *
- * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
- *
- * \tparam input_mapper_properties : determine if the input tensors are matrix. If they are matrix, special memory
- access is used to guarantee that always the memory access are coalesced.
- *
- * \tptaram IsFinal : determine if this is the final kernel. If so, the result will be written in a final output.
- Otherwise, the result of contraction will be written iin a temporary buffer. This is the case when Tall/Skinny
- contraction is used. So in this case, a final reduction step is required to compute final output.
-
- * \tparam contraction_tp: it is an enum value representing whether the local memroy/no local memory implementation of
- the algorithm to be used
- *
- * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group
- *
- * \param lhs: determines the left-hand-side flattened tensor (tensor mapper)
- *
- * \param rhs: determines the right-hand-side flattened tensor (tensor mapper)
- *
- * \param out_res: determines the output tensor containing the contraction result
- *
- * \param groupSizeM: a logical number determining the number of work-group for m dimension
- *
- * \param groupSizeN: a logical number determining the number of work-group for n dimension
- *
- * \param numTiles: determines total number of tiles on the k dimension
- *
- * \param TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix
- */
-template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
-          typename RhsMapper, typename StorageIndex, typename Properties, typename TripleDim, bool Vectorizable,
-          typename input_mapper_properties, bool IsFinal, contraction_type contraction_tp>
-class TensorContractionKernel {
- public:
-  typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
-      PacketReturnType;
-  static EIGEN_CONSTEXPR int PacketSize =
-      Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
-  static EIGEN_CONSTEXPR bool is_lhs_transposed =
-      !::Eigen::internal::TensorContractionInputMapperTrait<LhsMapper>::inner_dim_contiguous;
-  static EIGEN_CONSTEXPR bool is_rhs_transposed =
-      !::Eigen::internal::TensorContractionInputMapperTrait<RhsMapper>::inner_dim_contiguous;
-
-  typedef BlockProperties<is_lhs_transposed, false, input_mapper_properties::is_lhs_matrix && Vectorizable,
-                          PacketReturnType>
-      LHSBlockProperties;
-
-  typedef BlockProperties<is_rhs_transposed, true, input_mapper_properties::is_rhs_matrix && Vectorizable,
-                          PacketReturnType>
-      RHSBlockProperties;
-
-  static EIGEN_CONSTEXPR StorageIndex NStride =
-      contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride;
-
-  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
-  typedef cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::local_space> local_ptr;
-  typedef OutScalar * /*cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::private_space>*/ private_ptr;
-  typedef
-      typename ::Eigen::internal::conditional<contraction_tp == contraction_type::local, local_ptr, private_ptr>::type
-          tile_ptr;
-  static EIGEN_CONSTEXPR StorageIndex LSDL = contraction_tp == contraction_type::local
-                                                 ? Properties::TileSizeDimM + Properties::BC
-                                                 : Properties::WorkLoadPerThreadM;
-  static EIGEN_CONSTEXPR StorageIndex LSDR = contraction_tp == contraction_type::local
-                                                 ? Properties::TileSizeDimN + Properties::BC
-                                                 : Properties::WorkLoadPerThreadN;
-  static EIGEN_CONSTEXPR StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
-
-  /**
-   * \brief MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it is not
-   * allowed to have dynamic memory allocation. While the local memory is created outside of the kernel and passed to
-   * the kernel as an accessor, the private memory can only allowed to be allocated statically. Since we are abstracting
-   * the TiledMemory for both local and private memory, the MemHolder structs is used as a helper to abstract out
-   * different type of memory needed when local/no_local memory computation is called.
-   *
-   * \tparam contraction_type: it is an enum value representing whether the local memroy/no local memory implementation
-   of the algorithm to be used
-   * \tparam the private memory size
-   * \param ptr the tile memory pointer type
-   */
-  template <contraction_type, StorageIndex>
-  struct MemHolder {
-    tile_ptr ptr;
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr) : ptr(block_start_ptr) {}
-  };
-  /**
-   * \brief specialization of memHolder class when no local memory kernel is used.
-   */
-  template <StorageIndex MemSize>
-  struct MemHolder<contraction_type::no_local, MemSize> {
-    OutScalar ptr[MemSize] = {OutScalar{0}};
-  };
-  /**
-   * \brief TiledMemory: contains required memory pointer for loading  each tile of the TensorContraction panel from
-   * global memory to local/private memory when local/no_local algorithm used.
-   *
-   * \param lhs_scratch_extract : determines the LHS tile memory. It is either private or local memory based on the
-   * selected contraction_type.
-   *
-   * \param rhs_scratch_extract : determines the RHS tile memory. It is either private or local memory based on the
-   * selected contraction_type.
-   *
-   * \param lhs_extract_index: determins the position of each thread on a local memory for lhs input. When private
-   * memory is used this is set to zero as this is not applicable in case of private memory.
-   *
-   * \param rhs_extract_index: determins the position of each thread on a local memory for rhs input. When private
-   * memory is used this is set to zero as this is not applicable in case of private memory.
-   *
-   * \param lhs_scratch_compute : determines the  location to load for computation for lhs_local memory. This is the
-   * same as lhs_scratch_extract for private memory.
-   *
-   * \param rhs_scratch_compute : determines the  location to load for computation for rhs_local memory. This is the
-   * same as rhs_scratch_extract for private memory.
-   */
-  struct TiledMemory {
-    MemHolder<contraction_tp, Properties::WorkLoadPerThreadM * Properties::TileSizeDimK> lhs_scratch_extract;
-    MemHolder<contraction_tp, Properties::WorkLoadPerThreadN * Properties::TileSizeDimK> rhs_scratch_extract;
-    tile_ptr lhs_scratch_ptr_compute;
-    tile_ptr rhs_scratch_ptr_compute;
-    const std::pair<StorageIndex, StorageIndex> lhs_extract_index;
-    const std::pair<StorageIndex, StorageIndex> rhs_extract_index;
-    template <contraction_type tp = contraction_tp>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    TiledMemory(const ThreadProperties<StorageIndex> &, local_ptr,
-                typename ::Eigen::internal::enable_if<tp == contraction_type::no_local>::type * = 0)
-        : lhs_scratch_extract{},
-          rhs_scratch_extract{},
-          lhs_scratch_ptr_compute(lhs_scratch_extract.ptr),
-          rhs_scratch_ptr_compute(rhs_scratch_extract.ptr),
-          lhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})),
-          rhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})) {}
-
-    template <contraction_type tp = contraction_tp>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    TiledMemory(const ThreadProperties<StorageIndex> &thread_properties, local_ptr block_start_ptr,
-                typename ::Eigen::internal::enable_if<tp == contraction_type::local>::type * = 0)
-        : lhs_scratch_extract{block_start_ptr},
-          rhs_scratch_extract{lhs_scratch_extract.ptr +
-                              ((Properties::DoubleBuffer + 1) * LSDL * Properties::TileSizeDimK)},
-          lhs_scratch_ptr_compute(lhs_scratch_extract.ptr + thread_properties.mLocalOffset),
-          rhs_scratch_ptr_compute(rhs_scratch_extract.ptr + thread_properties.nLocalOffset),
-          lhs_extract_index(
-              local_id_extract<LHSBlockProperties, Properties::TileSizeDimM>(thread_properties.linearLocalThreadId)),
-          rhs_extract_index(
-              local_id_extract<RHSBlockProperties, Properties::TileSizeDimN>(thread_properties.linearLocalThreadId)) {}
-  };
-
-  Scratch scratch;
-  const LhsMapper lhs;
-  const RhsMapper rhs;
-  OutAccessor out_res;
-  const StorageIndex groupSizeM;
-  const StorageIndex groupSizeN;
-  const StorageIndex numTiles;
-  const TripleDim triple_dim;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_,
-                                                                const RhsMapper rhs_, OutAccessor out_res_,
-                                                                const StorageIndex groupSizeM_,
-                                                                const StorageIndex groupSizeN_,
-                                                                const StorageIndex numTiles_,
-                                                                const TripleDim triple_dim_)
-      : scratch(scratch_),
-        lhs(lhs_),
-        rhs(rhs_),
-        out_res(out_res_),
-        groupSizeM(groupSizeM_),
-        groupSizeN(groupSizeN_),
-        numTiles(numTiles_),
-        triple_dim(triple_dim_) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_,
-                                                                const RhsMapper rhs_, OutAccessor out_res_,
-                                                                const StorageIndex groupSizeM_,
-                                                                const StorageIndex numTiles_,
-                                                                const TripleDim triple_dim_)
-      : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
-    const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
-    const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM;
-    const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM;
-    const StorageIndex mGroupId = itemID.get_group(0) % groupSizeM;
-    const StorageIndex tmp = itemID.get_group(0) / groupSizeM;
-    const StorageIndex nGroupId = IsFinal ? tmp : tmp % groupSizeN;
-    const StorageIndex kGroupId = IsFinal ? 0 : tmp / groupSizeN;
-    const StorageIndex mGroupOffset = mGroupId * Properties::TileSizeDimM;
-    const StorageIndex nGroupOffset = nGroupId * Properties::TileSizeDimN;
-    const StorageIndex mLocalOffset = PacketSize * mLocalThreadId;
-    const StorageIndex nLocalOffset = NStride * nLocalThreadId;
-    const StorageIndex mGlobalOffset = mGroupOffset + mLocalOffset;
-    const StorageIndex nGlobalOffset = nGroupOffset + nLocalOffset;
-
-    const StorageIndex kSizePerWG = IsFinal ? triple_dim.K : numTiles * Properties::TileSizeDimK;
-    StorageIndex kGroupOffset = kGroupId * kSizePerWG;
-    const bool is_internal = triple_dim.M - mGroupOffset >= Properties::TileSizeDimM &&
-                             triple_dim.N - nGroupOffset >= Properties::TileSizeDimN &&
-                             triple_dim.K - kGroupOffset >= kSizePerWG;
-    // this is used to adjust the last block
-    StorageIndex kSize = IsFinal ? triple_dim.K : std::min(kSizePerWG, triple_dim.K - kGroupOffset);
-    // This is used to find out the lats K offset so that kGroupOffset -kSize can compute the coffset for loading to
-    // tile
-    kGroupOffset += kSize;
-
-    auto thread_properties =
-        ThreadProperties<StorageIndex>(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset,
-                                       mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal);
-
-    auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N);
-
-    (thread_properties.is_internal) ? compute_panel<true>(itemID, thread_properties, out_ptr)
-                                    : compute_panel<false>(itemID, thread_properties, out_ptr);
-  }
-  // The compute block computes the contraction operation private block for each thread and store the resutl in the
-  // privateRes memory of Each computation the compute block function is independent of local and no local concepts as
-  // it only compute the block on each thread's private memory space
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr,
-                                                                    PacketReturnType *privateRes) {
-    StorageIndex idx = 0;
-    EIGEN_CONSTEXPR StorageIndex lhs_stride =
-        contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1;
-    EIGEN_UNROLL_LOOP
-    for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) {
-      auto rhsPacket = PacketReturnType{*(rhs_block_ptr + wLPTN)};
-      StorageIndex lhs_index = 0;
-      EIGEN_UNROLL_LOOP
-      for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
-        PacketReturnType lhsPack{};
-        Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::set_packet(lhsPack,
-                                                                                             lhs_block_ptr + lhs_index);
-        privateRes[idx] = ::Eigen::internal::pmadd(lhsPack, rhsPacket, privateRes[idx]);
-
-        lhs_index += lhs_stride;
-        idx++;
-      }
-    }
-  }
-  // The store function write the computed contraction operation in the private memory of each thread to the global
-  // memory. The store function is independent of local and no local concepts s that it can be abstract out in the base
-  // class.
-  template <bool is_internal_block, StorageIndex PrivateNStride, typename OutPtr>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes,
-                                                   StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) {
-    auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC {
-      return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N);
-    };
-    // when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is
-    // available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId*
-    // WorkLoadPerThreadN slice of N
-    EIGEN_CONSTEXPR StorageIndex GlobalNStride =
-        contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN;
-    EIGEN_UNROLL_LOOP
-    for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) {
-      // output leading dimension
-      StorageIndex outputLD = 0;
-      // When local memory is used the PrivateNstride is always 1 because the coalesed access on N is loaded into Local
-      // memory and extracting from local to global is the same as no transposed version. However, when local memory is
-      // not used and RHS is transposed we packetize the load for RHS.
-      EIGEN_UNROLL_LOOP
-      for (StorageIndex nId = 0; nId < PrivateNStride; nId++) {
-        StorageIndex globalRow = mGlobalOffset;
-        EIGEN_UNROLL_LOOP
-        for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
-          PacketReturnType privetOut = privateRes[wLPTM];
-          if (check_boundary<is_internal_block>(chk_bound(globalRow, nId))) {
-            // Store the final results in C. The C matrix has always M as a first StorageIndex and N as a second
-            // StorageIndex Therefore it is always coalesced layout
-            write<data_source::global_mem>(privetOut, out_ptr + outputLD + globalRow);
-          } else {
-            EIGEN_UNROLL_LOOP
-            for (StorageIndex mId = 0; mId < PacketSize; mId++) {
-              StorageIndex mOffset = globalRow + mId;
-              if (mOffset < triple_dim.M && (nGlobalOffset + nId < triple_dim.N)) {
-                out_ptr[mOffset + outputLD] =
-                    Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::scalarize(mId, privetOut);
-              }
-            }
-          }
-          globalRow += (PacketSize * Properties::LocalThreadSizeM);
-        }
-        outputLD += triple_dim.M;
-        privateRes += Properties::WorkLoadPerThreadM / PacketSize;
-      }
-      out_ptr += (GlobalNStride * outputLD);
-
-      nGlobalOffset += (PrivateNStride * GlobalNStride);
-    }
-  }
-  // when no local memory is used the following extract_block will be enabled
-  template <typename InputBlockProperties, bool is_internal_block, typename Input, typename PrivateReg,
-            contraction_type contract_tp = contraction_tp>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-      typename ::Eigen::internal::enable_if<contract_tp == contraction_type::no_local>::type
-      extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &,
-                    const StorageIndex &ncOffset, const StorageIndex cOffset) {
-    EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC =
-        InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM;
-    EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC =
-        InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM;
-    const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
-
-    auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
-      return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
-              (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
-    };
-    const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
-    StorageIndex cIndex = cOffset;
-
-    EIGEN_UNROLL_LOOP
-    for (StorageIndex cId = 0; cId < Properties::TileSizeDimK / InputBlockProperties::c_stride; cId++) {
-      StorageIndex ncIndex = ncOffset;
-      EIGEN_UNROLL_LOOP
-      for (StorageIndex ncId = 0; ncId < WorkLoadPerThreadNC / InputBlockProperties::nc_stride; ncId++) {
-        if (check_boundary<is_internal_block>(chk_bound(cIndex, ncIndex))) {
-          auto val =
-              read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
-                   InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, ncIndex, cIndex, ld);
-
-          write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
-                data_source::private_mem>(val, private_ptr);
-        } else {
-          EIGEN_UNROLL_LOOP
-          for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
-            const StorageIndex ncInd = ncIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
-            const StorageIndex cInd = cIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
-            OutScalar val =
-                (ncInd < NC && cInd < triple_dim.K)
-                    ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
-                          inpt, ncInd, cInd, ld)
-                    : OutScalar(0);
-            write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
-                  data_source::private_mem>(
-                val, private_ptr + (InputBlockProperties::is_coalesced_layout ? i : 0) +
-                         ((InputBlockProperties::is_coalesced_layout ? 0 : i) * WorkLoadPerThreadNC));
-          }
-        }
-
-        // if it is lhs we have to load it packetised when the packet size is > 1, because the output is coalesced. So
-        // even if M is not accessed in a coalesced mode, we have to load packet_size number of m per thread.
-        ncIndex = (!InputBlockProperties::is_rhs && InputBlockProperties::nc_stride == 1 && PacketSize != 1)
-                      ? ncOffset + (ncId + 1) % PacketSize + ((ncId + 1) / PacketSize) * LocalThreadSizeNC
-                      : (ncIndex + InputBlockProperties::nc_stride * LocalThreadSizeNC);
-        private_ptr += InputBlockProperties::nc_stride;
-      }
-      // the previous for loop ( private_ptr += (ncId * nc_stride)) has already moved ptr with one WorkLoadPerThreadNC
-      private_ptr += (InputBlockProperties::c_stride - 1) * WorkLoadPerThreadNC;
-      cIndex += InputBlockProperties::c_stride;
-    }
-  }
-  template <typename InputBlockProperties, StorageIndex TileSizeDimNC>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair<StorageIndex, StorageIndex> local_id_extract(
-      const StorageIndex &linearLocalThreadId) {
-    const StorageIndex localThreadNC =
-        (InputBlockProperties::is_coalesced_layout)
-            ? linearLocalThreadId % (TileSizeDimNC / InputBlockProperties::nc_stride)
-            : linearLocalThreadId / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
-    const StorageIndex localThreadC =
-        (InputBlockProperties::is_coalesced_layout)
-            ? linearLocalThreadId / (TileSizeDimNC / InputBlockProperties::nc_stride)
-            : linearLocalThreadId % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
-    return std::pair<StorageIndex, StorageIndex>(localThreadNC, localThreadC);
-  }
-
-  template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-      typename ::Eigen::internal::enable_if<db && ctp == contraction_type::local>::type
-      sync_mem(const cl::sycl::nd_item<1> &, bool &db_offset) noexcept {
-    db_offset = !db_offset;
-  }
-
-  template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-      typename ::Eigen::internal::enable_if<!db && ctp == contraction_type::local>::type
-      sync_mem(const cl::sycl::nd_item<1> &itemID, bool &) noexcept {
-    itemID.barrier(cl::sycl::access::fence_space::local_space);
-  }
-
-  template <contraction_type ctp = contraction_tp>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-      typename ::Eigen::internal::enable_if<ctp == contraction_type::no_local>::type
-      sync_mem(const cl::sycl::nd_item<1> &, bool &) noexcept {
-    return;
-  }
-
-  template <bool need_sync, contraction_type ctp = contraction_tp>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-      typename ::Eigen::internal::enable_if<need_sync && ctp == contraction_type::no_local>::type
-      sync_thread(const cl::sycl::nd_item<1> &
-#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
-                      itemID
-#endif
-                  ) noexcept {
-#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
-    itemID.barrier(cl::sycl::access::fence_spacce::local_space);
-#else
-    return;
-#endif
-  }
-  template <bool need_sync, contraction_type ctp = contraction_tp>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-      typename ::Eigen::internal::enable_if<need_sync && ctp == contraction_type::local>::type
-      sync_thread(const cl::sycl::nd_item<1> &itemID) {
-    itemID.barrier(cl::sycl::access::fence_space::local_space);
-  }
-  template <bool need_sync>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!need_sync>::type sync_thread(
-      const cl::sycl::nd_item<1> &) {
-    return;
-  }
-
-  template <bool is_internal_block>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID,
-                                                                    ThreadProperties<StorageIndex> &thread_properties,
-                                                                    TiledMemory &tiled_input_block,
-                                                                    PacketReturnType *privateRes, bool &db_offset) {
-    // Tiling the Rhs block from global to local memory
-    extract_block<RHSBlockProperties, is_internal_block>(
-        rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR),
-        tiled_input_block.rhs_extract_index,
-        contraction_tp == contraction_type::local ? thread_properties.nGroupOffset : thread_properties.nGlobalOffset,
-        thread_properties.kGroupOffset - thread_properties.kSize);
-
-    sync_thread<contraction_tp == contraction_type::no_local>(itemID);
-
-    // Tiling the Lhs block from global to local memory
-    extract_block<LHSBlockProperties, is_internal_block>(
-        lhs, tiled_input_block.lhs_scratch_extract.ptr + (db_offset * LSDL * Properties::TileSizeDimK),
-        tiled_input_block.lhs_extract_index,
-        contraction_tp == contraction_type::local ? thread_properties.mGroupOffset : thread_properties.mGlobalOffset,
-        thread_properties.kGroupOffset - thread_properties.kSize);
-
-    // itemID.barrier(cl::sycl::access::fence_space::local_space);
-    sync_thread<contraction_tp == contraction_type::local>(itemID);
-    // switch to compute mede
-    StorageIndex lhs_offset = (db_offset * LSDL * Properties::TileSizeDimK);
-    StorageIndex rhs_offset = (db_offset * Properties::TileSizeDimK * LSDR);
-    // Loop over the values of a single tile
-    for (StorageIndex k = 0; k < Properties::TileSizeDimK; k++) {
-      compute_block_per_tile(tiled_input_block.lhs_scratch_ptr_compute + lhs_offset,
-                             tiled_input_block.rhs_scratch_ptr_compute + rhs_offset, privateRes);
-      lhs_offset += LSDL;
-      rhs_offset += LSDR;
-    }
-    // computing the K index for the next tile
-    thread_properties.kSize -= Properties::TileSizeDimK;
-    sync_mem(itemID, db_offset);
-  }
-
-  // when local memory is available the following compute_panel will be enabled
-  template <bool is_internal_block, typename OutPtr>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID,
-                                                           ThreadProperties<StorageIndex> &thread_properties,
-                                                           OutPtr out_ptr) {
-    auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()};
-    // Allocate register space
-    PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = {
-        PacketReturnType{0}};
-    bool db_offset = 0;
-
-    while (thread_properties.kSize >= Properties::TileSizeDimK) {
-      compute_tile_per_panel<is_internal_block>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
-    }
-    if (thread_properties.kSize > 0) {
-      compute_tile_per_panel<false>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
-    }
-
-    // Storing the final results in the output
-    store<is_internal_block,
-          contraction_tp == contraction_type::local ? static_cast<StorageIndex>(1) : RHSBlockProperties::nc_stride>(
-        out_ptr + thread_properties.nGlobalOffset * triple_dim.M, privateRes, thread_properties.mGlobalOffset,
-        thread_properties.nGlobalOffset);
-  }
-  // When local memory is available the following extract_block will be enabled
-  template <typename InputBlockProperties, bool is_internal_block, typename Input, typename Local,
-            contraction_type contract_tp = contraction_tp>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-      typename ::Eigen::internal::enable_if<contract_tp == contraction_type::local>::type
-      extract_block(const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex>& local_index,
-                    const StorageIndex &ncOffset, const StorageIndex cOffset) {
-    EIGEN_CONSTEXPR StorageIndex TileSizeDimNC =
-        InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM;
-    EIGEN_CONSTEXPR StorageIndex LoadPerThread =
-        InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs;
-    EIGEN_CONSTEXPR StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL;
-    static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) &&
-                   (LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)),
-                  " LocalOffset must be divisable by stride");
-    const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
-    StorageIndex localThreadNC = local_index.first;
-    StorageIndex localThreadC = local_index.second;
-    auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
-      return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
-              (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
-    };
-    EIGEN_UNROLL_LOOP
-    for (StorageIndex lPT = 0; lPT < LoadPerThread / InputBlockProperties::elements_per_access; lPT++) {
-      const StorageIndex CIndex = cOffset + (InputBlockProperties::c_stride * localThreadC);
-      const StorageIndex NCIndex = ncOffset + (InputBlockProperties::nc_stride * localThreadNC);
-      const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
-      if (check_boundary<is_internal_block>(chk_bound(CIndex, NCIndex))) {
-        auto val =
-            read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
-                 InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, NCIndex, CIndex, ld);
-        write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
-            val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
-                     (InputBlockProperties::c_stride * localThreadC * LSD));
-      } else {
-        EIGEN_UNROLL_LOOP
-        for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
-          const StorageIndex nCInd = NCIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
-          const StorageIndex cInd = CIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
-          OutScalar val =
-              (nCInd < NC && cInd < triple_dim.K)
-                  ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
-                        inpt, nCInd, cInd, ld)
-                  : OutScalar(0);
-
-          write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
-              val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
-                       (InputBlockProperties::is_coalesced_layout ? i : 0) +
-                       ((InputBlockProperties::c_stride * localThreadC +
-                         (InputBlockProperties::is_coalesced_layout ? 0 : i)) *
-                        LSD));
-        }
-      }
-      localThreadNC += (InputBlockProperties::is_coalesced_layout)
-                           ? LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride)
-                           : LocalOffset / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
-      localThreadC += (InputBlockProperties::is_coalesced_layout)
-                          ? LocalOffset / (TileSizeDimNC / InputBlockProperties::nc_stride)
-                          : LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
-    }
-  }
-};
-
-#ifndef EIGEN_SYCL_DISABLE_GEMV
-
-/*!
- * \brief GeneralVectorTensor is a template class that provides Tensor -vector contraction operation, which is a special
- * case of Tensor Tensor contraction.
- *
- * \tparam OutScalar: determines the output scalar type
- *
- * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
- * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
- *
- * \tparam VectorMapper: determines the tensor contraction mapper for the vector input (can be lhs or rhs)
- *
- * \tparam TensorMapper: determines the tensor contraction mapper for the tensor input (can be lhs or rhs)
- *
- * \tparam StorageIndex: determines the StorageIndex Type
- *
- * \tparam Properties: determines the Contraction Panel properties
- *
- * \tparam KFactor: determines the number of elements in K dimension in a Tile
- *
- * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
- *
- * \tparam is_lhs_vec: determines whether lhs is a vector or rhs is a vector
- *
- * \tparam IsFinal: determine if this is the final kernel. If so, the result will be written in a final output.
- * Otherwise, the result of contraction will be written iin a temporary buffer.
- *
- * \param scratch: determines the local memory containing the vector block for each work-group
- *
- * \param vec: determines the vector input (tensor mapper)
- *
- * \param mat: determines the tensor input (tensor mapper)
- *
- * \param out_res: determines the output vector containing the contraction result
- *
- * \param nonContractGroupSize: a logical number determining the number of work-group for non-contracting dimension
- *
- * \param nonContractDim: determines the size of non contracting dimension for the flattened tensor
- *
- * \param contractDim: determines the size of non contracting dimension for the flattened tensor
- *
- */
-template <typename OutScalar, typename OutAccessor, typename VectorMapper, typename TensorMapper, typename StorageIndex,
-          typename Properties, StorageIndex KFactor, bool Vectorizable, bool is_lhs_vec, bool IsFinal>
-struct GeneralVectorTensor {
-  typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
-      PacketReturnType;
-  static EIGEN_CONSTEXPR int PacketSize =
-      Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
-  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
-
-  static EIGEN_CONSTEXPR StorageIndex OutScratchOffset =
-      KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
-
-  // Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make
-  // sure that the !^ is true When RHS is a vector, we pass true and true to make sure that the !^ is true.
-  typedef BlockProperties<is_lhs_vec ? false : true, is_lhs_vec ? false : true, Vectorizable, PacketReturnType>
-      VecBlockProperties;
-
-  Scratch scratch;
-  const VectorMapper vec;
-  const TensorMapper mat;
-  OutAccessor out_res;
-  const StorageIndex nonContractGroupSize;
-  const StorageIndex nonContractDim;
-  const StorageIndex contractDim;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE GeneralVectorTensor(Scratch scratch_, const VectorMapper vec_,
-                                                            const TensorMapper mat_, OutAccessor out_res_,
-                                                            const StorageIndex nonContractGroupSize_,
-                                                            const StorageIndex nonContractDim_,
-                                                            const StorageIndex contractDim_)
-      : scratch(scratch_),
-        vec(vec_),
-        mat(mat_),
-        out_res(out_res_),
-        nonContractGroupSize(nonContractGroupSize_),
-        nonContractDim(nonContractDim_),
-        contractDim(contractDim_) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
-    auto scratch_ptr = scratch.get_pointer();
-    const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
-    StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC
-                                            : linearLocalThreadId % Properties::LocalThreadSizeNC;
-    StorageIndex contractId = is_lhs_vec ? linearLocalThreadId % Properties::LocalThreadSizeC
-                                         : linearLocalThreadId / Properties::LocalThreadSizeNC;
-    const StorageIndex cGroupSize = itemID.get_group_range(0) / nonContractGroupSize;
-    const StorageIndex nonContractGroupId =
-        is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize;
-    const StorageIndex contractGroupId =
-        is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize;
-    auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : contractGroupId * nonContractDim);
-
-    const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC;
-    const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC;
-    auto outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
-    const StorageIndex globalNonContractDimOffset = nonContractGroupOffset + nonContractId;
-    const StorageIndex globalContractDimOffset = contractGroupOffset + contractId;
-    auto local_output = scratch_ptr + OutScratchOffset;
-    const bool is_internal = nonContractDim - nonContractGroupOffset >= Properties::TileSizeDimNC &&
-                             contractDim - contractGroupOffset >= Properties::TileSizeDimC;
-    is_internal
-        ? compute_panel<true>(itemID, vec, mat, local_output, out_ptr,
-#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
-                              scratch_ptr, contractGroupOffset,
-#endif
-                              nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
-                              nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex)
-        : compute_panel<false>(itemID, vec, mat, local_output, out_ptr,
-#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
-                               scratch_ptr, contractGroupOffset,
-#endif
-                               nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
-                               nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex);
-  }
-  template <bool is_internal_block, typename OutPtr>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(
-      const cl::sycl::nd_item<1> &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output,
-      OutPtr out_ptr,
-#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
-      OutScalar *scratch_ptr, const StorageIndex contractGroupOffset,
-#endif
-      const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim,
-      StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId,
-      StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex) {
-    OutScalar outScalar[Properties::WorkLoadPerThreadNC] = {OutScalar(0)};
-    // Reading the vector
-#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
-    const StorageIndex vectorOffset = contractGroupOffset + linearLocalThreadId;
-    extract_block<VecBlockProperties, is_internal_block, KFactor,
-                  Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC>(vec, scratch_ptr, linearLocalThreadId,
-                                                                                vectorOffset, contractDim);
-
-    itemID.barrier(cl::sycl::access::fence_space::local_space);
-    auto in_scratch_ptr = scratch_ptr + contractId;
-#endif
-
-    StorageIndex privateOffsetC = 0;
-    EIGEN_UNROLL_LOOP
-    for (StorageIndex i = 0; i < Properties::WorkLoadPerThreadC; i++) {
-      StorageIndex privateOffsetNC = 0;
-      bool contract_conds = ((globalContractDimOffset + privateOffsetC) < contractDim);
-#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
-      auto vecScalar = *in_scratch_ptr;
-#else
-      auto vecScalar = (check_boundary<is_internal_block>(contract_conds))
-                           ? vec(is_lhs_vec ? StorageIndex(0) : globalContractDimOffset + privateOffsetC,
-                                 is_lhs_vec ? globalContractDimOffset + privateOffsetC : StorageIndex(0))
-                           : OutScalar(0);
-#endif
-      EIGEN_UNROLL_LOOP
-      for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
-        auto matScalar = (check_boundary<is_internal_block>(
-                             contract_conds && ((globalNonContractDimOffset + privateOffsetNC) < nonContractDim)))
-                             ? mat(is_lhs_vec ? globalContractDimOffset + privateOffsetC
-                                              : globalNonContractDimOffset + privateOffsetNC,
-                                   is_lhs_vec ? globalNonContractDimOffset + privateOffsetNC
-                                              : globalContractDimOffset + privateOffsetC)
-                             : OutScalar(0);
-
-        outScalar[j] = cl::sycl::mad(matScalar, vecScalar, outScalar[j]);
-        privateOffsetNC += Properties::LocalThreadSizeNC;
-      }
-      privateOffsetC += Properties::LocalThreadSizeC;
-#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
-      in_scratch_ptr += Properties::LocalThreadSizeC;
-#endif
-    }
-
-    auto out_scratch_ptr = local_output + outScratchIndex;
-    // Each block of 16*16 element in shared memory should reduce to 16*1
-    EIGEN_UNROLL_LOOP
-    for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
-      *out_scratch_ptr = outScalar[j];
-
-      out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
-    }
-    if (is_lhs_vec) {
-      nonContractId = linearLocalThreadId % Properties::LocalThreadSizeNC;
-      contractId = linearLocalThreadId / Properties::LocalThreadSizeNC;
-      outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
-    }
-
-    out_scratch_ptr = local_output + outScratchIndex;
-    EIGEN_UNROLL_LOOP
-    for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
-      EIGEN_UNROLL_LOOP
-      for (StorageIndex offset = Properties::LocalThreadSizeC >> 1; offset > 0; offset >>= 1) {
-        itemID.barrier(cl::sycl::access::fence_space::local_space);
-        if (contractId < offset) {
-          StorageIndex myNeigbourId = (Properties::LocalThreadSizeNC * offset);
-          *out_scratch_ptr += out_scratch_ptr[myNeigbourId];
-        }
-      }
-      // moving to next 16 by 16 block
-      out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
-    }
-
-    if (contractId == 0) {
-      out_scratch_ptr = local_output + nonContractId;
-      StorageIndex global_final_offset = nonContractGroupOffset + nonContractId;
-      out_ptr += global_final_offset;
-      EIGEN_UNROLL_LOOP
-      for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
-        if (check_boundary<is_internal_block>(global_final_offset < nonContractDim)) {
-          auto res = *out_scratch_ptr;
-
-          *out_ptr = res;
-          out_ptr += Properties::LocalThreadSizeNC;
-        }
-        // moving to next 16 by 16 block to ge the next 16 reduced elements
-        out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
-        if (!(is_internal_block)) global_final_offset += Properties::LocalThreadSizeNC;
-      }
-    }
-  }
-
-  template <typename InputBlockProperties, bool is_internal_block, int CFactor, int GroupSize, typename Input,
-            typename Local>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr,
-                                                                  const StorageIndex &linearLocalThreadId,
-                                                                  const StorageIndex &cOffset, const StorageIndex &C) {
-    local_ptr += InputBlockProperties::c_stride * linearLocalThreadId;
-    StorageIndex cIndex = cOffset;
-    for (StorageIndex cId = 0; cId < CFactor / InputBlockProperties::c_stride; cId++) {
-      if (check_boundary<is_internal_block>(cIndex + InputBlockProperties::c_stride - 1 < C)) {
-        auto val = read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
-                        InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, StorageIndex(0),
-                                                                                              cIndex, StorageIndex(1));
-        write<StorageIndex, 1, data_source::local_mem>(val, local_ptr);
-      } else {
-        EIGEN_UNROLL_LOOP
-        for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
-          OutScalar val =
-              (cIndex + i < C)
-                  ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
-                        inpt, StorageIndex(0), cIndex + i, StorageIndex(1))
-                  : OutScalar(0);
-          write<StorageIndex, 1, data_source::local_mem>(val, local_ptr + i);
-        }
-      }
-      local_ptr += InputBlockProperties::c_stride * GroupSize;
-      cIndex += InputBlockProperties::c_stride * GroupSize;
-    }
-  }
-};
-#endif
-
-#ifndef EIGEN_SYCL_DISABLE_SCALAR
-
-/*!
- * \brief GeneralScalarContraction is a template class that provides the scalar value of Tensor -Tensor contraction
- * operation, when all the dimensions are contracting dimensions. This Kernel reduces two tensors to an scalar
- *
- * \tparam OutScalar: determines the output scalar type
- *
- * \tparam LhsScalar: determines the left-hand-side scalar type
- *
- * \tparam RhsScalar: determines the right-hand-side scalar type
- *
- * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
- * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
- *
- * \tparam LhsMapper: determines the tensor contraction mapper type for left-hand-side matrix
- *
- * \tparam RhsMapper: determines the tensor contraction mapper type for right-hand-side matrix
- *
- * \tparam StorageIndex: determines the StorageIndex Type
- *
- * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
- *
- * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group
- *
- * \param lhs: determines the left-hand-side flattened tensor (tensor mapper)
- *
- * \param rhs: determines the right-hand-side flattened tensor (tensor mapper)
- *
- * \param out_res: determines the output tensor containing the contraction result
- *
- * \param rng: determins the total input data size
- */
-template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
-          typename RhsMapper, typename StorageIndex, bool Vectorizable>
-struct GeneralScalarContraction {
-  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
-  Scratch scratch;
-  const LhsMapper lhs;
-  const RhsMapper rhs;
-  OutAccessor out_res;
-  const StorageIndex rng;
-
-  EIGEN_DEVICE_FUNC
-  GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_,
-                           const StorageIndex rng_)
-      : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {}
-
-  EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) {
-    auto out_ptr = out_res.get_pointer();
-    auto scratch_ptr = scratch.get_pointer().get();
-
-    StorageIndex globalid = itemID.get_global_id(0);
-    StorageIndex localid = itemID.get_local_id(0);
-    OutScalar accumulator = OutScalar(0);
-    for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) {
-      accumulator = cl::sycl::mad(lhs(0, i), rhs(i, 0), accumulator);
-    }
-    auto out_scratch_ptr = scratch_ptr + localid;
-    *out_scratch_ptr = accumulator;
-    for (StorageIndex offset = itemID.get_local_range(0) >> 1; offset > 0; offset >>= 1) {
-      itemID.barrier(cl::sycl::access::fence_space::local_space);
-      if (localid < offset) {
-        *out_scratch_ptr = (accumulator += out_scratch_ptr[offset]);
-      }
-    }
-    if (localid == 0) {
-      out_ptr[itemID.get_group(0)] = accumulator;
-    }
-  }
-};
-#endif
-
-}  // namespace internal
-}  // namespace TensorSycl
-
-template <typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>,
-                       Eigen::SyclDevice>
-    : public TensorContractionEvaluatorBase<TensorEvaluator<
-          const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Eigen::SyclDevice>> {
-  static_assert(std::is_same<OutputKernelType, const NoOpOutputKernel>::value,
-                "SYCL tensor contraction does not support output kernels.");
-
-  typedef Eigen::SyclDevice Device;
-
-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
-  typedef TensorContractionEvaluatorBase<Self> Base;
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
-  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
-  typedef typename XprType::Index StorageIndex;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef typename Base::Storage Storage;
-  typedef typename Base::EvaluatorPointerType EvaluatorPointerType;
-  struct TripleDim {
-    const StorageIndex M;
-    const StorageIndex N;
-    const StorageIndex K;
-    TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_) : M(M_), N(N_), K(K_) {}
-  };
-  enum {
-    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
-    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess = false,
-  };
-
-  static EIGEN_CONSTEXPR int LDims = Base::LDims;
-  static EIGEN_CONSTEXPR int RDims = Base::RDims;
-  static EIGEN_CONSTEXPR int ContractDims = Base::ContractDims;
-
-  typedef array<StorageIndex, LDims> left_dim_mapper_t;
-  typedef array<StorageIndex, RDims> right_dim_mapper_t;
-
-  typedef array<StorageIndex, ContractDims> contract_t;
-  typedef array<StorageIndex, LDims - ContractDims> left_nocontract_t;
-  typedef array<StorageIndex, RDims - ContractDims> right_nocontract_t;
-
-  static const int NumDims = LDims + RDims - 2 * ContractDims;
-
-  typedef DSizes<StorageIndex, NumDims> Dimensions;
-
-  typedef TensorEvaluator<typename Base::EvalLeftArgType, Device> LeftEvaluator;
-  typedef TensorEvaluator<typename Base::EvalRightArgType, Device> RightEvaluator;
-  typedef typename Eigen::internal::remove_const<typename LeftEvaluator::CoeffReturnType>::type LhsScalar;
-  typedef typename Eigen::internal::remove_const<typename RightEvaluator::CoeffReturnType>::type RhsScalar;
-
-  typedef typename LeftEvaluator::Dimensions LeftDimensions;
-  typedef typename RightEvaluator::Dimensions RightDimensions;
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered>
-  struct input_mapper_propertis {
-    static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous;
-    static EIGEN_CONSTEXPR bool is_rhs_matrix =
-        (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered);
-  };
-
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType &op, const Device &device) : Base(op, device) {}
-
-  // We need to redefine this method to make nvcc happy
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data) {
-    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
-    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
-    if (!data) {
-      this->m_result = this->m_device.get(
-          static_cast<Scalar *>(this->m_device.allocate_temp(this->dimensions().TotalSize() * sizeof(Scalar))));
-      data = this->m_result;
-    }
-    evalToSycl(data);
-    return (this->m_result != NULL);
-  }
-  const Eigen::SyclDevice &device() const { return this->m_device; }
-  void evalToSycl(typename Base::EvaluatorPointerType buffer) const {
-    if (this->m_lhs_inner_dim_contiguous) {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<true, true, true, Unaligned>(buffer);
-        } else {
-          evalTyped<true, true, false, Unaligned>(buffer);
-        }
-      } else {
-        if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<true, false, true, Unaligned>(buffer);
-        } else {
-          evalTyped<true, false, false, Unaligned>(buffer);
-        }
-      }
-    } else {
-      if (this->m_rhs_inner_dim_contiguous) {
-        if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<false, true, true, Unaligned>(buffer);
-        } else {
-          evalTyped<false, true, false, Unaligned>(buffer);
-        }
-      } else {
-        if (this->m_rhs_inner_dim_reordered) {
-          evalTyped<false, false, true, Unaligned>(buffer);
-        } else {
-          evalTyped<false, false, false, Unaligned>(buffer);
-        }
-      }
-    }
-  }
-
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
-  void evalTyped(typename Base::EvaluatorPointerType buffer) const {
-    const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size};
-    typedef internal::TensorContractionInputMapper<
-        LhsScalar, StorageIndex, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t,
-        PacketType<CoeffReturnType, Device>::size, lhs_inner_dim_contiguous, false, Unaligned, MakeSYCLPointer>
-        LhsMapper;
-
-    typedef internal::TensorContractionInputMapper<RhsScalar, StorageIndex, internal::Rhs, RightEvaluator,
-                                                   right_nocontract_t, contract_t,
-                                                   PacketType<CoeffReturnType, Device>::size, rhs_inner_dim_contiguous,
-                                                   rhs_inner_dim_reordered, Unaligned, MakeSYCLPointer>
-        RhsMapper;
-
-    // initialize data mappers
-    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
-                  this->m_left_contracting_strides, this->m_k_strides);
-
-    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
-                  this->m_right_contracting_strides, this->m_k_strides);
-
-#ifndef EIGEN_SYCL_DISABLE_SCALAR
-    if (triple_dim.M == 1 && triple_dim.N == 1) {
-      launchSC(buffer, lhs, rhs, triple_dim.K);
-    } else
-#endif
-#ifndef EIGEN_SYCL_DISABLE_GEMV
-        if (triple_dim.M != 1 && triple_dim.N == 1) {
-      LaunchVT<false>(buffer, rhs, lhs, triple_dim.M, triple_dim.K);
-    } else if (triple_dim.M == 1 && triple_dim.N != 1) {
-      LaunchVT<true>(buffer, lhs, rhs, triple_dim.N, triple_dim.K);
-    } else  // This is equivalent of if (m!=1 && n!=1)
-#endif
-    {
-      typedef input_mapper_propertis<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>
-          inpt_mapper_properties;
-#ifndef EIGEN_SYCL_DISABLE_SKINNY
-      bool skinny = false;
-      auto platform_name = this->device().getPlatformName();
-      // This is based on empirical calculation for AMD r9-nano and Fiji
-      if (platform_name.find("AMD") == 0) {
-        skinny = (triple_dim.M < triple_dim.K || triple_dim.N < triple_dim.K) &&
-                 ((triple_dim.M < 1024 && triple_dim.N < 1024) ||
-                  (uint64_t(triple_dim.M * triple_dim.N) < uint64_t(triple_dim.K)));
-      } else {
-        skinny = (((std::max(triple_dim.K, triple_dim.N) / std::min(triple_dim.K, triple_dim.N)) > 100) ||
-                  ((std::max(triple_dim.K, triple_dim.M) / std::min(triple_dim.K, triple_dim.M)) > 100) ||
-                  ((std::max(triple_dim.N, triple_dim.M) / std::min(triple_dim.N, triple_dim.M)) > 100));
-      }
-      if (skinny)
-        adjustTT<true, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
-      else
-#endif  // EIGEN_SYCL_DISABLE_SKINNY
-        adjustTT<false, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
-    }
-  }
-
-  template <bool skinny, typename input_mapper_properties, typename LhsMapper, typename RhsMapper>
-  void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
-                                    const TripleDim &triple_dim) const {
-#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
-    if (device().has_local_memory()) {
-      typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 16> PanelParameters;
-      launchTT<TensorSycl::internal::contraction_type::local, skinny, input_mapper_properties, PanelParameters>(
-          buffer, lhs, rhs, triple_dim);
-    }
-#endif
-#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF
-    if (!(device().has_local_memory())) {
-      typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 4> PanelParameters;
-      launchTT<TensorSycl::internal::contraction_type::no_local, skinny, input_mapper_properties, PanelParameters>(
-          buffer, lhs, rhs, triple_dim);
-    }
-#endif
-  }
-
-  template <TensorSycl::internal::contraction_type ct, bool skinny, typename input_mapper_properties,
-            typename Properties, typename LhsMapper, typename RhsMapper>
-  void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
-                const TripleDim &triple_dim) const {
-    const StorageIndex roundUpM = Eigen::TensorSycl::internal::roundUp(triple_dim.M, Properties::TileSizeDimM);
-    const StorageIndex roundUpN = Eigen::TensorSycl::internal::roundUp(triple_dim.N, Properties::TileSizeDimN);
-    const StorageIndex groupSizeM = roundUpM / Properties::TileSizeDimM;
-    const StorageIndex groupSizeN = roundUpN / Properties::TileSizeDimN;
-
-    const StorageIndex roundUpK = Eigen::TensorSycl::internal::roundUp(triple_dim.K, Properties::TileSizeDimK);
-    StorageIndex totalTilesK = roundUpK / Properties::TileSizeDimK;
-    StorageIndex groupSizeK =
-        skinny
-            ? std::max(std::min(totalTilesK,
-                                (StorageIndex)(device().getPowerOfTwo(device().getNumSyclMultiProcessors(), true) * 4) /
-                                    (groupSizeM * groupSizeN)),
-                       StorageIndex(1))
-            : StorageIndex(1);
-
-    const StorageIndex numTilesPerGroup = Eigen::TensorSycl::internal::roundUp(totalTilesK, groupSizeK) / groupSizeK;
-
-    const StorageIndex totalGroupSize = groupSizeM * groupSizeN * groupSizeK;
-
-    const StorageIndex localRange = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
-    const StorageIndex globalRange = totalGroupSize * localRange;
-
-    const StorageIndex scratchSize = (ct == TensorSycl::internal::contraction_type::local)
-                                         ? ((Properties::DoubleBuffer + 1) *
-                                            (Properties::TileSizeDimM + Properties::BC) * (Properties::TileSizeDimK)) +
-                                               ((Properties::DoubleBuffer + 1) * (Properties::TileSizeDimK) *
-                                                (Properties::TileSizeDimN + Properties::BC))
-                                         : StorageIndex(1);
-
-    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
-    if (groupSizeK == 1) {
-      typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,
-                                                            LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
-                                                            PacketAccess, input_mapper_properties, true, ct>
-          ContractKernelName;
-      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
-          lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim);
-    } else {
-      typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,
-                                                            LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
-                                                            PacketAccess, input_mapper_properties, false, ct>
-          ContractKernelName;
-      CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
-          device().allocate_temp(triple_dim.M * triple_dim.N * groupSizeK * sizeof(CoeffReturnType)));
-      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
-
-      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
-          lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup,
-          triple_dim);
-
-      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
-      auto op = Op();
-      typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,
-                                                               EvaluatorPointerType, Op>
-          ReductionKernel;
-
-      device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
-          tmp_global_accessor, buffer,
-          cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex(
-                                    Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))),
-                                cl::sycl::range<1>(localRange)),
-          StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK);
-
-      device().deallocate_temp(temp_pointer);
-    }
-  }
-
-#ifndef EIGEN_SYCL_DISABLE_GEMV
-  template <bool is_lhs_vec, typename VectorMapper, typename TensorMapper, typename StorageIndex>
-  void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat,
-                                    StorageIndex NC, StorageIndex C) const {
-    const StorageIndex nonContractDim = NC;
-    EIGEN_CONSTEXPR StorageIndex NCFactor = 1;
-    EIGEN_CONSTEXPR StorageIndex CFactor = 1;
-    EIGEN_CONSTEXPR StorageIndex NCWindow = 16;
-    typedef Eigen::TensorSycl::internal::TVPanelSize<CoeffReturnType, StorageIndex, NCWindow, CFactor, NCFactor>
-        Properties;
-    const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC);
-    const StorageIndex cNumGroups = roundUpC / (Properties::LocalThreadSizeC * Properties::WorkLoadPerThreadC);
-    const StorageIndex roundUpNC = Eigen::TensorSycl::internal::roundUp(nonContractDim, Properties::TileSizeDimNC);
-    const StorageIndex nCNumGroups = roundUpNC / (Properties::LocalThreadSizeNC * Properties::WorkLoadPerThreadNC);
-    const StorageIndex globalRange =
-        (roundUpNC / (Properties::WorkLoadPerThreadNC)) * (roundUpC / (Properties::WorkLoadPerThreadC));
-    const StorageIndex localRange = Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC;
-    const StorageIndex scratchSize =
-        (Properties::WorkLoadPerThreadNC + CFactor) * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
-    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
-    if (cNumGroups > 1) {
-      typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,
-                                                               TensorMapper, StorageIndex, Properties, CFactor, false,
-                                                               is_lhs_vec, false>
-          ContractKernelName;
-      CoeffReturnType *temp_pointer =
-          static_cast<CoeffReturnType *>(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType)));
-      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
-
-      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
-          vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C);
-
-      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
-      typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,
-                                                               EvaluatorPointerType, Op>
-          ReductionKernel;
-
-      device().template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
-          tmp_global_accessor, buffer,
-          cl::sycl::nd_range<1>(cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)),
-                                cl::sycl::range<1>(localRange)),
-          StorageIndex(1), Op(), nonContractDim, cNumGroups);
-
-      device().deallocate_temp(temp_pointer);
-    } else {
-      typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,
-                                                               TensorMapper, StorageIndex, Properties, CFactor, false,
-                                                               is_lhs_vec, true>
-          ContractKernelName;
-      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
-          vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C);
-    }
-  }
-#endif
-
-#ifndef EIGEN_SYCL_DISABLE_SCALAR
-  template <typename LhsMapper, typename RhsMapper>
-  EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
-                                    StorageIndex K) const {
-    EIGEN_STATIC_ASSERT(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
-                          (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
-                        "The Local thread size must be a power of 2 for the reduction "
-                        "operation");
-    EIGEN_CONSTEXPR StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
-
-    // Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread
-    // reduces at least 512 elementss individually, we get better performance.
-    const StorageIndex num_work_group = ((K + (512 * local_range - 1)) / (512 * local_range) > 1 ? local_range : 1);
-    const StorageIndex global_range = num_work_group * local_range;
-
-    typedef Eigen::TensorSycl::internal::GeneralScalarContraction<
-        CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, LhsMapper, RhsMapper, StorageIndex, false>
-        ContractKernelName;
-    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
-    if (num_work_group > 1) {
-      CoeffReturnType *temp_pointer =
-          static_cast<CoeffReturnType *>(device().allocate_temp(num_work_group * sizeof(CoeffReturnType)));
-      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
-      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, tmp_global_accessor,
-                                                                                    thread_range, local_range, K);
-      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
-      typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
-                                                          EvaluatorPointerType, StorageIndex, local_range>
-          GenericRKernel;
-      device().template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
-          tmp_global_accessor, buffer,
-          cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, Op());
-
-      device().deallocate_temp(temp_pointer);
-    } else {
-      device().template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, buffer, thread_range,
-                                                                                    local_range, K);
-    }
-  }
-#endif
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
-    this->m_leftImpl.cleanup();
-    this->m_rightImpl.cleanup();
-
-    if (this->m_result) {
-      this->m_device.deallocate_temp(this->m_result);
-      this->m_result = NULL;
-    }
-  }
-  // The placeholder accessors must bound to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    this->m_leftImpl.bind(cgh);
-    this->m_rightImpl.bind(cgh);
-    this->m_result.bind(cgh);
-  }
-};
-}  // namespace Eigen
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 21be6ea42..c70dea053 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -15,16 +15,57 @@
 
 namespace Eigen {
 
-template<typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
-struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> :
-    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice> > {
+#ifdef EIGEN_USE_SIMPLE_THREAD_POOL
+namespace internal {
+
+template<typename LhsScalar, typename LhsMapper, typename Index>
+struct packLhsArg {
+  LhsScalar* blockA;
+  const LhsMapper& lhs;
+  const Index m_start;
+  const Index k_start;
+  const Index mc;
+  const Index kc;
+};
+
+template<typename LhsScalar, typename RhsScalar, typename RhsMapper, typename OutputMapper, typename Index>
+struct packRhsAndKernelArg {
+  const MaxSizeVector<LhsScalar*>* blockAs;
+  RhsScalar* blockB;
+  const RhsMapper& rhs;
+  OutputMapper& output;
+  const Index m;
+  const Index k;
+  const Index n;
+  const Index mc;
+  const Index kc;
+  const Index nc;
+  const Index num_threads;
+  const Index num_blockAs;
+  const Index max_m;
+  const Index k_block_idx;
+  const Index m_block_idx;
+  const Index n_block_idx;
+  const Index m_blocks;
+  const Index n_blocks;
+  MaxSizeVector<Notification*>* kernel_notifications;
+  const MaxSizeVector<Notification*>* lhs_notifications;
+  const bool need_to_pack;
+};
+
+}  // end namespace internal
+#endif  // EIGEN_USE_SIMPLE_THREAD_POOL
+
+template<typename Indices, typename LeftArgType, typename RightArgType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> :
+    public TensorContractionEvaluatorBase<TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> > {
 
   typedef ThreadPoolDevice Device;
 
-  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, Device> Self;
   typedef TensorContractionEvaluatorBase<Self> Base;
 
-  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType> XprType;
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
@@ -71,35 +112,31 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   TensorEvaluator(const XprType& op, const Device& device) :
       Base(op, device) {}
 
-  template <int Alignment>
+#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
+            bool rhs_inner_dim_reordered, int Alignment>
   void evalProduct(Scalar* buffer) const {
-    evalProductImpl<NoCallback, Alignment>(buffer, NoCallback());
-  }
-
-  template <typename EvalToCallback, int Alignment>
-  void evalProductAsync(Scalar* buffer, EvalToCallback done) const {
-    evalProductImpl<EvalToCallback, Alignment>(buffer, std::move(done));
-  }
-
-  template <typename DoneCallback, int Alignment>
-  void evalProductImpl(Scalar* buffer, DoneCallback done) const {
-    // This function computes a lot of heuristics in multiple steps, and it
-    // also has multiple exit points. To keep it sane, readable and all in one
-    // place, sync/async execution decision is made at runtime at the very end.
-    //
-    // (1) In sync mode we allocate Context on the stack, submit computations
-    //     to the device thread pool, and block on a barrier until it is
-    //     completed.
-    //
-    // (2) In async mode we allocate Context on the heap, and after all tasks
-    //     are finished, we call provided the done callback, and delete a
-    //     context from the heap.
-    //
-    // (*) EvalParallelContext & EvalShardedByInnerDimContext owns all the state
-    // and temporary buffers, requried for executing the tensor contraction.
-    // They are responsible for cleaning it up after contraction is done.
-    static const bool IsEvalInSyncMode =
-        std::is_same<DoneCallback, NoCallback>::value;
+    typedef internal::TensorContractionInputMapper<
+        LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
+        contract_t, internal::packet_traits<LhsScalar>::size,
+        lhs_inner_dim_contiguous, false, Unaligned>
+        LhsMapper;
+    typedef internal::TensorContractionInputMapper<
+        RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
+        contract_t, internal::packet_traits<RhsScalar>::size,
+        rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
+        RhsMapper;
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+    typedef internal::gemm_pack_lhs<LhsScalar, Index,
+                                    typename LhsMapper::SubMapper, Traits::mr,
+                                    Traits::LhsProgress, ColMajor>
+        LhsPacker;
+    typedef internal::gemm_pack_rhs<
+        RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor>
+        RhsPacker;
+    typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
+                                  Traits::mr, Traits::nr, false, false>
+        GebpKernel;
 
     const Index m = this->m_i_size;
     const Index n = this->m_j_size;
@@ -135,14 +172,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     // Again, we don't know number of threads yet, so we use 2.
     Index bm, bn, bk;
     if (shard_by_col) {
-      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
+      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
                                           internal::ShardByCol>
           blocking(k, m, n, 2);
       bm = blocking.mc();
       bn = blocking.nc();
       bk = blocking.kc();
     } else {
-      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
+      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
                                           internal::ShardByRow>
           blocking(k, m, n, 2);
       bm = blocking.mc();
@@ -158,45 +195,35 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
         contractionCost(m, n, bm, bn, bk, shard_by_col, false);
     int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
         static_cast<double>(n) * m, cost, this->m_device.numThreads());
-    int num_threads_by_k = numThreadsInnerDim(m, n, k);
-    if (shardByInnerDim(m, n, k, num_threads, num_threads_by_k)) {
-      // We are in the scenario where it is more effective to shard by the
-      // inner dimension.
-      if (IsEvalInSyncMode) {
-        EvalShardedByInnerDimContext<DoneCallback> ctx(
-            this, num_threads_by_k, buffer, m, n, k, std::move(done));
-        ctx.template run<Alignment>();
-      } else {
-        auto* ctx = new EvalShardedByInnerDimContext<DoneCallback>(
-            this, num_threads_by_k, buffer, m, n, k, std::move(done));
-        ctx->template runAsync<Alignment>();
-      }
-
-      return;
-    }
 
     // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost
     // model is not tuned. Remove this when the cost model is tuned.
     if (n == 1) num_threads = 1;
 
     if (num_threads == 1) {
-      TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential,
-                                  Unaligned, (buffer));
-      if (!IsEvalInSyncMode) done();
+      // The single-threaded algorithm should be faster in this case.
+      if (n == 1)
+        this->template evalGemv<lhs_inner_dim_contiguous,
+                                rhs_inner_dim_contiguous,
+                                rhs_inner_dim_reordered, Alignment>(buffer);
+      else
+        this->template evalGemm<lhs_inner_dim_contiguous,
+                                rhs_inner_dim_contiguous,
+                                rhs_inner_dim_reordered, Alignment>(buffer);
       return;
     }
 
     // Now that we know number of threads, recalculate sharding and blocking.
     shard_by_col = shardByCol(m, n, num_threads);
     if (shard_by_col) {
-      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
+      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
                                           internal::ShardByCol>
           blocking(k, m, n, num_threads);
       bm = blocking.mc();
       bn = blocking.nc();
       bk = blocking.kc();
     } else {
-      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index,
+      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
                                           internal::ShardByRow>
           blocking(k, m, n, num_threads);
       bm = blocking.mc();
@@ -228,26 +255,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     Index nm = divup(nm0, gm);
     Index nn = divup(nn0, gn);
 
-    // If there is enough concurrency in the sharding dimension, we choose not
-    // to paralellize by the other dimension, and execute all kernels in sync
-    // mode. This reduces parallelism from the nm x nn down to nn
-    // (shard_by_col==true) or nm (shard_by_col==false).
-    const Index sharding_dim_tasks = shard_by_col ? nn : nm;
-    const int num_worker_threads = this->m_device.numThreadsInPool();
-
-    // With small number of threads we want to make sure that we do not reduce
-    // parallelism too much. With large number of threads we trade maximum
-    // parallelism for better memory locality.
-    const float oversharding_factor =
-        num_worker_threads <= 4  ? 8.0 :
-        num_worker_threads <= 8  ? 4.0 :
-        num_worker_threads <= 16 ? 2.0 :
-        num_worker_threads <= 32 ? 1.0 :
-        num_worker_threads <= 64 ? 0.8 : /* num_worker_threads > 64 */ 0.6;
-
-    const bool parallelize_by_sharding_dim_only =
-        sharding_dim_tasks >= oversharding_factor * num_worker_threads;
-
     // Last by not least, decide whether we want to issue both lhs and rhs
     // packing in parallel; or issue lhs packing first, and then issue rhs
     // packing when lhs packing completes (for !shard_by_col lhs and rhs are
@@ -263,139 +270,40 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     // But don't do it if we will use each rhs only once. Locality seems to be
     // more important in this case.
     if ((shard_by_col ? nm : nn) == 1) parallel_pack = false;
-    // Also don't get in the way of parallelize_by_sharding_dim_only
-    // optimization.
-    if (parallelize_by_sharding_dim_only) parallel_pack = false;
 
-    // TODO(ezhulnev): With if contexpr we don't need SyncEvalParallelContext.
-    if (IsEvalInSyncMode) {
-#define CONTEXT_ARGS                                                        \
-  (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \
-   nn0, shard_by_col, parallel_pack, parallelize_by_sharding_dim_only,      \
-   NoCallback())                                                            \
-      .run()
-      TENSOR_CONTRACTION_DISPATCH(SyncEvalParallelContext, Alignment,
-                                  CONTEXT_ARGS);
-#undef CONTEXT_ARGS
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides,
+                  this->m_i_strides, this->m_left_contracting_strides,
+                  this->m_k_strides);
 
-    } else {
-#define CONTEXT_ARGS                                                        \
-  (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \
-   nn0, shard_by_col, parallel_pack, parallelize_by_sharding_dim_only,      \
-   std::move(done))
-      TENSOR_CONTRACTION_ASYNC_DISPATCH(EvalParallelContext, DoneCallback,
-                                        Alignment, CONTEXT_ARGS, run());
-#undef CONTEXT_ARGS
-    }
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides,
+                  this->m_j_strides, this->m_right_contracting_strides,
+                  this->m_k_strides);
+
+    Context<LhsPacker, RhsPacker, GebpKernel, LhsMapper, RhsMapper,
+            OutputMapper>(this->m_device, num_threads, lhs, rhs, buffer, m, n,
+                          k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0,
+                          shard_by_col, parallel_pack)
+        .run();
   }
 
-  // ------------------------------------------------------------------------ //
-
-  // Dummy struct to represent an empty DoneCallback.
-
-  struct NoCallback {
-    void operator()() {
-      eigen_assert(false && "NoCallback should never be called");
-    }
-  };
-
-  // ------------------------------------------------------------------------ //
-
-  template <typename DoneCallback, typename Context>
-  class EvalParallelNotification;
-
-  // Synchronous evaluation notification that blocks caller thread in Wait().
-  template <typename Context>
-  class EvalParallelNotification<NoCallback, Context> {
+  // Context coordinates a single parallel gemm operation.
+  template <typename LhsPacker, typename RhsPacker, typename GebpKernel,
+            typename LhsMapper, typename RhsMapper, typename OutputMapper>
+  class Context {
    public:
-    EvalParallelNotification(Context*, NoCallback) {}
-    void Notify() { done_.Notify(); }
-    void Wait() { done_.Wait(); }
-   private:
-    Eigen::Notification done_;
-  };
-
-  // Asynchronous evaluation notification that does not block in Wait().
-  template <typename DoneCallback, typename Context>
-  class EvalParallelNotification {
-   public:
-    EvalParallelNotification(Context* ctx, DoneCallback done)
-        : ctx_(ctx), done_(std::move(done)) {}
-
-    void Notify() {
-      // Make a copy of done callback, because it will be destructed when we
-      // will delete context in the next line (EvalParallelNotification is a
-      // data member of EvalParallelContext class).
-      DoneCallback done_copy = std::move(done_);
-
-      // Delete parallel evaluation context.
-      delete ctx_;
-
-      // Now safely call the done callback.
-      done_copy();
-    }
-
-    void Wait() {}
-
-   private:
-    Context* ctx_;
-    DoneCallback done_;
-  };
-
-  // Context orchestrates sync/async parallel contraction evaluation. When it is
-  // executed in asynchronous mode, it owns all the shared state that might be
-  // accessible by block packing and kernel tasks.
-
-  template <typename DoneCallback, bool lhs_inner_dim_contiguous,
-            bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered,
-            int Alignment>
-  class EvalParallelContext {
-   public:
-    typedef internal::TensorContractionInputMapper<
-        LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
-        contract_t, internal::packet_traits<LhsScalar>::size,
-        lhs_inner_dim_contiguous, false, Unaligned>
-        LhsMapper;
-    typedef internal::TensorContractionInputMapper<
-        RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
-        contract_t, internal::packet_traits<RhsScalar>::size,
-        rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
-        RhsMapper;
-
-    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-
-    typedef internal::TensorContractionKernel<
-        Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
-        TensorContractionKernel;
-
-    typedef typename TensorContractionKernel::LhsBlock LhsBlock;
-    typedef typename TensorContractionKernel::RhsBlock RhsBlock;
-    typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle;
-
-    EvalParallelContext(const Self* self, int num_threads, Scalar* buffer,
-                        Index tm, Index tn, Index tk, Index bm, Index bn,
-                        Index bk, Index nm, Index nn, Index nk, Index gm,
-                        Index gn, Index nm0, Index nn0, bool shard_by_col,
-                        bool parallel_pack,
-                        bool parallelize_by_sharding_dim_only,
-                        DoneCallback done)
-        : created_by_thread_id_(std::this_thread::get_id()),
-          done_(this, std::move(done)),
-          device_(self->m_device),
-          lhs_(self->m_leftImpl, self->m_left_nocontract_strides,
-               self->m_i_strides, self->m_left_contracting_strides,
-               self->m_k_strides),
-          rhs_(self->m_rightImpl, self->m_right_nocontract_strides,
-               self->m_j_strides, self->m_right_contracting_strides,
-               self->m_k_strides),
+    Context(const Device& device, int num_threads, LhsMapper& lhs,
+            RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm,
+            Index bn, Index bk, Index nm, Index nn, Index nk, Index gm,
+            Index gn, Index nm0, Index nn0, bool shard_by_col,
+            bool parallel_pack)
+        : device_(device),
+          lhs_(lhs),
+          rhs_(rhs),
           buffer_(buffer),
           output_(buffer, tm),
-          output_kernel_(self->m_output_kernel),
-          tensor_contraction_params_(self->m_tensor_contraction_params),
           num_threads_(num_threads),
           shard_by_col_(shard_by_col),
           parallel_pack_(parallel_pack),
-          parallelize_by_sharding_dim_only_(parallelize_by_sharding_dim_only),
           m_(tm),
           n_(tn),
           k_(tk),
@@ -408,29 +316,13 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
           gm_(gm),
           gn_(gn),
           nm0_(nm0),
-          nn0_(nn0),
-          kernel_(m_, k_, n_, bm_, bk_, bn_),
-          num_thread_local_allocations_(0),
-          // We reserve 2X more capacity for a thread local values, than the
-          // number of threads in the pool to efficiently handle task stealing
-          // by threads that are not managed by the pool.
-          thread_local_capacity(2 * (parallelize_by_sharding_dim_only_
-                                         ? device_.numThreadsInPool()
-                                         : 0)),
-          // We will use only one of the Lhs/Rhs thread local storage depending
-          // on the shard_by_col value and we parallelize by sharding dim ONLY.
-          lhs_thread_local_blocks_(shard_by_col_ ? 0 : thread_local_capacity,
-                                   {*this}, {*this}),
-          rhs_thread_local_blocks_(shard_by_col_ ? thread_local_capacity : 0,
-                                   {*this}, {*this}) {
-      // These two options are mutually exclusive.
-      eigen_assert(!(parallel_pack && parallelize_by_sharding_dim_only));
-
+          nn0_(nn0)
+  {
       for (Index x = 0; x < P; x++) {
         // Normal number of notifications for k slice switch is
         // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only
         // nm_ + nn_ notifications, because they will not receive notifications
-        // from preceding kernels.
+        // from preceeding kernels.
         state_switch_[x] =
             x == 0
                 ? 1
@@ -452,97 +344,57 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       }
 
       // Allocate memory for packed rhs/lhs matrices.
-      packed_mem_ = kernel_.allocateSlices(            //
-          device_,                                     //
-          /*num_lhs=*/nm0_,                            //
-          /*num_rhs=*/nn0_,                            //
-          /*num_slices=*/std::min<Index>(nk_, P - 1),  //
-          packed_lhs_, packed_rhs_);
-
-      if (parallelize_by_sharding_dim_only_) {
-        const int num_worker_threads = device_.numThreadsInPool();
-
-        if (shard_by_col) {
-          can_use_thread_local_packed_ = new std::atomic<bool>[nn_];
-          for (int i = 0; i < nn_; ++i)
-            can_use_thread_local_packed_[i].store(true,
-                                                  std::memory_order_relaxed);
-
-          Index num_blocks = num_worker_threads * gn_;
-          thread_local_pre_alocated_mem_ = kernel_.allocateSlices(  //
-              device_,                                              //
-              /*num_lhs=*/0,                                        //
-              /*num_rhs=*/num_blocks,                               //
-              /*num_slices=*/1,                                     //
-              /*lhs_blocks=*/nullptr, &rhs_thread_local_pre_allocated_);
-
-        } else {
-          can_use_thread_local_packed_ = new std::atomic<bool>[nm_];
-          for (int i = 0; i < nm_; ++i)
-            can_use_thread_local_packed_[i].store(true,
-                                                  std::memory_order_relaxed);
-
-          Index num_blocks = num_worker_threads * gm_;
-          thread_local_pre_alocated_mem_ = kernel_.allocateSlices(  //
-              device_,                                              //
-              /*num_lhs=*/num_blocks,                               //
-              /*num_rhs=*/0,                                        //
-              /*num_slices=*/1, &lhs_thread_local_pre_allocated_,   //
-              /*rhs_blocks=*/nullptr);
+      size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
+      size_t lhs_size =
+          divup<size_t>(bm_ * bk_ * sizeof(LhsScalar), align) * align;
+      size_t rhs_size =
+          divup<size_t>(bn_ * bk_ * sizeof(RhsScalar), align) * align;
+      packed_mem_ = static_cast<char*>(internal::aligned_malloc(
+          (nm0_ * lhs_size + nn0_ * rhs_size) * std::min<size_t>(nk_, P - 1)));
+      char* mem = static_cast<char*>(packed_mem_);
+      for (Index x = 0; x < numext::mini<Index>(nk_, P - 1); x++) {
+        packed_lhs_[x].resize(nm0_);
+        for (Index m = 0; m < nm0_; m++) {
+          packed_lhs_[x][m] = reinterpret_cast<LhsScalar*>(mem);
+          mem += lhs_size;
+        }
+        packed_rhs_[x].resize(nn0_);
+        for (Index n = 0; n < nn0_; n++) {
+          packed_rhs_[x][n] = reinterpret_cast<RhsScalar*>(mem);
+          mem += rhs_size;
         }
       }
     }
 
-    ~EvalParallelContext() {
+    ~Context() {
       for (Index x = 0; x < P; x++) {
         for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m];
         delete[] state_kernel_[x];
       }
-      kernel_.deallocate(device_, packed_mem_);
-      if (parallelize_by_sharding_dim_only_) {
-        kernel_.deallocate(device_, thread_local_pre_alocated_mem_);
-        delete[] can_use_thread_local_packed_;
-      }
+      internal::aligned_free(packed_mem_);
     }
 
     void run() {
       // Kick off packing of the first slice.
       signal_switch(0, 1);
-
       // Wait for overall completion.
-      //
-      // If parallel evaluation is executed in async mode, this is a no-op, and
-      // Wait() will return immediately. In synchronous mode it will block the
-      // caller thread until it will receive notification from last task.
-      //
-      // In async mode, last task when completed will call done callback from
-      // the same thread, and will delete this context.
-      //
-      // TODO(dvyukov): This wait can lead to deadlock if contraction is
-      // evaluated in synchronous mode. If nthreads contractions are
-      // concurrently submitted from worker threads, this wait will block all
-      // worker threads and the system will deadlock.
+      // TODO(dvyukov): this wait can lead to deadlock.
+      // If nthreads contractions are concurrently submitted from worker
+      // threads, this wait will block all worker threads and the system will
+      // deadlock.
       done_.Wait();
     }
 
    private:
-    std::thread::id created_by_thread_id_;
-
-    // This notification is specialized on the type of DoneCallback and can be
-    // blocking or non-blocking.
-    EvalParallelNotification<DoneCallback, EvalParallelContext> done_;
-
+    Notification done_;
     const Device& device_;
-    LhsMapper lhs_;
-    RhsMapper rhs_;
+    LhsMapper& lhs_;
+    RhsMapper& rhs_;
     Scalar* const buffer_;
     OutputMapper output_;
-    OutputKernelType output_kernel_;
-    TensorContractionParams tensor_contraction_params_;
     const int num_threads_;
     const bool shard_by_col_;
     const bool parallel_pack_;
-    const bool parallelize_by_sharding_dim_only_;
     // Matrix sizes.
     const Index m_;
     const Index n_;
@@ -562,8 +414,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     // coarsening).
     const Index nm0_;
     const Index nn0_;
-    // Tensor contraction kernel.
-    TensorContractionKernel kernel_;
 
     // Parallelization strategy.
     //
@@ -600,215 +450,9 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     // actively executing + one to track completion of kernels in the second
     // slice.
     static const Index P = 3;
-
-    // Handle to the allocated temporary storage for Lhs/Rhs blocks.
-    BlockMemHandle packed_mem_;
-    std::vector<LhsBlock> packed_lhs_[P - 1];
-    std::vector<RhsBlock> packed_rhs_[P - 1];
-
-    // If we choose to parallelize only by the sharding dimension, each thread
-    // will have it's own "thead local" (not a c++ thread local storage) memory
-    // for packed_lhs or packed_rhs (shard_by_col = false of true). This memory
-    // can't be passed to a kernel that might execute on a different thread.
-    //
-    // In practice when we are ready to pack memory for the sharding dimension
-    // (rhs if shard_by_col==true) of the K-th slice, all kernels for K-1 slice
-    // already computed (99% of the time), and we can pack data into the thread
-    // local storage, and guarantee that all the kernels will be executed
-    // immediately in the same thread. This significantly increases L1 cache hit
-    // ratio and reduces pressure on the memory bus.
-    //
-    // It's still possible that kernel for the K-th slice will be ready before
-    // completion of the K-1 kernel, so we have to allocate "global" packed_lhs_
-    // and packed_rhs_ to allow kernels to be executed later on a thread
-    // different from the thread that was used for packing.
-
-    // Handle for pre-allocated thread local memory buffers.
-    BlockMemHandle thread_local_pre_alocated_mem_;
-
-    // Only one of these will be initialized depending on shard_by_col value
-    // (the size will be `num_worker_threads * num_grains_in_the_sharding_dim`).
-    std::vector<LhsBlock> lhs_thread_local_pre_allocated_;
-    std::vector<RhsBlock> rhs_thread_local_pre_allocated_;
-
-    // How many thread local blocks were already allocated.
-    std::atomic<int> num_thread_local_allocations_;
-    const int thread_local_capacity;
-
-    // We will use pre-allocated Lhs/Rhs blocks defined above, if the number of
-    // unique threads in a system is below or equal to the number of threads in
-    // a thread pool. We will fallback on dynamic memory allocation after that.
-
-    // ThreadLocalBlocks is a container for Lhs or Rhs thread local buffers. Its
-    // size is equal to the grain size in Lhs/Rhs sharding dimension.
-    template <typename BlockType>
-    class ThreadLocalBlocks {
-     public:
-      ThreadLocalBlocks() = default;
-
-      ThreadLocalBlocks(BlockType* base, size_t grain_size)
-          : is_pre_allocated_(true),
-            thread_local_pre_allocated_base_(base),
-            grain_size_(grain_size) {}
-
-      ThreadLocalBlocks(BlockMemHandle mem_handle,
-                        std::vector<BlockType> blocks)
-          : is_pre_allocated_(false),
-            mem_handle_(std::move(mem_handle)),
-            blocks_(std::move(blocks)) {}
-
-      BlockType& block(int grain_index) {
-        eigen_assert(grain_index >= 0);
-        eigen_assert(static_cast<size_t>(grain_index) < size());
-        return is_pre_allocated_ ? thread_local_pre_allocated_base_[grain_index]
-                                 : blocks_[grain_index];
-      }
-
-      void Release(EvalParallelContext& ctx) const {
-        if (!is_pre_allocated_) {
-          ctx.kernel_.deallocate(ctx.device_, mem_handle_);
-        }
-      }
-
-      size_t size() const {
-        return is_pre_allocated_ ? grain_size_ : blocks_.size();
-      }
-
-     private:
-      bool is_pre_allocated_;
-
-      // Reuse pre-allocated thread local buffers.
-      BlockType* thread_local_pre_allocated_base_ = nullptr;
-      size_t grain_size_ = 0;
-
-      // These will be initialized only if `is_pre_allocated == false`.
-      BlockMemHandle mem_handle_{};
-      std::vector<BlockType> blocks_;
-    };
-
-    // ThreadLocalBlocksInitialize callable does custom thread local blocks
-    // initialization, and will reuse pre-allocated buffers if possible, or will
-    // dynamically allocate new memory.
-    //
-    // Lhs/Rhs blocks might be of the same type, so we have to pass explicitly
-    // for what side do we plan to do block allocation.
-    template <typename BlockType, bool is_rhs>
-    class ThreadLocalBlocksInitialize {
-      static constexpr bool kIsLhs =
-          !is_rhs && std::is_same<BlockType, LhsBlock>::value;
-      static const bool kIsRhs =
-          is_rhs && std::is_same<BlockType, RhsBlock>::value;
-      static_assert(kIsLhs || kIsRhs, "Unkown block type");
-
-      using Blocks = ThreadLocalBlocks<BlockType>;
-
-     public:
-      ThreadLocalBlocksInitialize(EvalParallelContext& ctx)
-          : ctx_(ctx),
-            num_worker_threads_(ctx_.device_.numThreadsInPool()) {}
-
-      void operator()(Blocks& blocks) {
-        const int n = ctx_.num_thread_local_allocations_.fetch_add(
-            1, std::memory_order_relaxed);
-
-        if (n >= num_worker_threads_) {
-          ThreadLocalBlocksAllocator<is_rhs>::allocate(ctx_, blocks);
-        } else {
-          ThreadLocalBlocksAllocator<is_rhs>::reuse(ctx_, n, blocks);
-        }
-      }
-
-     private:
-      // NOTE(ezhulenev): Without 'if constexpr' we have to put calls to
-      // TensorContractionKernel::allocateSlices into template specializations.
-      // Also explicit specializations are not allowed at class scope in C++03,
-      // EvalCtx type parameter is just a workaround for that limitation.
-      template <bool pack_rhs, typename EvalCtx = EvalParallelContext>
-      struct ThreadLocalBlocksAllocator;
-
-      template <typename EvalCtx>
-      struct ThreadLocalBlocksAllocator</*pack_rhs=*/true, EvalCtx> {
-        static void allocate(EvalCtx& ctx, Blocks& blocks) {
-          std::vector<RhsBlock> rhs_blocks;
-          BlockMemHandle mem_handle = ctx.kernel_.allocateSlices(
-              ctx.device_,
-              /*num_lhs=*/0,
-              /*num_rhs=*/ctx.gn_,
-              /*num_slices=*/1,
-              /*lhs_blocks=*/nullptr, /*rhs_blocks=*/&rhs_blocks);
-
-          blocks = ThreadLocalBlocks<RhsBlock>(std::move(mem_handle),
-                                               std::move(rhs_blocks));
-        }
-
-        static void reuse(EvalCtx& ctx, int index, Blocks& blocks) {
-          RhsBlock* ptr = &ctx.rhs_thread_local_pre_allocated_[ctx.gn_ * index];
-          blocks = ThreadLocalBlocks<RhsBlock>(ptr, ctx.gn_);
-        }
-      };
-
-      template <typename EvalCtx>
-      struct ThreadLocalBlocksAllocator</*pack_rhs=*/false, EvalCtx> {
-        static void allocate(EvalCtx& ctx, Blocks& blocks) {
-          std::vector<LhsBlock> lhs_blocks;
-          BlockMemHandle mem_handle = ctx.kernel_.allocateSlices(
-              ctx.device_,
-              /*num_lhs=*/ctx.gm_,
-              /*num_rhs=*/0,
-              /*num_slices=*/1,
-              /*lhs_blocks=*/&lhs_blocks, /*rhs_blocks=*/nullptr);
-
-          blocks = ThreadLocalBlocks<LhsBlock>(std::move(mem_handle),
-                                               std::move(lhs_blocks));
-        }
-
-        static void reuse(EvalCtx& ctx, int index, Blocks& blocks) {
-          LhsBlock* ptr = &ctx.lhs_thread_local_pre_allocated_[ctx.gm_ * index];
-          blocks = ThreadLocalBlocks<LhsBlock>(ptr, ctx.gm_);
-        }
-      };
-
-      EvalParallelContext& ctx_;
-      const int num_worker_threads_;
-    };
-
-    template <typename BlockType>
-    class ThreadLocalBlocksRelease {
-     public:
-      using Blocks = ThreadLocalBlocks<BlockType>;
-      ThreadLocalBlocksRelease(EvalParallelContext& ctx) : ctx_(ctx) {}
-      void operator()(Blocks& blocks) { blocks.Release(ctx_); }
-
-     private:
-      EvalParallelContext& ctx_;
-    };
-
-    // ThreadLocalBlocks initialization callables.
-    using ThreadLocalLhsInit =
-        ThreadLocalBlocksInitialize<LhsBlock, /*is_rhs=*/false>;
-    using ThreadLocalRhsInit =
-        ThreadLocalBlocksInitialize<RhsBlock, /*is_rhs=*/true>;
-
-    // ThreadLocalBlocks release callables.
-    using ThreadLocalLhsRelease = ThreadLocalBlocksRelease<LhsBlock>;
-    using ThreadLocalRhsRelease = ThreadLocalBlocksRelease<RhsBlock>;
-
-    // Thread local containers for Lhs/Rhs block packs. In practice only one of
-    // them will be used, depending on the shard_by_col value.
-    Eigen::ThreadLocal<ThreadLocalBlocks<LhsBlock>, ThreadLocalLhsInit,
-                       ThreadLocalLhsRelease>
-        lhs_thread_local_blocks_;
-    Eigen::ThreadLocal<ThreadLocalBlocks<RhsBlock>, ThreadLocalRhsInit,
-                       ThreadLocalRhsRelease>
-        rhs_thread_local_blocks_;
-
-    // After a particular shard for Kth slice missed thread local execution
-    // opportunity (K-1 slice didn't complete kernels execution), we can no
-    // longer schedule K+1 and following slices in thread local mode, because
-    // there is no more guarantee that previous kernels were executed
-    // sequentially in the same thread (size is nn_ or nm_).
-    std::atomic<bool>* can_use_thread_local_packed_;
-
+    void* packed_mem_;
+    std::vector<LhsScalar*> packed_lhs_[P - 1];
+    std::vector<RhsScalar*> packed_rhs_[P - 1];
     std::atomic<uint8_t>** state_kernel_[P];
     // state_switch_ is frequently modified by worker threads, while other
     // fields are read-only after constructor. Let's move it to a separate cache
@@ -817,168 +461,69 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     std::atomic<Index> state_packing_ready_[P];
     std::atomic<Index> state_switch_[P];
 
-    LhsBlock& packed_lhs(Index m, Index k, Index m1, bool use_thread_local) {
-      if (use_thread_local) {
-        eigen_assert(!shard_by_col_);
-        ThreadLocalBlocks<LhsBlock>& blocks = lhs_thread_local_blocks_.local();
-
-        Index grain_index = m1 - m * gm_;
-        return blocks.block(internal::convert_index<int>(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index?
-      } else {
-        return packed_lhs_[k % (P - 1)][m1];
-      }
-    }
-
-    RhsBlock& packed_rhs(Index n, Index k, Index n1, bool use_thread_local) {
-      if (use_thread_local) {
-        eigen_assert(shard_by_col_);
-        ThreadLocalBlocks<RhsBlock>& blocks = rhs_thread_local_blocks_.local();
-
-        Index grain_index = n1 - n * gn_;
-        return blocks.block(internal::convert_index<int>(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index?
-      } else {
-        return packed_rhs_[k % (P - 1)][n1];
-      }
-    }
-
-    // In following two methods (pack_lhs and pack_rhs), if we know for sure
-    // that we'll be able to immediately call a kernel with packed data, and do
-    // not submit it to the thread pool, we can use thread local memory for
-    // packed data.
-    //
-    // We can only reliably check it if we are running all kernels in sync mode
-    // (parallelize only by sharding dim). If kernel for m==0 (n==0) is ready to
-    // run, it's guaranteed that all kernels with larger values of m (n) are
-    // also ready, because we execute them in the same order for all K slices.
-
     void pack_lhs(Index m, Index k) {
-      bool use_thread_local = false;
-
-      if (parallelize_by_sharding_dim_only_ && !shard_by_col_ &&
-          can_use_thread_local_packed_[m].load(std::memory_order_relaxed)) {
-        if (state_kernel_[k % P][m][0].load(std::memory_order_relaxed) == 1) {
-          use_thread_local = true;
-        } else {
-          // If we can't guarantee that all kernels in `k` slice will be
-          // executed sequentially in current thread, it's no longer safe to use
-          // thread local memory in following slices along the k dimensions.
-          eigen_assert(k > 0);
-          can_use_thread_local_packed_[m].store(false,
-                                                std::memory_order_relaxed);
-        }
-      }
-
       const Index mend = m * gm_ + gm(m);
       for (Index m1 = m * gm_; m1 < mend; m1++)
-        kernel_.packLhs(&packed_lhs(m, k, m1, use_thread_local),
-                        lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1));
+        LhsPacker()(packed_lhs_[k % (P - 1)][m1],
+                    lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1));
 
       if (!parallel_pack_ && shard_by_col_) {
-        assert(!use_thread_local);
         signal_packing(k);
       } else {
         signal_switch(k + 1);
-        for (Index n = nn_ - 1; n >= 0; n--) {
-          bool sync = parallelize_by_sharding_dim_only_ || n == 0;
-          signal_kernel(m, n, k, sync, use_thread_local);
-        }
+        for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0);
       }
     }
 
     void pack_rhs(Index n, Index k) {
-      bool use_thread_local = false;
-
-      if (parallelize_by_sharding_dim_only_ && shard_by_col_ &&
-          can_use_thread_local_packed_[n].load(std::memory_order_relaxed)) {
-        if (state_kernel_[k % P][0][n].load(std::memory_order_relaxed) == 1) {
-          use_thread_local = true;
-        } else {
-          // If we can't guarantee that all kernels in `k` slice will be
-          // executed sequentially in current thread, it's no longer safe to use
-          // thread local memory in followig slices along the k dimensions.
-          eigen_assert(k > 0);
-          can_use_thread_local_packed_[n].store(false,
-                                                std::memory_order_relaxed);
-        }
-      }
-
       const Index nend = n * gn_ + gn(n);
       for (Index n1 = n * gn_; n1 < nend; n1++) {
-        if (!TensorContractionKernel::HasBeta && k == 0) {
-          // Zero the output memory in parallel, only if contraction kernel does
-          // not support `beta`. Otherwise we will pass beta 0.0 to the first
-          // call to the `TensorContractionKernel::invoke()`.
-          //
-          // On 10000x2x10000 mm zeroing can easily take half of time. Zero (bn
-          // x m) row. Safe to do here because all kernels that will write to
-          // this memory depend on completion of this task. Note: don't call
-          // device_.memset() here. device_.memset() blocks on thread pool
-          // worker thread, which can lead to underutilization and deadlocks.
+        if (k == 0) {
+          // Zero the output memory in parallel.
+          // On 10000x2x10000 mm zeroing can easily take half of time.
+          // Zero (bn x m) row. Safe to do here because all kernels that will
+          // write to this memory depend on completion of this task.
+          // Note: don't call device_.memset() here. device_.memset() blocks on
+          // thread pool worker thread, which can lead to underutilization and
+          // deadlocks.
           memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar));
         }
-        kernel_.packRhs(&packed_rhs(n, k, n1, use_thread_local),
-                        rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
+        RhsPacker()(packed_rhs_[k % (P - 1)][n1],
+                    rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
       }
 
       if (parallel_pack_ || shard_by_col_) {
         signal_switch(k + 1);
-        for (Index m = nm_ - 1; m >= 0; m--) {
-          bool sync = parallelize_by_sharding_dim_only_ || m == 0;
-          signal_kernel(m, n, k, sync, use_thread_local);
-        }
+        for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0);
       } else {
-        assert(!use_thread_local);
         signal_packing(k);
       }
     }
 
-    void kernel(Index m, Index n, Index k, bool use_thread_local) {
+    void kernel(Index m, Index n, Index k) {
       // Note: order of iteration matters here. Iteration over m is innermost
-      // because we want to reuse the same packed rhs in consecutive tasks
+      // because we want to reuse the same packed rhs in consequetive tasks
       // (rhs fits into L2$ while lhs only into L3$).
       const Index nend = n * gn_ + gn(n);
       const Index mend = m * gm_ + gm(m);
-
-      // NOTE: output = alpha * LHS * RHS + beta * output.
-      const Scalar alpha = Scalar(1);
-      const Scalar beta =
-          (TensorContractionKernel::HasBeta && k == 0) ? Scalar(0) : Scalar(1);
-
       if (shard_by_col_) {
         for (Index n1 = n * gn_; n1 < nend; n1++) {
-          for (Index m1 = m * gm_; m1 < mend; m1++) {
-            const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
-            kernel_.invoke(
-                output_mapper,
-                packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local),
-                packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1),
-                bk(k), bn(n1), alpha, beta);
-
-            // We are done with the last task for the [m1, n1] block.
-            if (k + 1 == nk_) {
-              output_kernel_(output_mapper, tensor_contraction_params_,
-                             m1 * bm_, n1 * bn_, bm(m1), bn(n1));
-            }
-          }
+          for (Index m1 = m * gm_; m1 < mend; m1++)
+            GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
+                         packed_lhs_[k % (P - 1)][m1],
+                         packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
+                         Scalar(1), -1, -1, 0, 0);
         }
       } else {
         for (Index m1 = m * gm_; m1 < mend; m1++)
           for (Index n1 = n * gn_; n1 < nend; n1++) {
-            const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
-            kernel_.invoke(
-                output_mapper,
-                packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local),
-                packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1),
-                bk(k), bn(n1), alpha, beta);
-
-            // We are done with the last task for the [m1, n1] block.
-            if (k + 1 == nk_) {
-              output_kernel_(output_mapper, tensor_contraction_params_,
-                             m1 * bm_, n1 * bn_, bm(m1), bn(n1));
-            }
+            GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
+                         packed_lhs_[k % (P - 1)][m1],
+                         packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
+                         Scalar(1), -1, -1, 0, 0);
           }
       }
-      signal_kernel(m, n, k + 1, /*sync=*/false, /*use_thread_local=*/false);
+      signal_kernel(m, n, k + 1, false);
       signal_switch(k + 2);
     }
 
@@ -991,23 +536,16 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       enqueue_packing(k, shard_by_col_);
     }
 
-    void signal_kernel(Index m, Index n, Index k, bool sync,
-                       bool use_thread_local) {
+    void signal_kernel(Index m, Index n, Index k, bool sync) {
       std::atomic<uint8_t>* state = &state_kernel_[k % P][m][n];
       Index s = state->load();
       eigen_assert(s > 0);
-      if (s != 1 && state->fetch_sub(1) != 1) {
-        eigen_assert(!use_thread_local);
-        return;
-      }
+      if (s != 1 && state->fetch_sub(1) != 1) return;
       state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed);
-      if (sync) {
-        kernel(m, n, k, use_thread_local);
-      } else {
-        eigen_assert(!use_thread_local);
-        device_.enqueueNoNotification(
-            [=]() { kernel(m, n, k, use_thread_local); });
-      }
+      if (sync)
+        kernel(m, n, k);
+      else
+        device_.enqueueNoNotification([=]() { kernel(m, n, k); });
     }
 
     void signal_switch(Index k, Index v = 1) {
@@ -1057,32 +595,11 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
         else
           pack_lhs(start, k);
       } else {
-        while (end - start > 1) {
-          Index mid = (start + end) / 2;
-          device_.enqueueNoNotification(
-              [=]() { enqueue_packing_helper(mid, end, k, rhs); });
-          end = mid;
-        }
-
-        // Decide if we want to run first packing task (start == 0) in
-        // async mode if we parallelize only by sharding dim:
-        // (1) pack_lhs and pack_rhs call signal_switch before completing
-        //     all calls to signal_kernel, which in sync mode might lead
-        //     to the execution of the first kernel of the k+1 slice, before
-        //     completing a call to the last kernel of the k slice.
-        // (2) all pack tasks for sharded dim must be executed in a thread
-        //     pool to get pre-allocated thead local buffers.
-        bool pack_async =
-          (start == 0) &&
-          (parallelize_by_sharding_dim_only_&& shard_by_col_ == rhs) &&
-          (k > 0 || std::this_thread::get_id() == created_by_thread_id_);
-
-        if (pack_async) {
-          device_.enqueueNoNotification(
-              [=]() { enqueue_packing_helper(start, end, k, rhs); });
-        } else {
-          enqueue_packing_helper(start, end, k, rhs);
-        }
+        Index mid = (start + end) / 2;
+        device_.enqueueNoNotification(
+            [=]() { enqueue_packing_helper(mid, end, k, rhs); });
+        device_.enqueueNoNotification(
+            [=]() { enqueue_packing_helper(start, mid, k, rhs); });
       }
     }
 
@@ -1094,364 +611,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; }
     Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; }
 
-    EvalParallelContext(const EvalParallelContext&) = delete;
-    void operator=(const EvalParallelContext&) = delete;
+    Context(const Context&) = delete;
+    void operator=(const Context&) = delete;
   };
 
-  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
-            bool rhs_inner_dim_reordered, int Alignment>
-  using SyncEvalParallelContext =
-      EvalParallelContext<NoCallback, lhs_inner_dim_contiguous,
-                          rhs_inner_dim_contiguous, rhs_inner_dim_reordered,
-                          Alignment>;
-
-  // ------------------------------------------------------------------------ //
-
-  // EvalShardedByInnerDimContext orchestrates sync/async contraction
-  // evaluation, when we shard by inner dimension. When it is executed in
-  // asynchronous mode, it owns all the shared state that might be accessible by
-  // block processing tasks.
-
-  template <typename DoneCallback>
-  struct EvalShardedByInnerDimContext {
-    EvalShardedByInnerDimContext(const Self* self, int num_threads,
-                                 Scalar* result_buffer,
-                                 Index m_size, Index n_size, Index k_size,
-                                 DoneCallback done_callback)
-        : evaluator(self),
-          m_lhs_inner_dim_contiguous(evaluator->m_lhs_inner_dim_contiguous),
-          m_rhs_inner_dim_contiguous(evaluator->m_rhs_inner_dim_contiguous),
-          m_rhs_inner_dim_reordered(evaluator->m_rhs_inner_dim_reordered),
-          result(result_buffer),
-          m(m_size),
-          n(n_size),
-          k(k_size),
-          done(std::move(done_callback)),
-          buffer_size_bytes(m * n * sizeof(Scalar)),
-          block_size(blockSize(k, num_threads)),
-          num_blocks(divup<Index>(k, block_size)),
-          num_pending_blocks(internal::convert_index<int>(num_blocks)),
-          l0_ranges(divup<Index>(num_blocks, l0_size)),
-          l0_state(l0_ranges),
-          block_buffers(num_blocks) {
-      // Keep count of pending gemm tasks for each l0 range.
-      for (int i = 0; i < l0_ranges; ++i) {
-        const Index num_pending_tasks = actualRangeSize(l0_ranges, l0_size, i);
-        l0_state.emplace_back(internal::convert_index<int>(num_pending_tasks));
-      }
-
-      // Allocate temporary buffers for each block.
-      for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) {
-        Scalar* buf = block_idx == 0
-                          ? result
-                          : static_cast<Scalar*>(evaluator->m_device.allocate(
-                                buffer_size_bytes));
-        block_buffers.emplace_back(buf);
-      }
-    }
-
-    ~EvalShardedByInnerDimContext() {
-      for (Index i = 1; i < num_blocks; ++i) {
-        evaluator->m_device.deallocate(block_buffers[i]);
-      }
-    }
-
-    template <int Alignment>
-    void run() {
-      Barrier barrier(internal::convert_index<int>(num_blocks));
-      eval<Alignment>(barrier, 0, num_blocks);
-      barrier.Wait();
-
-      // Aggregate partial sums from l0 ranges.
-      aggregateL0Blocks<Alignment>();
-
-      // Apply output kernel.
-      applyOutputKernel();
-    }
-
-    template <int Alignment>
-    void runAsync() {
-      evalAsync<Alignment>(0, num_blocks);
-    }
-
-   private:
-    // The underlying GEMM kernel assumes that k is a multiple of
-    // the packet size and subtle breakage occurs if this is violated.
-    static const Index packet_size = internal::packet_traits<RhsScalar>::size;
-
-    const Self* evaluator;  // TensorContraction evaluator
-
-    // These fields required fromTENSOR_CONTRACTION_DISPATCH macro.
-    bool m_lhs_inner_dim_contiguous;
-    bool m_rhs_inner_dim_contiguous;
-    bool m_rhs_inner_dim_reordered;
-
-    Scalar* result;
-
-    Index m;
-    Index n;
-    Index k;
-
-    DoneCallback done;
-
-    // ----------------------------------------------------------------------//
-    // Algorithm parameters.
-
-    // We will compute partial results into the buffers of this size.
-    Index buffer_size_bytes;
-
-    Index block_size;
-    Index num_blocks;
-
-    // Keep track of pending tasks when evaluate in async mode.
-    std::atomic<int> num_pending_blocks;
-
-    // We compute partial gemm results in parallel, and to get the final result
-    // we need to add them all together. For the large number of threads (>= 48)
-    // this adds a very expensive sequential step at the end.
-    //
-    // We split the [0, num_blocks) into small ranges, and when a task for the
-    // block finishes its partial gemm computation, it checks if it was the last
-    // gemm in the range, and if so, it will add all blocks of the range.
-    //
-    // After all tasks done, we need to add only these pre-aggregated blocks.
-
-    // For now we use just a single level of ranges to compute pre-aggregated
-    // partial sums, but in general we can use more layers to compute tree
-    // aggregation in parallel and reduce the size of the sequential step.
-    //
-    // TODO(ezhulenev): Add multilevel tree aggregation? Probably will make
-    // sense only if number of threads >= ~128?
-    static const Index l0_size = 4;
-    Index l0_ranges;
-
-    // Keep count of pending gemm tasks for each l0 range.
-    MaxSizeVector<std::atomic<int>> l0_state;  // [0, l0_ranges)
-
-    // Buffers allocated for each temporary block computation.
-    MaxSizeVector<Scalar*> block_buffers;  // [0, num_blocks)
-
-    template <int Alignment>
-    void processBlock(Index block_idx, Index begin, Index end) {
-      Scalar* buf = block_buffers[block_idx];
-
-      TENSOR_CONTRACTION_DISPATCH(
-          evaluator->template evalGemmPartialWithoutOutputKernel, Alignment,
-          (buf, begin, end,
-           /*num_threads=*/internal::convert_index<int>(num_blocks)));
-
-      // Check if it was the last task in l0 range.
-      const Index l0_index = block_idx / l0_size;
-      const int v = l0_state[l0_index].fetch_sub(1);
-      eigen_assert(v >= 1);
-
-      // If we processed the last block of the range, we can aggregate all
-      // partial results into the first block of the range.
-      if (v == 1) {
-        const Index rng_size = actualRangeSize(l0_ranges, l0_size, l0_index);
-        const Index dst_block_idx = l0_index * l0_size;
-
-        if (rng_size == l0_size) {
-          addAllToBuffer<Alignment>(
-              m * n,
-              /*src_buf0=*/block_buffers[dst_block_idx + 1],
-              /*src_buf1=*/block_buffers[dst_block_idx + 2],
-              /*src_buf2=*/block_buffers[dst_block_idx + 3],
-              /*dst_buf= */ block_buffers[dst_block_idx]);
-        } else {
-          // Aggregate blocks of potentially incomplete last range.
-          for (int i = 1; i < rng_size; ++i) {
-            addToBuffer<Alignment>(m * n,
-                                   /*src_buf=*/block_buffers[dst_block_idx + i],
-                                   /*dst_buf=*/block_buffers[dst_block_idx]);
-          }
-        }
-      }
-    }
-
-    // Aggregate partial sums from l0 ranges.
-    template <int Alignment>
-    void aggregateL0Blocks() const {
-      Index l0_index = 1;
-
-      for (; l0_index + 2 < l0_ranges; l0_index += 3) {
-        addAllToBuffer<Alignment>(
-            m * n,
-            /*src_buf0=*/block_buffers[(l0_index + 0) * l0_size],
-            /*src_buf1=*/block_buffers[(l0_index + 1) * l0_size],
-            /*src_buf2=*/block_buffers[(l0_index + 2) * l0_size],
-            /*dst_buf= */ block_buffers[0]);
-      }
-
-      for (; l0_index < l0_ranges; ++l0_index) {
-        addToBuffer<Alignment>(m * n, block_buffers[l0_index * l0_size],
-                               block_buffers[0]);
-      }
-    }
-
-    void applyOutputKernel() const {
-      typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
-      evaluator->m_output_kernel(
-          OutputMapper(result, m), evaluator->m_tensor_contraction_params,
-          static_cast<Eigen::Index>(0), static_cast<Eigen::Index>(0), m, n);
-    }
-
-    // Compute block size with accounting for potentially incomplete last block.
-    Index actualBlockSize(Index block_idx) const {
-      return block_idx + 1 < num_blocks
-                 ? block_size
-                 : k + block_size - block_size * num_blocks;
-    };
-
-    // Compute range size with accounting for potentially incomplete last range.
-    Index actualRangeSize(Index num_ranges, Index range_size,
-                          Index range_idx) const {
-      eigen_assert(range_idx < num_ranges);
-      return range_idx + 1 < num_ranges
-                 ? range_size
-                 : num_blocks + range_size - range_size * num_ranges;
-    };
-
-    template <int Alignment>
-    EIGEN_STRONG_INLINE static void addToBuffer(size_t n, const Scalar* src_buf,
-                                                Scalar* tgt_buf) {
-      const int output_packet_size =
-          internal::unpacket_traits<PacketReturnType>::size;
-      size_t i = 0;
-      const size_t num_packets = n / output_packet_size;
-      for (; i < output_packet_size * num_packets; i += output_packet_size) {
-        const PacketReturnType src_val =
-            internal::pload<PacketReturnType>(src_buf + i);
-        const PacketReturnType tgt_val =
-            internal::ploadt<PacketReturnType, Alignment>(tgt_buf + i);
-        const PacketReturnType sum = internal::padd(src_val, tgt_val);
-        internal::pstoret<Scalar, PacketReturnType, Alignment>(tgt_buf + i,
-                                                               sum);
-      }
-      for (; i < n; ++i) {
-        tgt_buf[i] += src_buf[i];
-      }
-    }
-
-    template <int Alignment>
-    EIGEN_STRONG_INLINE static void addAllToBuffer(size_t n,
-                                                   const Scalar* src_buf0,
-                                                   const Scalar* src_buf1,
-                                                   const Scalar* src_buf2,
-                                                   Scalar* dst_buf) {
-      using ::Eigen::internal::padd;
-      using ::Eigen::internal::pload;
-      using ::Eigen::internal::ploadt;
-      using ::Eigen::internal::pstoret;
-
-      const int output_packet_size =
-          internal::unpacket_traits<PacketReturnType>::size;
-
-      size_t i = 0;
-      const size_t num_packets = n / output_packet_size;
-      for (; i < output_packet_size * num_packets; i += output_packet_size) {
-        const auto src_val0 = pload<PacketReturnType>(src_buf0 + i);
-        const auto src_val1 = pload<PacketReturnType>(src_buf1 + i);
-        const auto src_val2 = pload<PacketReturnType>(src_buf2 + i);
-
-        const auto dst_val = ploadt<PacketReturnType, Alignment>(dst_buf + i);
-        const auto sum =
-            padd(padd(dst_val, src_val0), padd(src_val1, src_val2));
-
-        pstoret<Scalar, PacketReturnType, Alignment>(dst_buf + i, sum);
-      }
-      for (; i < n; ++i) {
-        dst_buf[i] += src_buf0[i] + src_buf1[i] + src_buf2[i];
-      }
-    }
-
-    template <int Alignment>
-    void eval(Barrier& barrier, Index start_block_idx, Index end_block_idx) {
-      while (end_block_idx - start_block_idx > 1) {
-        Index mid_block_idx = (start_block_idx + end_block_idx) / 2;
-        evaluator->m_device.enqueueNoNotification(
-            [this, &barrier, mid_block_idx, end_block_idx]() {
-              eval<Alignment>(barrier, mid_block_idx, end_block_idx);
-            });
-        end_block_idx = mid_block_idx;
-      }
-
-      Index block_idx = start_block_idx;
-      Index block_start = block_idx * block_size;
-      Index block_end = block_start + actualBlockSize(block_idx);
-
-      processBlock<Alignment>(block_idx, block_start, block_end);
-      barrier.Notify();
-    }
-
-    template <int Alignment>
-    void evalAsync(Index start_block_idx, Index end_block_idx) {
-      while (end_block_idx - start_block_idx > 1) {
-        Index mid_block_idx = (start_block_idx + end_block_idx) / 2;
-        evaluator->m_device.enqueueNoNotification(
-            [this, mid_block_idx, end_block_idx]() {
-              evalAsync<Alignment>(mid_block_idx, end_block_idx);
-            });
-        end_block_idx = mid_block_idx;
-      }
-
-      Index block_idx = start_block_idx;
-
-      Index block_start = block_idx * block_size;
-      Index block_end = block_start + actualBlockSize(block_idx);
-
-      processBlock<Alignment>(block_idx, block_start, block_end);
-
-      int v = num_pending_blocks.fetch_sub(1);
-      eigen_assert(v >= 1);
-
-      if (v == 1) {
-        // Aggregate partial sums from l0 ranges.
-        aggregateL0Blocks<Alignment>();
-
-        // Apply output kernel.
-        applyOutputKernel();
-
-        // NOTE: If we call `done` callback before deleting this (context),
-        // it might deallocate Self* pointer captured by context, and we'll
-        // fail in destructor trying to deallocate temporary buffers.
-
-        // Move done call back from context before it will be destructed.
-        DoneCallback done_copy = std::move(done);
-
-        // We are confident that we are the last one who touches context.
-        delete this;
-
-        // Now safely call the done callback.
-        done_copy();
-      }
-    }
-
-    // Cost model doesn't capture well the cost associated with constructing
-    // tensor contraction mappers and computing loop bounds in gemm_pack_lhs
-    // and gemm_pack_rhs, so we specify minimum desired block size.
-    static Index blockSize(Index k, int num_threads) {
-      const auto round_up = [=](Index index) -> Index {
-        const Index kmultiple = packet_size <= 8 ? 8 : packet_size;
-        return divup<Index>(index, kmultiple) * kmultiple;
-      };
-
-      const Index target_block_size = round_up(divup<Index>(k, num_threads));
-      const Index desired_min_block_size = 12 * packet_size;
-
-      return numext::mini<Index>(
-          k, numext::maxi<Index>(desired_min_block_size, target_block_size));
-    }
-
-    EvalShardedByInnerDimContext(const EvalShardedByInnerDimContext&) = delete;
-    void operator=(const EvalShardedByInnerDimContext&) = delete;
-  };
-
-  // ------------------------------------------------------------------------ //
-
-  // Below are the function used by evalProductImpl heuristics, trying to select
-  // optimcal parameters for parallelization algorithm.
-
   // Decide whether we want to shard m x n contraction by columns or by rows.
   static bool shardByCol(Index m, Index n, Index num_threads) {
     // Note: we are comparing both n and m against Traits::nr, it is not
@@ -1555,15 +718,304 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     return 0;
   }
 
+#else  // EIGEN_USE_SIMPLE_THREAD_POOL
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalProduct(Scalar* buffer) const {
+    if (this->m_j_size == 1) {
+      this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+      return;
+    }
+
+    evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(buffer);
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalGemm(Scalar* buffer) const {
+    // columns in left side, rows in right side
+    const Index k = this->m_k_size;
+
+    // rows in left side
+    const Index m = this->m_i_size;
+
+    // columns in right side
+    const Index n = this->m_j_size;
+
+    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar)
+    this->m_device.memset(buffer, 0, m * n * sizeof(Scalar));
+
+
+    const int lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
+    const int rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
+
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs,
+                                                   LeftEvaluator, left_nocontract_t,
+                                                   contract_t, lhs_packet_size,
+                                                   lhs_inner_dim_contiguous,
+                                                   false, Unaligned> LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs,
+                                                   RightEvaluator, right_nocontract_t,
+                                                   contract_t, rhs_packet_size,
+                                                   rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned> RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+    // TODO: packing could be faster sometimes if we supported row major tensor mappers
+    typedef internal::gemm_pack_lhs<LhsScalar, Index, typename LhsMapper::SubMapper, Traits::mr,
+                                    Traits::LhsProgress, ColMajor> LhsPacker;
+    typedef internal::gemm_pack_rhs<RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> RhsPacker;
+
+    // TODO: replace false, false with conjugate values?
+    typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
+                                  Traits::mr, Traits::nr, false, false> GebpKernel;
+
+    typedef internal::packLhsArg<LhsScalar, LhsMapper, Index> packLArg;
+    typedef internal::packRhsAndKernelArg<LhsScalar, RhsScalar, RhsMapper, OutputMapper, Index> packRKArg;
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+    OutputMapper output(buffer, m);
+
+    // compute block sizes (which depend on number of threads)
+    const Index num_threads = this->m_device.numThreads();
+    internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index, internal::ShardByCol> blocking(k, m, n, num_threads);
+    Index mc = blocking.mc();
+    Index nc = blocking.nc();
+    Index kc = blocking.kc();
+    eigen_assert(mc <= m);
+    eigen_assert(nc <= n);
+    eigen_assert(kc <= k);
+
+#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b))
+    const Index k_blocks = CEIL_DIV(k, kc);
+    const Index n_blocks = CEIL_DIV(n, nc);
+    const Index m_blocks = CEIL_DIV(m, mc);
+    const Index sizeA = mc * kc;
+    const Index sizeB = kc * nc;
+
+    /*    cout << "m: " << m << " n: " << n << " k: " << k << endl;
+    cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl;
+    cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl;
+    cout << "num threads: " << num_threads << endl;
+    */
+
+    // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB
+    //       aren't 16 byte aligned segfaults will happen due to SIMD instructions
+    // note: You can get away with allocating just a single blockA and offsets and meet the
+    //       the alignment requirements with the assumption that
+    //       (Traits::mr * sizeof(ResScalar)) % 16 == 0
+    const Index numBlockAs = numext::mini(num_threads, m_blocks);
+    MaxSizeVector<LhsScalar *> blockAs(num_threads);
+    for (int i = 0; i < num_threads; i++) {
+      blockAs.push_back(static_cast<LhsScalar *>(this->m_device.allocate(sizeA * sizeof(LhsScalar))));
+    }
+
+    // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread
+    // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful.
+    //       Other options: (1) reuse memory when a thread finishes. con: tricky
+    //                      (2) allocate block B memory in each thread. con: overhead
+    MaxSizeVector<RhsScalar *> blockBs(n_blocks);
+    for (int i = 0; i < n_blocks; i++) {
+      blockBs.push_back(static_cast<RhsScalar *>(this->m_device.allocate(sizeB * sizeof(RhsScalar))));
+    }
+
+    // lhs_notifications starts with all null Notifications
+    MaxSizeVector<Notification*> lhs_notifications(num_threads, nullptr);
+
+    // this should really be numBlockAs * n_blocks;
+    const Index num_kernel_notifications = num_threads * n_blocks;
+    MaxSizeVector<Notification*> kernel_notifications(num_kernel_notifications,
+                                                    nullptr);
+
+    for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) {
+      const Index k_start = k_block_idx * kc;
+      // make sure we don't overshoot right edge of left matrix
+      const Index actual_kc = numext::mini(k_start + kc, k) - k_start;
+
+      for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) {
+        const Index num_blocks = numext::mini(m_blocks-m_block_idx, numBlockAs);
+
+        for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) {
+          const Index m_start = mt_block_idx * mc;
+          const Index actual_mc = numext::mini(m_start + mc, m) - m_start;
+          eigen_assert(actual_mc > 0);
+
+          Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads;
+
+          for (int i = 0; i < n_blocks; ++i) {
+            Index notification_id = (blockAId * n_blocks + i);
+            // Wait for any current kernels using this slot to complete
+            // before using it.
+            if (kernel_notifications[notification_id]) {
+              wait_until_ready(kernel_notifications[notification_id]);
+              delete kernel_notifications[notification_id];
+            }
+            kernel_notifications[notification_id] = new Notification();
+          }
+          const packLArg arg = {
+            blockAs[blockAId], // blockA
+            lhs,        // lhs
+            m_start,    // m
+            k_start,    // k
+            actual_mc,  // mc
+            actual_kc,  // kc
+          };
+
+          // Delete any existing notification since we may be
+          // replacing it.  The algorithm should ensure that there are
+          // no existing waiters on this notification.
+          delete lhs_notifications[blockAId];
+          lhs_notifications[blockAId] =
+          this->m_device.enqueue(&Self::packLhs<packLArg, LhsPacker>, arg);
+        }
+
+        // now start kernels.
+        const Index m_base_start = m_block_idx * mc;
+        const bool need_to_pack = m_block_idx == 0;
+
+        for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) {
+          const Index n_start = n_block_idx * nc;
+          const Index actual_nc = numext::mini(n_start + nc, n) - n_start;
+
+          // first make sure the previous kernels are all done before overwriting rhs. Also wait if
+          // we're going to start new k. In both cases need_to_pack is true.
+          if (need_to_pack) {
+            for (Index i = num_blocks; i < num_threads; ++i) {
+              Index blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads;
+              Index future_id = (blockAId * n_blocks + n_block_idx);
+              wait_until_ready(kernel_notifications[future_id]);
+            }
+          }
+
+          packRKArg arg = {
+            &blockAs, // blockA
+            blockBs[n_block_idx], // blockB
+            rhs,          // rhs
+            output,       // output
+            m_base_start, // m
+            k_start,      // k
+            n_start,      // n
+            mc,           // mc
+            actual_kc,    // kc
+            actual_nc,    // nc
+            num_threads,
+            numBlockAs,
+            m,
+            k_block_idx,
+            m_block_idx,
+            n_block_idx, // n_block_idx
+            m_blocks, // m_blocks
+            n_blocks, // n_blocks
+            &kernel_notifications, // kernel notifications
+            &lhs_notifications,    // lhs notifications
+            need_to_pack, // need_to_pack
+          };
+
+          // We asynchronously kick off this function, which ends up
+          // notifying the appropriate kernel_notifications objects,
+          // which this thread waits on before exiting.
+          this->m_device.enqueueNoNotification(&Self::packRhsAndKernel<packRKArg, RhsPacker, GebpKernel>, arg);
+        }
+      }
+    }
+
+    // Make sure all the kernels are done.
+    for (size_t i = 0; i < kernel_notifications.size(); ++i) {
+      wait_until_ready(kernel_notifications[i]);
+      delete kernel_notifications[i];
+    }
+
+    // No need to wait for lhs notifications since they should have
+    // already been waited on.  Just clean them up.
+    for (size_t i = 0; i < lhs_notifications.size(); ++i) {
+      delete lhs_notifications[i];
+    }
+
+    // deallocate all of the memory for both A and B's
+    for (size_t i = 0; i < blockAs.size(); i++) {
+      this->m_device.deallocate(blockAs[i]);
+    }
+    for (size_t i = 0; i < blockBs.size(); i++) {
+      this->m_device.deallocate(blockBs[i]);
+    }
+
+#undef CEIL_DIV
+  }
+
+  /*
+   * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing
+   * the LHS block, check that all of the kernels that worked on the same
+   * mt_block_idx in the previous m_block are done.
+   */
+  template <typename packLArg, typename LhsPacker>
+  static void packLhs(const packLArg arg) {
+    // perform actual packing
+    LhsPacker pack_lhs;
+    pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc);
+  }
+
+  /*
+   * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that
+   * all kernels in the previous block are done.
+   * Then for each LHS future, we wait on the future and then call GEBP
+   * on the area packed by the future (which starts at
+   * blockA + future_idx * mt * kc) on the LHS and with the full packed
+   * RHS block.
+   * The output of this GEBP is written to output(m + i * mt, n).
+   */
+  template <typename packRKArg, typename RhsPacker, typename GebpKernel>
+  static void packRhsAndKernel(packRKArg arg) {
+    if (arg.need_to_pack) {
+      RhsPacker pack_rhs;
+      pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc);
+    }
+
+    GebpKernel gebp;
+    for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) {
+      const Index m_base_start = arg.m + arg.mc*mt_block_idx;
+      if (m_base_start < arg.max_m) {
+        Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads;
+        wait_until_ready((*arg.lhs_notifications)[blockAId]);
+        const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start;
+        gebp(arg.output.getSubMapper(m_base_start, arg.n),
+             (*arg.blockAs)[blockAId], arg.blockB,
+             actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0);
+
+        // Notify that the kernel is done.
+        const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx;
+        (*arg.kernel_notifications)[set_idx]->Notify();
+      }
+    }
+  }
+#endif  // EIGEN_USE_SIMPLE_THREAD_POOL
+
   TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk,
                                bool shard_by_col, bool prepacked) const {
     const int packed_size = std::min<int>(PacketType<LhsScalar, Device>::size,
                                           PacketType<RhsScalar, Device>::size);
     const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
     const double kd = static_cast<double>(bk);
-    double compute_bandwidth = computeBandwidth(false, bm, bn, bk);
+    // Peak VFMA bandwidth is 0.5. However if we have not enough data for
+    // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined
+    // experimentally.
+    double computeBandwidth = bk == 1 ? 4.0 :
+          (shard_by_col ? bn : bm) < Traits::nr ||
+          (shard_by_col ? bm : bn) < Traits::mr ? 2.0 : 0.5;
+#ifndef EIGEN_VECTORIZE_FMA
+    // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors.
+    // However for MULPS/ADDPS we have dependent sequence of 2 such instructions,
+    // so overall bandwidth is 1.0.
+    if (computeBandwidth == 0.5) computeBandwidth = 1.0;
+#endif
     // Computations.
-    TensorOpCost cost = TensorOpCost(0, 0, kd * compute_bandwidth, true, packed_size);
+    TensorOpCost cost = TensorOpCost(0, 0, kd * computeBandwidth, true, packed_size);
     // Output stores.
     cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
     if (prepacked) {
@@ -1583,94 +1035,6 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       rhsCost.dropMemoryCost();
     return cost + lhsCost + rhsCost;
   }
-
-  // Decide whether we want to shard m x k x n contraction over the inner
-  // (contraction) dimension (k).
-  static bool shardByInnerDim(Index m, Index n, Index k, int num_threads,
-                              int num_threads_by_k) {
-    std::ptrdiff_t bufsize = m * n * sizeof(Scalar);
-    bool shard_by_k = false;
-    if (n == 1 ||                // If mat*vec or...
-        num_threads_by_k < 2 ||  // running single threaded or...
-        num_threads_by_k <
-            num_threads ||  // sharding by k gives less parallelism or...
-        bufsize > l3CacheSize() / num_threads_by_k ||  // need more buffer space
-        // than L3 cache or...
-        k / num_threads_by_k < 2 * Traits::nr) {  // k per thread is tiny.
-      shard_by_k = false;
-    } else if (numext::maxi(m, n) / num_threads <
-                   Traits::nr ||  // both other dimensions are tiny or...
-               // k per thread is not small and...
-               (k / num_threads_by_k > 8 * Traits::nr &&
-                // one of the outer dimensions is tiny or sharding by k offers
-                // more parallelism.
-                (numext::mini(m, n) < 2 * Traits::nr ||
-                 num_threads_by_k > num_threads))) {
-      shard_by_k = true;
-    }
-    return shard_by_k;
-  }
-
-  TensorOpCost contractionCostPerInnerDim(Index m, Index n, Index k) const {
-    // Compute cost.
-    const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
-    TensorOpCost cost(0, 0, (computeBandwidth(true, m, n, k) * m) * n, true, output_packet_size);
-    // Output stores.
-    cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
-    TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * m;
-    TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * n;
-    // Since the inner gemm kernel is always sharded by column, the lhs
-    // load cost is negligible.
-    lhsCost.dropMemoryCost();
-    return cost + lhsCost + rhsCost;
-  }
-
-  int numThreadsInnerDim(Index m, Index n, Index k) const {
-    const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
-    TensorOpCost cost = contractionCostPerInnerDim(m, n, k);
-    double total_parallel_cost =
-        TensorCostModel<ThreadPoolDevice>::totalCost(k, cost);
-    // Cost of reduction step accumulating the m*n per-thread buffers into the
-    // result.
-    double reduction_cost = TensorCostModel<ThreadPoolDevice>::totalCost(
-        m * n, TensorOpCost(2, 1, 1, true, output_packet_size));
-    int num_threads = 1;
-    double min_cost = total_parallel_cost;
-    double kPerThreadOverHead = 3000;
-    double kFixedOverHead = 100000;
-    for (int nt = 2; nt <= this->m_device.numThreads(); nt += 2) {
-      double sequential_cost =
-          kFixedOverHead + nt * (reduction_cost + kPerThreadOverHead);
-      double parallel_cost = total_parallel_cost / nt + sequential_cost;
-      if (parallel_cost < min_cost) {
-        num_threads = nt;
-        min_cost = parallel_cost;
-      }
-    }
-    return num_threads;
-  }
-
-  double computeBandwidth(bool shard_by_col, Index bm, Index bn,
-                          Index bk) const {
-    // Peak VFMA bandwidth is 0.5. However if we have not enough data for
-    // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined
-    // experimentally.
-    double computeBandwidth =
-        bk == 1 ? 4.0
-                : (shard_by_col ? bn : bm) < Traits::nr ||
-                          (shard_by_col ? bm : bn) < Traits::mr
-                      ? 2.0
-                      : 0.5;
-#ifndef EIGEN_VECTORIZE_FMA
-    // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors.
-    // However for MULPS/ADDPS we have dependent sequence of 2 such
-    // instructions,
-    // so overall bandwidth is 1.0.
-    if (computeBandwidth == 0.5) computeBandwidth = 1.0;
-#endif
-    return computeBandwidth;
-  }
-
 };
 
 } // end namespace Eigen
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index cdbafbbb1..860a6949a 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -32,7 +32,6 @@ struct traits<TensorConversionOp<TargetType, XprType> >
   static const int NumDimensions = traits<XprType>::NumDimensions;
   static const int Layout = traits<XprType>::Layout;
   enum { Flags = 0 };
-  typedef typename TypeConversion<Scalar, typename traits<XprType>::PointerType>::type PointerType;
 };
 
 template<typename TargetType, typename XprType>
@@ -129,7 +128,6 @@ struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 2> {
       typedef typename internal::unpacket_traits<TgtPacket>::type TgtType;
       internal::scalar_cast_op<SrcType, TgtType> converter;
       EIGEN_ALIGN_MAX typename internal::unpacket_traits<TgtPacket>::type values[TgtPacketSize];
-      EIGEN_UNROLL_LOOP
       for (int i = 0; i < TgtPacketSize; ++i) {
         values[i] = converter(m_impl.coeff(index+i));
       }
@@ -165,116 +163,19 @@ class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprT
     typename XprType::Nested m_xpr;
 };
 
-template <bool SameType, typename Eval, typename EvalPointerType> struct ConversionSubExprEval {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) {
+template <bool SameType, typename Eval, typename Scalar> struct ConversionSubExprEval {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar*) {
     impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 };
 
-template <typename Eval, typename EvalPointerType> struct ConversionSubExprEval<true, Eval, EvalPointerType> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) {
+template <typename Eval, typename Scalar> struct ConversionSubExprEval<true, Eval, Scalar> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar* data) {
     return impl.evalSubExprsIfNeeded(data);
   }
 };
 
-#ifdef EIGEN_USE_THREADS
-template <bool SameType, typename Eval, typename EvalPointerType,
-          typename EvalSubExprsCallback>
-struct ConversionSubExprEvalAsync {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(
-      Eval& impl, EvalPointerType, EvalSubExprsCallback done) {
-    impl.evalSubExprsIfNeededAsync(nullptr, std::move(done));
-  }
-};
-
-template <typename Eval, typename EvalPointerType,
-          typename EvalSubExprsCallback>
-struct ConversionSubExprEvalAsync<true, Eval, EvalPointerType,
-                                  EvalSubExprsCallback> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(
-      Eval& impl, EvalPointerType data, EvalSubExprsCallback done) {
-    impl.evalSubExprsIfNeededAsync(data, std::move(done));
-  }
-};
-#endif
-
-namespace internal {
-
-template <typename SrcType, typename TargetType, bool IsSameT>
-struct CoeffConv {
-  template <typename ArgType, typename Device>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
-    internal::scalar_cast_op<SrcType, TargetType> converter;
-    return converter(impl.coeff(index));
-  }
-};
-
-template <typename SrcType, typename TargetType>
-struct CoeffConv<SrcType, TargetType, true> {
-  template <typename ArgType, typename Device>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
-    return impl.coeff(index);
-  }
-};
-
-template <typename SrcPacket, typename TargetPacket, int LoadMode, bool ActuallyVectorize, bool IsSameT>
-struct PacketConv {
-  typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
-  typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
-
-  static const int PacketSize = internal::unpacket_traits<TargetPacket>::size;
-
-  template <typename ArgType, typename Device>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
-    internal::scalar_cast_op<SrcType, TargetType> converter;
-    EIGEN_ALIGN_MAX typename internal::remove_const<TargetType>::type values[PacketSize];
-    EIGEN_UNROLL_LOOP
-    for (int i = 0; i < PacketSize; ++i) {
-      values[i] = converter(impl.coeff(index+i));
-    }
-    TargetPacket rslt = internal::pload<TargetPacket>(values);
-    return rslt;
-  }
-};
-
-template <typename SrcPacket, typename TargetPacket, int LoadMode, bool IsSameT>
-struct PacketConv<SrcPacket, TargetPacket, LoadMode, true, IsSameT> {
-  typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
-  typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
-
-  template <typename ArgType, typename Device>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
-    const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
-    const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
-    PacketConverter<TensorEvaluator<ArgType, Device>, SrcPacket, TargetPacket,
-                    SrcCoeffRatio, TgtCoeffRatio> converter(impl);
-    return converter.template packet<LoadMode>(index);
-  }
-};
-
-template <typename SrcPacket, typename TargetPacket, int LoadMode>
-struct PacketConv<SrcPacket, TargetPacket, LoadMode, /*ActuallyVectorize=*/false, /*IsSameT=*/true> {
-  typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
-  static const int PacketSize = internal::unpacket_traits<TargetPacket>::size;
-
-  template <typename ArgType, typename Device>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
-    EIGEN_ALIGN_MAX typename internal::remove_const<TargetType>::type values[PacketSize];
-    for (int i = 0; i < PacketSize; ++i) values[i] = impl.coeff(index+i);
-    return internal::pload<TargetPacket>(values);
-  }
-};
-
-template <typename SrcPacket, typename TargetPacket, int LoadMode>
-struct PacketConv<SrcPacket, TargetPacket, LoadMode, /*ActuallyVectorize=*/true, /*IsSameT=*/true> {
-  template <typename ArgType, typename Device>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
-    return impl.template packet<LoadMode>(index);
-  }
-};
-
-}  // namespace internal
 
 // Eval as rvalue
 template<typename TargetType, typename ArgType, typename Device>
@@ -288,52 +189,15 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
   typedef typename internal::remove_all<typename internal::traits<ArgType>::Scalar>::type SrcType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef typename PacketType<SrcType, Device>::type PacketSourceType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  static const bool IsSameType = internal::is_same<TargetType, SrcType>::value;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
-    IsAligned         = false,
-    PacketAccess      =
-    #ifndef EIGEN_USE_SYCL
-                        true,
-    #else
-                        TensorEvaluator<ArgType, Device>::PacketAccess &
-                        internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
-    #endif
-    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
-    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess         = false
+    IsAligned = false,
+    PacketAccess = true,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess = false
   };
 
-  static const int NumDims = internal::array_size<Dimensions>::value;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
-      ArgTensorBlock;
-
-  struct TensorConversionOpBlockFactory {
-    template <typename ArgXprType>
-    struct XprType {
-      typedef TensorConversionOp<TargetType, const ArgXprType> type;
-    };
-
-    template <typename ArgXprType>
-    typename XprType<ArgXprType>::type expr(const ArgXprType& expr) const {
-      return typename XprType<ArgXprType>::type(expr);
-    }
-  };
-
-  typedef internal::TensorUnaryExprBlock<TensorConversionOpBlockFactory,
-                                         ArgTensorBlock>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : m_impl(op.expression(), device)
   {
@@ -341,21 +205,11 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data)
   {
-    return ConversionSubExprEval<IsSameType, TensorEvaluator<ArgType, Device>, EvaluatorPointerType>::run(m_impl, data);
+    return ConversionSubExprEval<internal::is_same<TargetType, SrcType>::value, TensorEvaluator<ArgType, Device>, Scalar>::run(m_impl, data);
   }
 
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType data, EvalSubExprsCallback done) {
-    ConversionSubExprEvalAsync<IsSameType, TensorEvaluator<ArgType, Device>,
-                               EvaluatorPointerType,
-        EvalSubExprsCallback>::run(m_impl, data, std::move(done));
-  }
-#endif
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
   {
     m_impl.cleanup();
@@ -363,23 +217,16 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    return internal::CoeffConv<SrcType, TargetType, IsSameType>::run(m_impl,index);
+    internal::scalar_cast_op<SrcType, TargetType> converter;
+    return converter(m_impl.coeff(index));
   }
 
   template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType
-  packet(Index index) const {
-    // If we are not going to do the cast, we just need to check that base
-    // TensorEvaluator has packet access. Otherwise we also need to make sure,
-    // that we have an implementation of vectorized cast.
-    const bool Vectorizable =
-        IsSameType
-        ? TensorEvaluator<ArgType, Device>::PacketAccess
-        : TensorEvaluator<ArgType, Device>::PacketAccess &
-          internal::type_casting_traits<SrcType, TargetType>::VectorizedCast;
-
-    return internal::PacketConv<PacketSourceType, PacketReturnType, LoadMode,
-                                Vectorizable, IsSameType>::run(m_impl, index);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    const bool Vectorizable = TensorEvaluator<ArgType, Device>::PacketAccess &
+        internal::type_casting_traits<SrcType, TargetType>::VectorizedCast;
+    return PacketConv<LoadMode, Vectorizable>::run(m_impl, index);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
@@ -397,30 +244,33 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    return m_impl.getResourceRequirements();
-  }
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    return TensorBlock(m_impl.block(desc, scratch),
-                         TensorConversionOpBlockFactory());
-  }
+  protected:
+  template <int LoadMode, bool ActuallyVectorize>
+  struct PacketConv {
+    static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+      internal::scalar_cast_op<SrcType, TargetType> converter;
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      for (int i = 0; i < PacketSize; ++i) {
+        values[i] = converter(impl.coeff(index+i));
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  };
 
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+  template <int LoadMode>
+  struct PacketConv<LoadMode, true> {
+    static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+      const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
+      const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
+      PacketConverter<TensorEvaluator<ArgType, Device>, PacketSourceType, PacketReturnType,
+                      SrcCoeffRatio, TgtCoeffRatio> converter(impl);
+      return converter.template packet<LoadMode>(index);
+    }
+  };
 
-  /// required by sycl in order to extract the sycl accessor
-  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
-
- protected:
   TensorEvaluator<ArgType, Device> m_impl;
 };
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 27ad9f147..abdf742c6 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -54,8 +54,8 @@ class IndexMapper {
       }
     }
 
-    array<Index, NumDims> gpuInputDimensions;
-    array<Index, NumDims> gpuOutputDimensions;
+    array<Index, NumDims> cudaInputDimensions;
+    array<Index, NumDims> cudaOutputDimensions;
     array<Index, NumDims> tmp = dimensions;
     array<Index, NumDims> ordering;
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
@@ -65,8 +65,8 @@ class IndexMapper {
       const Index index = i + offset;
       ordering[index] = indices[i];
       tmp[indices[i]] = -1;
-      gpuInputDimensions[index] = input_dims[indices[i]];
-      gpuOutputDimensions[index] = dimensions[indices[i]];
+      cudaInputDimensions[index] = input_dims[indices[i]];
+      cudaOutputDimensions[index] = dimensions[indices[i]];
     }
 
     int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
@@ -75,8 +75,8 @@ class IndexMapper {
     for (int i = 0; i < NumDims; ++i) {
       if (tmp[i] >= 0) {
         ordering[written] = i;
-        gpuInputDimensions[written] = input_dims[i];
-        gpuOutputDimensions[written] = dimensions[i];
+        cudaInputDimensions[written] = input_dims[i];
+        cudaOutputDimensions[written] = dimensions[i];
         ++written;
       }
     }
@@ -89,37 +89,37 @@ class IndexMapper {
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int i = 0; i < NumDims; ++i) {
         if (i > NumKernelDims) {
-          m_gpuInputStrides[i] =
-              m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
-          m_gpuOutputStrides[i] =
-              m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
+          m_cudaInputStrides[i] =
+              m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1];
+          m_cudaOutputStrides[i] =
+              m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1];
         } else {
-          m_gpuInputStrides[i] = 1;
-          m_gpuOutputStrides[i] = 1;
+          m_cudaInputStrides[i] = 1;
+          m_cudaOutputStrides[i] = 1;
         }
       }
     } else {
       for (int i = NumDims - 1; i >= 0; --i) {
-        if (static_cast<size_t>(i + 1) < offset) {
-          m_gpuInputStrides[i] =
-              m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
-          m_gpuOutputStrides[i] =
-              m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
+        if (i + 1 < offset) {
+          m_cudaInputStrides[i] =
+              m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1];
+          m_cudaOutputStrides[i] =
+              m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1];
         } else {
-          m_gpuInputStrides[i] = 1;
-          m_gpuOutputStrides[i] = 1;
+          m_cudaInputStrides[i] = 1;
+          m_cudaOutputStrides[i] = 1;
         }
       }
     }
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const {
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int d = NumDims - 1; d > NumKernelDims; --d) {
-        const Index idx = p / m_gpuInputStrides[d];
+        const Index idx = p / m_cudaInputStrides[d];
         inputIndex += idx * m_inputStrides[d];
-        p -= idx * m_gpuInputStrides[d];
+        p -= idx * m_cudaInputStrides[d];
       }
       inputIndex += p * m_inputStrides[NumKernelDims];
     } else {
@@ -128,22 +128,22 @@ class IndexMapper {
         limit = NumDims - NumKernelDims - 1;
       }
       for (int d = 0; d < limit; ++d) {
-        const Index idx = p / m_gpuInputStrides[d];
+        const Index idx = p / m_cudaInputStrides[d];
         inputIndex += idx * m_inputStrides[d];
-        p -= idx * m_gpuInputStrides[d];
+        p -= idx * m_cudaInputStrides[d];
       }
       inputIndex += p * m_inputStrides[limit];
     }
     return inputIndex;
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const {
     Index outputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int d = NumDims - 1; d > NumKernelDims; --d) {
-        const Index idx = p / m_gpuOutputStrides[d];
+        const Index idx = p / m_cudaOutputStrides[d];
         outputIndex += idx * m_outputStrides[d];
-        p -= idx * m_gpuOutputStrides[d];
+        p -= idx * m_cudaOutputStrides[d];
       }
       outputIndex += p * m_outputStrides[NumKernelDims];
     } else {
@@ -152,44 +152,44 @@ class IndexMapper {
         limit = NumDims - NumKernelDims - 1;
       }
       for (int d = 0; d < limit; ++d) {
-        const Index idx = p / m_gpuOutputStrides[d];
+        const Index idx = p / m_cudaOutputStrides[d];
         outputIndex += idx * m_outputStrides[d];
-        p -= idx * m_gpuOutputStrides[d];
+        p -= idx * m_cudaOutputStrides[d];
       }
       outputIndex += p * m_outputStrides[limit];
     }
     return outputIndex;
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
     return i * m_inputStrides[offset];
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
     return i * m_outputStrides[offset];
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
     return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
     return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
@@ -197,7 +197,7 @@ class IndexMapper {
            k * m_inputStrides[offset + 2];
   }
 
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
                               ? 0
                               : NumDims - NumKernelDims;
@@ -209,8 +209,8 @@ class IndexMapper {
   static const int NumDims = internal::array_size<InputDims>::value;
   array<Index, NumDims> m_inputStrides;
   array<Index, NumDims> m_outputStrides;
-  array<Index, NumDims> m_gpuInputStrides;
-  array<Index, NumDims> m_gpuOutputStrides;
+  array<Index, NumDims> m_cudaInputStrides;
+  array<Index, NumDims> m_cudaOutputStrides;
 };
 
 
@@ -231,8 +231,6 @@ struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
   typedef typename remove_reference<RhsNested>::type _RhsNested;
   static const int NumDimensions = traits<InputXprType>::NumDimensions;
   static const int Layout = traits<InputXprType>::Layout;
-  typedef typename conditional<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val,
-  typename traits<InputXprType>::PointerType, typename traits<KernelXprType>::PointerType>::type PointerType;
 
   enum {
     Flags = 0
@@ -302,24 +300,16 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef StorageMemory<Scalar, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
     IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    PreferBlockAccess = false,
     Layout = TensorEvaluator<InputArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
   {
@@ -475,7 +465,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
                                        PacketSize));
   }
 
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
  private:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
@@ -531,11 +521,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
       m_local_kernel = false;
     } else {
       size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
-      Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
+      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
       typedef TensorEvalToOp<const KernelArgType> EvalTo;
       EvalTo evalToTmp(local, m_kernelArg);
-      const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value;
-      internal::TensorExecutor<const EvalTo, Device, Vectorize>::run(evalToTmp, m_device);
+      const bool PacketAccess = internal::IsVectorizable<Device, KernelArgType>::value;
+      internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device);
 
       m_kernel = local;
       m_local_kernel = true;
@@ -554,14 +544,14 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   KernelArgType m_kernelArg;
   const Scalar* m_kernel;
   bool m_local_kernel;
-  const Device EIGEN_DEVICE_REF m_device;
+  const Device& m_device;
 };
 
 
 
 
 // Use an optimized implementation of the evaluation code for GPUs whenever possible.
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
 
 template <int StaticKernelSize>
 struct GetKernelSize {
@@ -584,11 +574,7 @@ __global__ void EigenConvolutionKernel1D(
         indexMapper,
     const float* __restrict kernel, const int numPlanes, const int numX,
     const int maxX, const int kernelSize, float* buffer) {
-#if defined(EIGEN_HIPCC)
-  HIP_DYNAMIC_SHARED(float, s)
-#else
   extern __shared__ float s[];
-#endif
 
   const int first_x = blockIdx.x * maxX;
   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
@@ -600,18 +586,18 @@ __global__ void EigenConvolutionKernel1D(
 
   for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
     // Load inputs to shared memory
-    const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
+    const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
     const int plane_kernel_offset = threadIdx.y * num_x_input;
     #pragma unroll
     for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
-      const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x);
+      const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x);
       s[i + plane_kernel_offset] = eval.coeff(tensor_index);
     }
 
     __syncthreads();
 
     // Compute the convolution
-    const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
+    const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
 
     #pragma unroll
     for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
@@ -621,7 +607,7 @@ __global__ void EigenConvolutionKernel1D(
       for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
         result += s[k + kernel_offset] * kernel[k];
       }
-      const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x);
+      const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x);
       buffer[tensor_index] = result;
     }
     __syncthreads();
@@ -637,11 +623,7 @@ __global__ void EigenConvolutionKernel2D(
     const float* __restrict kernel, const int numPlanes, const int numX,
     const int maxX, const int numY, const int maxY, const int kernelSizeX,
     const int kernelSizeY, float* buffer) {
-#if defined(EIGEN_HIPCC)
-  HIP_DYNAMIC_SHARED(float, s)
-#else
   extern __shared__ float s[];
-#endif
 
   const int first_x = blockIdx.x * maxX;
   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
@@ -658,7 +640,7 @@ __global__ void EigenConvolutionKernel2D(
 
   for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
 
-    const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
+    const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
     const int plane_kernel_offset = threadIdx.z * num_y_input;
 
     // Load inputs to shared memory
@@ -667,7 +649,7 @@ __global__ void EigenConvolutionKernel2D(
       const int input_offset = num_x_input * (j + plane_kernel_offset);
       #pragma unroll
       for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
-        const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y);
+        const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y);
         s[i + input_offset] = eval.coeff(tensor_index);
       }
     }
@@ -675,7 +657,7 @@ __global__ void EigenConvolutionKernel2D(
     __syncthreads();
 
     // Convolution
-    const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
+    const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
 
     #pragma unroll
     for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
@@ -691,7 +673,7 @@ __global__ void EigenConvolutionKernel2D(
             result += s[k + input_offset] * kernel[k + kernel_offset];
           }
         }
-        const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
+        const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
         buffer[tensor_index] = result;
       }
     }
@@ -709,11 +691,7 @@ __global__ void EigenConvolutionKernel3D(
     const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
     const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
     const size_t kernelSizeZ, float* buffer) {
-#if defined(EIGEN_HIPCC)
-  HIP_DYNAMIC_SHARED(float, s)
-#else
   extern __shared__ float s[];
-#endif
 
   // Load inputs to shared memory
   const int first_x = blockIdx.x * maxX;
@@ -730,13 +708,13 @@ __global__ void EigenConvolutionKernel3D(
 
   for (int p = 0; p < numPlanes; ++p) {
 
-    const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
+    const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p);
     const int plane_kernel_offset = 0;
 
     for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
       for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
         for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
-          const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
+          const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
           s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
         }
       }
@@ -748,7 +726,7 @@ __global__ void EigenConvolutionKernel3D(
     const int num_z_output = last_z - first_z + 1;
     const int num_y_output = last_y - first_y + 1;
     const int num_x_output = last_x - first_x + 1;
-    const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
+    const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p);
 
     for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
       for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
@@ -761,7 +739,7 @@ __global__ void EigenConvolutionKernel3D(
               }
             }
           }
-          const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
+          const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
           buffer[tensor_index] = result;
         }
       }
@@ -786,19 +764,13 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   enum {
     IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
     PacketAccess = false,
-    BlockAccess = false,
-    PreferBlockAccess = false,
     Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
-      : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
+      : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
   {
     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
 
@@ -880,9 +852,9 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
     typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
 
     const int maxSharedMem = m_device.sharedMemPerBlock();
-    const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
-    const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
-    const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
+    const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock();
+    const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock;
+    const int numMultiProcessors = m_device.getNumCudaMultiProcessors();
     const int warpSize = 32;
 
     switch (NumKernelDims) {
@@ -917,7 +889,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
         }
 
         const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
-        gpu_assert(shared_mem <= maxSharedMem);
+        assert(shared_mem <= maxSharedMem);
 
         const int num_x_blocks = ceil(numX, maxX);
         const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
@@ -934,15 +906,15 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
             m_inputImpl.dimensions(), kernel_dims, indices);
         switch(kernel_size) {
           case 4: {
-            LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
+            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
             break;
           }
           case 7: {
-            LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
+            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
             break;
           }
           default: {
-            LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
+            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
           }
         }
         break;
@@ -974,7 +946,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
         block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
 
         const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
-        gpu_assert(shared_mem <= maxSharedMem);
+        assert(shared_mem <= maxSharedMem);
 
         const int num_x_blocks = ceil(numX, maxX);
         const int num_y_blocks = ceil(numY, maxY);
@@ -995,11 +967,11 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
           case 4: {
             switch (kernel_size_y) {
               case 7: {
-                LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
+                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
                 break;
               }
               default: {
-                LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
+                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
                 break;
               }
             }
@@ -1008,18 +980,18 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
           case 7: {
             switch (kernel_size_y) {
               case 4: {
-                LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
+                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
                 break;
               }
               default: {
-                LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
+                LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
                 break;
               }
             }
             break;
           }
           default: {
-            LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
+            LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
             break;
           }
         }
@@ -1054,7 +1026,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
         dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
 
         const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
-        gpu_assert(shared_mem <= maxSharedMem);
+        assert(shared_mem <= maxSharedMem);
 
         //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z  << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
         const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
@@ -1065,7 +1037,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
         internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
             m_inputImpl.dimensions(), kernel_dims, indices);
 
-        LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
+        LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
         break;
       }
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
deleted file mode 100644
index 92003c766..000000000
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ /dev/null
@@ -1,544 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
-#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
-
-namespace Eigen {
-
-/** \class TensorConvolution
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Tensor convolution class.
- *
- *
- */
-
-enum class convolution_type { CONV1D, CONV2D, CONV3D };
-template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
-          typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim>
-struct EigenConvolutionKernel;
-template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
-          typename Kernel_accessor, typename Buffer_accessor>
-struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
-                              Buffer_accessor, convolution_type::CONV1D> {
-  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
-      Local_accessor;
-  Local_accessor local_acc;
-  Evaluator device_evaluator;
-  Kernel_accessor kernel_filter;
-  Buffer_accessor buffer_acc;
-  internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper;
-  const size_t kernelSize;
-  const cl::sycl::range<2> input_range;
-  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
-                         Buffer_accessor buffer_acc_,
-                         internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_,
-                         const size_t kernelSize_, const cl::sycl::range<2> input_range_)
-      : local_acc(local_acc_),
-        device_evaluator(device_evaluator_),
-        kernel_filter(kernel_filter_),
-        buffer_acc(buffer_acc_),
-        indexMapper(indexMapper_),
-        kernelSize(kernelSize_),
-        input_range(input_range_) {}
-
-  template <typename BooleanDim2>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) {
-    return (boolean_check[0] && boolean_check[1]);
-  }
-  void operator()(cl::sycl::nd_item<2> itemID) {
-    auto buffer_ptr = buffer_acc.get_pointer();
-    auto kernel_ptr = kernel_filter.get_pointer();
-    // the required row to be calculated for the for each plane in shered memory
-    const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
-    const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
-    const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
-    const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
-    /// fill the shared memory
-    for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) {
-      const size_t local_index = i + plane_kernel_offset;
-      const size_t tensor_index =
-          plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset);
-
-      local_acc[local_index] =
-          (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
-              ? device_evaluator.coeff(tensor_index)
-              : CoeffReturnType(0);
-    }
-
-    itemID.barrier(cl::sycl::access::fence_space::local_space);
-
-    // calculate the convolution // output start x
-    const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
-    if (boundary_check(itemID.get_global_id() < input_range)) {
-      CoeffReturnType result = static_cast<CoeffReturnType>(0);
-      const size_t index = plane_kernel_offset + itemID.get_local_id(0);
-      for (size_t k = 0; k < kernelSize; ++k) {
-        result += (local_acc[k + index] * kernel_ptr[k]);
-      }
-      const size_t tensor_index =
-          indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
-          indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
-      buffer_ptr[tensor_index] = result;
-    }
-  }
-};
-
-template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
-          typename Kernel_accessor, typename Buffer_accessor>
-struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
-                              Buffer_accessor, convolution_type::CONV2D> {
-  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
-      Local_accessor;
-  Local_accessor local_acc;
-  Evaluator device_evaluator;
-  Kernel_accessor kernel_filter;
-  Buffer_accessor buffer_acc;
-  internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper;
-  const cl::sycl::range<2> kernel_size;
-  const cl::sycl::range<3> input_range;
-  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
-                         Buffer_accessor buffer_acc_,
-                         internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_,
-                         const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_)
-      : local_acc(local_acc_),
-        device_evaluator(device_evaluator_),
-        kernel_filter(kernel_filter_),
-        buffer_acc(buffer_acc_),
-        indexMapper(indexMapper_),
-        kernel_size(kernel_size_),
-        input_range(input_range_) {}
-  template <typename BooleanDim3>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
-    return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
-  }
-
-  void operator()(cl::sycl::nd_item<3> itemID) {
-    auto buffer_ptr = buffer_acc.get_pointer();
-    auto kernel_ptr = kernel_filter.get_pointer();
-    // the required row to be calculated for the for each plane in shered memory
-    const auto num_input = cl::sycl::range<2>{
-        (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
-
-    const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
-    const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
-
-    const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
-                                                 itemID.get_group(1) * itemID.get_local_range()[1]};
-      
-    // fill the local memory
-    bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
-    for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
-      const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset);
-      bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1)); 
-      for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
-        const size_t local_index = i + local_input_offset;
-        const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
-                                                             i + input_offset[0], j + input_offset[1]);
-        local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) &&
-                                  in_range_dim1 && in_range_dim2)
-                                     ? device_evaluator.coeff(tensor_index)
-                                     : CoeffReturnType(0);
-      }
-    }
-
-    itemID.barrier(cl::sycl::access::fence_space::local_space);
-
-    // output offset start for each thread
-    const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
-                                                  itemID.get_group(1) * itemID.get_local_range()[1]};
-
-    if (boundary_check(itemID.get_global_id() < input_range)) {
-      CoeffReturnType result = static_cast<CoeffReturnType>(0);
-
-      for (size_t j = 0; j < kernel_size[1]; j++) {
-        size_t kernel_offset = kernel_size[0] * j;
-        const size_t index =
-            (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0);
-        for (size_t i = 0; i < kernel_size[0]; i++) {
-          result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]);
-        }
-      }
-      const size_t tensor_index =
-          indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
-          indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
-                                                             itemID.get_local_id(1) + output_offset[1]);
-
-      buffer_ptr[tensor_index] = result;
-    }
-  }
-};
-
-template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
-          typename Kernel_accessor, typename Buffer_accessor>
-struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
-                              Buffer_accessor, convolution_type::CONV3D> {
-  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
-      Local_accessor;
-  Local_accessor local_acc;
-  Evaluator device_evaluator;
-  Kernel_accessor kernel_filter;
-  Buffer_accessor buffer_acc;
-  internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper;
-  const cl::sycl::range<3> kernel_size;
-  const cl::sycl::range<3> input_range;
-  const size_t numP;
-
-  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
-                         Buffer_accessor buffer_acc_,
-                         internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_,
-                         const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_,
-                         const size_t numP_)
-      : local_acc(local_acc_),
-        device_evaluator(device_evaluator_),
-        kernel_filter(kernel_filter_),
-        buffer_acc(buffer_acc_),
-        indexMapper(indexMapper_),
-        kernel_size(kernel_size_),
-        input_range(input_range_),
-        numP(numP_) {}
-  template <typename BooleanDim3>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
-    return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
-  }
-  void operator()(cl::sycl::nd_item<3> itemID) {
-    auto buffer_ptr = buffer_acc.get_pointer();
-    auto kernel_ptr = kernel_filter.get_pointer();
-    const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
-
-    const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
-
-    const auto output_offset =
-          cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
-
-    for (size_t p = 0; p < numP; p++) {
-      /// fill the shared memory
-      const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
-      for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
-        size_t local_index_dim2 = num_input[0] * num_input[1] * k;
-        bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
-        for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
-          bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
-          size_t local_index_dim1 = (num_input[0] * j)  + local_index_dim2;
-          for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
-            bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
-            const size_t local_index = local_index_dim1 + i;
-            const size_t tensor_index =
-                plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
-                                         i + input_offset[0], j + input_offset[1], k + input_offset[2]);
-            local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
-          }
-        }
-      }
-      itemID.barrier(cl::sycl::access::fence_space::local_space);
-
-      // calculate the convolution
-
-      if (boundary_check(itemID.get_global_id() < input_range)) {
-        CoeffReturnType result = static_cast<CoeffReturnType>(0);
-        for (size_t k = 0; k < kernel_size[2]; k++) {
-          for (size_t j = 0; j < kernel_size[1]; j++) {
-            for (size_t i = 0; i < kernel_size[0]; i++) {
-              const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k);
-              const size_t local_index =
-                  ((i + itemID.get_local_id(0)) +
-                   num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
-
-              result += (local_acc[local_index] * kernel_ptr[kernel_index]);
-            }
-          }
-        }
-        const size_t tensor_index =
-            indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) +
-            indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
-        buffer_ptr[tensor_index] = result;
-      }
-
-      itemID.barrier(cl::sycl::access::fence_space::local_space);
-    }
-  }
-};
-
-template <typename Indices, typename InputArgType, typename KernelArgType>
-struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> {
-  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
-
-  static const int NumDims =
-      internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value;
-  static const int NumKernelDims = internal::array_size<Indices>::value;
-  typedef typename XprType::Index Index;
-  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions KernelDimensions;
-  typedef const Eigen::SyclDevice Device;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Eigen::SyclDevice>::type PacketReturnType;
-  typedef typename InputArgType::Scalar Scalar;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef StorageMemory<CoeffReturnType, Eigen::SyclDevice> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-  typedef StorageMemory<const CoeffReturnType, Eigen::SyclDevice> KernelStorage;
-
-  enum {
-    IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned &
-                TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned,
-    PacketAccess = false,
-    BlockAccess = false,
-    PreferBlockAccess = false,
-    Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
-      : m_inputImpl(op.inputExpression(), device),
-        m_kernelArg(op.kernelExpression()),
-        m_kernelImpl(op.kernelExpression(), device),
-        m_indices(op.indices()),
-        m_buf(NULL),
-        m_kernel(NULL),
-        m_local_kernel(false),
-        m_device(device) {
-    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout) ==
-                         static_cast<int>(TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Layout)),
-                        YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-    const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
-    const typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions &kernel_dims =
-        m_kernelImpl.dimensions();
-
-    m_dimensions = m_inputImpl.dimensions();
-    for (int i = 0; i < NumKernelDims; ++i) {
-      const Index index = op.indices()[i];
-      const Index input_dim = input_dims[index];
-      const Index kernel_dim = kernel_dims[i];
-      const Index result_dim = input_dim - kernel_dim + 1;
-      m_dimensions[index] = result_dim;
-    }
-  }
-
-  EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
-    preloadKernel();
-    m_inputImpl.evalSubExprsIfNeeded(NULL);
-    if (data) {
-      executeEval(data);
-      return false;
-    } else {
-      m_buf = (EvaluatorPointerType)m_device.get(
-          (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
-      executeEval(m_buf);
-      return true;
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
-    m_inputImpl.cleanup();
-    if (m_buf) {
-      m_device.deallocate_temp(m_buf);
-      m_buf = NULL;
-    }
-    if (m_local_kernel) {
-      m_device.deallocate_temp(m_kernel);
-      m_local_kernel = false;
-    }
-    m_kernel = NULL;
-  }
-  /// used by sycl in order to build the sycl buffer
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; }
-  /// used by sycl in order to build the sycl buffer
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
-    // Don't make a local copy of the kernel unless we have to (i.e. it's an
-    // expression that needs to be evaluated)
-    typename KernelStorage::Type in_place = m_kernelImpl.data();
-    if (in_place) {
-      m_kernel = in_place;
-      m_local_kernel = false;
-    } else {
-      ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
-      EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
-      typedef TensorEvalToOp<const KernelArgType> EvalTo;
-      EvalTo evalToTmp(m_device.get(local), m_kernelArg);
-      const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value;
-      internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
-      m_kernel = local;
-      m_local_kernel = true;
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const {
-    typedef TensorEvaluator<InputArgType, Eigen::SyclDevice> InputEvaluator;
-    typedef typename InputEvaluator::Dimensions InputDims;
-    switch (NumKernelDims) {
-      case 1: {
-        const size_t numX = dimensions()[m_indices[0]];
-        const size_t numP = dimensions().TotalSize() / numX;
-        const auto input_dim = std::array<size_t, 2>{numX, numP};
-        auto global_range = cl::sycl::range<2>{};
-        auto local_range = cl::sycl::range<2>{};
-        const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
-
-        m_device.parallel_for_setup(input_dim, global_range, local_range);
-        const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
-        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
-        const array<Index, 1> indices{{m_indices[0]}};
-        const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
-        internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
-
-        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
-                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D>
-            ConvKernel;
-
-        m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
-            m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
-            indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]));
-        break;
-      }
-
-      case 2: {
-        auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1,
-                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0};
-        auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
-                                              (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
-        const size_t numX = dimensions()[m_indices[kernel_index[0]]];
-        const size_t numY = dimensions()[m_indices[kernel_index[1]]];
-        const size_t numP = dimensions().TotalSize() / (numX * numY);
-        auto input_dim = std::array<size_t, 3>{numX, numY, numP};
-
-        auto global_range = cl::sycl::range<3>{};
-        auto local_range = cl::sycl::range<3>{};
-
-        m_device.parallel_for_setup(input_dim, global_range, local_range);
-
-        const size_t local_memory_size =
-            (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
-        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
-        const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
-        const array<Index, 2> kernel_dims{
-            {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
-        internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
-        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
-                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D>
-            ConvKernel;
-        m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
-            m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
-            indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]});
-        break;
-      }
-
-      case 3: {
-        auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2,
-                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1,
-                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0};
-
-        auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
-                                              (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
-                                              (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
-
-        const size_t numX = dimensions()[m_indices[kernel_index[0]]];
-        const size_t numY = dimensions()[m_indices[kernel_index[1]]];
-        const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
-        auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
-        const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
-
-        const array<Index, 3> indices{
-            {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
-        const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
-                                           m_kernelImpl.dimensions()[kernel_index[1]],
-                                           m_kernelImpl.dimensions()[kernel_index[2]]}};
-
-        internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
-
-        auto global_range = cl::sycl::range<3>{};
-        auto local_range = cl::sycl::range<3>{};
-
-        m_device.parallel_for_setup(input_dim, global_range, local_range);
-        auto local_memory_range = (local_range + kernel_size - 1);
-        const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
-
-        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
-        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
-                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D>
-            ConvKernel;
-        m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
-            m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
-            indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP);
-        break;
-      }
-
-      default: {
-        EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
-                            THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
-      }
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
-    eigen_assert(m_buf != NULL);
-    eigen_assert(index < m_dimensions.TotalSize());
-    return m_buf[index];
-  }
-
-  template <int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
-    eigen_assert(m_buf != NULL);
-    eigen_assert(index < m_dimensions.TotalSize());
-    return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
-    // model.
-    const double kernel_size = m_kernelImpl.dimensions().TotalSize();
-    // We ignore the use of fused multiply-add.
-    const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
-    const double firstIndex_compute_cost =
-        NumDims *
-        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
-    return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
-           kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
-                          TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
-  }
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_kernelImpl.bind(cgh);
-    m_inputImpl.bind(cgh);
-    m_buf.bind(cgh);
-    m_kernel.bind(cgh);
-  }
-
- private:
-  // No assignment (copies are needed by the kernels)
-  TensorEvaluator &operator=(const TensorEvaluator &);
-  TensorEvaluator<InputArgType, Eigen::SyclDevice> m_inputImpl;
-  KernelArgType m_kernelArg;
-  TensorEvaluator<KernelArgType, Eigen::SyclDevice> m_kernelImpl;
-  Indices m_indices;
-  Dimensions m_dimensions;
-  EvaluatorPointerType m_buf;
-  typename KernelStorage::Type m_kernel;
-  bool m_local_kernel;
-  const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;
-};  // namespace Eigen
-
-}  // end namespace Eigen
-
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index 195267ce8..83c449cf1 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -174,11 +174,8 @@ class TensorCostModel {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(
       double output_size, const TensorOpCost& cost_per_coeff, int max_threads) {
     double cost = totalCost(output_size, cost_per_coeff);
-    double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
-    // Make sure we don't invoke undefined behavior when we convert to an int.
-    threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());
-    return numext::mini(max_threads,
-                        numext::maxi<int>(1, static_cast<int>(threads)));
+    int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
+    return numext::mini(max_threads, numext::maxi(1, threads));
   }
 
   // taskSize assesses parallel task size.
@@ -189,13 +186,14 @@ class TensorCostModel {
     return totalCost(output_size, cost_per_coeff) / kTaskSize;
   }
 
+ private:
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(
       double output_size, const TensorOpCost& cost_per_coeff) {
     // Cost of memory fetches from L2 cache. 64 is typical cache line size.
     // 11 is L2 cache latency on Haswell.
     // We don't know whether data is in L1, L2 or L3. But we are most interested
     // in single-threaded computational time around 100us-10ms (smaller time
-    // is too small for parallelization, larger time is not interesting
+    // is too small for parallelization, larger time is not intersting
     // either because we are probably using all available threads already).
     // And for the target time range, L2 seems to be what matters. Data set
     // fitting into L1 is too small to take noticeable time. Data set fitting
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
index 476b2282a..e020d076f 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@@ -30,13 +30,12 @@ struct traits<TensorCustomUnaryOp<CustomUnaryFunc, XprType> >
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = traits<XprType>::NumDimensions;
   static const int Layout = traits<XprType>::Layout;
-  typedef typename traits<XprType>::PointerType PointerType;
 };
 
 template<typename CustomUnaryFunc, typename XprType>
 struct eval<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Eigen::Dense>
 {
-  typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType>EIGEN_DEVICE_REF type;
+  typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType>& type;
 };
 
 template<typename CustomUnaryFunc, typename XprType>
@@ -87,25 +86,17 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
   typedef typename internal::remove_const<typename ArgType::Scalar>::type Scalar;
   typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
     IsAligned = false,
-    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
     BlockAccess = false,
-    PreferBlockAccess = false,
     Layout = TensorEvaluator<XprType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
       : m_op(op), m_device(device), m_result(NULL)
   {
@@ -114,21 +105,21 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
     if (data) {
       evalTo(data);
       return false;
     } else {
-      m_result = static_cast<EvaluatorPointerType>(m_device.get( (CoeffReturnType*)
-          m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar))));
+      m_result = static_cast<CoeffReturnType*>(
+          m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
       evalTo(m_result);
       return true;
     }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
-    if (m_result) {
-      m_device.deallocate_temp(m_result);
+    if (m_result != NULL) {
+      m_device.deallocate(m_result);
       m_result = NULL;
     }
   }
@@ -147,25 +138,19 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_result.bind(cgh);
-  }
-#endif
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
 
  protected:
-  EIGEN_DEVICE_FUNC void evalTo(EvaluatorPointerType data) {
-    TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(m_device.get(data), m_dimensions);
+  EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
+    TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(
+        data, m_dimensions);
     m_op.func().eval(m_op.expression(), result, m_device);
   }
 
   Dimensions m_dimensions;
   const ArgType m_op;
-  const Device EIGEN_DEVICE_REF m_device;
-  EvaluatorPointerType m_result;
+  const Device& m_device;
+  CoeffReturnType* m_result;
 };
 
 
@@ -195,8 +180,6 @@ struct traits<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> >
   typedef typename remove_reference<RhsNested>::type _RhsNested;
   static const int NumDimensions = traits<LhsXprType>::NumDimensions;
   static const int Layout = traits<LhsXprType>::Layout;
-  typedef typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
-                                typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>::type PointerType;
 };
 
 template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
@@ -259,26 +242,17 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
   typedef typename XprType::Scalar Scalar;
   typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-
-  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
     IsAligned = false,
-    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
     BlockAccess = false,
-    PreferBlockAccess = false,
     Layout = TensorEvaluator<LhsXprType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_op(op), m_device(device), m_result(NULL)
   {
@@ -287,13 +261,12 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
     if (data) {
       evalTo(data);
       return false;
     } else {
-      m_result = static_cast<EvaluatorPointerType>(m_device.get( (CoeffReturnType*)
-        m_device.allocate_temp(dimensions().TotalSize() * sizeof(CoeffReturnType))));
+      m_result = static_cast<Scalar *>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
       evalTo(m_result);
       return true;
     }
@@ -301,7 +274,7 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     if (m_result != NULL) {
-      m_device.deallocate_temp(m_result);
+      m_device.deallocate(m_result);
       m_result = NULL;
     }
   }
@@ -320,25 +293,18 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_result.bind(cgh);
-  }
-#endif
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; }
 
  protected:
-  EIGEN_DEVICE_FUNC void evalTo(EvaluatorPointerType data) {
-    TensorMap<Tensor<CoeffReturnType, NumDims, Layout> > result(m_device.get(data), m_dimensions);
+  EIGEN_DEVICE_FUNC void evalTo(Scalar* data) {
+    TensorMap<Tensor<Scalar, NumDims, Layout> > result(data, m_dimensions);
     m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device);
   }
 
   Dimensions m_dimensions;
   const XprType m_op;
-  const Device EIGEN_DEVICE_REF m_device;
-  EvaluatorPointerType m_result;
+  const Device& m_device;
+  CoeffReturnType* m_result;
 };
 
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
index 804a16cc5..29e50a3b2 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@@ -63,73 +63,6 @@ template <typename ExpressionType, typename DeviceType> class TensorDevice {
     ExpressionType& m_expression;
 };
 
-/** \class TensorAsyncDevice
- * \ingroup CXX11_Tensor_Module
- *
- * \brief Pseudo expression providing an operator = that will evaluate its
- * argument asynchronously on the specified device. Currently only
- * ThreadPoolDevice implements proper asynchronous execution, while the default
- * and GPU devices just run the expression synchronously and call m_done() on
- * completion..
- *
- * Example:
- *    auto done = []() { ... expression evaluation done ... };
- *    C.device(thread_pool_device, std::move(done)) = A + B;
- */
-
-template <typename ExpressionType, typename DeviceType, typename DoneCallback>
-class TensorAsyncDevice {
- public:
-  TensorAsyncDevice(const DeviceType& device, ExpressionType& expression,
-                    DoneCallback done)
-      : m_device(device), m_expression(expression), m_done(std::move(done)) {}
-
-  template <typename OtherDerived>
-  EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) {
-    typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
-    typedef internal::TensorExecutor<const Assign, DeviceType> Executor;
-
-    Assign assign(m_expression, other);
-    Executor::run(assign, m_device);
-    m_done();
-
-    return *this;
-  }
-
- protected:
-  const DeviceType& m_device;
-  ExpressionType& m_expression;
-  DoneCallback m_done;
-};
-
-
-#ifdef EIGEN_USE_THREADS
-template <typename ExpressionType, typename DoneCallback>
-class TensorAsyncDevice<ExpressionType, ThreadPoolDevice, DoneCallback> {
- public:
-  TensorAsyncDevice(const ThreadPoolDevice& device, ExpressionType& expression,
-                    DoneCallback done)
-      : m_device(device), m_expression(expression), m_done(std::move(done)) {}
-
-  template <typename OtherDerived>
-  EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) {
-    typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
-    typedef internal::TensorAsyncExecutor<const Assign, ThreadPoolDevice, DoneCallback> Executor;
-
-    // WARNING: After assignment 'm_done' callback will be in undefined state.
-    Assign assign(m_expression, other);
-    Executor::runAsync(assign, m_device, std::move(m_done));
-
-    return *this;
-  }
-
- protected:
-  const ThreadPoolDevice& m_device;
-  ExpressionType& m_expression;
-  DoneCallback m_done;
-};
-#endif
-
 } // end namespace Eigen
 
 #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index f77923933..4f5767bc7 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -1,6 +1,337 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#if defined(__clang__) || defined(__GNUC__)
-#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorDeviceGpu.h file"
+#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
+
+namespace Eigen {
+
+static const int kCudaScratchSize = 1024;
+
+// This defines an interface that GPUDevice can take to use
+// CUDA streams underneath.
+class StreamInterface {
+ public:
+  virtual ~StreamInterface() {}
+
+  virtual const cudaStream_t& stream() const = 0;
+  virtual const cudaDeviceProp& deviceProperties() const = 0;
+
+  // Allocate memory on the actual device where the computation will run
+  virtual void* allocate(size_t num_bytes) const = 0;
+  virtual void deallocate(void* buffer) const = 0;
+
+  // Return a scratchpad buffer of size 1k
+  virtual void* scratchpad() const = 0;
+
+  // Return a semaphore. The semaphore is initially initialized to 0, and
+  // each kernel using it is responsible for resetting to 0 upon completion
+  // to maintain the invariant that the semaphore is always equal to 0 upon
+  // each kernel start.
+  virtual unsigned int* semaphore() const = 0;
+};
+
+static cudaDeviceProp* m_deviceProperties;
+static bool m_devicePropInitialized = false;
+
+static void initializeDeviceProp() {
+  if (!m_devicePropInitialized) {
+    // Attempts to ensure proper behavior in the case of multiple threads
+    // calling this function simultaneously. This would be trivial to
+    // implement if we could use std::mutex, but unfortunately mutex don't
+    // compile with nvcc, so we resort to atomics and thread fences instead.
+    // Note that if the caller uses a compiler that doesn't support c++11 we
+    // can't ensure that the initialization is thread safe.
+#if __cplusplus >= 201103L
+    static std::atomic<bool> first(true);
+    if (first.exchange(false)) {
+#else
+    static bool first = true;
+    if (first) {
+      first = false;
+#endif
+      // We're the first thread to reach this point.
+      int num_devices;
+      cudaError_t status = cudaGetDeviceCount(&num_devices);
+      if (status != cudaSuccess) {
+        std::cerr << "Failed to get the number of CUDA devices: "
+                  << cudaGetErrorString(status)
+                  << std::endl;
+        assert(status == cudaSuccess);
+      }
+      m_deviceProperties = new cudaDeviceProp[num_devices];
+      for (int i = 0; i < num_devices; ++i) {
+        status = cudaGetDeviceProperties(&m_deviceProperties[i], i);
+        if (status != cudaSuccess) {
+          std::cerr << "Failed to initialize CUDA device #"
+                    << i
+                    << ": "
+                    << cudaGetErrorString(status)
+                    << std::endl;
+          assert(status == cudaSuccess);
+        }
+      }
+
+#if __cplusplus >= 201103L
+      std::atomic_thread_fence(std::memory_order_release);
+#endif
+      m_devicePropInitialized = true;
+    } else {
+      // Wait for the other thread to inititialize the properties.
+      while (!m_devicePropInitialized) {
+#if __cplusplus >= 201103L
+        std::atomic_thread_fence(std::memory_order_acquire);
+#endif
+        sleep(1);
+      }
+    }
+  }
+}
+
+static const cudaStream_t default_stream = cudaStreamDefault;
+
+class CudaStreamDevice : public StreamInterface {
+ public:
+  // Use the default stream on the current device
+  CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
+    cudaGetDevice(&device_);
+    initializeDeviceProp();
+  }
+  // Use the default stream on the specified device
+  CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
+    initializeDeviceProp();
+  }
+  // Use the specified stream. Note that it's the
+  // caller responsibility to ensure that the stream can run on
+  // the specified device. If no device is specified the code
+  // assumes that the stream is associated to the current gpu device.
+  CudaStreamDevice(const cudaStream_t* stream, int device = -1)
+      : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
+    if (device < 0) {
+      cudaGetDevice(&device_);
+    } else {
+      int num_devices;
+      cudaError_t err = cudaGetDeviceCount(&num_devices);
+      EIGEN_UNUSED_VARIABLE(err)
+      assert(err == cudaSuccess);
+      assert(device < num_devices);
+      device_ = device;
+    }
+    initializeDeviceProp();
+  }
+
+  virtual ~CudaStreamDevice() {
+    if (scratch_) {
+      deallocate(scratch_);
+    }
+  }
+
+  const cudaStream_t& stream() const { return *stream_; }
+  const cudaDeviceProp& deviceProperties() const {
+    return m_deviceProperties[device_];
+  }
+  virtual void* allocate(size_t num_bytes) const {
+    cudaError_t err = cudaSetDevice(device_);
+    EIGEN_UNUSED_VARIABLE(err)
+    assert(err == cudaSuccess);
+    void* result;
+    err = cudaMalloc(&result, num_bytes);
+    assert(err == cudaSuccess);
+    assert(result != NULL);
+    return result;
+  }
+  virtual void deallocate(void* buffer) const {
+    cudaError_t err = cudaSetDevice(device_);
+    EIGEN_UNUSED_VARIABLE(err)
+    assert(err == cudaSuccess);
+    assert(buffer != NULL);
+    err = cudaFree(buffer);
+    assert(err == cudaSuccess);
+  }
+
+  virtual void* scratchpad() const {
+    if (scratch_ == NULL) {
+      scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int));
+    }
+    return scratch_;
+  }
+
+  virtual unsigned int* semaphore() const {
+    if (semaphore_ == NULL) {
+      char* scratch = static_cast<char*>(scratchpad()) + kCudaScratchSize;
+      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+      cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
+      EIGEN_UNUSED_VARIABLE(err)
+      assert(err == cudaSuccess);
+    }
+    return semaphore_;
+  }
+
+ private:
+  const cudaStream_t* stream_;
+  int device_;
+  mutable void* scratch_;
+  mutable unsigned int* semaphore_;
+};
+
+struct GpuDevice {
+  // The StreamInterface is not owned: the caller is
+  // responsible for its initialization and eventual destruction.
+  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
+    eigen_assert(stream);
+  }
+  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
+    eigen_assert(stream);
+  }
+  // TODO(bsteiner): This is an internal API, we should not expose it.
+  EIGEN_STRONG_INLINE const cudaStream_t& stream() const {
+    return stream_->stream();
+  }
+
+  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    return stream_->allocate(num_bytes);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+    stream_->deallocate(buffer);
+  }
+
+  EIGEN_STRONG_INLINE void* scratchpad() const {
+    return stream_->scratchpad();
+  }
+
+  EIGEN_STRONG_INLINE unsigned int* semaphore() const {
+    return stream_->semaphore();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#ifndef __CUDA_ARCH__
+    cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
+                                      stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    assert(err == cudaSuccess);
+#else
+  eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+    cudaError_t err =
+        cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    assert(err == cudaSuccess);
+  }
+
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+    cudaError_t err =
+        cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    assert(err == cudaSuccess);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+#ifndef __CUDA_ARCH__
+    cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    assert(err == cudaSuccess);
+#else
+  eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE size_t numThreads() const {
+    // FIXME
+    return 32;
+  }
+
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+    // FIXME
+    return 48*1024;
+  }
+
+  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+    // We won't try to take advantage of the l2 cache for the time being, and
+    // there is no l3 cache on cuda devices.
+    return firstLevelCacheSize();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
+#if defined(__CUDACC__) && !defined(__CUDA_ARCH__)
+    cudaError_t err = cudaStreamSynchronize(stream_->stream());
+    if (err != cudaSuccess) {
+      std::cerr << "Error detected in CUDA stream: "
+                << cudaGetErrorString(err)
+                << std::endl;
+      assert(err == cudaSuccess);
+    }
+#else
+    assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
+    return stream_->deviceProperties().multiProcessorCount;
+  }
+  EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
+    return stream_->deviceProperties().maxThreadsPerBlock;
+  }
+  EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
+    return stream_->deviceProperties().maxThreadsPerMultiProcessor;
+  }
+  EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
+    return stream_->deviceProperties().sharedMemPerBlock;
+  }
+  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+    return stream_->deviceProperties().major;
+  }
+  EIGEN_STRONG_INLINE int minorDeviceVersion() const {
+    return stream_->deviceProperties().minor;
+  }
+
+  EIGEN_STRONG_INLINE int maxBlocks() const {
+    return max_blocks_;
+  }
+
+  // This function checks if the CUDA runtime recorded an error for the
+  // underlying stream device.
+  inline bool ok() const {
+#ifdef __CUDACC__
+    cudaError_t error = cudaStreamQuery(stream_->stream());
+    return (error == cudaSuccess) || (error == cudaErrorNotReady);
+#else
+    return false;
+#endif
+  }
+
+ private:
+  const StreamInterface* stream_;
+  int max_blocks_;
+};
+
+#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
+  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);   \
+  assert(cudaGetLastError() == cudaSuccess);
+
+
+// FIXME: Should be device and kernel specific.
+#ifdef __CUDACC__
+static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) {
+#ifndef __CUDA_ARCH__
+  cudaError_t status = cudaDeviceSetSharedMemConfig(config);
+  EIGEN_UNUSED_VARIABLE(status)
+  assert(status == cudaSuccess);
+#else
+  EIGEN_UNUSED_VARIABLE(config)
+#endif
+}
 #endif
 
-#include "TensorDeviceGpu.h"
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
index 46b9d3ab2..9d141395b 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@@ -20,12 +20,6 @@ struct DefaultDevice {
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
     internal::aligned_free(buffer);
-  }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
-    return allocate(num_bytes);
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
-    deallocate(buffer);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
     ::memcpy(dst, src, n);
@@ -39,18 +33,11 @@ struct DefaultDevice {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
     ::memset(buffer, c, n);
   }
-  template<typename Type>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { 
-    return data;
-  }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
-#if !defined(EIGEN_GPU_COMPILE_PHASE)
+#ifndef __CUDA_ARCH__
     // Running on the host CPU
     return 1;
-#elif defined(EIGEN_HIP_DEVICE_COMPILE)
-    // Running on a HIP device
-    return 64;
 #else
     // Running on a CUDA device
     return 32;
@@ -58,12 +45,9 @@ struct DefaultDevice {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
-#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
+#ifndef __CUDA_ARCH__
     // Running on the host CPU
     return l1CacheSize();
-#elif defined(EIGEN_HIP_DEVICE_COMPILE)
-    // Running on a HIP device
-    return 48*1024; // FIXME : update this number for HIP
 #else
     // Running on a CUDA device, return the amount of shared memory available.
     return 48*1024;
@@ -71,12 +55,9 @@ struct DefaultDevice {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
-#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
+#ifndef __CUDA_ARCH__
     // Running single threaded on the host CPU
     return l3CacheSize();
-#elif defined(EIGEN_HIP_DEVICE_COMPILE)
-    // Running on a HIP device
-    return firstLevelCacheSize(); // FIXME : update this number for HIP
 #else
     // Running on a CUDA device
     return firstLevelCacheSize();
@@ -84,17 +65,13 @@ struct DefaultDevice {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-#if !defined(EIGEN_GPU_COMPILE_PHASE)
+#ifndef __CUDA_ARCH__
     // Running single threaded on the host CPU
     // Should return an enum that encodes the ISA supported by the CPU
     return 1;
-#elif defined(EIGEN_HIP_DEVICE_COMPILE)
-    // Running on a HIP device
-    // return 1 as major for HIP
-    return 1;
 #else
     // Running on a CUDA device
-    return EIGEN_CUDA_ARCH / 100;
+    return __CUDA_ARCH__ / 100;
 #endif
   }
 };
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
deleted file mode 100644
index 7f3394438..000000000
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
+++ /dev/null
@@ -1,360 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
-#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
-
-// This header file container defines fo gpu* macros which will resolve to
-// their equivalent hip* or cuda* versions depending on the compiler in use
-// A separate header (included at the end of this file) will undefine all 
-#include "TensorGpuHipCudaDefines.h"
-
-namespace Eigen {
-
-static const int kGpuScratchSize = 1024;
-
-// This defines an interface that GPUDevice can take to use
-// HIP / CUDA streams underneath.
-class StreamInterface {
- public:
-  virtual ~StreamInterface() {}
-
-  virtual const gpuStream_t& stream() const = 0;
-  virtual const gpuDeviceProp_t& deviceProperties() const = 0;
-
-  // Allocate memory on the actual device where the computation will run
-  virtual void* allocate(size_t num_bytes) const = 0;
-  virtual void deallocate(void* buffer) const = 0;
-
-  // Return a scratchpad buffer of size 1k
-  virtual void* scratchpad() const = 0;
-
-  // Return a semaphore. The semaphore is initially initialized to 0, and
-  // each kernel using it is responsible for resetting to 0 upon completion
-  // to maintain the invariant that the semaphore is always equal to 0 upon
-  // each kernel start.
-  virtual unsigned int* semaphore() const = 0;
-};
-
-static gpuDeviceProp_t* m_deviceProperties;
-static bool m_devicePropInitialized = false;
-
-static void initializeDeviceProp() {
-  if (!m_devicePropInitialized) {
-    // Attempts to ensure proper behavior in the case of multiple threads
-    // calling this function simultaneously. This would be trivial to
-    // implement if we could use std::mutex, but unfortunately mutex don't
-    // compile with nvcc, so we resort to atomics and thread fences instead.
-    // Note that if the caller uses a compiler that doesn't support c++11 we
-    // can't ensure that the initialization is thread safe.
-    static std::atomic<bool> first(true);
-    if (first.exchange(false)) {
-      // We're the first thread to reach this point.
-      int num_devices;
-      gpuError_t status = gpuGetDeviceCount(&num_devices);
-      if (status != gpuSuccess) {
-        std::cerr << "Failed to get the number of GPU devices: "
-                  << gpuGetErrorString(status)
-                  << std::endl;
-        gpu_assert(status == gpuSuccess);
-      }
-      m_deviceProperties = new gpuDeviceProp_t[num_devices];
-      for (int i = 0; i < num_devices; ++i) {
-        status = gpuGetDeviceProperties(&m_deviceProperties[i], i);
-        if (status != gpuSuccess) {
-          std::cerr << "Failed to initialize GPU device #"
-                    << i
-                    << ": "
-                    << gpuGetErrorString(status)
-                    << std::endl;
-          gpu_assert(status == gpuSuccess);
-        }
-      }
-
-      std::atomic_thread_fence(std::memory_order_release);
-      m_devicePropInitialized = true;
-    } else {
-      // Wait for the other thread to inititialize the properties.
-      while (!m_devicePropInitialized) {
-        std::atomic_thread_fence(std::memory_order_acquire);
-        EIGEN_SLEEP(1000);
-      }
-    }
-  }
-}
-
-static const gpuStream_t default_stream = gpuStreamDefault;
-
-class GpuStreamDevice : public StreamInterface {
- public:
-  // Use the default stream on the current device
-  GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
-    gpuGetDevice(&device_);
-    initializeDeviceProp();
-  }
-  // Use the default stream on the specified device
-  GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
-    initializeDeviceProp();
-  }
-  // Use the specified stream. Note that it's the
-  // caller responsibility to ensure that the stream can run on
-  // the specified device. If no device is specified the code
-  // assumes that the stream is associated to the current gpu device.
-  GpuStreamDevice(const gpuStream_t* stream, int device = -1)
-      : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
-    if (device < 0) {
-      gpuGetDevice(&device_);
-    } else {
-      int num_devices;
-      gpuError_t err = gpuGetDeviceCount(&num_devices);
-      EIGEN_UNUSED_VARIABLE(err)
-      gpu_assert(err == gpuSuccess);
-      gpu_assert(device < num_devices);
-      device_ = device;
-    }
-    initializeDeviceProp();
-  }
-
-  virtual ~GpuStreamDevice() {
-    if (scratch_) {
-      deallocate(scratch_);
-    }
-  }
-
-  const gpuStream_t& stream() const { return *stream_; }
-  const gpuDeviceProp_t& deviceProperties() const {
-    return m_deviceProperties[device_];
-  }
-  virtual void* allocate(size_t num_bytes) const {
-    gpuError_t err = gpuSetDevice(device_);
-    EIGEN_UNUSED_VARIABLE(err)
-    gpu_assert(err == gpuSuccess);
-    void* result;
-    err = gpuMalloc(&result, num_bytes);
-    gpu_assert(err == gpuSuccess);
-    gpu_assert(result != NULL);
-    return result;
-  }
-  virtual void deallocate(void* buffer) const {
-    gpuError_t err = gpuSetDevice(device_);
-    EIGEN_UNUSED_VARIABLE(err)
-    gpu_assert(err == gpuSuccess);
-    gpu_assert(buffer != NULL);
-    err = gpuFree(buffer);
-    gpu_assert(err == gpuSuccess);
-  }
-
-  virtual void* scratchpad() const {
-    if (scratch_ == NULL) {
-      scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int));
-    }
-    return scratch_;
-  }
-
-  virtual unsigned int* semaphore() const {
-    if (semaphore_ == NULL) {
-      char* scratch = static_cast<char*>(scratchpad()) + kGpuScratchSize;
-      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
-      gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
-      EIGEN_UNUSED_VARIABLE(err)
-      gpu_assert(err == gpuSuccess);
-    }
-    return semaphore_;
-  }
-
- private:
-  const gpuStream_t* stream_;
-  int device_;
-  mutable void* scratch_;
-  mutable unsigned int* semaphore_;
-};
-
-struct GpuDevice {
-  // The StreamInterface is not owned: the caller is
-  // responsible for its initialization and eventual destruction.
-  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) {
-    eigen_assert(stream);
-  }
-  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
-    eigen_assert(stream);
-  }
-  // TODO(bsteiner): This is an internal API, we should not expose it.
-  EIGEN_STRONG_INLINE const gpuStream_t& stream() const {
-    return stream_->stream();
-  }
-
-  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
-    return stream_->allocate(num_bytes);
-  }
-
-  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
-    stream_->deallocate(buffer);
-  }
-
-  EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
-    return stream_->allocate(num_bytes);
-  }
-
-  EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
-    stream_->deallocate(buffer);
-  }
-
-  template<typename Type>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { 
-    return data;
-  }
-
-  EIGEN_STRONG_INLINE void* scratchpad() const {
-    return stream_->scratchpad();
-  }
-
-  EIGEN_STRONG_INLINE unsigned int* semaphore() const {
-    return stream_->semaphore();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
-#ifndef EIGEN_GPU_COMPILE_PHASE
-    gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice,
-                                      stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
-    gpu_assert(err == gpuSuccess);
-#else
-    EIGEN_UNUSED_VARIABLE(dst);
-    EIGEN_UNUSED_VARIABLE(src);
-    EIGEN_UNUSED_VARIABLE(n);
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
-  }
-
-  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
-    gpuError_t err =
-        gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
-    gpu_assert(err == gpuSuccess);
-  }
-
-  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
-    gpuError_t err =
-        gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
-    gpu_assert(err == gpuSuccess);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
-#ifndef EIGEN_GPU_COMPILE_PHASE
-    gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
-    EIGEN_UNUSED_VARIABLE(err)
-    gpu_assert(err == gpuSuccess);
-#else
-  eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
-  }
-
-  EIGEN_STRONG_INLINE size_t numThreads() const {
-    // FIXME
-    return 32;
-  }
-
-  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
-    // FIXME
-    return 48*1024;
-  }
-
-  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
-    // We won't try to take advantage of the l2 cache for the time being, and
-    // there is no l3 cache on hip/cuda devices.
-    return firstLevelCacheSize();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
-#ifndef EIGEN_GPU_COMPILE_PHASE
-    gpuError_t err = gpuStreamSynchronize(stream_->stream());
-    if (err != gpuSuccess) {
-      std::cerr << "Error detected in GPU stream: "
-                << gpuGetErrorString(err)
-                << std::endl;
-      gpu_assert(err == gpuSuccess);
-    }
-#else
-    gpu_assert(false && "The default device should be used instead to generate kernel code");
-#endif
-  }
-
-  EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const {
-    return stream_->deviceProperties().multiProcessorCount;
-  }
-  EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const {
-    return stream_->deviceProperties().maxThreadsPerBlock;
-  }
-  EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const {
-    return stream_->deviceProperties().maxThreadsPerMultiProcessor;
-  }
-  EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
-    return stream_->deviceProperties().sharedMemPerBlock;
-  }
-  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-    return stream_->deviceProperties().major;
-  }
-  EIGEN_STRONG_INLINE int minorDeviceVersion() const {
-    return stream_->deviceProperties().minor;
-  }
-
-  EIGEN_STRONG_INLINE int maxBlocks() const {
-    return max_blocks_;
-  }
-
-  // This function checks if the GPU runtime recorded an error for the
-  // underlying stream device.
-  inline bool ok() const {
-#ifdef EIGEN_GPUCC
-    gpuError_t error = gpuStreamQuery(stream_->stream());
-    return (error == gpuSuccess) || (error == gpuErrorNotReady);
-#else
-    return false;
-#endif
-  }
-
- private:
-  const StreamInterface* stream_;
-  int max_blocks_;
-};
-
-#if defined(EIGEN_HIPCC)
-
-#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
-  hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
-  gpu_assert(hipGetLastError() == hipSuccess);
-
-#else
- 
-#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)             \
-  (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__);   \
-  gpu_assert(cudaGetLastError() == cudaSuccess);
-
-#endif
- 
-// FIXME: Should be device and kernel specific.
-#ifdef EIGEN_GPUCC
-static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
-#ifndef EIGEN_GPU_COMPILE_PHASE
-  gpuError_t status = gpuDeviceSetSharedMemConfig(config);
-  EIGEN_UNUSED_VARIABLE(status)
-  gpu_assert(status == gpuSuccess);
-#else
-  EIGEN_UNUSED_VARIABLE(config)
-#endif
-}
-#endif
-
-}  // end namespace Eigen
-
-// undefine all the gpu* macros we defined at the beginning of the file
-#include "TensorGpuHipCudaUndefines.h"
-
-#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
index df591c21d..7c039890e 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -14,1035 +14,109 @@
 
 #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
 #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
-#include <unordered_set>
 
 namespace Eigen {
-
-namespace TensorSycl {
-namespace internal {
-
-/// Cache all the device information needed
-struct SyclDeviceInfo {
-  SyclDeviceInfo(cl::sycl::queue queue)
-      : local_mem_type(
-            queue.get_device()
-                .template get_info<cl::sycl::info::device::local_mem_type>()),
-        max_work_item_sizes(
-            queue.get_device()
-                .template get_info<
-                    cl::sycl::info::device::max_work_item_sizes>()),
-        max_mem_alloc_size(
-            queue.get_device()
-                .template get_info<
-                    cl::sycl::info::device::max_mem_alloc_size>()),
-        max_compute_units(queue.get_device()
-                              .template get_info<
-                                  cl::sycl::info::device::max_compute_units>()),
-        max_work_group_size(
-            queue.get_device()
-                .template get_info<
-                    cl::sycl::info::device::max_work_group_size>()),
-        local_mem_size(
-            queue.get_device()
-                .template get_info<cl::sycl::info::device::local_mem_size>()),
-        platform_name(queue.get_device()
-                          .get_platform()
-                          .template get_info<cl::sycl::info::platform::name>()),
-        device_name(queue.get_device()
-                        .template get_info<cl::sycl::info::device::name>()),
-        device_vendor(
-            queue.get_device()
-                .template get_info<cl::sycl::info::device::vendor>()) {}
-
-  cl::sycl::info::local_mem_type local_mem_type;
-  cl::sycl::id<3> max_work_item_sizes;
-  unsigned long max_mem_alloc_size;
-  unsigned long max_compute_units;
-  unsigned long max_work_group_size;
-  size_t local_mem_size;
-  std::string platform_name;
-  std::string device_name;
-  std::string device_vendor;
-};
-
-}  // end namespace internal
-}  // end namespace TensorSycl
-
-typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t;
-// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
-// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
-// TensorFlow via the Eigen SYCL Backend.
-EIGEN_STRONG_INLINE auto get_sycl_supported_devices()
-    -> decltype(cl::sycl::device::get_devices()) {
-#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
-  return {cl::sycl::device(cl::sycl::default_selector())};
-#else
-  std::vector<cl::sycl::device> supported_devices;
-  auto platform_list = cl::sycl::platform::get_platforms();
-  for (const auto &platform : platform_list) {
-    auto device_list = platform.get_devices();
-    auto platform_name =
-        platform.template get_info<cl::sycl::info::platform::name>();
-    std::transform(platform_name.begin(), platform_name.end(),
-                   platform_name.begin(), ::tolower);
-    for (const auto &device : device_list) {
-      auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
-      std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
-      bool unsupported_condition =
-          (device.is_cpu() && platform_name.find("amd") != std::string::npos &&
-           vendor.find("apu") == std::string::npos) ||
-          (platform_name.find("experimental") != std::string::npos) ||
-          device.is_host();
-      if (!unsupported_condition) {
-        supported_devices.push_back(device);
-      }
-    }
-  }
-  return supported_devices;
-#endif
-}
-
-class QueueInterface {
- public:
-  /// Creating device by using cl::sycl::selector or cl::sycl::device.
-  template <typename DeviceOrSelector>
-  explicit QueueInterface(
-      const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
-      unsigned num_threads = std::thread::hardware_concurrency())
-      : m_queue(dev_or_sel, handler),
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-        m_prog(m_queue.get_context(), get_sycl_supported_devices()),
-#endif
-        m_thread_pool(num_threads),
-        m_device_info(m_queue) {
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-    m_prog.build_with_kernel_type<DeviceOrSelector>();
-    auto f = [&](cl::sycl::handler &cgh) {
-      cgh.single_task<DeviceOrSelector>(m_prog.get_kernel<DeviceOrSelector>(),
-                                        [=]() {})
-    };
-    EIGEN_SYCL_TRY_CATCH(m_queue.submit(f));
-#endif
-  }
-
-  template <typename DeviceOrSelector>
-  explicit QueueInterface(
-      const DeviceOrSelector &dev_or_sel,
-      unsigned num_threads = std::thread::hardware_concurrency())
-      : QueueInterface(dev_or_sel,
-                       [this](cl::sycl::exception_list l) {
-                         this->exception_caught_ = this->sycl_async_handler(l);
-                       },
-                       num_threads) {}
-
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-  EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; }
-#endif
-
-  /// Attach an existing buffer to the pointer map, Eigen will not reuse it
-  EIGEN_STRONG_INLINE void *attach_buffer(
-      cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    return static_cast<void *>(pMapper.add_pointer(buf));
-  }
-
-  /// Detach previously attached buffer
-  EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    TensorSycl::internal::SYCLfree<false>(p, pMapper);
-  }
-
-  /// Allocating device pointer. This pointer is actually an 8 bytes host
-  /// pointer used as key to access the sycl device buffer. The reason is that
-  /// we cannot use device buffer as a pointer as a m_data in Eigen leafNode
-  /// expressions. So we create a key pointer to be used in Eigen expression
-  /// construction. When we convert the Eigen construction into the sycl
-  /// construction we use this pointer as a key in our buffer_map and we make
-  /// sure that we dedicate only one buffer only for this pointer. The device
-  /// pointer would be deleted by calling deallocate function.
-  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
-#if EIGEN_MAX_ALIGN_BYTES > 0
-    size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
-    if (align > 0) {
-      num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
-    }
-#endif
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
-  }
-
-  EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
-#if EIGEN_MAX_ALIGN_BYTES > 0
-    size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES;
-    if (align > 0) {
-      num_bytes += EIGEN_MAX_ALIGN_BYTES - align;
-    }
-#endif
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
-    if (scratch_buffers.empty()) {
-      return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
-      ;
-    } else {
-      for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) {
-        auto buff = pMapper.get_buffer(*it);
-        if (buff.get_size() >= num_bytes) {
-          auto ptr = *it;
-          scratch_buffers.erase(it);
-          return ptr;
-        } else {
-          ++it;
-        }
-      }
-      return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
-    }
-#else
-    return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper);
-#endif
-  }
-  template <typename data_t>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
-      cl::sycl::access::mode::read_write, data_t>
-  get(data_t *data) const {
-    return get_range_accessor<cl::sycl::access::mode::read_write, data_t>(data);
-  }
-  template <typename data_t>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
-      TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
-                                        data_t>
-          data) const {
-    return static_cast<data_t *>(data.get_virtual_pointer());
-  }
-
-  EIGEN_STRONG_INLINE void deallocate_temp(void *p) const {
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
-    scratch_buffers.insert(p);
-#else
-    TensorSycl::internal::SYCLfree(p, pMapper);
-#endif
-  }
-  template <cl::sycl::access::mode AcMd, typename T>
-  EIGEN_STRONG_INLINE void deallocate_temp(
-      const TensorSycl::internal::RangeAccess<AcMd, T> &p) const {
-    deallocate_temp(p.get_virtual_pointer());
-  }
-
-  /// This is used to deallocate the device pointer. p is used as a key inside
-  /// the map to find the device buffer and delete it.
-  EIGEN_STRONG_INLINE void deallocate(void *p) const {
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    TensorSycl::internal::SYCLfree(p, pMapper);
-  }
-
-  EIGEN_STRONG_INLINE void deallocate_all() const {
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    TensorSycl::internal::SYCLfreeAll(pMapper);
-#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
-    scratch_buffers.clear();
-#endif
-  }
-
-  /// The memcpyHostToDevice is used to copy the data from host to device
-  /// The destination pointer could be deleted before the copy happend which is
-  /// why a callback function is needed. By default if none is provided, the
-  /// function is blocking.
-  EIGEN_STRONG_INLINE void memcpyHostToDevice(
-      void *dst, const void *src, size_t n,
-      std::function<void()> callback) const {
-    static const auto write_mode = cl::sycl::access::mode::discard_write;
-    static const auto global_access = cl::sycl::access::target::global_buffer;
-    typedef cl::sycl::accessor<buffer_scalar_t, 1, write_mode, global_access>
-        write_accessor;
-    if (n == 0) {
-      if (callback) callback();
-      return;
-    }
-    n /= sizeof(buffer_scalar_t);
-    auto f = [&](cl::sycl::handler &cgh) {
-      write_accessor dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
-      buffer_scalar_t const *ptr = static_cast<buffer_scalar_t const *>(src);
-      auto non_deleter = [](buffer_scalar_t const *) {};
-      std::shared_ptr<const buffer_scalar_t> s_ptr(ptr, non_deleter);
-      cgh.copy(s_ptr, dst_acc);
-    };
-    cl::sycl::event e;
-    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
-    synchronize_and_callback(e, callback);
-  }
-
-  /// The memcpyDeviceToHost is used to copy the data from device to host.
-  /// The source pointer could be deleted before the copy happend which is
-  /// why a callback function is needed. By default if none is provided, the
-  /// function is blocking.
-  EIGEN_STRONG_INLINE void memcpyDeviceToHost(
-      void *dst, const void *src, size_t n,
-      std::function<void()> callback) const {
-    static const auto read_mode = cl::sycl::access::mode::read;
-    static const auto global_access = cl::sycl::access::target::global_buffer;
-    typedef cl::sycl::accessor<buffer_scalar_t, 1, read_mode, global_access>
-        read_accessor;
-    if (n == 0) {
-      if (callback) callback();
-      return;
-    }
-    n /= sizeof(buffer_scalar_t);
-    auto f = [&](cl::sycl::handler &cgh) {
-      read_accessor src_acc = get_range_accessor<read_mode>(cgh, src, n);
-      buffer_scalar_t *ptr = static_cast<buffer_scalar_t *>(dst);
-      auto non_deleter = [](buffer_scalar_t *) {};
-      std::shared_ptr<buffer_scalar_t> s_ptr(ptr, non_deleter);
-      cgh.copy(src_acc, s_ptr);
-    };
-    cl::sycl::event e;
-    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
-    synchronize_and_callback(e, callback);
-  }
-
-  /// The memcpy function.
-  /// No callback is required here as both arguments are on the device
-  /// and SYCL can handle the dependency.
-  EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
-    static const auto read_mode = cl::sycl::access::mode::read;
-    static const auto write_mode = cl::sycl::access::mode::discard_write;
-    if (n == 0) {
-      return;
-    }
-    n /= sizeof(buffer_scalar_t);
-    auto f = [&](cl::sycl::handler &cgh) {
-      auto src_acc = get_range_accessor<read_mode>(cgh, src, n);
-      auto dst_acc = get_range_accessor<write_mode>(cgh, dst, n);
-      cgh.copy(src_acc, dst_acc);
-    };
-    cl::sycl::event e;
-    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
-    async_synchronize(e);
-  }
-
-  /// the memset function.
-  /// No callback is required here as both arguments are on the device
-  /// and SYCL can handle the dependency.
-  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
-    static const auto write_mode = cl::sycl::access::mode::discard_write;
-    if (n == 0) {
-      return;
-    }
-    n /= sizeof(buffer_scalar_t);
-    auto f = [&](cl::sycl::handler &cgh) {
-      auto dst_acc = get_range_accessor<write_mode>(cgh, data, n);
-      // The cast to uint8_t is here to match the behaviour of the standard
-      // memset. The cast to buffer_scalar_t is needed to match the type of the
-      // accessor (in case buffer_scalar_t is not uint8_t)
-      cgh.fill(dst_acc, static_cast<buffer_scalar_t>(static_cast<uint8_t>(c)));
-    };
-    cl::sycl::event e;
-    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f));
-    async_synchronize(e);
-  }
-
-  /// Get a range accessor to the virtual pointer's device memory. This range
-  /// accessor will allow access to the memory from the pointer to the end of
-  /// the buffer.
-  ///
-  /// NOTE: Inside a kernel the range accessor will always be indexed from the
-  /// start of the buffer, so the offset in the accessor is only used by
-  /// methods like handler::copy and will not be available inside a kernel.
-  template <cl::sycl::access::mode AcMd, typename T>
-  EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
-  get_range_accessor(const void *ptr) const {
-    static const auto global_access = cl::sycl::access::target::global_buffer;
-    static const auto is_place_holder = cl::sycl::access::placeholder::true_t;
-    typedef TensorSycl::internal::RangeAccess<AcMd, T> ret_type;
-    typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t;
-
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-
-    auto original_buffer = pMapper.get_buffer(ptr);
-    const ptrdiff_t offset = pMapper.get_offset(ptr);
-    const ptrdiff_t typed_offset = offset / sizeof(T);
-    eigen_assert(typed_offset >= 0);
-    const auto typed_size = original_buffer.get_size() / sizeof(T);
-    auto buffer = original_buffer.template reinterpret<
-        typename Eigen::internal::remove_const<T>::type>(
-        cl::sycl::range<1>(typed_size));
-    const ptrdiff_t size = buffer.get_count() - typed_offset;
-    eigen_assert(size >= 0);
-    typedef cl::sycl::accessor<typename Eigen::internal::remove_const<T>::type,
-                               1, AcMd, global_access, is_place_holder>
-        placeholder_accessor_t;
-    const auto start_ptr = static_cast<internal_ptr_t>(ptr) - offset;
-    return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size),
-                                           cl::sycl::id<1>(typed_offset)),
-                    static_cast<size_t>(typed_offset),
-                    reinterpret_cast<std::intptr_t>(start_ptr));
-  }
-
-  /// Get a range accessor to the virtual pointer's device memory with a
-  /// specified size.
-  template <cl::sycl::access::mode AcMd, typename Index>
-  EIGEN_STRONG_INLINE cl::sycl::accessor<
-      buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
-  get_range_accessor(cl::sycl::handler &cgh, const void *ptr,
-                     const Index n_bytes) const {
-    static const auto global_access = cl::sycl::access::target::global_buffer;
-    eigen_assert(n_bytes >= 0);
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    auto buffer = pMapper.get_buffer(ptr);
-    const ptrdiff_t offset = pMapper.get_offset(ptr);
-    eigen_assert(offset >= 0);
-    eigen_assert(offset + n_bytes <= buffer.get_size());
-    return buffer.template get_access<AcMd, global_access>(
-        cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset));
-  }
-
-  /// Creation of sycl accessor for a buffer. This function first tries to find
-  /// the buffer in the buffer_map. If found it gets the accessor from it, if
-  /// not, the function then adds an entry by creating a sycl buffer for that
-  /// particular pointer.
-  template <cl::sycl::access::mode AcMd>
-  EIGEN_STRONG_INLINE cl::sycl::accessor<
-      buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
-  get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    return pMapper.get_buffer(ptr)
-        .template get_access<AcMd, cl::sycl::access::target::global_buffer>(
-            cgh);
-  }
-
-  EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
-      const void *ptr) const {
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    return pMapper.get_buffer(ptr);
-  }
-
-  EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
-    std::lock_guard<std::mutex> lock(pmapper_mutex_);
-    return pMapper.get_offset(ptr);
-  }
-
-  template <typename OutScalar, typename sycl_kernel, typename Lhs,
-            typename Rhs, typename OutPtr, typename Range, typename Index,
-            typename... T>
-  EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs,
-                                                  const Rhs &rhs, OutPtr outptr,
-                                                  Range thread_range,
-                                                  Index scratchSize,
-                                                  T... var) const {
-    auto kernel_functor = [=](cl::sycl::handler &cgh) {
-      // binding the placeholder accessors to a commandgroup handler
-      lhs.bind(cgh);
-      rhs.bind(cgh);
-      outptr.bind(cgh);
-      typedef cl::sycl::accessor<OutScalar, 1,
-                                 cl::sycl::access::mode::read_write,
-                                 cl::sycl::access::target::local>
-          LocalAccessor;
-
-      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
-      cgh.parallel_for(
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-          program().template get_kernel<sycl_kernel>(),
-#endif
-          thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
-    };
-    cl::sycl::event e;
-    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
-    async_synchronize(e);
-  }
-
-  template <typename OutScalar, typename sycl_kernel, typename InPtr,
-            typename OutPtr, typename Range, typename Index, typename... T>
-  EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr,
-                                                 OutPtr &outptr,
-                                                 Range thread_range,
-                                                 Index scratchSize,
-                                                 T... var) const {
-    auto kernel_functor = [=](cl::sycl::handler &cgh) {
-      // binding the placeholder accessors to a commandgroup handler
-      inptr.bind(cgh);
-      outptr.bind(cgh);
-      typedef cl::sycl::accessor<OutScalar, 1,
-                                 cl::sycl::access::mode::read_write,
-                                 cl::sycl::access::target::local>
-          LocalAccessor;
-
-      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
-      cgh.parallel_for(
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-          program().template get_kernel<sycl_kernel>(),
-#endif
-          thread_range, sycl_kernel(scratch, inptr, outptr, var...));
-    };
-    cl::sycl::event e;
-    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
-    async_synchronize(e);
-  }
-
-    template <typename OutScalar, typename sycl_kernel, typename InPtr,
-           typename Range, typename Index, typename... T>
-  EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr,
-                                                 Range thread_range,
-                                                 Index scratchSize,
-                                                 T... var) const {
-    auto kernel_functor = [=](cl::sycl::handler &cgh) {
-      // binding the placeholder accessors to a commandgroup handler
-      inptr.bind(cgh);
-      typedef cl::sycl::accessor<OutScalar, 1,
-                                 cl::sycl::access::mode::read_write,
-                                 cl::sycl::access::target::local>
-          LocalAccessor;
-
-      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
-      cgh.parallel_for(
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-          program().template get_kernel<sycl_kernel>(),
-#endif
-          thread_range, sycl_kernel(scratch, inptr, var...));
-    };
-    cl::sycl::event e;
-    EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor));
-    async_synchronize(e);
-  }
-
-
-  EIGEN_STRONG_INLINE void synchronize() const {
-#ifdef EIGEN_EXCEPTIONS
-    m_queue.wait_and_throw();
-#else
-    m_queue.wait();
-#endif
-  }
-
-
-  EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const {
-    set_latest_event(e);
-#ifndef EIGEN_SYCL_ASYNC_EXECUTION
-    synchronize();
-#endif
-  }
-
-  template <typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
-                                              Index &rng, Index &GRange) const {
-    tileSize = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
-    tileSize = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
-                                           EIGEN_SYCL_LOCAL_THREAD_DIM1),
-                        static_cast<Index>(tileSize));
-    rng = n;
-    if (rng == 0) rng = static_cast<Index>(1);
-    GRange = rng;
-    if (tileSize > GRange)
-      tileSize = GRange;
-    else if (GRange > tileSize) {
-      Index xMode = static_cast<Index>(GRange % tileSize);
-      if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
-    }
-  }
-
-  /// This is used to prepare the number of threads and also the number of
-  /// threads per block for sycl kernels
-  template <typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(
-      const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
-      cl::sycl::range<2> &local_range) const {
-    std::array<Index, 2> input_range = input_dim;
-    Index max_workgroup_Size =
-        static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
-    max_workgroup_Size =
-        std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
-                                    EIGEN_SYCL_LOCAL_THREAD_DIM1),
-                 static_cast<Index>(max_workgroup_Size));
-    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
-    local_range[1] =
-        static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
-    input_range[1] = input_dim[1];
-    if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
-    global_range[1] = input_range[1];
-    if (local_range[1] > global_range[1])
-      local_range[1] = global_range[1];
-    else if (global_range[1] > local_range[1]) {
-      Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
-      if (xMode != 0)
-        global_range[1] += static_cast<Index>(local_range[1] - xMode);
-    }
-    local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]);
-    input_range[0] = input_dim[0];
-    if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
-    global_range[0] = input_range[0];
-    if (local_range[0] > global_range[0])
-      local_range[0] = global_range[0];
-    else if (global_range[0] > local_range[0]) {
-      Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
-      if (xMode != 0)
-        global_range[0] += static_cast<Index>(local_range[0] - xMode);
-    }
-  }
-
-  /// This is used to prepare the number of threads and also the number of
-  /// threads per block for sycl kernels
-  template <typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(
-      const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
-      cl::sycl::range<3> &local_range) const {
-    std::array<Index, 3> input_range = input_dim;
-    Index max_workgroup_Size =
-        static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
-    max_workgroup_Size =
-        std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 *
-                                    EIGEN_SYCL_LOCAL_THREAD_DIM1),
-                 static_cast<Index>(max_workgroup_Size));
-    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
-    local_range[2] =
-        static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
-    input_range[2] = input_dim[2];
-    if (input_range[2] == 0) input_range[1] = static_cast<Index>(1);
-    global_range[2] = input_range[2];
-    if (local_range[2] > global_range[2])
-      local_range[2] = global_range[2];
-    else if (global_range[2] > local_range[2]) {
-      Index xMode = static_cast<Index>(global_range[2] % local_range[2]);
-      if (xMode != 0)
-        global_range[2] += static_cast<Index>(local_range[2] - xMode);
-    }
-    pow_of_2 = static_cast<Index>(
-        std::log2(static_cast<Index>(max_workgroup_Size / local_range[2])));
-    local_range[1] =
-        static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
-    input_range[1] = input_dim[1];
-    if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
-    global_range[1] = input_range[1];
-    if (local_range[1] > global_range[1])
-      local_range[1] = global_range[1];
-    else if (global_range[1] > local_range[1]) {
-      Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
-      if (xMode != 0)
-        global_range[1] += static_cast<Index>(local_range[1] - xMode);
-    }
-    local_range[0] = static_cast<Index>(max_workgroup_Size /
-                                        (local_range[1] * local_range[2]));
-    input_range[0] = input_dim[0];
-    if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
-    global_range[0] = input_range[0];
-    if (local_range[0] > global_range[0])
-      local_range[0] = global_range[0];
-    else if (global_range[0] > local_range[0]) {
-      Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
-      if (xMode != 0)
-        global_range[0] += static_cast<Index>(local_range[0] - xMode);
-    }
-  }
-
-  EIGEN_STRONG_INLINE bool has_local_memory() const {
-#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
-    return false;
-#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
-    return true;
-#else
-    return m_device_info.local_mem_type ==
-           cl::sycl::info::local_mem_type::local;
-#endif
-  }
-
-  EIGEN_STRONG_INLINE unsigned long max_buffer_size() const {
-    return m_device_info.max_mem_alloc_size;
-  }
-
-  EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
-    return m_device_info.max_compute_units;
-  }
-
-  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
-    return m_device_info.max_work_group_size;
-  }
-
-  EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
-    return m_device_info.max_work_item_sizes;
-  }
-
-  /// No need for sycl it should act the same as CPU version
-  EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
-
-  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
-    // OpenCL doesnot have such concept
-    return 2;
-  }
-
-  EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
-    return m_device_info.local_mem_size;
-  }
-
-  // This function returns the nearest power of 2 Work-group size which is <=
-  // maximum device workgroup size.
-  EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
-    return getPowerOfTwo(m_device_info.max_work_group_size, false);
-  }
-
-  EIGEN_STRONG_INLINE std::string getPlatformName() const {
-    return m_device_info.platform_name;
-  }
-
-  EIGEN_STRONG_INLINE std::string getDeviceName() const {
-    return m_device_info.device_name;
-  }
-
-  EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
-    return m_device_info.device_vendor;
-  }
-
-  // This function returns the nearest power of 2
-  // if roundup is true returns result>=wgsize
-  // else it return result <= wgsize
-  EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const {
-    if (roundUp) --wGSize;
-    wGSize |= (wGSize >> 1);
-    wGSize |= (wGSize >> 2);
-    wGSize |= (wGSize >> 4);
-    wGSize |= (wGSize >> 8);
-    wGSize |= (wGSize >> 16);
-#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
-    wGSize |= (wGSize >> 32);
-#endif
-    return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
-  }
-
-  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; }
-
-  // This function checks if the runtime recorded an error for the
-  // underlying stream device.
-  EIGEN_STRONG_INLINE bool ok() const {
-    if (!exception_caught_) {
-      synchronize();
-    }
-    return !exception_caught_;
-  }
-
-  EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
-#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
-    std::lock_guard<std::mutex> lock(event_mutex_);
-    return latest_events_[std::this_thread::get_id()];
-#else
-    eigen_assert(false);
-    return cl::sycl::event();
-#endif
-  }
-
-  // destructor
-  ~QueueInterface() {
-    pMapper.clear();
-#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
-    scratch_buffers.clear();
-#endif
-  }
-
- protected:
-  EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const {
-#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
-    std::lock_guard<std::mutex> lock(event_mutex_);
-    latest_events_[std::this_thread::get_id()] = e;
-#else
-    EIGEN_UNUSED_VARIABLE(e);
-#endif
-  }
-
-  void synchronize_and_callback(cl::sycl::event e,
-                                const std::function<void()> &callback) const {
-    set_latest_event(e);
-    if (callback) {
-      auto callback_ = [=]() {
-#ifdef EIGEN_EXCEPTIONS
-        cl::sycl::event(e).wait_and_throw();
-#else
-        cl::sycl::event(e).wait();
-#endif
-        callback();
-      };
-      m_thread_pool.Schedule(std::move(callback_));
-    } else {
-#ifdef EIGEN_EXCEPTIONS
-      m_queue.wait_and_throw();
-#else
-      m_queue.wait();
-#endif
-    }
-  }
-
-  bool sycl_async_handler(cl::sycl::exception_list exceptions) const {
-    bool exception_caught = false;
-    for (const auto &e : exceptions) {
-      if (e) {
-        exception_caught = true;
-        EIGEN_THROW_X(e);
-      }
-    }
-    return exception_caught;
-  }
-
-  /// class members:
-  bool exception_caught_ = false;
-
-  mutable std::mutex pmapper_mutex_;
-
-#ifdef EIGEN_SYCL_STORE_LATEST_EVENT
-  mutable std::mutex event_mutex_;
-  mutable std::unordered_map<std::thread::id, cl::sycl::event> latest_events_;
-#endif
-
-  /// std::map is the container used to make sure that we create only one buffer
-  /// per pointer. The lifespan of the buffer now depends on the lifespan of
-  /// SyclDevice. If a non-read-only pointer is needed to be accessed on the
-  /// host we should manually deallocate it.
-  mutable TensorSycl::internal::PointerMapper pMapper;
-#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS
-  mutable std::unordered_set<void *> scratch_buffers;
-#endif
+struct SyclDevice {
+  /// class members
   /// sycl queue
   mutable cl::sycl::queue m_queue;
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-  mutable cl::sycl::program m_prog;
+  /// std::map is the container used to make sure that we create only one buffer
+  /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice.
+  /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it.
+  mutable std::map<const void *, std::shared_ptr<void>> buffer_map;
+  /// creating device by using selector
+  template<typename dev_Selector> SyclDevice(dev_Selector s)
+  :
+#ifdef EIGEN_EXCEPTIONS
+  m_queue(cl::sycl::queue(s, [=](cl::sycl::exception_list l) {
+    for (const auto& e : l) {
+      try {
+        std::rethrow_exception(e);
+      } catch (cl::sycl::exception e) {
+          std::cout << e.what() << std::endl;
+        }
+    }
+  }))
+#else
+  m_queue(cl::sycl::queue(s))
 #endif
+  {}
+  // destructor
+  ~SyclDevice() { deallocate_all(); }
 
-  /// The thread pool is used to wait on events and call callbacks
-  /// asynchronously
-  mutable Eigen::ThreadPool m_thread_pool;
-
-  const TensorSycl::internal::SyclDeviceInfo m_device_info;
-};
-
-struct SyclDeviceBase {
-  /// QueueInterface is not owned. it is the caller's responsibility to destroy
-  /// it
-  const QueueInterface *m_queue_stream;
-  explicit SyclDeviceBase(const QueueInterface *queue_stream)
-      : m_queue_stream(queue_stream) {}
-  EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const {
-    return m_queue_stream;
+  template <typename T> void deallocate(T *p) const {
+    auto it = buffer_map.find(p);
+    if (it != buffer_map.end()) {
+      buffer_map.erase(it);
+      internal::aligned_free(p);
+    }
   }
-};
-
-// Here is a sycl device struct which accept the sycl queue interface
-// as an input
-struct SyclDevice : public SyclDeviceBase {
-  explicit SyclDevice(const QueueInterface *queue_stream)
-      : SyclDeviceBase(queue_stream) {}
-
-  // this is the accessor used to construct the evaluator
-  template <cl::sycl::access::mode AcMd, typename T>
-  EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<AcMd, T>
-  get_range_accessor(const void *ptr) const {
-    return queue_stream()->template get_range_accessor<AcMd, T>(ptr);
+  void deallocate_all() const {
+    std::map<const void *, std::shared_ptr<void>>::iterator it=buffer_map.begin();
+    while (it!=buffer_map.end()) {
+      auto p=it->first;
+      buffer_map.erase(it);
+      internal::aligned_free(const_cast<void*>(p));
+      it=buffer_map.begin();
+    }
+    buffer_map.clear();
   }
 
-  // get sycl accessor
-  template <cl::sycl::access::mode AcMd>
-  EIGEN_STRONG_INLINE cl::sycl::accessor<
-      buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer>
-  get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const {
-    return queue_stream()->template get_sycl_accessor<AcMd>(cgh, ptr);
+  /// creation of sycl accessor for a buffer. This function first tries to find
+  /// the buffer in the buffer_map. If found it gets the accessor from it, if not,
+  ///the function then adds an entry by creating a sycl buffer for that particular pointer.
+  template <cl::sycl::access::mode AcMd, typename T> inline cl::sycl::accessor<T, 1, AcMd, cl::sycl::access::target::global_buffer>
+  get_sycl_accessor(size_t num_bytes, cl::sycl::handler &cgh, const T * ptr) const {
+    return (get_sycl_buffer<T>(num_bytes, ptr)->template get_access<AcMd, cl::sycl::access::target::global_buffer>(cgh));
   }
 
-  /// Accessing the created sycl device buffer for the device pointer
-  EIGEN_STRONG_INLINE cl::sycl::buffer<buffer_scalar_t, 1> get_sycl_buffer(
-      const void *ptr) const {
-    return queue_stream()->get_sycl_buffer(ptr);
+  template<typename T> inline  std::pair<std::map<const void *, std::shared_ptr<void>>::iterator,bool> add_sycl_buffer(const T *ptr, size_t num_bytes) const {
+    using Type = cl::sycl::buffer<T, 1>;
+    std::pair<std::map<const void *, std::shared_ptr<void>>::iterator,bool> ret = buffer_map.insert(std::pair<const void *, std::shared_ptr<void>>(ptr, std::shared_ptr<void>(new Type(cl::sycl::range<1>(num_bytes)),
+      [](void *dataMem) { delete static_cast<Type*>(dataMem); })));
+    (static_cast<Type*>(buffer_map.at(ptr).get()))->set_final_data(nullptr);
+    return ret;
   }
 
-  /// This is used to prepare the number of threads and also the number of
-  /// threads per block for sycl kernels
-  template <typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize,
-                                              Index &rng, Index &GRange) const {
-    queue_stream()->parallel_for_setup(n, tileSize, rng, GRange);
+  template <typename T> inline cl::sycl::buffer<T, 1>* get_sycl_buffer(size_t num_bytes,const T * ptr) const {
+    return static_cast<cl::sycl::buffer<T, 1>*>(add_sycl_buffer(ptr, num_bytes).first->second.get());
   }
 
-  /// This is used to prepare the number of threads and also the number of
-  /// threads per block for sycl kernels
-  template <typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(
-      const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
-      cl::sycl::range<2> &local_range) const {
-    queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
-  }
-
-  /// This is used to prepare the number of threads and also the number of
-  /// threads per block for sycl kernels
-  template <typename Index>
-  EIGEN_STRONG_INLINE void parallel_for_setup(
-      const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
-      cl::sycl::range<3> &local_range) const {
-    queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
-  }
-
-  /// allocate device memory
-  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
-    return queue_stream()->allocate(num_bytes);
-  }
-
-  EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
-    return queue_stream()->allocate_temp(num_bytes);
-  }
-
-  /// deallocate device memory
-  EIGEN_STRONG_INLINE void deallocate(void *p) const {
-    queue_stream()->deallocate(p);
-  }
-
-  EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const {
-    queue_stream()->deallocate_temp(buffer);
-  }
-  template <cl::sycl::access::mode AcMd, typename T>
-  EIGEN_STRONG_INLINE void deallocate_temp(
-      const TensorSycl::internal::RangeAccess<AcMd, T> &buffer) const {
-    queue_stream()->deallocate_temp(buffer);
-  }
-  EIGEN_STRONG_INLINE void deallocate_all() const {
-    queue_stream()->deallocate_all();
-  }
-
-  template <typename data_t>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess<
-      cl::sycl::access::mode::read_write, data_t>
-  get(data_t *data) const {
-    return queue_stream()->get(data);
-  }
-  template <typename data_t>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(
-      TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write,
-                                        data_t>
-          data) const {
-    return queue_stream()->get(data);
-  }
-
-  /// attach existing buffer
-  EIGEN_STRONG_INLINE void *attach_buffer(
-      cl::sycl::buffer<buffer_scalar_t, 1> &buf) const {
-    return queue_stream()->attach_buffer(buf);
-  }
-  /// detach buffer
-  EIGEN_STRONG_INLINE void detach_buffer(void *p) const {
-    queue_stream()->detach_buffer(p);
-  }
-  EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const {
-    return queue_stream()->get_offset(ptr);
+  /// allocating memory on the cpu
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void *allocate(size_t) const {
+    return internal::aligned_malloc(8);
   }
 
   // some runtime conditions that can be applied here
-  EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
+  bool isDeviceSuitable() const { return true; }
 
-  /// memcpyHostToDevice
-  template <typename Index>
-  EIGEN_STRONG_INLINE void memcpyHostToDevice(
-      Index *dst, const Index *src, size_t n,
-      std::function<void()> callback = {}) const {
-    queue_stream()->memcpyHostToDevice(dst, src, n, callback);
-  }
-  /// memcpyDeviceToHost
-  template <typename Index>
-  EIGEN_STRONG_INLINE void memcpyDeviceToHost(
-      void *dst, const Index *src, size_t n,
-      std::function<void()> callback = {}) const {
-    queue_stream()->memcpyDeviceToHost(dst, src, n, callback);
-  }
-  /// the memcpy function
-  template <typename Index>
-  EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
-    queue_stream()->memcpy(dst, src, n);
-  }
-  /// the memset function
-  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
-    queue_stream()->memset(data, c, n);
-  }
-  /// returning the sycl queue
-  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const {
-    return queue_stream()->sycl_queue();
-  }
-#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS
-  EIGEN_STRONG_INLINE cl::sycl::program &program() const {
-    return queue_stream()->program();
-  }
-#endif
-
-  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; }
-
-  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
-    // We won't try to take advantage of the l2 cache for the time being, and
-    // there is no l3 cache on sycl devices.
-    return firstLevelCacheSize();
-  }
-  EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
-    return queue_stream()->getNumSyclMultiProcessors();
-  }
-  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const {
-    return queue_stream()->maxSyclThreadsPerBlock();
-  }
-  EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const {
-    return queue_stream()->maxWorkItemSizes();
-  }
-  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
-    // OpenCL doesnot have such concept
-    return queue_stream()->maxSyclThreadsPerMultiProcessor();
-  }
-  EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const {
-    return queue_stream()->sharedMemPerBlock();
-  }
-  EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
-    return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
+    ::memcpy(dst, src, n);
   }
 
-  EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const {
-    return queue_stream()->getPowerOfTwo(val, roundUp);
+  template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const {
+    auto host_acc= (static_cast<cl::sycl::buffer<T, 1>*>(add_sycl_buffer(dst, n).first->second.get()))-> template get_access<cl::sycl::access::mode::discard_write, cl::sycl::access::target::host_buffer>();
+    memcpy(host_acc.get_pointer(), src, n);
   }
-  /// No need for sycl it should act the same as CPU version
-  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-    return queue_stream()->majorDeviceVersion();
+ /// whith the current implementation of sycl, the data is copied twice from device to host. This will be fixed soon.
+  template<typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(T *dst, const T *src, size_t n) const {
+    auto it = buffer_map.find(src);
+    if (it != buffer_map.end()) {
+      auto host_acc= (static_cast<cl::sycl::buffer<T, 1>*>(it->second.get()))-> template get_access<cl::sycl::access::mode::read, cl::sycl::access::target::host_buffer>();
+      memcpy(dst,host_acc.get_pointer(),  n);
+    } else{
+      eigen_assert("no device memory found. The memory might be destroyed before creation");
+    }
   }
 
-  EIGEN_STRONG_INLINE void synchronize() const {
-    queue_stream()->synchronize();
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void *buffer, int c, size_t n) const {
+    ::memset(buffer, c, n);
   }
-  EIGEN_STRONG_INLINE void async_synchronize(
-      cl::sycl::event e = cl::sycl::event()) const {
-    queue_stream()->async_synchronize(e);
-  }
-  EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const {
-    return queue_stream()->get_latest_event();
-  }
-
-  // This function checks if the runtime recorded an error for the
-  // underlying stream device.
-  EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); }
-
-  EIGEN_STRONG_INLINE bool has_local_memory() const {
-    return queue_stream()->has_local_memory();
-  }
-  EIGEN_STRONG_INLINE long max_buffer_size() const {
-    return queue_stream()->max_buffer_size();
-  }
-  EIGEN_STRONG_INLINE std::string getPlatformName() const {
-    return queue_stream()->getPlatformName();
-  }
-  EIGEN_STRONG_INLINE std::string getDeviceName() const {
-    return queue_stream()->getDeviceName();
-  }
-  EIGEN_STRONG_INLINE std::string getDeviceVendor() const {
-    return queue_stream()->getDeviceVendor();
-  }
-  template <typename OutScalar, typename KernelType, typename... T>
-  EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const {
-    queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(
-        var...);
-  }
-  template <typename OutScalar, typename KernelType, typename... T>
-  EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const {
-    queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(
-        var...);
-  }
-
-  template <typename OutScalar, typename KernelType, typename... T>
-  EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const {
-    queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(
-        var...);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+  return 1;
   }
 };
+
 }  // end namespace Eigen
 
 #endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index e524b535a..a5e084a24 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -12,6 +12,67 @@
 
 namespace Eigen {
 
+// Use the SimpleThreadPool by default. We'll switch to the new non blocking
+// thread pool later.
+#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
+template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
+typedef NonBlockingThreadPool ThreadPool;
+#else
+template <typename Env> using ThreadPoolTempl = SimpleThreadPoolTempl<Env>;
+typedef SimpleThreadPool ThreadPool;
+#endif
+
+
+// Barrier is an object that allows one or more threads to wait until
+// Notify has been called a specified number of times.
+class Barrier {
+ public:
+  Barrier(unsigned int count) : state_(count << 1), notified_(false) {
+    eigen_assert(((count << 1) >> 1) == count);
+  }
+  ~Barrier() {
+    eigen_plain_assert((state_>>1) == 0);
+  }
+
+  void Notify() {
+    unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
+    if (v != 1) {
+      eigen_assert(((v + 2) & ~1) != 0);
+      return;  // either count has not dropped to 0, or waiter is not waiting
+    }
+    std::unique_lock<std::mutex> l(mu_);
+    eigen_assert(!notified_);
+    notified_ = true;
+    cv_.notify_all();
+  }
+
+  void Wait() {
+    unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
+    if ((v >> 1) == 0) return;
+    std::unique_lock<std::mutex> l(mu_);
+    while (!notified_) {
+      cv_.wait(l);
+    }
+  }
+
+ private:
+  std::mutex mu_;
+  std::condition_variable cv_;
+  std::atomic<unsigned int> state_;  // low bit is waiter flag
+  bool notified_;
+};
+
+
+// Notification is an object that allows a user to to wait for another
+// thread to signal a notification that an event has occurred.
+//
+// Multiple threads can wait on the same Notification object,
+// but only one caller must call Notify() on the object.
+struct Notification : Barrier {
+  Notification() : Barrier(1) {};
+};
+
+
 // Runs an arbitrary function and then calls Notify() on the passed in
 // Notification.
 template <typename Function, typename... Args> struct FunctionWrapperWithNotification
@@ -41,75 +102,22 @@ static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
   }
 }
 
-// An abstract interface to a device specific memory allocator.
-class Allocator {
- public:
-  virtual ~Allocator() {}
-  virtual void* allocate(size_t num_bytes) const = 0;
-  virtual void deallocate(void* buffer) const = 0;
-};
 
 // Build a thread pool device on top the an existing pool of threads.
 struct ThreadPoolDevice {
   // The ownership of the thread pool remains with the caller.
-  ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores, Allocator* allocator = nullptr)
-      : pool_(pool), num_threads_(num_cores), allocator_(allocator) { }
+  ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { }
 
   EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
-    return allocator_ ? allocator_->allocate(num_bytes)
-        : internal::aligned_malloc(num_bytes);
+    return internal::aligned_malloc(num_bytes);
   }
 
   EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
-    if (allocator_) {
-      allocator_->deallocate(buffer);
-    } else {
-      internal::aligned_free(buffer);
-    }
-  }
-
-    EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const {
-    return allocate(num_bytes);
-  }
-
-  EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const {
-    deallocate(buffer);
-  }
-
-  template<typename Type>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
-    return data;
+    internal::aligned_free(buffer);
   }
 
   EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
-#ifdef __ANDROID__
     ::memcpy(dst, src, n);
-#else
-    // TODO(rmlarsen): Align blocks on cache lines.
-    // We have observed that going beyond 4 threads usually just wastes
-    // CPU cycles due to the threads competing for memory bandwidth, so we
-    // statically schedule at most 4 block copies here.
-    const size_t kMinBlockSize = 32768;
-    const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), 4);
-    if (n <= kMinBlockSize || num_threads < 2) {
-      ::memcpy(dst, src, n);
-    } else {
-      const char* src_ptr = static_cast<const char*>(src);
-      char* dst_ptr = static_cast<char*>(dst);
-      const size_t blocksize = (n + (num_threads - 1)) / num_threads;
-      Barrier barrier(static_cast<int>(num_threads - 1));
-      // Launch the last 3 blocks on worker threads.
-      for (size_t i = 1; i < num_threads; ++i) {
-        enqueue_with_barrier(&barrier, [n, i, src_ptr, dst_ptr, blocksize] {
-          ::memcpy(dst_ptr + i * blocksize, src_ptr + i * blocksize,
-                   numext::mini(blocksize, n - (i * blocksize)));
-        });
-      }
-      // Launch the first block on the main thread.
-      ::memcpy(dst_ptr, src_ptr, blocksize);
-      barrier.Wait();
-    }
-#endif
   }
   EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
     memcpy(dst, src, n);
@@ -126,12 +134,6 @@ struct ThreadPoolDevice {
     return num_threads_;
   }
 
-  // Number of theads available in the underlying thread pool. This number can
-  // be different from the value returned by numThreads().
-  EIGEN_STRONG_INLINE int numThreadsInPool() const {
-    return pool_->NumThreads();
-  }
-
   EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
     return l1CacheSize();
   }
@@ -147,31 +149,23 @@ struct ThreadPoolDevice {
   }
 
   template <class Function, class... Args>
-  EIGEN_STRONG_INLINE Notification* enqueue(Function&& f,
-                                            Args&&... args) const {
+  EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
     Notification* n = new Notification();
-    pool_->Schedule(
-        std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n,
-                  std::move(f), args...));
+    pool_->Schedule(std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...));
     return n;
   }
 
   template <class Function, class... Args>
-  EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, Function&& f,
+  EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b,
+                                                Function&& f,
                                                 Args&&... args) const {
-    pool_->Schedule(
-        std::bind(&FunctionWrapperWithBarrier<Function, Args...>::run, b,
-                  std::move(f), args...));
+    pool_->Schedule(std::bind(
+        &FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...));
   }
 
   template <class Function, class... Args>
-  EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f,
-                                                 Args&&... args) const {
-    if (sizeof...(args) > 0) {
-      pool_->Schedule(std::bind(std::move(f), args...));
-    } else {
-      pool_->Schedule(std::move(f));
-    }
+  EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const {
+    pool_->Schedule(std::bind(f, args...));
   }
 
   // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if
@@ -180,189 +174,44 @@ struct ThreadPoolDevice {
     return pool_->CurrentThreadId();
   }
 
-  // WARNING: This function is synchronous and will block the calling thread.
-  //
-  // Synchronous parallelFor executes f with [0, n) arguments in parallel and
-  // waits for completion. F accepts a half-open interval [first, last). Block
-  // size is chosen based on the iteration cost and resulting parallel
+  // parallelFor executes f with [0, n) arguments in parallel and waits for
+  // completion. F accepts a half-open interval [first, last).
+  // Block size is choosen based on the iteration cost and resulting parallel
   // efficiency. If block_align is not nullptr, it is called to round up the
   // block size.
   void parallelFor(Index n, const TensorOpCost& cost,
                    std::function<Index(Index)> block_align,
                    std::function<void(Index, Index)> f) const {
-    if (EIGEN_PREDICT_FALSE(n <= 0)){
-      return;
-    // Compute small problems directly in the caller thread.
-    } else if (n == 1 || numThreads() == 1 ||
-               CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
-      f(0, n);
-      return;
-    }
-
-    // Compute block size and total count of blocks.
-    ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align);
-
-    // Recursively divide size into halves until we reach block_size.
-    // Division code rounds mid to block_size, so we are guaranteed to get
-    // block_count leaves that do actual computations.
-    Barrier barrier(static_cast<unsigned int>(block.count));
-    std::function<void(Index, Index)> handleRange;
-    handleRange = [=, &handleRange, &barrier, &f](Index firstIdx,
-                                                  Index lastIdx) {
-      while (lastIdx - firstIdx > block.size) {
-        // Split into halves and schedule the second half on a different thread.
-        const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size;
-        pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); });
-        lastIdx = midIdx;
-      }
-      // Single block or less, execute directly.
-      f(firstIdx, lastIdx);
-      barrier.Notify();
-    };
-
-    if (block.count <= numThreads()) {
-      // Avoid a thread hop by running the root of the tree and one block on the
-      // main thread.
-      handleRange(0, n);
-    } else {
-      // Execute the root in the thread pool to avoid running work on more than
-      // numThreads() threads.
-      pool_->Schedule([=, &handleRange]() { handleRange(0, n); });
-    }
-
-    barrier.Wait();
-  }
-
-  // Convenience wrapper for parallelFor that does not align blocks.
-  void parallelFor(Index n, const TensorOpCost& cost,
-                   std::function<void(Index, Index)> f) const {
-    parallelFor(n, cost, nullptr, std::move(f));
-  }
-
-  // WARNING: This function is asynchronous and will not block the calling thread.
-  //
-  // Asynchronous parallelFor executes f with [0, n) arguments in parallel
-  // without waiting for completion. When the last block finished, it will call
-  // 'done' callback. F accepts a half-open interval [first, last). Block size
-  // is chosen based on the iteration cost and resulting parallel efficiency. If
-  // block_align is not nullptr, it is called to round up the block size.
-  void parallelForAsync(Index n, const TensorOpCost& cost,
-                        std::function<Index(Index)> block_align,
-                        std::function<void(Index, Index)> f,
-                        std::function<void()> done) const {
-    // Compute small problems directly in the caller thread.
+    typedef TensorCostModel<ThreadPoolDevice> CostModel;
     if (n <= 1 || numThreads() == 1 ||
         CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
       f(0, n);
-      done();
       return;
     }
 
-    // Compute block size and total count of blocks.
-    ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align);
+    // Calculate block size based on (1) the iteration cost and (2) parallel
+    // efficiency. We want blocks to be not too small to mitigate
+    // parallelization overheads; not too large to mitigate tail
+    // effect and potential load imbalance and we also want number
+    // of blocks to be evenly dividable across threads.
 
-    ParallelForAsyncContext* const ctx =
-        new ParallelForAsyncContext(block.count, std::move(f), std::move(done));
-
-    // Recursively divide size into halves until we reach block_size.
-    // Division code rounds mid to block_size, so we are guaranteed to get
-    // block_count leaves that do actual computations.
-    ctx->handle_range = [this, ctx, block](Index firstIdx, Index lastIdx) {
-      while (lastIdx - firstIdx > block.size) {
-        // Split into halves and schedule the second half on a different thread.
-        const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size;
-        pool_->Schedule(
-            [ctx, midIdx, lastIdx]() { ctx->handle_range(midIdx, lastIdx); });
-        lastIdx = midIdx;
-      }
-
-      // Single block or less, execute directly.
-      ctx->f(firstIdx, lastIdx);
-
-      // Delete async context if it was the last block.
-      if (ctx->count.fetch_sub(1) == 1) delete ctx;
-    };
-
-    if (block.count <= numThreads()) {
-      // Avoid a thread hop by running the root of the tree and one block on the
-      // main thread.
-      ctx->handle_range(0, n);
-    } else {
-      // Execute the root in the thread pool to avoid running work on more than
-      // numThreads() threads.
-      pool_->Schedule([ctx, n]() { ctx->handle_range(0, n); });
-    }
-  }
-
-  // Convenience wrapper for parallelForAsync that does not align blocks.
-  void parallelForAsync(Index n, const TensorOpCost& cost,
-                        std::function<void(Index, Index)> f,
-                        std::function<void()> done) const {
-    parallelForAsync(n, cost, nullptr, std::move(f), std::move(done));
-  }
-
-  // Thread pool accessor.
-  ThreadPoolInterface* getPool() const { return pool_; }
-
-  // Allocator accessor.
-  Allocator* allocator() const { return allocator_; }
-
- private:
-  typedef TensorCostModel<ThreadPoolDevice> CostModel;
-
-  // For parallelForAsync we must keep passed in closures on the heap, and
-  // delete them only after `done` callback finished.
-  struct ParallelForAsyncContext {
-    ParallelForAsyncContext(Index block_count,
-                            std::function<void(Index, Index)> block_f,
-                            std::function<void()> done_callback)
-        : count(block_count),
-          f(std::move(block_f)),
-          done(std::move(done_callback)) {}
-    ~ParallelForAsyncContext() { done(); }
-
-    std::atomic<Index> count;
-    std::function<void(Index, Index)> f;
-    std::function<void()> done;
-
-    std::function<void(Index, Index)> handle_range;
-  };
-
-  struct ParallelForBlock {
-    Index size;   // block size
-    Index count;  // number of blocks
-  };
-
-  // Calculates block size based on (1) the iteration cost and (2) parallel
-  // efficiency. We want blocks to be not too small to mitigate parallelization
-  // overheads; not too large to mitigate tail effect and potential load
-  // imbalance and we also want number of blocks to be evenly dividable across
-  // threads.
-  ParallelForBlock CalculateParallelForBlock(
-      const Index n, const TensorOpCost& cost,
-      std::function<Index(Index)> block_align) const {
-    const double block_size_f = 1.0 / CostModel::taskSize(1, cost);
+    double block_size_f = 1.0 / CostModel::taskSize(1, cost);
     const Index max_oversharding_factor = 4;
     Index block_size = numext::mini(
-        n, numext::maxi<Index>(
-               divup<Index>(n, max_oversharding_factor * numThreads()),
-               block_size_f));
+        n, numext::maxi<Index>(divup<Index>(n, max_oversharding_factor * numThreads()),
+                               block_size_f));
     const Index max_block_size = numext::mini(n, 2 * block_size);
-
     if (block_align) {
       Index new_block_size = block_align(block_size);
       eigen_assert(new_block_size >= block_size);
       block_size = numext::mini(n, new_block_size);
     }
-
     Index block_count = divup(n, block_size);
-
     // Calculate parallel efficiency as fraction of total CPU time used for
     // computations:
     double max_efficiency =
         static_cast<double>(block_count) /
         (divup<int>(block_count, numThreads()) * numThreads());
-
     // Now try to increase block size up to max_block_size as long as it
     // doesn't decrease parallel efficiency.
     for (Index prev_block_count = block_count;
@@ -395,12 +244,36 @@ struct ThreadPoolDevice {
       }
     }
 
-    return {block_size, block_count};
+    // Recursively divide size into halves until we reach block_size.
+    // Division code rounds mid to block_size, so we are guaranteed to get
+    // block_count leaves that do actual computations.
+    Barrier barrier(static_cast<unsigned int>(block_count));
+    std::function<void(Index, Index)> handleRange;
+    handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) {
+      if (last - first <= block_size) {
+        // Single block or less, execute directly.
+        f(first, last);
+        barrier.Notify();
+        return;
+      }
+      // Split into halves and submit to the pool.
+      Index mid = first + divup((last - first) / 2, block_size) * block_size;
+      pool_->Schedule([=, &handleRange]() { handleRange(mid, last); });
+      pool_->Schedule([=, &handleRange]() { handleRange(first, mid); });
+    };
+    handleRange(0, n);
+    barrier.Wait();
   }
 
+  // Convenience wrapper for parallelFor that does not align blocks.
+  void parallelFor(Index n, const TensorOpCost& cost,
+                   std::function<void(Index, Index)> f) const {
+    parallelFor(n, cost, nullptr, std::move(f));
+  }
+
+ private:
   ThreadPoolInterface* pool_;
   int num_threads_;
-  Allocator* allocator_;
 };
 
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index 132458a20..451940de3 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -32,16 +32,16 @@ namespace Eigen {
 // Boilerplate code
 namespace internal {
 
-template<std::ptrdiff_t n, typename Dimension> struct dget {
-  static const std::ptrdiff_t value = get<n, Dimension>::value;
+template<std::size_t n, typename Dimension> struct dget {
+  static const std::size_t value = get<n, Dimension>::value;
 };
 
 
-template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
+template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
 struct fixed_size_tensor_index_linearization_helper
 {
   template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const& indices,
+  static inline Index run(array<Index, NumIndices> const& indices,
                           const Dimensions& dimensions)
   {
     return array_get<RowMajor ? n - 1 : (NumIndices - n)>(indices) +
@@ -50,21 +50,21 @@ struct fixed_size_tensor_index_linearization_helper
   }
 };
 
-template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
+template<typename Index, std::size_t NumIndices, bool RowMajor>
 struct fixed_size_tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
 {
   template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const&, const Dimensions&)
+  static inline Index run(array<Index, NumIndices> const&, const Dimensions&)
   {
     return 0;
   }
 };
 
-template<typename Index, std::ptrdiff_t n>
+template<typename Index, std::size_t n>
 struct fixed_size_tensor_index_extraction_helper
 {
   template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Index run(const Index index,
+  static inline Index run(const Index index,
                           const Dimensions& dimensions)
   {
     const Index mult = (index == n-1) ? 1 : 0;
@@ -77,7 +77,7 @@ template<typename Index>
 struct fixed_size_tensor_index_extraction_helper<Index, 0>
 {
   template <typename Dimensions> EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Index run(const Index,
+  static inline Index run(const Index,
                           const Dimensions&)
   {
     return 0;
@@ -90,11 +90,9 @@ struct fixed_size_tensor_index_extraction_helper<Index, 0>
 // Fixed size
 #ifndef EIGEN_EMULATE_CXX11_META_H
 template <typename std::ptrdiff_t... Indices>
-struct Sizes {
+struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
   typedef internal::numeric_list<std::ptrdiff_t, Indices...> Base;
-  const Base t = Base();
   static const std::ptrdiff_t total_size = internal::arg_prod(Indices...);
-  static const ptrdiff_t count = Base::count;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const {
     return Base::count;
@@ -121,17 +119,17 @@ struct Sizes {
     return *this;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::ptrdiff_t index) const {
-    return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, t);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::size_t index) const {
+    return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, *this);
   }
 
   template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  ptrdiff_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
-    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, t);
+  size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *static_cast<const Base*>(this));
   }
   template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  ptrdiff_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
-    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, t);
+  size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *static_cast<const Base*>(this));
   }
 };
 
@@ -144,25 +142,25 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<Indi
 
 #else
 
-template <std::ptrdiff_t n>
+template <std::size_t n>
 struct non_zero_size {
-  typedef internal::type2val<std::ptrdiff_t, n> type;
+  typedef internal::type2val<std::size_t, n> type;
 };
 template <>
 struct non_zero_size<0> {
   typedef internal::null_type type;
 };
 
-template <std::ptrdiff_t V1=0, std::ptrdiff_t V2=0, std::ptrdiff_t V3=0, std::ptrdiff_t V4=0, std::ptrdiff_t V5=0> struct Sizes {
+template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0, std::size_t V5=0> struct Sizes {
   typedef typename internal::make_type_list<typename non_zero_size<V1>::type, typename non_zero_size<V2>::type, typename non_zero_size<V3>::type, typename non_zero_size<V4>::type, typename non_zero_size<V5>::type >::type Base;
-  static const std::ptrdiff_t count = Base::count;
-  static const std::ptrdiff_t total_size = internal::arg_prod<Base>::value;
+  static const size_t count = Base::count;
+  static const std::size_t total_size = internal::arg_prod<Base>::value;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t rank() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
     return count;
   }
 
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t TotalSize() {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() {
     return internal::arg_prod<Base>::value;
   }
 
@@ -178,7 +176,7 @@ template <std::ptrdiff_t V1=0, std::ptrdiff_t V2=0, std::ptrdiff_t V3=0, std::pt
 
 #if EIGEN_HAS_VARIADIC_TEMPLATES
   template <typename... DenseIndex> Sizes(DenseIndex... /*indices*/) { }
-  explicit Sizes(std::initializer_list<std::ptrdiff_t>) {
+  explicit Sizes(std::initializer_list<std::size_t>) {
     // todo: add assertion
   }
 #else
@@ -213,18 +211,18 @@ template <std::ptrdiff_t V1=0, std::ptrdiff_t V2=0, std::ptrdiff_t V3=0, std::pt
   }
 
   template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  ptrdiff_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
+  size_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
     return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(indices, *reinterpret_cast<const Base*>(this));
   }
   template <typename DenseIndex> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  ptrdiff_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
+  size_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
     return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(indices, *reinterpret_cast<const Base*>(this));
   }
 };
 
 namespace internal {
-template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<V1, V2, V3, V4, V5>&) {
+template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes<V1, V2, V3, V4, V5>&) {
   return Sizes<V1, V2, V3, V4, V5>::total_size;
 }
 }
@@ -233,7 +231,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<V1,
 
 // Boilerplate
 namespace internal {
-template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
+template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
 struct tensor_index_linearization_helper
 {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -245,7 +243,7 @@ struct tensor_index_linearization_helper
   }
 };
 
-template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
+template<typename Index, std::size_t NumIndices, bool RowMajor>
 struct tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor>
 {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -264,7 +262,7 @@ struct DSizes : array<DenseIndex, NumDims> {
   typedef array<DenseIndex, NumDims> Base;
   static const int count = NumDims;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const {
     return NumDims;
   }
 
@@ -284,57 +282,6 @@ struct DSizes : array<DenseIndex, NumDims> {
     (*this)[0] = i0;
   }
 
-  EIGEN_DEVICE_FUNC DSizes(const DimensionList<DenseIndex, NumDims>& a) {
-    for (int i = 0 ; i < NumDims; ++i) {
-      (*this)[i] = a[i];
-    }
-  }
-
-  // Enable DSizes index type promotion only if we are promoting to the
-  // larger type, e.g. allow to promote dimensions of type int to long.
-  template<typename OtherIndex>
-  EIGEN_DEVICE_FUNC
-  explicit DSizes(const array<OtherIndex, NumDims>& other,
-                  // Default template parameters require c++11.
-                  typename internal::enable_if<
-                     internal::is_same<
-                         DenseIndex,
-                         typename internal::promote_index_type<
-                             DenseIndex,
-                             OtherIndex
-                         >::type
-                     >::value, void*>::type = 0) {
-    for (int i = 0; i < NumDims; ++i) {
-      (*this)[i] = static_cast<DenseIndex>(other[i]);
-    }
-  }
-
-#ifdef EIGEN_HAS_INDEX_LIST
-  template <typename FirstType, typename... OtherTypes>
-  EIGEN_DEVICE_FUNC
-  explicit DSizes(const Eigen::IndexList<FirstType, OtherTypes...>& dimensions) {
-    for (int i = 0; i < dimensions.count; ++i) {
-      (*this)[i] = dimensions[i];
-    }
-  }
-#endif
-
-#ifndef EIGEN_EMULATE_CXX11_META_H
-  template <typename std::ptrdiff_t... Indices>
-  EIGEN_DEVICE_FUNC DSizes(const Sizes<Indices...>& a) {
-    for (int i = 0 ; i < NumDims; ++i) {
-      (*this)[i] = a[i];
-    }
-  }
-#else
-  template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5>
-  EIGEN_DEVICE_FUNC DSizes(const Sizes<V1, V2, V3, V4, V5>& a) {
-    for (int i = 0 ; i < NumDims; ++i) {
-      (*this)[i] = a[i];
-    }
-  }
-#endif
-
 #if EIGEN_HAS_VARIADIC_TEMPLATES
   template<typename... IndexTypes> EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) {
@@ -383,21 +330,12 @@ struct DSizes : array<DenseIndex, NumDims> {
   }
 };
 
-template <typename IndexType, int NumDims>
-std::ostream& operator<<(std::ostream& os,
-                         const DSizes<IndexType, NumDims>& dims) {
-  os << "[";
-  for (int i = 0; i < NumDims; ++i) {
-    if (i > 0) os << ", ";
-    os << dims[i];
-  }
-  os << "]";
-  return os;
-}
+
+
 
 // Boilerplate
 namespace internal {
-template<typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
+template<typename Index, std::size_t NumIndices, std::size_t n, bool RowMajor>
 struct tensor_vsize_index_linearization_helper
 {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -409,7 +347,7 @@ struct tensor_vsize_index_linearization_helper
   }
 };
 
-template<typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
+template<typename Index, std::size_t NumIndices, bool RowMajor>
 struct tensor_vsize_index_linearization_helper<Index, NumIndices, 0, RowMajor>
 {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -424,10 +362,10 @@ struct tensor_vsize_index_linearization_helper<Index, NumIndices, 0, RowMajor>
 namespace internal {
 
 template <typename DenseIndex, int NumDims> struct array_size<const DSizes<DenseIndex, NumDims> > {
-  static const ptrdiff_t value = NumDims;
+  static const size_t value = NumDims;
 };
 template <typename DenseIndex, int NumDims> struct array_size<DSizes<DenseIndex, NumDims> > {
-  static const ptrdiff_t value = NumDims;
+  static const size_t value = NumDims;
 };
 #ifndef EIGEN_EMULATE_CXX11_META_H
 template <typename std::ptrdiff_t... Indices> struct array_size<const Sizes<Indices...> > {
@@ -437,42 +375,42 @@ template <typename std::ptrdiff_t... Indices> struct array_size<Sizes<Indices...
 static const std::ptrdiff_t value = Sizes<Indices...>::count;
 };
 template <std::ptrdiff_t n, typename std::ptrdiff_t... Indices> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<Indices...>&) {
-  return get<n, internal::numeric_list<std::ptrdiff_t, Indices...> >::value;
+  return get<n, internal::numeric_list<std::size_t, Indices...> >::value;
 }
 template <std::ptrdiff_t n> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<>&) {
   eigen_assert(false && "should never be called");
   return -1;
 }
 #else
-template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> struct array_size<const Sizes<V1,V2,V3,V4,V5> > {
-  static const ptrdiff_t value = Sizes<V1,V2,V3,V4,V5>::count;
+template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<const Sizes<V1,V2,V3,V4,V5> > {
+  static const size_t value = Sizes<V1,V2,V3,V4,V5>::count;
 };
-template <std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > {
-  static const ptrdiff_t value = Sizes<V1,V2,V3,V4,V5>::count;
+template <std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> struct array_size<Sizes<V1,V2,V3,V4,V5> > {
+  static const size_t value = Sizes<V1,V2,V3,V4,V5>::count;
 };
-template <std::ptrdiff_t n, std::ptrdiff_t V1, std::ptrdiff_t V2, std::ptrdiff_t V3, std::ptrdiff_t V4, std::ptrdiff_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<V1,V2,V3,V4,V5>&) {
+template <std::size_t n, std::size_t V1, std::size_t V2, std::size_t V3, std::size_t V4, std::size_t V5> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes<V1,V2,V3,V4,V5>&) {
   return get<n, typename Sizes<V1,V2,V3,V4,V5>::Base>::value;
 }
 
 #endif
 
 
-template <typename Dims1, typename Dims2, ptrdiff_t n, ptrdiff_t m>
+template <typename Dims1, typename Dims2, size_t n, size_t m>
 struct sizes_match_below_dim {
-  static EIGEN_DEVICE_FUNC  EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) {
+  static EIGEN_DEVICE_FUNC  inline bool run(Dims1&, Dims2&) {
     return false;
   }
 };
-template <typename Dims1, typename Dims2, ptrdiff_t n>
+template <typename Dims1, typename Dims2, size_t n>
 struct sizes_match_below_dim<Dims1, Dims2, n, n> {
-  static EIGEN_DEVICE_FUNC  EIGEN_STRONG_INLINE bool run(Dims1& dims1, Dims2& dims2) {
+  static EIGEN_DEVICE_FUNC  inline bool run(Dims1& dims1, Dims2& dims2) {
     return (array_get<n-1>(dims1) == array_get<n-1>(dims2)) &
         sizes_match_below_dim<Dims1, Dims2, n-1, n-1>::run(dims1, dims2);
   }
 };
 template <typename Dims1, typename Dims2>
 struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {
-  static EIGEN_DEVICE_FUNC  EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) {
+  static EIGEN_DEVICE_FUNC  inline bool run(Dims1&, Dims2&) {
     return true;
   }
 };
@@ -481,7 +419,7 @@ struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {
 
 
 template <typename Dims1, typename Dims2>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2) {
+EIGEN_DEVICE_FUNC bool dimensions_match(Dims1& dims1, Dims2& dims2) {
   return internal::sizes_match_below_dim<Dims1, Dims2, internal::array_size<Dims1>::value, internal::array_size<Dims2>::value>::run(dims1, dims2);
 }
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 4689b0230..06987132b 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -32,7 +32,6 @@ struct traits<TensorEvalToOp<XprType, MakePointer_> >
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
-  typedef typename MakePointer_<Scalar>::Type PointerType;
 
   enum {
     Flags = 0
@@ -42,8 +41,6 @@ struct traits<TensorEvalToOp<XprType, MakePointer_> >
     // Intermediate typedef to workaround MSVC issue.
     typedef MakePointer_<T> MakePointerT;
     typedef typename MakePointerT::Type Type;
-
-
   };
 };
 
@@ -76,8 +73,6 @@ class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType, MakePointer_>,
   typedef typename Eigen::internal::traits<TensorEvalToOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorEvalToOp>::Index Index;
 
-  static const int NumDims = Eigen::internal::traits<TensorEvalToOp>::NumDimensions;
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr)
       : m_xpr(expr), m_buffer(buffer) {}
 
@@ -103,60 +98,38 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
   typedef typename XprType::Index Index;
   typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
   enum {
-    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = true,
-    PreferBlockAccess = false,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess       = false,  // to be implemented
-    RawAccess         = true
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = true
   };
 
-  static const int NumDims = internal::traits<ArgType>::NumDimensions;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
-      ArgTensorBlock;
-
-  typedef internal::TensorBlockAssignment<
-      CoeffReturnType, NumDims, typename ArgTensorBlock::XprType, Index>
-      TensorBlockAssignment;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){}
-
+      : m_impl(op.expression(), device), m_device(device),
+          m_buffer(op.buffer()), m_op(op), m_expression(op.expression())
+  { }
 
+  // Used for accessor extraction in SYCL Managed TensorMap:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& op() const {
+    return m_op;
+  }
+  
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {
   }
 
-
+  typedef typename internal::traits<const TensorEvalToOp<ArgType, MakePointer_> >::template MakePointer<CoeffReturnType>::Type DevicePointer;
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(DevicePointer scalar) {
     EIGEN_UNUSED_VARIABLE(scalar);
     eigen_assert(scalar == NULL);
     return m_impl.evalSubExprsIfNeeded(m_buffer);
   }
 
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType scalar, EvalSubExprsCallback done) {
-    EIGEN_UNUSED_VARIABLE(scalar);
-    eigen_assert(scalar == NULL);
-    m_impl.evalSubExprsIfNeededAsync(m_buffer, std::move(done));
-  }
-#endif
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) {
     m_buffer[i] = m_impl.coeff(i);
   }
@@ -164,33 +137,6 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
     internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(m_buffer + i, m_impl.template packet<TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned>(i));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    return m_impl.getResourceRequirements();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(
-      TensorBlockDesc& desc, TensorBlockScratch& scratch) {
-    // Add `m_buffer` as destination buffer to the block descriptor.
-    desc.template AddDestinationBuffer<Layout>(
-        /*dst_base=*/m_buffer + desc.offset(),
-        /*dst_strides=*/internal::strides<Layout>(m_impl.dimensions()));
-
-    ArgTensorBlock block =
-        m_impl.block(desc, scratch, /*root_of_expr_ast=*/true);
-
-    // If block was evaluated into a destination buffer, there is no need to do
-    // an assignment.
-    if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
-      TensorBlockAssignment::Run(
-          TensorBlockAssignment::target(
-              desc.dimensions(), internal::strides<Layout>(m_impl.dimensions()),
-              m_buffer, desc.offset()),
-          block.expr());
-    }
-    block.cleanup();
-  }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
@@ -213,20 +159,19 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
         TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_buffer; }
+  EIGEN_DEVICE_FUNC DevicePointer data() const { return m_buffer; }
   ArgType expression() const { return m_expression; }
-  #ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-    m_buffer.bind(cgh);
-  }
-  #endif
 
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+  /// added for sycl in order to construct the buffer from the sycl device
+  const Device& device() const{return m_device;}
 
  private:
   TensorEvaluator<ArgType, Device> m_impl;
-  EvaluatorPointerType m_buffer;
+  const Device& m_device;
+  DevicePointer m_buffer;
+  const XprType& m_op;
   const ArgType m_expression;
 };
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index d4532b72c..834ce07df 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -32,72 +32,44 @@ struct TensorEvaluator
   typedef typename Derived::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef typename Derived::Dimensions Dimensions;
-  typedef Derived XprType;
-  static const int PacketSize =  PacketType<CoeffReturnType, Device>::size;
-  typedef typename internal::traits<Derived>::template MakePointer<Scalar>::Type TensorPointerType;
-  typedef StorageMemory<Scalar, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
 
   // NumDimensions is -1 for variable dim tensors
   static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
                                internal::traits<Derived>::NumDimensions : 0;
 
   enum {
-    IsAligned          = Derived::IsAligned,
-    PacketAccess       = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess        = internal::is_arithmetic<typename internal::remove_const<Scalar>::type>::value,
-    PreferBlockAccess  = false,
-    Layout             = Derived::Layout,
-    CoordAccess        = NumCoords > 0,
-    RawAccess          = true
+    IsAligned = Derived::IsAligned,
+    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+    Layout = Derived::Layout,
+    CoordAccess = NumCoords > 0,
+    RawAccess = true
   };
 
-  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords,
-                                                     Layout, Index>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
-      : m_data(device.get((const_cast<TensorPointerType>(m.data())))),
-        m_dims(m.dimensions()),
-        m_device(device)
+      : m_data(const_cast<typename internal::traits<Derived>::template MakePointer<Scalar>::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m)
   { }
 
-
+  // Used for accessor extraction in SYCL Managed TensorMap:
+  const Derived& derived() const { return m_impl; }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) {
-    if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && dest) {
-      m_device.memcpy((void*)(m_device.get(dest)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar));
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) {
+    if (dest) {
+      m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize());
       return false;
     }
     return true;
   }
 
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType dest, EvalSubExprsCallback done) {
-    // TODO(ezhulenev): ThreadPoolDevice memcpy is blockign operation.
-    done(evalSubExprsIfNeeded(dest));
-  }
-#endif  // EIGEN_USE_THREADS
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
-    eigen_assert(m_data != NULL);
+    eigen_assert(m_data);
     return m_data[index];
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) {
-    eigen_assert(m_data != NULL);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    eigen_assert(m_data);
     return m_data[index];
   }
 
@@ -107,18 +79,6 @@ struct TensorEvaluator
     return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
   }
 
-  // Return a packet starting at `index` where `umask` specifies which elements
-  // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for
-  // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding
-  // float element will be loaded, otherwise 0 will be loaded.
-  // Function has been templatized to enable Sfinae.
-  template <typename PacketReturnTypeT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  typename internal::enable_if<internal::unpacket_traits<PacketReturnTypeT>::masked_load_available, PacketReturnTypeT>::type
-  partialPacket(Index index, typename internal::unpacket_traits<PacketReturnTypeT>::mask_t umask) const
-  {
-    return internal::ploadu<PacketReturnTypeT>(m_data + index, umask);
-  }
-
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
@@ -126,7 +86,7 @@ struct TensorEvaluator
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
-    eigen_assert(m_data != NULL);
+    eigen_assert(m_data);
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       return m_data[m_dims.IndexOfColMajor(coords)];
     } else {
@@ -134,9 +94,8 @@ struct TensorEvaluator
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType&
-  coeffRef(const array<DenseIndex, NumCoords>& coords) {
-    eigen_assert(m_data != NULL);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<DenseIndex, NumCoords>& coords) {
+    eigen_assert(m_data);
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       return m_data[m_dims.IndexOfColMajor(coords)];
     } else {
@@ -146,50 +105,19 @@ struct TensorEvaluator
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
-                        PacketType<CoeffReturnType, Device>::size);
+                        internal::unpacket_traits<PacketReturnType>::size);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    return internal::TensorBlockResourceRequirements::any();
-  }
+  EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<Scalar>::Type data() const { return m_data; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    assert(m_data != NULL);
-    return TensorBlock::materialize(m_data, m_dims, desc, scratch);
-  }
+  /// required by sycl in order to construct sycl buffer from raw pointer
+  const Device& device() const{return m_device;}
 
-  template<typename TensorBlock>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
-      const TensorBlockDesc& desc, const TensorBlock& block) {
-    assert(m_data != NULL);
-
-    typedef typename TensorBlock::XprType TensorBlockExpr;
-    typedef internal::TensorBlockAssignment<Scalar, NumCoords, TensorBlockExpr,
-                                            Index>
-        TensorBlockAssign;
-
-    TensorBlockAssign::Run(
-        TensorBlockAssign::target(desc.dimensions(),
-                                  internal::strides<Layout>(m_dims), m_data,
-                                  desc.offset()),
-        block.expr());
-  }
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_data.bind(cgh);
-  }
-#endif
  protected:
-  EvaluatorPointerType m_data;
+  typename internal::traits<Derived>::template MakePointer<Scalar>::Type m_data;
   Dimensions m_dims;
-  const Device EIGEN_DEVICE_REF m_device;
+  const Device& m_device;
+  const Derived& m_impl;
 };
 
 namespace {
@@ -198,7 +126,7 @@ T loadConstant(const T* address) {
   return *address;
 }
 // Use the texture cache on CUDA devices whenever possible
-#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
 template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 float loadConstant(const float* address) {
   return __ldg(address);
@@ -212,13 +140,6 @@ Eigen::half loadConstant(const Eigen::half* address) {
   return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x)));
 }
 #endif
-#ifdef EIGEN_USE_SYCL
-// overload of load constant should be implemented here based on range access
-template <cl::sycl::access::mode AcMd, typename T>
-T &loadConstant(const Eigen::TensorSycl::internal::RangeAccess<AcMd, T> &address) {
-  return *address;
-}
-#endif
 }
 
 
@@ -231,64 +152,40 @@ struct TensorEvaluator<const Derived, Device>
   typedef typename Derived::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef typename Derived::Dimensions Dimensions;
-  typedef const Derived XprType;
-  typedef typename internal::traits<Derived>::template MakePointer<const Scalar>::Type TensorPointerType;
-  typedef StorageMemory<const Scalar, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
 
   // NumDimensions is -1 for variable dim tensors
   static const int NumCoords = internal::traits<Derived>::NumDimensions > 0 ?
                                internal::traits<Derived>::NumDimensions : 0;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
 
   enum {
-    IsAligned         = Derived::IsAligned,
-    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess       = internal::is_arithmetic<ScalarNoConst>::value,
-    PreferBlockAccess = false,
-    Layout            = Derived::Layout,
-    CoordAccess       = NumCoords > 0,
-    RawAccess         = true
+    IsAligned = Derived::IsAligned,
+    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+    Layout = Derived::Layout,
+    CoordAccess = NumCoords > 0,
+    RawAccess = true
   };
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords,
-                                                     Layout, Index>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
+  // Used for accessor extraction in SYCL Managed TensorMap:
+  const Derived& derived() const { return m_impl; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
-      : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device)
+      : m_data(m.data()), m_dims(m.dimensions()), m_device(device), m_impl(m)
   { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
     if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && data) {
-      m_device.memcpy((void*)(m_device.get(data)),m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar));
+      m_device.memcpy((void*)data, m_data, m_dims.TotalSize() * sizeof(Scalar));
       return false;
     }
     return true;
   }
 
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType dest, EvalSubExprsCallback done) {
-    // TODO(ezhulenev): ThreadPoolDevice memcpy is a blockign operation.
-    done(evalSubExprsIfNeeded(dest));
-  }
-#endif  // EIGEN_USE_THREADS
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
-    eigen_assert(m_data != NULL);
+    eigen_assert(m_data);
     return loadConstant(m_data+index);
   }
 
@@ -298,20 +195,8 @@ struct TensorEvaluator<const Derived, Device>
     return internal::ploadt_ro<PacketReturnType, LoadMode>(m_data + index);
   }
 
-  // Return a packet starting at `index` where `umask` specifies which elements
-  // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for
-  // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding
-  // float element will be loaded, otherwise 0 will be loaded.
-  // Function has been templatized to enable Sfinae.
-  template <typename PacketReturnTypeT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  typename internal::enable_if<internal::unpacket_traits<PacketReturnTypeT>::masked_load_available, PacketReturnTypeT>::type
-  partialPacket(Index index, typename internal::unpacket_traits<PacketReturnTypeT>::mask_t umask) const
-  {
-    return internal::ploadu<PacketReturnTypeT>(m_data + index, umask);
-  }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
-    eigen_assert(m_data != NULL);
+    eigen_assert(m_data);
     const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_dims.IndexOfColMajor(coords)
                         : m_dims.IndexOfRowMajor(coords);
     return loadConstant(m_data+index);
@@ -319,32 +204,19 @@ struct TensorEvaluator<const Derived, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
-                        PacketType<CoeffReturnType, Device>::size);
+                        internal::unpacket_traits<PacketReturnType>::size);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    return internal::TensorBlockResourceRequirements::any();
-  }
+  EIGEN_DEVICE_FUNC typename internal::traits<Derived>::template MakePointer<const Scalar>::Type data() const { return m_data; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    assert(m_data != NULL);
-    return TensorBlock::materialize(m_data, m_dims, desc, scratch);
-  }
+  /// added for sycl in order to construct the buffer from the sycl device
+  const Device& device() const{return m_device;}
 
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_data.bind(cgh);
-  }
-#endif
  protected:
-  EvaluatorPointerType m_data;
+  typename internal::traits<Derived>::template MakePointer<const Scalar>::Type m_data;
   Dimensions m_dims;
-  const Device EIGEN_DEVICE_REF m_device;
+  const Device& m_device;
+  const Derived& m_impl;
 };
 
 
@@ -357,6 +229,14 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
 {
   typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType;
 
+  enum {
+    IsAligned = true,
+    PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
   EIGEN_DEVICE_FUNC
   TensorEvaluator(const XprType& op, const Device& device)
       : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper()
@@ -366,41 +246,12 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  enum {
-    IsAligned = true,
-    PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess
-    #ifdef EIGEN_USE_SYCL
-    &&  (PacketType<CoeffReturnType, Device>::size >1)
-    #endif
-    ,
-    BlockAccess = false,
-    PreferBlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,  // to be implemented
-    RawAccess = false
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType, EvalSubExprsCallback done) {
-    done(true);
-  }
-#endif  // EIGEN_USE_THREADS
-
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
 
   EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
@@ -417,17 +268,16 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
   costPerCoeff(bool vectorized) const {
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized,
-                        PacketType<CoeffReturnType, Device>::size);
+                        internal::unpacket_traits<PacketReturnType>::size);
   }
 
-  EIGEN_DEVICE_FUNC  EvaluatorPointerType data() const { return NULL; }
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_argImpl; }
+  /// required by sycl in order to extract the accessor
+  NullaryOp functor() const { return m_functor; }
 
-#ifdef EIGEN_USE_SYCL
-   // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_argImpl.bind(cgh);
-  }
-#endif
 
  private:
   const NullaryOp m_functor;
@@ -445,59 +295,31 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
   typedef TensorCwiseUnaryOp<UnaryOp, ArgType> XprType;
 
   enum {
-    IsAligned          = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess       = TensorEvaluator<ArgType, Device>::PacketAccess &
-                         internal::functor_traits<UnaryOp>::PacketAccess,
-    BlockAccess        = TensorEvaluator<ArgType, Device>::BlockAccess,
-    PreferBlockAccess  = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
-    Layout             = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess        = false,  // to be implemented
-    RawAccess          = false
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess & internal::functor_traits<UnaryOp>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
-    : m_device(device),
-      m_functor(op.functor()),
+    : m_functor(op.functor()),
       m_argImpl(op.nestedExpression(), device)
   { }
 
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
-  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-  static const int NumDims = internal::array_size<Dimensions>::value;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
-      ArgTensorBlock;
-
-  typedef internal::TensorCwiseUnaryBlock<UnaryOp, ArgTensorBlock>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
     m_argImpl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType, EvalSubExprsCallback done) {
-    m_argImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
-  }
-#endif  // EIGEN_USE_THREADS
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_argImpl.cleanup();
   }
@@ -519,31 +341,15 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
         TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    static const double functor_cost = internal::functor_traits<UnaryOp>::Cost;
-    return m_argImpl.getResourceRequirements().addCostPerCoeff(
-        {0, 0, functor_cost / PacketSize});
-  }
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    return TensorBlock(m_argImpl.block(desc, scratch), m_functor);
-  }
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const{
-    m_argImpl.bind(cgh);
-  }
-#endif
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<ArgType, Device> & impl() const { return m_argImpl; }
+  /// added for sycl in order to construct the buffer from sycl device
+  UnaryOp functor() const { return m_functor; }
 
 
  private:
-  const Device EIGEN_DEVICE_REF m_device;
   const UnaryOp m_functor;
   TensorEvaluator<ArgType, Device> m_argImpl;
 };
@@ -557,23 +363,16 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
   typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
 
   enum {
-    IsAligned         = TensorEvaluator<LeftArgType, Device>::IsAligned &
-                        TensorEvaluator<RightArgType, Device>::IsAligned,
-    PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &
-                        TensorEvaluator<RightArgType, Device>::PacketAccess &
-                        internal::functor_traits<BinaryOp>::PacketAccess,
-    BlockAccess       = TensorEvaluator<LeftArgType, Device>::BlockAccess &
-                        TensorEvaluator<RightArgType, Device>::BlockAccess,
-    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
-                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
-    Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
-    CoordAccess       = false,  // to be implemented
-    RawAccess         = false
+    IsAligned = TensorEvaluator<LeftArgType, Device>::IsAligned & TensorEvaluator<RightArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<LeftArgType, Device>::PacketAccess & TensorEvaluator<RightArgType, Device>::PacketAccess &
+                   internal::functor_traits<BinaryOp>::PacketAccess,
+    Layout = TensorEvaluator<LeftArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
-    : m_device(device),
-      m_functor(op.functor()),
+    : m_functor(op.functor()),
       m_leftImpl(op.lhsExpression(), device),
       m_rightImpl(op.rhsExpression(), device)
   {
@@ -585,27 +384,8 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
   typedef typename XprType::Scalar Scalar;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
   typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  static const int NumDims = internal::array_size<
-      typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename TensorEvaluator<const LeftArgType, Device>::TensorBlock
-      LeftTensorBlock;
-  typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlock
-      RightTensorBlock;
-
-  typedef internal::TensorCwiseBinaryBlock<BinaryOp, LeftTensorBlock,
-                                           RightTensorBlock>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
   {
@@ -613,24 +393,11 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
     return m_leftImpl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
     m_leftImpl.evalSubExprsIfNeeded(NULL);
     m_rightImpl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType, EvalSubExprsCallback done) {
-    // TODO(ezhulenev): Evaluate two expression in parallel?
-    m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) {
-      m_rightImpl.evalSubExprsIfNeededAsync(nullptr,
-                                            [done](bool) { done(true); });
-    });
-  }
-#endif  // EIGEN_USE_THREADS
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_leftImpl.cleanup();
     m_rightImpl.cleanup();
@@ -654,34 +421,15 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
            TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    static const double functor_cost = internal::functor_traits<BinaryOp>::Cost;
-    return internal::TensorBlockResourceRequirements::merge(
-               m_leftImpl.getResourceRequirements(),
-               m_rightImpl.getResourceRequirements())
-        .addCostPerCoeff({0, 0, functor_cost / PacketSize});
-  }
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<LeftArgType, Device>& left_impl() const { return m_leftImpl; }
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<RightArgType, Device>& right_impl() const { return m_rightImpl; }
+  /// required by sycl in order to extract the accessor
+  BinaryOp functor() const { return m_functor; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    desc.DropDestinationBuffer();
-    return TensorBlock(m_leftImpl.block(desc, scratch),
-                         m_rightImpl.block(desc, scratch), m_functor);
-  }
-
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-
-  #ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_leftImpl.bind(cgh);
-    m_rightImpl.bind(cgh);
-  }
-  #endif
  private:
-  const Device EIGEN_DEVICE_REF m_device;
   const BinaryOp m_functor;
   TensorEvaluator<LeftArgType, Device> m_leftImpl;
   TensorEvaluator<RightArgType, Device> m_rightImpl;
@@ -696,17 +444,11 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
 
   enum {
     IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned,
-    PacketAccess      = TensorEvaluator<Arg1Type, Device>::PacketAccess &&
-                        TensorEvaluator<Arg2Type, Device>::PacketAccess &&
-                        TensorEvaluator<Arg3Type, Device>::PacketAccess &&
-                        internal::functor_traits<TernaryOp>::PacketAccess,
-    BlockAccess       = false,
-    PreferBlockAccess = TensorEvaluator<Arg1Type, Device>::PreferBlockAccess ||
-                        TensorEvaluator<Arg2Type, Device>::PreferBlockAccess ||
-                        TensorEvaluator<Arg3Type, Device>::PreferBlockAccess,
-    Layout            = TensorEvaluator<Arg1Type, Device>::Layout,
-    CoordAccess       = false,  // to be implemented
-    RawAccess         = false
+    PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
+                   internal::functor_traits<TernaryOp>::PacketAccess,
+    Layout = TensorEvaluator<Arg1Type, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
@@ -737,14 +479,8 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
   typedef typename XprType::Scalar Scalar;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
   typedef typename TensorEvaluator<Arg1Type, Device>::Dimensions Dimensions;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
   {
@@ -752,7 +488,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
     return m_arg1Impl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
     m_arg1Impl.evalSubExprsIfNeeded(NULL);
     m_arg2Impl.evalSubExprsIfNeeded(NULL);
     m_arg3Impl.evalSubExprsIfNeeded(NULL);
@@ -785,16 +521,14 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
            TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
 
-#ifdef EIGEN_USE_SYCL
-   // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_arg1Impl.bind(cgh);
-    m_arg2Impl.bind(cgh);
-    m_arg3Impl.bind(cgh);
-  }
-#endif
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<Arg1Type, Device> & arg1Impl() const { return m_arg1Impl; }
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<Arg2Type, Device>& arg2Impl() const { return m_arg2Impl; }
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<Arg3Type, Device>& arg3Impl() const { return m_arg3Impl; }
 
  private:
   const TernaryOp m_functor;
@@ -813,20 +547,12 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
   typedef typename XprType::Scalar Scalar;
 
   enum {
-    IsAligned         = TensorEvaluator<ThenArgType, Device>::IsAligned &
-                        TensorEvaluator<ElseArgType, Device>::IsAligned,
-    PacketAccess      = TensorEvaluator<ThenArgType, Device>::PacketAccess &
-                        TensorEvaluator<ElseArgType, Device>::PacketAccess &
-                        PacketType<Scalar, Device>::HasBlend,
-    BlockAccess       = TensorEvaluator<IfArgType, Device>::BlockAccess &&
-                        TensorEvaluator<ThenArgType, Device>::BlockAccess &&
-                        TensorEvaluator<ElseArgType, Device>::BlockAccess,
-    PreferBlockAccess = TensorEvaluator<IfArgType, Device>::PreferBlockAccess ||
-                        TensorEvaluator<ThenArgType, Device>::PreferBlockAccess ||
-                        TensorEvaluator<ElseArgType, Device>::PreferBlockAccess,
-    Layout            = TensorEvaluator<IfArgType, Device>::Layout,
-    CoordAccess       = false,  // to be implemented
-    RawAccess         = false
+    IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ThenArgType, Device>::PacketAccess & TensorEvaluator<ElseArgType, Device>::PacketAccess &
+                   internal::packet_traits<Scalar>::HasBlend,
+    Layout = TensorEvaluator<IfArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
@@ -843,42 +569,8 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
   typedef typename XprType::Index Index;
   typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
   typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  static const int NumDims = internal::array_size<Dimensions>::value;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-    typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename TensorEvaluator<const IfArgType, Device>::TensorBlock
-      IfArgTensorBlock;
-  typedef typename TensorEvaluator<const ThenArgType, Device>::TensorBlock
-      ThenArgTensorBlock;
-  typedef typename TensorEvaluator<const ElseArgType, Device>::TensorBlock
-      ElseArgTensorBlock;
-
-  struct TensorSelectOpBlockFactory {
-    template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType>
-    struct XprType {
-      typedef TensorSelectOp<const IfArgXprType, const ThenArgXprType, const ElseArgXprType> type;
-    };
-
-    template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType>
-    typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type expr(
-        const IfArgXprType& if_expr, const ThenArgXprType& then_expr, const ElseArgXprType& else_expr) const {
-      return typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type(if_expr, then_expr, else_expr);
-    }
-  };
-
-  typedef internal::TensorTernaryExprBlock<TensorSelectOpBlockFactory,
-                                           IfArgTensorBlock, ThenArgTensorBlock,
-                                           ElseArgTensorBlock>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
   {
@@ -886,25 +578,12 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
     return m_condImpl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
     m_condImpl.evalSubExprsIfNeeded(NULL);
     m_thenImpl.evalSubExprsIfNeeded(NULL);
     m_elseImpl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType, EvalSubExprsCallback done) {
-    m_condImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) {
-      m_thenImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) {
-        m_elseImpl.evalSubExprsIfNeeded(nullptr, [done](bool) { done(true); });
-      });
-    });
-  }
-#endif  // EIGEN_USE_THREADS
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_condImpl.cleanup();
     m_thenImpl.cleanup();
@@ -918,15 +597,13 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const
   {
-     internal::Selector<PacketSize> select;
-     EIGEN_UNROLL_LOOP
-     for (Index i = 0; i < PacketSize; ++i) {
-       select.select[i] = m_condImpl.coeff(index+i);
-     }
-     return internal::pblend(select,
-                             m_thenImpl.template packet<LoadMode>(index),
-                             m_elseImpl.template packet<LoadMode>(index));
-
+    internal::Selector<PacketSize> select;
+    for (Index i = 0; i < PacketSize; ++i) {
+      select.select[i] = m_condImpl.coeff(index+i);
+    }
+    return internal::pblend(select,
+                            m_thenImpl.template packet<LoadMode>(index),
+                            m_elseImpl.template packet<LoadMode>(index));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
@@ -936,42 +613,14 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
         .cwiseMax(m_elseImpl.costPerCoeff(vectorized));
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    auto then_req = m_thenImpl.getResourceRequirements();
-    auto else_req = m_elseImpl.getResourceRequirements();
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; }
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<IfArgType, Device> & cond_impl() const { return m_condImpl; }
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<ThenArgType, Device>& then_impl() const { return m_thenImpl; }
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<ElseArgType, Device>& else_impl() const { return m_elseImpl; }
 
-    auto merged_req =
-        internal::TensorBlockResourceRequirements::merge(then_req, else_req);
-    merged_req.cost_per_coeff =
-        then_req.cost_per_coeff.cwiseMax(else_req.cost_per_coeff);
-
-    return internal::TensorBlockResourceRequirements::merge(
-        m_condImpl.getResourceRequirements(), merged_req);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    // It's unsafe to pass destination buffer to underlying expressions, because
-    // output might be aliased with one of the inputs.
-    desc.DropDestinationBuffer();
-
-    return TensorBlock(
-        m_condImpl.block(desc, scratch), m_thenImpl.block(desc, scratch),
-        m_elseImpl.block(desc, scratch), TensorSelectOpBlockFactory());
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
-
-#ifdef EIGEN_USE_SYCL
- // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_condImpl.bind(cgh);
-    m_thenImpl.bind(cgh);
-    m_elseImpl.bind(cgh);
-  }
-#endif
  private:
   TensorEvaluator<IfArgType, Device> m_condImpl;
   TensorEvaluator<ThenArgType, Device> m_thenImpl;
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index c52fb77dc..f01d77c0a 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -12,94 +12,31 @@
 
 namespace Eigen {
 
-/**
- * \class TensorExecutor
- * \ingroup CXX11_Tensor_Module
- *
- * \brief The tensor executor class.
- *
- * This class is responsible for launch the evaluation of the expression on
- * the specified computing device.
- *
- * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and
- *                      instructions)
- * @tparam Tiling       can use block based tensor evaluation
- *                      (see TensorBlock.h)
- */
+/** \class TensorExecutor
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief The tensor executor class.
+  *
+  * This class is responsible for launch the evaluation of the expression on
+  * the specified computing device.
+  */
 namespace internal {
 
-/**
- * Evaluating TensorBroadcastingOp via coefficient of packet path is extremely
- * expensive. If expression has at least one broadcast op in it, and it supports
- * block based evaluation, we always prefer it, even for the small tensors. For
- * all other tileable ops, block evaluation overhead for small tensors (fits
- * into L1) is too large, and we fallback on vectorized evaluation.
- */
-
-// TODO(ezhulenev): Add specializations for all other types of Tensor ops.
-
-template<typename Expression>
-struct ExpressionHasTensorBroadcastingOp {
-  enum { value = false };
-};
-
-template<typename LhsXprType, typename RhsXprType>
-struct ExpressionHasTensorBroadcastingOp<
-    const TensorAssignOp<LhsXprType, RhsXprType> > {
-  enum { value = ExpressionHasTensorBroadcastingOp<RhsXprType>::value };
-};
-
-template<typename UnaryOp, typename XprType>
-struct ExpressionHasTensorBroadcastingOp<
-    const TensorCwiseUnaryOp<UnaryOp, XprType> > {
-  enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value };
-};
-
-template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
-struct ExpressionHasTensorBroadcastingOp<
-    const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
-  enum {
-    value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value ||
-        ExpressionHasTensorBroadcastingOp<RhsXprType>::value
-  };
-};
-
-template<typename Broadcast, typename XprType>
-struct ExpressionHasTensorBroadcastingOp<
-    const TensorBroadcastingOp<Broadcast, XprType> > {
-  enum { value = true };
-};
-
-// -------------------------------------------------------------------------- //
-
-/**
- * Default strategy: the expression is evaluated sequentially with a single cpu
- * thread, without vectorization and block evaluation.
- */
-template <typename Expression, typename Device, bool Vectorizable,
-          TiledEvaluation Tiling>
-class TensorExecutor {
+// Default strategy: the expression is evaluated with a single cpu thread.
+template<typename Expression, typename Device, bool Vectorizable>
+class TensorExecutor
+{
  public:
-  typedef typename Expression::Index StorageIndex;
-
-  // Including `unsupported/Eigen/CXX11/Tensor` in different translation units
-  // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR
-  // violation. If this template is instantiated with a non-default device, it
-  // means that this header file was included without defining
-  // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`.
-  static_assert(std::is_same<Device, DefaultDevice>::value,
-                "Default executor instantiated with non-default device. "
-                "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
-                "EIGEN_USE_SYCL before including Eigen headers.");
-
+  typedef typename Expression::Index Index;
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE void run(const Expression& expr,
-                                      const Device& device = Device()) {
+  static inline void run(const Expression& expr, const Device& device = Device())
+  {
     TensorEvaluator<Expression, Device> evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign) {
-      const StorageIndex size = array_prod(evaluator.dimensions());
-      for (StorageIndex i = 0; i < size; ++i) {
+    if (needs_assign)
+    {
+      const Index size = array_prod(evaluator.dimensions());
+      for (Index i = 0; i < size; ++i) {
         evaluator.evalScalar(i);
       }
     }
@@ -107,48 +44,35 @@ class TensorExecutor {
   }
 };
 
-/**
- * Default async execution strategy is not implemented. Currently it's only
- * available for ThreadPoolDevice (see definition below).
- */
-template <typename Expression, typename Device, typename DoneCallback,
-          bool Vectorizable, TiledEvaluation Tiling>
-class TensorAsyncExecutor {};
 
-/**
- * Process all the data with a single cpu thread, using vectorized instructions.
- */
-template <typename Expression>
-class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
-                     /*Tiling=*/TiledEvaluation::Off> {
+template<typename Expression>
+class TensorExecutor<Expression, DefaultDevice, true>
+{
  public:
-  typedef typename Expression::Index StorageIndex;
-
+  typedef typename Expression::Index Index;
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE void run(
-      const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
+  static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
+  {
     TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign) {
-      const StorageIndex size = array_prod(evaluator.dimensions());
-      const int PacketSize = unpacket_traits<typename TensorEvaluator<
-          Expression, DefaultDevice>::PacketReturnType>::size;
-
-      // Give compiler a strong possibility to unroll the loop. But don't insist
-      // on unrolling, because if the function is expensive compiler should not
+    if (needs_assign)
+    {
+      const Index size = array_prod(evaluator.dimensions());
+      const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
+      // Give the compiler a strong hint to unroll the loop. But don't insist
+      // on unrolling, because if the function is expensive the compiler should not
       // unroll the loop at the expense of inlining.
-      const StorageIndex UnrolledSize =
-          (size / (4 * PacketSize)) * 4 * PacketSize;
-      for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
-        for (StorageIndex j = 0; j < 4; j++) {
+      const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
+      for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
+        for (Index j = 0; j < 4; j++) {
           evaluator.evalPacket(i + j * PacketSize);
         }
       }
-      const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
-      for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
+      const Index VectorizedSize = (size / PacketSize) * PacketSize;
+      for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
         evaluator.evalPacket(i);
       }
-      for (StorageIndex i = VectorizedSize; i < size; ++i) {
+      for (Index i = VectorizedSize; i < size; ++i) {
         evaluator.evalScalar(i);
       }
     }
@@ -156,162 +80,55 @@ class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
   }
 };
 
-/**
- * Process all the data with a single cpu thread, using blocks of data. By
- * sizing a block to fit L1 cache we get better cache performance.
- */
-template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, DefaultDevice, Vectorizable,
-                     /*Tiling=*/TiledEvaluation::On> {
- public:
-  typedef typename traits<Expression>::Scalar Scalar;
-  typedef typename remove_const<Scalar>::type ScalarNoConst;
 
-  typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
-  typedef typename traits<Expression>::Index StorageIndex;
 
-  static const int NumDims = traits<Expression>::NumDimensions;
-
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE void run(const Expression& expr,
-                         const DefaultDevice& device = DefaultDevice()) {
-    typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex>
-        TensorBlockMapper;
-
-    typedef internal::TensorBlockDescriptor<NumDims, StorageIndex>
-        TensorBlockDesc;
-    typedef internal::TensorBlockScratchAllocator<DefaultDevice>
-        TensorBlockScratch;
-
-    Evaluator evaluator(expr, device);
-
-    // TODO(ezhulenev): Do not use tiling for small tensors?
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-
-    if (needs_assign) {
-      // Query expression tree for desired block size/shape.
-      const TensorBlockResourceRequirements requirements =
-          evaluator.getResourceRequirements();
-
-      const TensorBlockMapper block_mapper(
-          typename TensorBlockDesc::Dimensions(evaluator.dimensions()),
-          requirements);
-
-      // Share scratch memory allocator between all blocks.
-      TensorBlockScratch scratch(device);
-
-      const StorageIndex total_block_count = block_mapper.blockCount();
-      for (StorageIndex i = 0; i < total_block_count; ++i) {
-        TensorBlockDesc desc = block_mapper.blockDescriptor(i);
-        evaluator.evalBlock(desc, scratch);
-        scratch.reset();
-      }
-    }
-    evaluator.cleanup();
-  }
-};
-
-/**
- * Multicore strategy: the index space is partitioned and each partition is
- * executed on a single core.
- *
- * (1) TensorExecutor will submit work to the ThreadPoolDevice managed thread
- *     pool, and will block the caller thread until all tasks are finished.
- *
- * (2) TensorAsyncExecutor is a non-blocking version, that will submit work to
- *     the ThreadPoolDevice managed thread pool, and will return immediately.
- *     It will call 'done' callback after all tasks are finished.
- */
+// Multicore strategy: the index space is partitioned and each partition is executed on a single core
 #ifdef EIGEN_USE_THREADS
-
-template <typename TensorBlockMapper>
-struct TensorExecutorTilingContext {
-  TensorExecutorTilingContext() = default;
-  TensorExecutorTilingContext(const TensorBlockMapper& b_mapper,
-                              const TensorOpCost& b_cost, size_t b_aligned_size)
-      : block_mapper(b_mapper),
-        cost(b_cost),
-        aligned_blocksize(b_aligned_size) {}
-
-  TensorBlockMapper block_mapper;  // navigate through blocks
-  TensorOpCost cost;               // cost of computing a single block
-  size_t aligned_blocksize;        // block size after memory alignment
-};
-
-// Computes a block evaluation parameters, and allocates temporary memory buffer
-// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
-template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
-TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
-    const Evaluator& evaluator) {
-  // Query expression tree for desired block size/shape.
-  TensorBlockResourceRequirements requirements =
-      evaluator.getResourceRequirements();
-
-  // Update target block size based on cost model.
-  double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
-      1, requirements.cost_per_coeff);
-  requirements.size = static_cast<size_t>(1.0 / taskSize);
-
-  TensorBlockMapper block_mapper(
-      typename TensorBlockMapper::Dimensions(evaluator.dimensions()),
-      requirements);
-
-  size_t block_size = block_mapper.blockTotalSize();
-  const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
-  const size_t aligned_blocksize =
-      align *
-      divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
-
-  return {block_mapper, requirements.cost_per_coeff * block_size,
-          aligned_blocksize};
-}
-
-template <typename Evaluator, typename StorageIndex, bool Vectorizable>
+template <typename Evaluator, typename Index, bool Vectorizable>
 struct EvalRange {
-  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
-                  const StorageIndex lastIdx) {
+  static void run(Evaluator* evaluator_in, const Index first, const Index last) {
     Evaluator evaluator = *evaluator_in;
-    eigen_assert(lastIdx >= firstIdx);
-    for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
+    eigen_assert(last >= first);
+    for (Index i = first; i < last; ++i) {
       evaluator.evalScalar(i);
     }
   }
 
-  static StorageIndex alignBlockSize(StorageIndex size) { return size; }
+  static Index alignBlockSize(Index size) {
+    return size;
+  }
 };
 
-template <typename Evaluator, typename StorageIndex>
-struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
-  static const int PacketSize =
-      unpacket_traits<typename Evaluator::PacketReturnType>::size;
+template <typename Evaluator, typename Index>
+struct EvalRange<Evaluator, Index, true> {
+  static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
 
-  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
-                  const StorageIndex lastIdx) {
+  static void run(Evaluator* evaluator_in, const Index first, const Index last) {
     Evaluator evaluator = *evaluator_in;
-    eigen_assert(lastIdx >= firstIdx);
-    StorageIndex i = firstIdx;
-    if (lastIdx - firstIdx >= PacketSize) {
-      eigen_assert(firstIdx % PacketSize == 0);
-      StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
-      // Give compiler a strong possibility to unroll the loop. But don't insist
-      // on unrolling, because if the function is expensive compiler should not
+    eigen_assert(last >= first);
+    Index i = first;
+    if (last - first >= PacketSize) {
+      eigen_assert(first % PacketSize == 0);
+      Index last_chunk_offset = last - 4 * PacketSize;
+      // Give the compiler a strong hint to unroll the loop. But don't insist
+      // on unrolling, because if the function is expensive the compiler should not
       // unroll the loop at the expense of inlining.
-      for (; i <= last_chunk_offset; i += 4 * PacketSize) {
-        for (StorageIndex j = 0; j < 4; j++) {
+      for (; i <= last_chunk_offset; i += 4*PacketSize) {
+        for (Index j = 0; j < 4; j++) {
           evaluator.evalPacket(i + j * PacketSize);
         }
       }
-      last_chunk_offset = lastIdx - PacketSize;
+      last_chunk_offset = last - PacketSize;
       for (; i <= last_chunk_offset; i += PacketSize) {
         evaluator.evalPacket(i);
       }
     }
-    for (; i < lastIdx; ++i) {
+    for (; i < last; ++i) {
       evaluator.evalScalar(i);
     }
   }
 
-  static StorageIndex alignBlockSize(StorageIndex size) {
+  static Index alignBlockSize(Index size) {
     // Align block size to packet size and account for unrolling in run above.
     if (size >= 16 * PacketSize) {
       return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
@@ -321,376 +138,144 @@ struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
   }
 };
 
-template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
-class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
- public:
-  typedef typename Expression::Index StorageIndex;
-
-  static EIGEN_STRONG_INLINE void run(const Expression& expr,
-                         const ThreadPoolDevice& device) {
-    typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
-    typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
-
-    Evaluator evaluator(expr, device);
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
-    if (needs_assign) {
-      const StorageIndex size = array_prod(evaluator.dimensions());
-      device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
-                         EvalRange::alignBlockSize,
-                         [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) {
-                           EvalRange::run(&evaluator, firstIdx, lastIdx);
-                         });
-    }
-    evaluator.cleanup();
-  }
-};
-
 template <typename Expression, bool Vectorizable>
-class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
-                     /*Tiling=*/TiledEvaluation::On> {
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
  public:
-  typedef typename traits<Expression>::Index IndexType;
-  typedef typename traits<Expression>::Scalar Scalar;
-  typedef typename remove_const<Scalar>::type ScalarNoConst;
-
-  static const int NumDims = traits<Expression>::NumDimensions;
-
-  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
-  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
-  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
-
-  typedef internal::TensorBlockDescriptor<NumDims, IndexType>
-      TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
-      TensorBlockScratch;
-
-  static EIGEN_STRONG_INLINE void run(const Expression& expr,
-                                      const ThreadPoolDevice& device) {
+  typedef typename Expression::Index Index;
+  static inline void run(const Expression& expr, const ThreadPoolDevice& device)
+  {
+    typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
     Evaluator evaluator(expr, device);
-
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
-    if (needs_assign) {
-      const TilingContext tiling =
-          internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
-                                                   Vectorizable>(evaluator);
-
-      auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
-                                                       IndexType lastBlockIdx) {
-        TensorBlockScratch scratch(device);
-
-        for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
-             ++block_idx) {
-          TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
-          evaluator.evalBlock(desc, scratch);
-          scratch.reset();
-        }
-      };
-
-      // Evaluate small expressions directly as a single block.
-      if (tiling.block_mapper.blockCount() == 1) {
-        TensorBlockScratch scratch(device);
-        TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
-        evaluator.evalBlock(desc, scratch);
-      } else {
-        device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost,
-                           eval_block);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    if (needs_assign)
+    {
+      const Index size = array_prod(evaluator.dimensions());
+#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL)
+      device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
+                         EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize,
+                         [&evaluator](Index first, Index last) {
+                           EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first, last);
+                         });
+#else
+      size_t num_threads = device.numThreads();
+      if (num_threads > 1) {
+        num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+            size, evaluator.costPerCoeff(Vectorizable), num_threads);
       }
+      if (num_threads == 1) {
+        EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
+      } else {
+        const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
+        Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
+        const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
+        const Index numblocks = size / blocksize;
+
+        Barrier barrier(numblocks);
+        for (int i = 0; i < numblocks; ++i) {
+          device.enqueue_with_barrier(
+              &barrier, &EvalRange<Evaluator, Index, Vectorizable>::run,
+              &evaluator, i * blocksize, (i + 1) * blocksize);
+        }
+        if (numblocks * blocksize < size) {
+          EvalRange<Evaluator, Index, Vectorizable>::run(
+              &evaluator, numblocks * blocksize, size);
+        }
+        barrier.Wait();
+      }
+#endif  // defined(!EIGEN_USE_SIMPLE_THREAD_POOL)
     }
     evaluator.cleanup();
   }
 };
-
-template <typename Expression, typename DoneCallback, bool Vectorizable,
-          TiledEvaluation Tiling>
-class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
-                          Vectorizable, Tiling> {
- public:
-  typedef typename Expression::Index StorageIndex;
-  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
-
-  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
-                                           const ThreadPoolDevice& device,
-                                           DoneCallback done) {
-    TensorAsyncExecutorContext* const ctx =
-        new TensorAsyncExecutorContext(expr, device, std::move(done));
-
-    const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
-      if (!need_assign) {
-        delete ctx;
-        return;
-      }
-
-      typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
-      const StorageIndex size = array_prod(ctx->evaluator.dimensions());
-      device.parallelForAsync(
-          size, ctx->evaluator.costPerCoeff(Vectorizable),
-          EvalRange::alignBlockSize,
-          [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
-            EvalRange::run(&ctx->evaluator, firstIdx, lastIdx);
-          },
-          [ctx]() { delete ctx; });
-    };
-
-    ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
-  }
-
- private:
-  struct TensorAsyncExecutorContext {
-    TensorAsyncExecutorContext(const Expression& expr,
-                               const ThreadPoolDevice& thread_pool,
-                               DoneCallback done)
-        : evaluator(expr, thread_pool), on_done(std::move(done)) {}
-
-    ~TensorAsyncExecutorContext() {
-      evaluator.cleanup();
-      on_done();
-    }
-
-    Evaluator evaluator;
-
-   private:
-    DoneCallback on_done;
-  };
-};
-
-template <typename Expression, typename DoneCallback, bool Vectorizable>
-class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
-                          Vectorizable, /*Tileable*/ TiledEvaluation::On> {
- public:
-  typedef typename traits<Expression>::Index IndexType;
-  typedef typename traits<Expression>::Scalar Scalar;
-  typedef typename remove_const<Scalar>::type ScalarNoConst;
-
-  static const int NumDims = traits<Expression>::NumDimensions;
-
-  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
-  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
-  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
-
-  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
-      TensorBlockScratch;
-
-  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
-                                           const ThreadPoolDevice& device,
-                                           DoneCallback done) {
-
-    TensorAsyncExecutorContext* const ctx =
-        new TensorAsyncExecutorContext(expr, device, std::move(done));
-
-    const auto on_eval_subexprs = [ctx](bool need_assign) -> void {
-      if (!need_assign) {
-        delete ctx;
-        return;
-      }
-
-      ctx->tiling = internal::GetTensorExecutorTilingContext<
-          Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
-
-      auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
-        TensorBlockScratch scratch(ctx->device);
-
-        for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
-             ++block_idx) {
-          TensorBlockDesc desc =
-              ctx->tiling.block_mapper.blockDescriptor(block_idx);
-          ctx->evaluator.evalBlock(desc, scratch);
-          scratch.reset();
-        }
-      };
-
-      // Evaluate small expressions directly as a single block.
-      if (ctx->tiling.block_mapper.blockCount() == 1) {
-        TensorBlockScratch scratch(ctx->device);
-        TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
-        ctx->evaluator.evalBlock(desc, scratch);
-        delete ctx;
-      } else {
-        ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(),
-                                     ctx->tiling.cost, eval_block,
-                                     [ctx]() { delete ctx; });
-      }
-    };
-
-    ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
-  }
-
- private:
-  struct TensorAsyncExecutorContext {
-    TensorAsyncExecutorContext(const Expression& expr,
-                               const ThreadPoolDevice& thread_pool,
-                               DoneCallback done)
-        : device(thread_pool),
-          evaluator(expr, thread_pool),
-          on_done(std::move(done)) {}
-
-    ~TensorAsyncExecutorContext() {
-      evaluator.cleanup();
-      on_done();
-    }
-
-    const ThreadPoolDevice& device;
-    Evaluator evaluator;
-    TilingContext tiling;
-
-   private:
-    DoneCallback on_done;
-  };
-};
-
 #endif  // EIGEN_USE_THREADS
 
+
 // GPU: the evaluation of the expression is offloaded to a GPU.
 #if defined(EIGEN_USE_GPU)
 
-template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
-class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, GpuDevice, Vectorizable> {
  public:
-  typedef typename Expression::Index StorageIndex;
+  typedef typename Expression::Index Index;
   static void run(const Expression& expr, const GpuDevice& device);
 };
 
-#if defined(EIGEN_GPUCC)
-template <typename Evaluator, typename StorageIndex, bool Vectorizable>
+
+#if defined(__CUDACC__)
+template <typename Evaluator, typename Index, bool Vectorizable>
 struct EigenMetaKernelEval {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
-    for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) {
+  static __device__ EIGEN_ALWAYS_INLINE
+  void run(Evaluator& eval, Index first, Index last, Index step_size) {
+    for (Index i = first; i < last; i += step_size) {
       eval.evalScalar(i);
     }
   }
 };
 
-template <typename Evaluator, typename StorageIndex>
-struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-  void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
-    const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
-    const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
-    const StorageIndex vectorized_step_size = step_size * PacketSize;
+template <typename Evaluator, typename Index>
+struct EigenMetaKernelEval<Evaluator, Index, true> {
+  static __device__ EIGEN_ALWAYS_INLINE
+  void run(Evaluator& eval, Index first, Index last, Index step_size) {
+    const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+    const Index vectorized_size = (last / PacketSize) * PacketSize;
+    const Index vectorized_step_size = step_size * PacketSize;
 
     // Use the vector path
-    for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
+    for (Index i = first * PacketSize; i < vectorized_size;
          i += vectorized_step_size) {
       eval.evalPacket(i);
     }
-    for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) {
+    for (Index i = vectorized_size + first; i < last; i += step_size) {
       eval.evalScalar(i);
     }
   }
 };
 
-template <typename Evaluator, typename StorageIndex>
+template <typename Evaluator, typename Index>
 __global__ void
 __launch_bounds__(1024)
-EigenMetaKernel(Evaluator eval, StorageIndex size) {
+EigenMetaKernel(Evaluator eval, Index size) {
 
-  const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
-  const StorageIndex step_size = blockDim.x * gridDim.x;
+  const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index step_size = blockDim.x * gridDim.x;
 
   const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
-  EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
+  EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
 }
 
 /*static*/
-template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
-EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling>::run(
+template <typename Expression, bool Vectorizable>
+inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
     const Expression& expr, const GpuDevice& device) {
   TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
-  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
   if (needs_assign) {
-
-    const int block_size = device.maxGpuThreadsPerBlock();
-    const int max_blocks = device.getNumGpuMultiProcessors() *
-                           device.maxGpuThreadsPerMultiProcessor() / block_size;
-    const StorageIndex size = array_prod(evaluator.dimensions());
+    const int block_size = device.maxCudaThreadsPerBlock();
+    const int max_blocks = device.getNumCudaMultiProcessors() *
+                           device.maxCudaThreadsPerMultiProcessor() / block_size;
+    const Index size = array_prod(evaluator.dimensions());
     // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
     const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
 
-    LAUNCH_GPU_KERNEL(
-        (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
+    LAUNCH_CUDA_KERNEL(
+        (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
         num_blocks, block_size, 0, device, evaluator, size);
   }
   evaluator.cleanup();
 }
 
-#endif  // EIGEN_GPUCC
+#endif  // __CUDACC__
 #endif  // EIGEN_USE_GPU
 
 // SYCL Executor policy
 #ifdef EIGEN_USE_SYCL
 
-template <typename Evaluator>
-struct ExecExprFunctorKernel {
-  typedef typename Evaluator::Index Index;
-  Evaluator evaluator;
-  const Index range;
-  template <typename Scratch>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(
-      const Scratch, Evaluator evaluator_, const Index range_)
-      : evaluator(evaluator_), range(range_) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
-      cl::sycl::nd_item<1> itemID) {
-    compute(itemID);
-  }
-  template <bool is_vec = Evaluator::PacketAccess>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<!is_vec>::type
-  compute(const cl::sycl::nd_item<1>& itemID) {
-    Index gId = static_cast<Index>(itemID.get_global_linear_id());
-    Index total_threads = itemID.get_global_range(0);
-
-    for (Index i = gId; i < range; i += total_threads) {
-      evaluator.evalScalar(i);
-    }
-  }
-  template <bool is_vec = Evaluator::PacketAccess>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<is_vec>::type
-  compute(const cl::sycl::nd_item<1>& itemID) {
-    const Index vectorizedRange =
-        (range / Evaluator::PacketSize) * Evaluator::PacketSize;
-    Index gId = static_cast<Index>(itemID.get_global_linear_id());
-    const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
-    const Index start = Evaluator::PacketSize * gId;
-    for (Index i = start; i < vectorizedRange; i += step) {
-      evaluator.evalPacket(i);
-    }
-    gId += vectorizedRange;
-    for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
-      evaluator.evalScalar(i);
-    }
-  }
-};
-
-template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
-class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
- public:
-  typedef typename Expression::Index Index;
-  static EIGEN_STRONG_INLINE void run(const Expression& expr,
-                                      const Eigen::SyclDevice& dev) {
-    typedef Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> Evaluator;
-    Evaluator evaluator(expr, dev);
-    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
-    if (needs_assign) {
-      Index range, GRange, tileSize;
-      Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
-      total_size = (total_size == 0) ? 1 : total_size;
-      const int PacketSize =
-          Eigen::PacketType<typename Evaluator::CoeffReturnType,
-                            Eigen::SyclDevice>::size;
-      Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
-      dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
-      range = total_size;
-
-      dev.template nullary_kernel_launcher<
-          typename Evaluator::CoeffReturnType,
-          ExecExprFunctorKernel<Evaluator> >(
-          evaluator,
-          cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
-                                cl::sycl::range<1>(tileSize)),
-          Index(1), range);
-    }
-    evaluator.cleanup();
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, SyclDevice, Vectorizable> {
+public:
+  static inline void run(const Expression &expr, const SyclDevice &device) {
+    // call TensorSYCL module
+    TensorSycl::run(expr, device);
   }
 };
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
index c9bccfc66..85dfc7a69 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
@@ -38,7 +38,7 @@ struct traits<TensorCwiseNullaryOp<NullaryOp, XprType> >
   typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
+
   enum {
     Flags = 0
   };
@@ -89,10 +89,6 @@ struct traits<TensorCwiseUnaryOp<UnaryOp, XprType> >
   typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
-  typedef typename TypeConversion<Scalar, 
-                                  typename XprTraits::PointerType
-                                  >::type 
-                                  PointerType;
 };
 
 template<typename UnaryOp, typename XprType>
@@ -165,12 +161,7 @@ struct traits<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >
   typedef typename remove_reference<RhsNested>::type _RhsNested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
-  typedef typename TypeConversion<Scalar,
-                                  typename conditional<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
-                                                      typename traits<LhsXprType>::PointerType,
-                                                      typename traits<RhsXprType>::PointerType>::type
-                                  >::type 
-                                  PointerType;
+
   enum {
     Flags = 0
   };
@@ -247,12 +238,7 @@ struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprT
   typedef typename remove_reference<Arg3Nested>::type _Arg3Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
-  typedef typename TypeConversion<Scalar,
-                                  typename conditional<Pointer_type_promotion<typename Arg2XprType::Scalar, Scalar>::val,
-                                                      typename traits<Arg2XprType>::PointerType,
-                                                      typename traits<Arg3XprType>::PointerType>::type
-                                  >::type 
-                                  PointerType;
+
   enum {
     Flags = 0
   };
@@ -328,9 +314,6 @@ struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
   typedef typename ElseXprType::Nested ElseNested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
-  typedef typename conditional<Pointer_type_promotion<typename ThenXprType::Scalar, Scalar>::val,
-                               typename traits<ThenXprType>::PointerType,
-                               typename traits<ElseXprType>::PointerType>::type PointerType;
 };
 
 template<typename IfXprType, typename ThenXprType, typename ElseXprType>
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index c62bc5fa9..08eb5595a 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -10,6 +10,10 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H
 #define EIGEN_CXX11_TENSOR_TENSOR_FFT_H
 
+// This code requires the ability to initialize arrays of constant
+// values directly inside a class.
+#if __cplusplus >= 201103L || EIGEN_COMP_MSVC >= 1900
+
 namespace Eigen {
 
 /** \class TensorFFT
@@ -67,7 +71,6 @@ struct traits<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir> > : public traits
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
-  typedef typename traits<XprType>::PointerType PointerType;
 };
 
 template <typename FFT, typename XprType, int FFTResultType, int FFTDirection>
@@ -127,23 +130,16 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
   typedef OutputScalar CoeffReturnType;
   typedef typename PacketType<OutputScalar, Device>::type PacketReturnType;
   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-    typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = false,
     PacketAccess = true,
     BlockAccess = false,
-    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
   };
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) {
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
     for (int i = 0; i < NumDims; ++i) {
@@ -169,13 +165,13 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
     return m_dimensions;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(OutputScalar* data) {
     m_impl.evalSubExprsIfNeeded(NULL);
     if (data) {
       evalToBuf(data);
       return false;
     } else {
-      m_data = (EvaluatorPointerType)m_device.get((CoeffReturnType*)(m_device.allocate_temp(sizeof(CoeffReturnType) * m_size)));
+      m_data = (CoeffReturnType*)m_device.allocate(sizeof(CoeffReturnType) * m_size);
       evalToBuf(m_data);
       return true;
     }
@@ -204,16 +200,11 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_data.bind(cgh);
-  }
-#endif
+  EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; }
+
 
  private:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(OutputScalar* data) {
     const bool write_to_out = internal::is_same<OutputScalar, ComplexScalar>::value;
     ComplexScalar* buf = write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size);
 
@@ -239,32 +230,20 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
         //   t_n = exp(sqrt(-1) * pi * n^2 / line_len)
         // for n = 0, 1,..., line_len-1.
         // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2
-
-        // The recurrence is correct in exact arithmetic, but causes
-        // numerical issues for large transforms, especially in
-        // single-precision floating point.
-        //
-        // pos_j_base_powered[0] = ComplexScalar(1, 0);
-        // if (line_len > 1) {
-        //   const ComplexScalar pos_j_base = ComplexScalar(
-        //       numext::cos(M_PI / line_len), numext::sin(M_PI / line_len));
-        //   pos_j_base_powered[1] = pos_j_base;
-        //   if (line_len > 2) {
-        //     const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base;
-        //     for (int i = 2; i < line_len + 1; ++i) {
-        //       pos_j_base_powered[i] = pos_j_base_powered[i - 1] *
-        //           pos_j_base_powered[i - 1] /
-        //           pos_j_base_powered[i - 2] *
-        //           pos_j_base_sq;
-        //     }
-        //   }
-        // }
-        // TODO(rmlarsen): Find a way to use Eigen's vectorized sin
-        // and cosine functions here.
-        for (int j = 0; j < line_len + 1; ++j) {
-          double arg = ((EIGEN_PI * j) * j) / line_len;
-          std::complex<double> tmp(numext::cos(arg), numext::sin(arg));
-          pos_j_base_powered[j] = static_cast<ComplexScalar>(tmp);
+        pos_j_base_powered[0] = ComplexScalar(1, 0);
+        if (line_len > 1) {
+          const RealScalar pi_over_len(EIGEN_PI / line_len);
+          const ComplexScalar pos_j_base = ComplexScalar(
+	       std::cos(pi_over_len), std::sin(pi_over_len));
+          pos_j_base_powered[1] = pos_j_base;
+          if (line_len > 2) {
+            const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base;
+            for (int j = 2; j < line_len + 1; ++j) {
+              pos_j_base_powered[j] = pos_j_base_powered[j - 1] *
+                                      pos_j_base_powered[j - 1] /
+                                      pos_j_base_powered[j - 2] * pos_j_base_sq;
+            }
+          }
         }
       }
 
@@ -274,7 +253,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
         // get data into line_buf
         const Index stride = m_strides[dim];
         if (stride == 1) {
-          m_device.memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
+          memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar));
         } else {
           Index offset = base_offset;
           for (int j = 0; j < line_len; ++j, offset += stride) {
@@ -282,7 +261,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
           }
         }
 
-        // process the line
+        // processs the line
         if (is_power_of_two) {
           processDataLineCooleyTukey(line_buf, line_len, log_len);
         }
@@ -292,7 +271,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
 
         // write back
         if (FFTDir == FFT_FORWARD && stride == 1) {
-          m_device.memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
+          memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar));
         } else {
           Index offset = base_offset;
           const ComplexScalar div_factor =  ComplexScalar(1.0 / line_len, 0);
@@ -583,12 +562,12 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
 
  protected:
   Index m_size;
-  const FFT EIGEN_DEVICE_REF m_fft;
+  const FFT& m_fft;
   Dimensions m_dimensions;
   array<Index, NumDims> m_strides;
   TensorEvaluator<ArgType, Device> m_impl;
-  EvaluatorPointerType m_data;
-  const Device EIGEN_DEVICE_REF m_device;
+  CoeffReturnType* m_data;
+  const Device& m_device;
 
   // This will support a maximum FFT size of 2^32 for each dimension
   // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2;
@@ -666,4 +645,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
 
 }  // end namespace Eigen
 
+#endif  // EIGEN_HAS_CONSTEXPR
+
+
 #endif  // EIGEN_CXX11_TENSOR_TENSOR_FFT_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index a5be54bcd..fcee5f60d 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -20,7 +20,7 @@ namespace Eigen {
   * The fixed sized equivalent of
   * Eigen::Tensor<float, 3> t(3, 5, 7);
   * is
-  * Eigen::TensorFixedSize<float, Sizes<3,5,7>> t;
+  * Eigen::TensorFixedSize<float, Size<3,5,7>> t;
   */
 
 template<typename Scalar_, typename Dimensions_, int Options_, typename IndexType>
@@ -40,18 +40,11 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
 
     enum {
       IsAligned = bool(EIGEN_MAX_ALIGN_BYTES>0),
-      PacketAccess = (internal::packet_traits<Scalar>::size > 1),
-      BlockAccess = false,
-      PreferBlockAccess = false,
       Layout = Options_ & RowMajor ? RowMajor : ColMajor,
       CoordAccess = true,
       RawAccess = true
     };
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   typedef Dimensions_ Dimensions;
   static const std::size_t NumIndices = Dimensions::count;
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index 14020aa68..8bece4e65 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -12,16 +12,9 @@
 
 namespace Eigen {
 
-/** \class TensorForcedEval
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor reshaping class.
-  *
-  *
-  */
 namespace internal {
-template<typename XprType>
-struct traits<TensorForcedEvalOp<XprType> >
+template<typename XprType, template <class> class MakePointer_>
+struct traits<TensorForcedEvalOp<XprType, MakePointer_> >
 {
   // Type promotion to handle the case where the types of the lhs and the rhs are different.
   typedef typename XprType::Scalar Scalar;
@@ -32,31 +25,50 @@ struct traits<TensorForcedEvalOp<XprType> >
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
 
   enum {
     Flags = 0
   };
+  template <class T> struct MakePointer {
+    // Intermediate typedef to workaround MSVC issue.
+    typedef MakePointer_<T> MakePointerT;
+    typedef typename MakePointerT::Type Type;
+  };
 };
 
-template<typename XprType>
-struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense>
+template<typename XprType, template <class> class MakePointer_>
+struct eval<TensorForcedEvalOp<XprType, MakePointer_>, Eigen::Dense>
 {
-  typedef const TensorForcedEvalOp<XprType>& type;
+  typedef const TensorForcedEvalOp<XprType, MakePointer_>& type;
 };
 
-template<typename XprType>
-struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType> >::type>
+template<typename XprType, template <class> class MakePointer_>
+struct nested<TensorForcedEvalOp<XprType, MakePointer_>, 1, typename eval<TensorForcedEvalOp<XprType, MakePointer_> >::type>
 {
-  typedef TensorForcedEvalOp<XprType> type;
+  typedef TensorForcedEvalOp<XprType, MakePointer_> type;
 };
 
 }  // end namespace internal
 
 
 
-template<typename XprType>
-class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors>
+// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_)
+
+/** \class TensorForcedEvalOp
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor reshaping class.
+  *
+  *
+  */
+/// `template <class> class MakePointer_` is added to convert the host pointer to the device pointer.
+/// It is added due to the fact that for our device compiler `T*` is not allowed.
+/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer `T`.
+/// This is done through our `MakePointer_` class. By default the Type in the `MakePointer_<T>` is `T*` .
+/// Therefore, by adding the default value, we managed to convert the type and it does not break any
+/// existing code as its default value is `T*`.
+template<typename XprType, template <class> class MakePointer_>
+class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType, MakePointer_>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
@@ -77,116 +89,49 @@ class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOn
     typename XprType::Nested m_xpr;
 };
 
-namespace internal {
-template <typename Device, typename CoeffReturnType>
-struct non_integral_type_placement_new{
-  template <typename StorageType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index numValues, StorageType m_buffer) {
-   // Initialize non-trivially constructible types.
-    if (!internal::is_arithmetic<CoeffReturnType>::value) {
-      for (Index i = 0; i < numValues; ++i) new (m_buffer + i) CoeffReturnType();
-    }
-}
-};
 
-// SYCL does not support non-integral types 
-// having new (m_buffer + i) CoeffReturnType() causes the following compiler error for SYCL Devices 
-// no matching function for call to 'operator new'
-template <typename CoeffReturnType>
-struct non_integral_type_placement_new<Eigen::SyclDevice, CoeffReturnType> {
-  template <typename StorageType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index, StorageType) {
-}
-};
-} // end namespace internal
-
-template<typename ArgType_, typename Device>
-struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
+template<typename ArgType, typename Device, template <class> class MakePointer_>
+struct TensorEvaluator<const TensorForcedEvalOp<ArgType, MakePointer_>, Device>
 {
-  typedef const typename internal::remove_all<ArgType_>::type ArgType;
-  typedef TensorForcedEvalOp<ArgType> XprType;
+  typedef TensorForcedEvalOp<ArgType, MakePointer_> XprType;
   typedef typename ArgType::Scalar Scalar;
   typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
   typedef typename XprType::Index Index;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
-    IsAligned         = true,
-    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess       = internal::is_arithmetic<CoeffReturnType>::value,
-    PreferBlockAccess = false,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess         = true
+    IsAligned = true,
+    PacketAccess = (PacketSize > 1),
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess = true
   };
 
-  static const int NumDims = internal::traits<ArgType>::NumDimensions;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
-                                                     Layout, Index>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_op(op.expression()),
-      m_device(device), m_buffer(NULL)
+	/// op_ is used for sycl
+      : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL)
   { }
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
-  #if !defined(EIGEN_HIPCC)
-  EIGEN_DEVICE_FUNC
-  #endif
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
     const Index numValues =  internal::array_prod(m_impl.dimensions());
-    m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType)));
-
-   internal::non_integral_type_placement_new<Device, CoeffReturnType>()(numValues, m_buffer);
-
+    m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
+    // Should initialize the memory in case we're dealing with non POD types.
+    if (NumTraits<CoeffReturnType>::RequireInitialization) {
+      for (Index i = 0; i < numValues; ++i) {
+        new(m_buffer+i) CoeffReturnType();
+      }
+    }
     typedef TensorEvalToOp< const typename internal::remove_const<ArgType>::type > EvalTo;
-    EvalTo evalToTmp(m_device.get(m_buffer), m_op);
-
-    internal::TensorExecutor<
-        const EvalTo, typename internal::remove_const<Device>::type,
-        /*Vectorizable=*/internal::IsVectorizable<Device, const ArgType>::value,
-        /*Tiling=*/internal::IsTileable<Device, const ArgType>::value>::
-        run(evalToTmp, m_device);
-
+    EvalTo evalToTmp(m_buffer, m_op);
+    const bool PacketAccess = internal::IsVectorizable<Device, const ArgType>::value;
+    internal::TensorExecutor<const EvalTo, typename internal::remove_const<Device>::type, PacketAccess>::run(evalToTmp, m_device);
     return true;
   }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType, EvalSubExprsCallback done) {
-    const Index numValues = internal::array_prod(m_impl.dimensions());
-    m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(
-        numValues * sizeof(CoeffReturnType)));
-    typedef TensorEvalToOp<const typename internal::remove_const<ArgType>::type>
-        EvalTo;
-    EvalTo evalToTmp(m_device.get(m_buffer), m_op);
-
-    auto on_done = std::bind([](EvalSubExprsCallback done_) { done_(true); },
-                             std::move(done));
-    internal::TensorAsyncExecutor<
-        const EvalTo, typename internal::remove_const<Device>::type,
-        decltype(on_done),
-        /*Vectorizable=*/internal::IsVectorizable<Device, const ArgType>::value,
-        /*Tiling=*/internal::IsTileable<Device, const ArgType>::value>::
-        runAsync(evalToTmp, m_device, std::move(on_done));
-  }
-#endif
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
-    m_device.deallocate_temp(m_buffer);
+    m_device.deallocate(m_buffer);
     m_buffer = NULL;
   }
 
@@ -201,37 +146,21 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
     return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    return internal::TensorBlockResourceRequirements::any();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    assert(m_buffer != NULL);
-    return TensorBlock::materialize(m_buffer, m_impl.dimensions(), desc, scratch);
-  }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  EvaluatorPointerType data() const { return m_buffer; }
+  EIGEN_DEVICE_FUNC typename MakePointer<Scalar>::Type data() const { return m_buffer; }
 
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_buffer.bind(cgh);
-    m_impl.bind(cgh);
-  }
-#endif
+  /// required by sycl in order to extract the sycl accessor
+  const TensorEvaluator<ArgType, Device>& impl() { return m_impl; }
+  /// used by sycl in order to build the sycl buffer
+  const Device& device() const{return m_device;}
  private:
   TensorEvaluator<ArgType, Device> m_impl;
   const ArgType m_op;
-  const Device EIGEN_DEVICE_REF m_device;
-  EvaluatorPointerType m_buffer;
+  const Device& m_device;
+  typename MakePointer<CoeffReturnType>::Type m_buffer;
 };
 
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index 246ebe44e..52b803d7f 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -12,7 +12,7 @@
 
 namespace Eigen {
 
-// MakePointer class is used as a container of the address space of the pointer
+// MakePointer class is used as a container of the adress space of the pointer
 // on the host and on the device. From the host side it generates the T* pointer
 // and when EIGEN_USE_SYCL is used it construct a buffer with a map_allocator to
 // T* m_data on the host. It is always called on the device.
@@ -20,35 +20,8 @@ namespace Eigen {
 // map_allocator.
 template<typename T> struct MakePointer {
   typedef T* Type;
-  typedef const T* ConstType;
 };
 
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* constCast(const T* data) {
-  return const_cast<T*>(data);
-}
-
-// The StorageMemory class is a container of the device specific pointer
-// used for refering to a Pointer on TensorEvaluator class. While the TensorExpression
-// is a device-agnostic type and need MakePointer class for type conversion,
-// the TensorEvaluator class can be specialized for a device, hence it is possible
-// to construct different types of temproray storage memory in TensorEvaluator
-// for different devices by specializing the following StorageMemory class.
-template<typename T, typename device> struct StorageMemory: MakePointer <T> {};
-
-namespace internal{
-template<typename A, typename B> struct Pointer_type_promotion {
-  static const bool val=false;
-};
-template<typename A> struct Pointer_type_promotion<A, A> {
-  static const bool val = true;
-};
-template<typename A, typename B> struct TypeConversion {
-  typedef A* type;
-};
-}
-
-
 template<typename PlainObjectType, int Options_ = Unaligned, template <class> class MakePointer_ = MakePointer> class TensorMap;
 template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType = DenseIndex> class Tensor;
 template<typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex> class TensorFixedSize;
@@ -64,7 +37,7 @@ template<typename Op, typename Dims, typename XprType, template <class> class Ma
 template<typename XprType> class TensorIndexTupleOp;
 template<typename ReduceOp, typename Dims, typename XprType> class TensorTupleReducerOp;
 template<typename Axis, typename LeftXprType, typename RightXprType> class TensorConcatenationOp;
-template<typename Dimensions, typename LeftXprType, typename RightXprType, typename OutputKernelType> class TensorContractionOp;
+template<typename Dimensions, typename LeftXprType, typename RightXprType> class TensorContractionOp;
 template<typename TargetType, typename XprType> class TensorConversionOp;
 template<typename Dimensions, typename InputXprType, typename KernelXprType> class TensorConvolutionOp;
 template<typename FFT, typename XprType, int FFTDataType, int FFTDirection> class TensorFFTOp;
@@ -85,50 +58,21 @@ template<typename Strides, typename XprType> class TensorInflationOp;
 template<typename Generator, typename XprType> class TensorGeneratorOp;
 template<typename LeftXprType, typename RightXprType> class TensorAssignOp;
 template<typename Op, typename XprType> class TensorScanOp;
-template<typename Dims, typename XprType> class TensorTraceOp;
 
 template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp;
 template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp;
 
 template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorEvalToOp;
-template<typename XprType> class TensorForcedEvalOp;
+template<typename XprType, template <class> class MakePointer_ = MakePointer> class TensorForcedEvalOp;
 
 template<typename ExpressionType, typename DeviceType> class TensorDevice;
-template<typename ExpressionType, typename DeviceType, typename DoneCallback> class TensorAsyncDevice;
 template<typename Derived, typename Device> struct TensorEvaluator;
 
-struct NoOpOutputKernel;
-
 struct DefaultDevice;
 struct ThreadPoolDevice;
 struct GpuDevice;
 struct SyclDevice;
 
-#ifdef EIGEN_USE_SYCL
-
-template <typename T> struct MakeSYCLPointer {
-  typedef Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T> Type;
-};
-
-template <typename T>
-EIGEN_STRONG_INLINE const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>&
-constCast(const Eigen::TensorSycl::internal::RangeAccess<cl::sycl::access::mode::read_write, T>& data) {
-  return data;
-}
-
-template <typename T>
-struct StorageMemory<T, SyclDevice> : MakeSYCLPointer<T> {};
-template <typename T>
-struct StorageMemory<T, const SyclDevice> : StorageMemory<T, SyclDevice> {};
-
-namespace TensorSycl {
-namespace internal{
-template <typename Evaluator, typename Op> class GenericNondeterministicReducer;
-}
-}
-#endif
-
-
 enum FFTResultType {
   RealPart = 0,
   ImagPart = 1,
@@ -154,36 +98,10 @@ struct IsVectorizable<GpuDevice, Expression> {
                             TensorEvaluator<Expression, GpuDevice>::IsAligned;
 };
 
-// Tiled evaluation strategy.
-enum TiledEvaluation {
-  Off = 0,    // tiled evaluation is not supported
-  On = 1,     // still work in progress (see TensorBlock.h)
-};
-
-template <typename Device, typename Expression>
-struct IsTileable {
-  // Check that block evaluation is supported and it's a preferred option (at
-  // least one sub-expression has much faster block evaluation, e.g.
-  // broadcasting).
-  static const bool BlockAccess =
-      TensorEvaluator<Expression, Device>::BlockAccess &&
-      TensorEvaluator<Expression, Device>::PreferBlockAccess;
-
-  static const TiledEvaluation value =
-      BlockAccess ? TiledEvaluation::On : TiledEvaluation::Off;
-};
-
 template <typename Expression, typename Device,
-          bool Vectorizable      = IsVectorizable<Device, Expression>::value,
-          TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
+          bool Vectorizable = IsVectorizable<Device, Expression>::value>
 class TensorExecutor;
 
-template <typename Expression, typename Device, typename DoneCallback,
-          bool Vectorizable = IsVectorizable<Device, Expression>::value,
-          TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
-class TensorAsyncExecutor;
-
-
 }  // end namespace internal
 
 }  // end namespace Eigen
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 2edc45f1a..d73f6dc68 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -20,7 +20,7 @@ namespace internal {
 template <typename Scalar>
 struct scalar_mod_op {
   EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a % m_divisor; }
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a % m_divisor; }
   const Scalar m_divisor;
 };
 template <typename Scalar>
@@ -33,8 +33,8 @@ struct functor_traits<scalar_mod_op<Scalar> >
  */
 template <typename Scalar>
 struct scalar_mod2_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op);
+  EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; }
 };
 template <typename Scalar>
 struct functor_traits<scalar_mod2_op<Scalar> >
@@ -42,7 +42,7 @@ struct functor_traits<scalar_mod2_op<Scalar> >
 
 template <typename Scalar>
 struct scalar_fmod_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op)
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op);
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar
   operator()(const Scalar& a, const Scalar& b) const {
     return numext::fmod(a, b);
@@ -54,19 +54,50 @@ struct functor_traits<scalar_fmod_op<Scalar> > {
          PacketAccess = false };
 };
 
+
+/** \internal
+  * \brief Template functor to compute the sigmoid of a scalar
+  * \sa class CwiseUnaryOp, ArrayBase::sigmoid()
+  */
+template <typename T>
+struct scalar_sigmoid_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
+    const T one = T(1);
+    return one / (one + numext::exp(-x));
+  }
+
+  template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Packet packetOp(const Packet& x) const {
+    const Packet one = pset1<Packet>(T(1));
+    return pdiv(one, padd(one, pexp(pnegate(x))));
+  }
+};
+
+template <typename T>
+struct functor_traits<scalar_sigmoid_op<T> > {
+  enum {
+    Cost = NumTraits<T>::AddCost * 2 + NumTraits<T>::MulCost * 6,
+    PacketAccess = packet_traits<T>::HasAdd && packet_traits<T>::HasDiv &&
+                   packet_traits<T>::HasNegate && packet_traits<T>::HasExp
+  };
+};
+
+
 template<typename Reducer, typename Device>
 struct reducer_traits {
   enum {
     Cost = 1,
-    PacketAccess = false,
-    IsStateful = false,
-    IsExactlyAssociative = true
+    PacketAccess = false
   };
 };
 
 // Standard reduction functors
 template <typename T> struct SumReducer
 {
+  static const bool PacketAccess = packet_traits<T>::HasAdd;
+  static const bool IsStateful = false;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
     internal::scalar_sum_op<T> sum_op;
     *accum = sum_op(*accum, t);
@@ -102,14 +133,16 @@ template <typename T, typename Device>
 struct reducer_traits<SumReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = PacketType<T, Device>::HasAdd,
-    IsStateful = false,
-    IsExactlyAssociative = NumTraits<T>::IsInteger
+    PacketAccess = PacketType<T, Device>::HasAdd
   };
 };
 
+
 template <typename T> struct MeanReducer
 {
+  static const bool PacketAccess = packet_traits<T>::HasAdd && !NumTraits<T>::IsInteger;
+  static const bool IsStateful = true;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   MeanReducer() : scalarCount_(0), packetCount_(0) { }
 
@@ -133,20 +166,16 @@ template <typename T> struct MeanReducer
     return pset1<Packet>(initialize());
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
-    internal::scalar_quotient_op<T> quotient_op;
-    return quotient_op(accum, T(scalarCount_));
+    return accum / scalarCount_;
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
-    return pdiv(vaccum, pset1<Packet>(T(packetCount_)));
+    return pdiv(vaccum, pset1<Packet>(packetCount_));
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
     internal::scalar_sum_op<T> sum_op;
-    internal::scalar_quotient_op<T> quotient_op;
-    return quotient_op(
-        sum_op(saccum, predux(vaccum)),
-        T(scalarCount_ + packetCount_ * unpacket_traits<Packet>::size));
+    return sum_op(saccum, predux(vaccum)) / (scalarCount_ + packetCount_ * unpacket_traits<Packet>::size);
   }
 
   protected:
@@ -158,10 +187,7 @@ template <typename T, typename Device>
 struct reducer_traits<MeanReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = PacketType<T, Device>::HasAdd &&
-                   PacketType<T, Device>::HasDiv && !NumTraits<T>::IsInteger,
-    IsStateful = true,
-    IsExactlyAssociative = NumTraits<T>::IsInteger
+    PacketAccess = PacketType<T, Device>::HasAdd
   };
 };
 
@@ -194,6 +220,9 @@ struct MinMaxBottomValue<T, false, false> {
 
 template <typename T> struct MaxReducer
 {
+  static const bool PacketAccess = packet_traits<T>::HasMax;
+  static const bool IsStateful = false;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
     if (t > *accum) { *accum = t; }
   }
@@ -225,15 +254,16 @@ template <typename T, typename Device>
 struct reducer_traits<MaxReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = PacketType<T, Device>::HasMax,
-    IsStateful = false,
-    IsExactlyAssociative = true
+    PacketAccess = PacketType<T, Device>::HasMax
   };
 };
 
 
 template <typename T> struct MinReducer
 {
+  static const bool PacketAccess = packet_traits<T>::HasMin;
+  static const bool IsStateful = false;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
     if (t < *accum) { *accum = t; }
   }
@@ -265,15 +295,16 @@ template <typename T, typename Device>
 struct reducer_traits<MinReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = PacketType<T, Device>::HasMin,
-    IsStateful = false,
-    IsExactlyAssociative = true
+    PacketAccess = PacketType<T, Device>::HasMin
   };
 };
 
 
 template <typename T> struct ProdReducer
 {
+  static const bool PacketAccess = packet_traits<T>::HasMul;
+  static const bool IsStateful = false;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
     internal::scalar_product_op<T> prod_op;
     (*accum) = prod_op(*accum, t);
@@ -309,15 +340,16 @@ template <typename T, typename Device>
 struct reducer_traits<ProdReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::MulCost,
-    PacketAccess = PacketType<T, Device>::HasMul,
-    IsStateful = false,
-    IsExactlyAssociative = true
+    PacketAccess = PacketType<T, Device>::HasMul
   };
 };
 
 
 struct AndReducer
 {
+  static const bool PacketAccess = false;
+  static const bool IsStateful = false;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const {
     *accum = *accum && t;
   }
@@ -333,14 +365,15 @@ template <typename Device>
 struct reducer_traits<AndReducer, Device> {
   enum {
     Cost = 1,
-    PacketAccess = false,
-    IsStateful = false,
-    IsExactlyAssociative = true
+    PacketAccess = false
   };
 };
 
 
 struct OrReducer {
+  static const bool PacketAccess = false;
+  static const bool IsStateful = false;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const {
     *accum = *accum || t;
   }
@@ -356,9 +389,7 @@ template <typename Device>
 struct reducer_traits<OrReducer, Device> {
   enum {
     Cost = 1,
-    PacketAccess = false,
-    IsStateful = false,
-    IsExactlyAssociative = true
+    PacketAccess = false
   };
 };
 
@@ -366,6 +397,9 @@ struct reducer_traits<OrReducer, Device> {
 // Argmin/Argmax reducers
 template <typename T> struct ArgMaxTupleReducer
 {
+  static const bool PacketAccess = false;
+  static const bool IsStateful = false;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
     if (t.second > accum->second) { *accum = t; }
   }
@@ -381,15 +415,16 @@ template <typename T, typename Device>
 struct reducer_traits<ArgMaxTupleReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = false,
-    IsStateful = false,
-    IsExactlyAssociative = true
+    PacketAccess = false
   };
 };
 
 
 template <typename T> struct ArgMinTupleReducer
 {
+  static const bool PacketAccess = false;
+  static const bool IsStateful = false;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const {
     if (t.second < accum->second) { *accum = t; }
   }
@@ -405,9 +440,7 @@ template <typename T, typename Device>
 struct reducer_traits<ArgMinTupleReducer<T>, Device> {
   enum {
     Cost = NumTraits<T>::AddCost,
-    PacketAccess = false,
-    IsStateful = false,
-    IsExactlyAssociative = true
+    PacketAccess = false
   };
 };
 
@@ -421,7 +454,6 @@ class GaussianGenerator {
                                       const array<T, NumDims>& std_devs)
       : m_means(means)
   {
-    EIGEN_UNROLL_LOOP
     for (size_t i = 0; i < NumDims; ++i) {
       m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2;
     }
@@ -429,7 +461,6 @@ class GaussianGenerator {
 
   EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const {
     T tmp = T(0);
-    EIGEN_UNROLL_LOOP
     for (size_t i = 0; i < NumDims; ++i) {
       T offset = coordinates[i] - m_means[i];
       tmp += offset * offset / m_two_sigmas[i];
@@ -452,25 +483,6 @@ struct functor_traits<GaussianGenerator<T, Index, NumDims> > {
   };
 };
 
-template <typename Scalar>
-struct scalar_clamp_op {
-  EIGEN_DEVICE_FUNC inline scalar_clamp_op(const Scalar& _min, const Scalar& _max) : m_min(_min), m_max(_max) {}
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
-  operator()(const Scalar& x) const {
-    return numext::mini(numext::maxi(x, m_min), m_max);
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
-  packetOp(const Packet& x) const {
-    return internal::pmin(internal::pmax(x, pset1<Packet>(m_min)), pset1<Packet>(m_max));
-  }
-  const Scalar m_min;
-  const Scalar m_max;
-};
-template<typename Scalar>
-struct functor_traits<scalar_clamp_op<Scalar> >
-{ enum { Cost = 2 * NumTraits<Scalar>::AddCost, PacketAccess = (packet_traits<Scalar>::HasMin && packet_traits<Scalar>::HasMax)}; };
-
 } // end namespace internal
 } // end namespace Eigen
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index b1ff1d8b1..e27753b19 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -31,7 +31,6 @@ struct traits<TensorGeneratorOp<Generator, XprType> > : public traits<XprType>
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename Generator, typename XprType>
@@ -88,55 +87,37 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
   enum {
-    IsAligned         = false,
-    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess       = true,
-    PreferBlockAccess = true,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess       = false,  // to be implemented
-    RawAccess         = false
+    IsAligned = false,
+    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
-  typedef internal::TensorIntDivisor<Index> IndexDivisor;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
-                                                     Layout, Index>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      :  m_device(device), m_generator(op.generator())
+      : m_generator(op.generator())
   {
-    TensorEvaluator<ArgType, Device> argImpl(op.expression(), device);
-    m_dimensions = argImpl.dimensions();
+    TensorEvaluator<ArgType, Device> impl(op.expression(), device);
+    m_dimensions = impl.dimensions();
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       m_strides[0] = 1;
-      EIGEN_UNROLL_LOOP
       for (int i = 1; i < NumDims; ++i) {
         m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
-        if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
       }
     } else {
       m_strides[NumDims - 1] = 1;
-      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 2; i >= 0; --i) {
         m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
-        if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
       }
     }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
     return true;
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
@@ -152,7 +133,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    const int packetSize = PacketType<CoeffReturnType, Device>::size;
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
     EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+packetSize-1 < dimensions().TotalSize());
 
@@ -164,97 +145,6 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
     return rslt;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    const size_t target_size = m_device.firstLevelCacheSize();
-    // TODO(ezhulenev): Generator should have a cost.
-    return internal::TensorBlockResourceRequirements::skewed<Scalar>(
-        target_size);
-  }
-
-  struct BlockIteratorState {
-    Index stride;
-    Index span;
-    Index size;
-    Index count;
-  };
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    static const bool is_col_major =
-        static_cast<int>(Layout) == static_cast<int>(ColMajor);
-
-    // Compute spatial coordinates for the first block element.
-    array<Index, NumDims> coords;
-    extract_coordinates(desc.offset(), coords);
-    array<Index, NumDims> initial_coords = coords;
-
-    // Offset in the output block buffer.
-    Index offset = 0;
-
-    // Initialize output block iterator state. Dimension in this array are
-    // always in inner_most -> outer_most order (col major layout).
-    array<BlockIteratorState, NumDims> it;
-    for (int i = 0; i < NumDims; ++i) {
-      const int dim = is_col_major ? i : NumDims - 1 - i;
-      it[i].size = desc.dimension(dim);
-      it[i].stride = i == 0 ? 1 : (it[i - 1].size * it[i - 1].stride);
-      it[i].span = it[i].stride * (it[i].size - 1);
-      it[i].count = 0;
-    }
-    eigen_assert(it[0].stride == 1);
-
-    // Prepare storage for the materialized generator result.
-    const typename TensorBlock::Storage block_storage =
-        TensorBlock::prepareStorage(desc, scratch);
-
-    CoeffReturnType* block_buffer = block_storage.data();
-
-    static const int packet_size = PacketType<CoeffReturnType, Device>::size;
-
-    static const int inner_dim = is_col_major ? 0 : NumDims - 1;
-    const Index inner_dim_size = it[0].size;
-    const Index inner_dim_vectorized = inner_dim_size - packet_size;
-
-    while (it[NumDims - 1].count < it[NumDims - 1].size) {
-      Index i = 0;
-      // Generate data for the vectorized part of the inner-most dimension.
-      for (; i <= inner_dim_vectorized; i += packet_size) {
-        for (Index j = 0; j < packet_size; ++j) {
-          array<Index, NumDims> j_coords = coords;  // Break loop dependence.
-          j_coords[inner_dim] += j;
-          *(block_buffer + offset + i + j) = m_generator(j_coords);
-        }
-        coords[inner_dim] += packet_size;
-      }
-      // Finalize non-vectorized part of the inner-most dimension.
-      for (; i < inner_dim_size; ++i) {
-        *(block_buffer + offset + i) = m_generator(coords);
-        coords[inner_dim]++;
-      }
-      coords[inner_dim] = initial_coords[inner_dim];
-
-      // For the 1d tensor we need to generate only one inner-most dimension.
-      if (NumDims == 1) break;
-
-      // Update offset.
-      for (i = 1; i < NumDims; ++i) {
-        if (++it[i].count < it[i].size) {
-          offset += it[i].stride;
-          coords[is_col_major ? i : NumDims - 1 - i]++;
-          break;
-        }
-        if (i != NumDims - 1) it[i].count = 0;
-        coords[is_col_major ? i : NumDims - 1 - i] =
-            initial_coords[is_col_major ? i : NumDims - 1 - i];
-        offset -= it[i].span;
-      }
-    }
-
-    return block_storage.AsTensorMaterializedBlock();
-  }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
   costPerCoeff(bool) const {
     // TODO(rmlarsen): This is just a placeholder. Define interface to make
@@ -263,26 +153,21 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
                                   TensorOpCost::MulCost<Scalar>());
   }
 
-  EIGEN_DEVICE_FUNC EvaluatorPointerType  data() const { return NULL; }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler&) const {}
-#endif
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void extract_coordinates(Index index, array<Index, NumDims>& coords) const {
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx = index / m_fast_strides[i];
+        const Index idx = index / m_strides[i];
         index -= idx * m_strides[i];
         coords[i] = idx;
       }
       coords[0] = index;
     } else {
       for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx = index / m_fast_strides[i];
+        const Index idx = index / m_strides[i];
         index -= idx * m_strides[i];
         coords[i] = idx;
       }
@@ -290,10 +175,8 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
     }
   }
 
-  const Device EIGEN_DEVICE_REF m_device;
   Dimensions m_dimensions;
   array<Index, NumDims> m_strides;
-  array<IndexDivisor, NumDims> m_fast_strides;
   Generator m_generator;
 };
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
deleted file mode 100644
index f32ce27e9..000000000
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
+++ /dev/null
@@ -1,93 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
-#define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
-
-// Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design
-// There is code in the Tensorflow codebase that will define EIGEN_USE_GPU,  but
-// for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler
-// When compiling such files, gcc will end up trying to pick up the CUDA headers by 
-// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU)
-// This will obsviously not work when trying to compile tensorflow on a system with no CUDA
-// To work around this issue for HIP systems (and leave the default behaviour intact), the
-// HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and 
-// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is
-// defined. In continuation of that requirement, the guard here needs to be EIGEN_USE_HIP as well
-
-#if defined(EIGEN_USE_HIP)
-
-#define gpuStream_t hipStream_t
-#define gpuDeviceProp_t hipDeviceProp_t
-#define gpuError_t hipError_t
-#define gpuSuccess hipSuccess
-#define gpuErrorNotReady hipErrorNotReady
-#define gpuGetDeviceCount hipGetDeviceCount
-#define gpuGetErrorString hipGetErrorString
-#define gpuGetDeviceProperties hipGetDeviceProperties
-#define gpuStreamDefault hipStreamDefault
-#define gpuGetDevice hipGetDevice
-#define gpuSetDevice hipSetDevice
-#define gpuMalloc hipMalloc
-#define gpuFree hipFree
-#define gpuMemsetAsync hipMemsetAsync
-#define gpuMemcpyAsync hipMemcpyAsync
-#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
-#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
-#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
-#define gpuStreamQuery hipStreamQuery
-#define gpuSharedMemConfig hipSharedMemConfig
-#define gpuDeviceSetSharedMemConfig hipDeviceSetSharedMemConfig
-#define gpuStreamSynchronize hipStreamSynchronize
-#define gpuDeviceSynchronize hipDeviceSynchronize
-#define gpuMemcpy hipMemcpy
-
-#else
-
-#define gpuStream_t cudaStream_t
-#define gpuDeviceProp_t cudaDeviceProp
-#define gpuError_t cudaError_t
-#define gpuSuccess cudaSuccess
-#define gpuErrorNotReady cudaErrorNotReady
-#define gpuGetDeviceCount cudaGetDeviceCount
-#define gpuGetErrorString cudaGetErrorString
-#define gpuGetDeviceProperties cudaGetDeviceProperties
-#define gpuStreamDefault cudaStreamDefault
-#define gpuGetDevice cudaGetDevice
-#define gpuSetDevice cudaSetDevice
-#define gpuMalloc cudaMalloc
-#define gpuFree cudaFree
-#define gpuMemsetAsync cudaMemsetAsync
-#define gpuMemcpyAsync cudaMemcpyAsync
-#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
-#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
-#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
-#define gpuStreamQuery cudaStreamQuery
-#define gpuSharedMemConfig cudaSharedMemConfig
-#define gpuDeviceSetSharedMemConfig cudaDeviceSetSharedMemConfig
-#define gpuStreamSynchronize cudaStreamSynchronize
-#define gpuDeviceSynchronize cudaDeviceSynchronize
-#define gpuMemcpy cudaMemcpy
-
-#endif
-
-// gpu_assert can be overridden
-#ifndef gpu_assert
-
-#if defined(EIGEN_HIP_DEVICE_COMPILE)
-// HIPCC do not support the use of assert on the GPU side.
-#define gpu_assert(COND)
-#else
-#define gpu_assert(COND) assert(COND)
-#endif
-
-#endif // gpu_assert
-
-#endif  // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
deleted file mode 100644
index db394bcbb..000000000
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#if defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
-
-#undef gpuStream_t
-#undef gpuDeviceProp_t 
-#undef gpuError_t
-#undef gpuSuccess
-#undef gpuErrorNotReady
-#undef gpuGetDeviceCount
-#undef gpuGetErrorString
-#undef gpuGetDeviceProperties
-#undef gpuStreamDefault
-#undef gpuGetDevice
-#undef gpuSetDevice
-#undef gpuMalloc
-#undef gpuFree
-#undef gpuMemsetAsync
-#undef gpuMemcpyAsync
-#undef gpuMemcpyDeviceToDevice
-#undef gpuMemcpyDeviceToHost
-#undef gpuMemcpyHostToDevice
-#undef gpuStreamQuery
-#undef gpuSharedMemConfig
-#undef gpuDeviceSetSharedMemConfig
-#undef gpuStreamSynchronize
-#undef gpuDeviceSynchronize
-#undef gpuMemcpy
-
-#undef EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
-
-#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index 49d1004f3..566856ed2 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -27,7 +27,6 @@ namespace Eigen {
   * patch_cols, and 1 for all the additional dimensions.
   */
 namespace internal {
-
 template<DenseIndex Rows, DenseIndex Cols, typename XprType>
 struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType>
 {
@@ -39,7 +38,6 @@ struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType>
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions + 1;
   static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<DenseIndex Rows, DenseIndex Cols, typename XprType>
@@ -54,66 +52,6 @@ struct nested<TensorImagePatchOp<Rows, Cols, XprType>, 1, typename eval<TensorIm
   typedef TensorImagePatchOp<Rows, Cols, XprType> type;
 };
 
-template <typename Self, bool Vectorizable>
-struct ImagePatchCopyOp {
-  typedef typename Self::Index Index;
-  typedef typename Self::Scalar Scalar;
-  typedef typename Self::Impl Impl;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const Self& self, const Index num_coeff_to_copy, const Index dst_index,
-      Scalar* dst_data, const Index src_index) {
-    const Impl& impl = self.impl();
-    for (Index i = 0; i < num_coeff_to_copy; ++i) {
-      dst_data[dst_index + i] = impl.coeff(src_index + i);
-    }
-  }
-};
-
-template <typename Self>
-struct ImagePatchCopyOp<Self, true> {
-  typedef typename Self::Index Index;
-  typedef typename Self::Scalar Scalar;
-  typedef typename Self::Impl Impl;
-  typedef typename packet_traits<Scalar>::type Packet;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const Self& self, const Index num_coeff_to_copy, const Index dst_index,
-      Scalar* dst_data, const Index src_index) {
-    const Impl& impl = self.impl();
-    const Index packet_size = internal::unpacket_traits<Packet>::size;
-    const Index vectorized_size =
-        (num_coeff_to_copy / packet_size) * packet_size;
-    for (Index i = 0; i < vectorized_size; i += packet_size) {
-      Packet p = impl.template packet<Unaligned>(src_index + i);
-      internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i, p);
-    }
-    for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
-      dst_data[dst_index + i] = impl.coeff(src_index + i);
-    }
-  }
-};
-
-template <typename Self>
-struct ImagePatchPaddingOp {
-  typedef typename Self::Index Index;
-  typedef typename Self::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type Packet;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
-      const Index num_coeff_to_pad, const Scalar padding_value,
-      const Index dst_index, Scalar* dst_data) {
-    const Index packet_size = internal::unpacket_traits<Packet>::size;
-    const Packet padded_packet = internal::pset1<Packet>(padding_value);
-    const Index vectorized_size =
-        (num_coeff_to_pad / packet_size) * packet_size;
-    for (Index i = 0; i < vectorized_size; i += packet_size) {
-      internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i,
-                                                   padded_packet);
-    }
-    for (Index i = vectorized_size; i < num_coeff_to_pad; ++i) {
-      dst_data[dst_index + i] = padding_value;
-    }
-  }
-};
-
 }  // end namespace internal
 
 template<DenseIndex Rows, DenseIndex Cols, typename XprType>
@@ -132,12 +70,12 @@ class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprT
                                                            DenseIndex in_row_strides, DenseIndex in_col_strides,
                                                            DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
                                                            PaddingType padding_type, Scalar padding_value)
-                                                           : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
-                                                           m_row_strides(row_strides), m_col_strides(col_strides),
-                                                           m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
-                                                           m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
-                                                           m_padding_explicit(false), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
-                                                           m_padding_type(padding_type), m_padding_value(padding_value) {}
+      : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+        m_row_strides(row_strides), m_col_strides(col_strides),
+        m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+        m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+        m_padding_explicit(false), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
+        m_padding_type(padding_type), m_padding_value(padding_value) {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows, DenseIndex patch_cols,
                                                            DenseIndex row_strides, DenseIndex col_strides,
@@ -146,14 +84,13 @@ class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprT
                                                            DenseIndex padding_top, DenseIndex padding_bottom,
                                                            DenseIndex padding_left, DenseIndex padding_right,
                                                            Scalar padding_value)
-                                                           : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
-                                                           m_row_strides(row_strides), m_col_strides(col_strides),
-                                                           m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
-                                                           m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
-                                                           m_padding_explicit(true), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
-                                                           m_padding_left(padding_left), m_padding_right(padding_right),
-                                                           m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
-
+      : m_xpr(expr), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+        m_row_strides(row_strides), m_col_strides(col_strides),
+        m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+        m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+        m_padding_explicit(true), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
+        m_padding_left(padding_left), m_padding_right(padding_right),
+        m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
 
     EIGEN_DEVICE_FUNC
     DenseIndex patch_rows() const { return m_patch_rows; }
@@ -224,26 +161,18 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   typedef TensorEvaluator<ArgType, Device> Impl;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
-    IsAligned         = false,
-    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = false,
-    PreferBlockAccess = true,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess       = false,
-    RawAccess         = false
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,
+    RawAccess = false
   };
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
-      : m_device(device), m_impl(op.expression(), device)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device)
   {
     EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
 
@@ -309,15 +238,9 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
           // Calculate the padding
           m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2;
           m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2;
-          // The padding size calculation for PADDING_SAME has been updated to
-          // be consistent with how TensorFlow extracts its paddings.
-          m_rowPaddingTop = numext::maxi<Index>(0, m_rowPaddingTop);
-          m_colPaddingLeft = numext::maxi<Index>(0, m_colPaddingLeft);
           break;
         default:
           eigen_assert(false && "unexpected padding");
-          m_outputCols=0; // silence the uninitialised warning;
-          m_outputRows=0; //// silence the uninitialised warning;
       }
     }
     eigen_assert(m_outputRows > 0);
@@ -389,19 +312,11 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType, EvalSubExprsCallback done) {
-    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
-  }
-#endif  // EIGEN_USE_THREADS
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
@@ -503,16 +418,9 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
     return packetWithPossibleZero(index);
   }
 
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 
   Index rowPaddingTop() const { return m_rowPaddingTop; }
   Index colPaddingLeft() const { return m_colPaddingLeft; }
@@ -541,7 +449,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
   {
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-    EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
@@ -593,7 +500,6 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
 
   Scalar m_paddingValue;
 
-  const Device EIGEN_DEVICE_REF m_device;
   TensorEvaluator<ArgType, Device> m_impl;
 };
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
index 0e9133c49..3209fecd3 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -37,36 +37,36 @@ namespace Eigen {
   * \sa Tensor
   */
 
-template <Index n>
+template <DenseIndex n>
 struct type2index {
-  static const Index value = n;
-  EIGEN_DEVICE_FUNC constexpr operator Index() const { return n; }
-  EIGEN_DEVICE_FUNC void set(Index val) {
+  static const DenseIndex value = n;
+  EIGEN_DEVICE_FUNC constexpr operator DenseIndex() const { return n; }
+  EIGEN_DEVICE_FUNC void set(DenseIndex val) {
     eigen_assert(val == n);
   }
 };
 
 // This can be used with IndexPairList to get compile-time constant pairs,
 // such as IndexPairList<type2indexpair<1,2>, type2indexpair<3,4>>().
-template <Index f, Index s>
+template <DenseIndex f, DenseIndex s>
 struct type2indexpair {
-  static const Index first = f;
-  static const Index second = s;
+  static const DenseIndex first = f;
+  static const DenseIndex second = s;
 
-  constexpr EIGEN_DEVICE_FUNC operator IndexPair<Index>() const {
-    return IndexPair<Index>(f, s);
+  constexpr EIGEN_DEVICE_FUNC operator IndexPair<DenseIndex>() const {
+    return IndexPair<DenseIndex>(f, s);
   }
 
-  EIGEN_DEVICE_FUNC void set(const IndexPair<Index>& val) {
+  EIGEN_DEVICE_FUNC void set(const IndexPair<DenseIndex>& val) {
     eigen_assert(val.first == f);
     eigen_assert(val.second == s);
   }
 };
 
 
-template<Index n> struct NumTraits<type2index<n> >
+template<DenseIndex n> struct NumTraits<type2index<n> >
 {
-  typedef Index Real;
+  typedef DenseIndex Real;
   enum {
     IsComplex = 0,
     RequireInitialization = false,
@@ -75,28 +75,28 @@ template<Index n> struct NumTraits<type2index<n> >
     MulCost = 1
   };
 
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Real epsilon() { return 0; }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Real dummy_precision() { return 0; }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Real highest() { return n; }
-  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Real lowest() { return n; }
+  EIGEN_DEVICE_FUNC static inline Real epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC static inline Real dummy_precision() { return 0; }
+  EIGEN_DEVICE_FUNC static inline Real highest() { return n; }
+  EIGEN_DEVICE_FUNC static inline Real lowest() { return n; }
 };
 
 namespace internal {
 template <typename T>
-EIGEN_DEVICE_FUNC void update_value(T& val, Index new_val) {
-  val = internal::convert_index<T>(new_val);
+EIGEN_DEVICE_FUNC void update_value(T& val, DenseIndex new_val) {
+  val = new_val;
 }
-template <Index n>
-EIGEN_DEVICE_FUNC void update_value(type2index<n>& val, Index new_val) {
+template <DenseIndex n>
+EIGEN_DEVICE_FUNC void update_value(type2index<n>& val, DenseIndex new_val) {
   val.set(new_val);
 }
 
 template <typename T>
-EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair<Index> new_val) {
+EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair<DenseIndex> new_val) {
   val = new_val;
 }
-template <Index f, Index s>
-EIGEN_DEVICE_FUNC void update_value(type2indexpair<f, s>& val, IndexPair<Index> new_val) {
+template <DenseIndex f, DenseIndex s>
+EIGEN_DEVICE_FUNC void update_value(type2indexpair<f, s>& val, IndexPair<DenseIndex> new_val) {
   val.set(new_val);
 }
 
@@ -106,36 +106,36 @@ struct is_compile_time_constant {
   static constexpr bool value = false;
 };
 
-template <Index idx>
+template <DenseIndex idx>
 struct is_compile_time_constant<type2index<idx> > {
   static constexpr bool value = true;
 };
-template <Index idx>
+template <DenseIndex idx>
 struct is_compile_time_constant<const type2index<idx> > {
   static constexpr bool value = true;
 };
-template <Index idx>
+template <DenseIndex idx>
 struct is_compile_time_constant<type2index<idx>& > {
   static constexpr bool value = true;
 };
-template <Index idx>
+template <DenseIndex idx>
 struct is_compile_time_constant<const type2index<idx>& > {
   static constexpr bool value = true;
 };
 
-template <Index f, Index s>
+template <DenseIndex f, DenseIndex s>
 struct is_compile_time_constant<type2indexpair<f, s> > {
   static constexpr bool value = true;
 };
-template <Index f, Index s>
+template <DenseIndex f, DenseIndex s>
 struct is_compile_time_constant<const type2indexpair<f, s> > {
   static constexpr bool value = true;
 };
-template <Index f, Index s>
+template <DenseIndex f, DenseIndex s>
 struct is_compile_time_constant<type2indexpair<f, s>& > {
   static constexpr bool value = true;
 };
-template <Index f, Index s>
+template <DenseIndex f, DenseIndex s>
 struct is_compile_time_constant<const type2indexpair<f, s>& > {
   static constexpr bool value = true;
 };
@@ -228,15 +228,15 @@ template <typename T, typename... O>
 
 
 
-template <Index Idx, typename ValueT>
+template <DenseIndex Idx, typename ValueT>
 struct tuple_coeff {
   template <typename... T>
-  EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index i, const IndexTuple<T...>& t) {
+  EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex i, const IndexTuple<T...>& t) {
     //    return array_get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
     return (i == Idx ? array_get<Idx>(t) : tuple_coeff<Idx-1, ValueT>::get(i, t));
   }
   template <typename... T>
-  EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple<T...>& t, const ValueT& value) {
+  EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const ValueT& value) {
     if (i == Idx) {
       update_value(array_get<Idx>(t), value);
     } else {
@@ -245,7 +245,7 @@ struct tuple_coeff {
   }
 
   template <typename... T>
-  EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple<T...>& t) {
+  EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple<T...>& t) {
     return ((i == Idx) & is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value) ||
         tuple_coeff<Idx-1, ValueT>::value_known_statically(i, t);
   }
@@ -268,18 +268,18 @@ struct tuple_coeff {
 template <typename ValueT>
 struct tuple_coeff<0, ValueT> {
   template <typename... T>
-  EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index /*i*/, const IndexTuple<T...>& t) {
+  EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex /*i*/, const IndexTuple<T...>& t) {
     //  eigen_assert (i == 0);  // gcc fails to compile assertions in constexpr
     return array_get<0>(t)/* * (i == 0)*/;
   }
   template <typename... T>
-  EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple<T...>& t, const ValueT value) {
+  EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const ValueT value) {
     eigen_assert (i == 0);
     update_value(array_get<0>(t), value);
   }
   template <typename... T>
-  EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple<T...>&) {
-    return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value && (i == 0);
+  EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple<T...>&) {
+    return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value & (i == 0);
   }
 
   template <typename... T>
@@ -298,43 +298,32 @@ struct tuple_coeff<0, ValueT> {
 
 template<typename FirstType, typename... OtherTypes>
 struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> {
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index operator[] (const Index i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::get(i, *this);
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::get(i, *this);
   }
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index get(const Index i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::get(i, *this);
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex get(const DenseIndex i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::get(i, *this);
   }
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const Index value) {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::set(i, *this, value);
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::set(i, *this, value);
   }
 
   EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
   EIGEN_DEVICE_FUNC constexpr IndexList(FirstType& first, OtherTypes... other) : internal::IndexTuple<FirstType, OtherTypes...>(first, other...) { }
   EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
 
-  EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::value_known_statically(i, *this);
+  EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::value_known_statically(i, *this);
   }
   EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::values_up_to_known_statically(*this);
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::values_up_to_known_statically(*this);
   }
 
   EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::values_up_to_statically_known_to_increase(*this);
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::values_up_to_statically_known_to_increase(*this);
   }
 };
 
-template <typename FirstType, typename... OtherTypes>
-std::ostream& operator<<(std::ostream& os,
-                         const IndexList<FirstType, OtherTypes...>& dims) {
-  os << "[";
-  for (size_t i = 0; i < 1 + sizeof...(OtherTypes); ++i) {
-    if (i > 0) os << ", ";
-    os << dims[i];
-  }
-  os << "]";
-  return os;
-}
 
 template<typename FirstType, typename... OtherTypes>
 constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, OtherTypes... other_vals) {
@@ -344,28 +333,26 @@ constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, Ot
 
 template<typename FirstType, typename... OtherTypes>
 struct IndexPairList : internal::IndexTuple<FirstType, OtherTypes...> {
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair<Index> operator[] (const Index i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, IndexPair<Index>>::get(i, *this);
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair<DenseIndex> operator[] (const DenseIndex i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, IndexPair<DenseIndex>>::get(i, *this);
   }
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const IndexPair<Index> value) {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value-1, IndexPair<Index> >::set(i, *this, value);
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const IndexPair<DenseIndex> value) {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value-1, IndexPair<DenseIndex> >::set(i, *this, value);
   }
 
   EIGEN_DEVICE_FUNC  constexpr IndexPairList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
   EIGEN_DEVICE_FUNC  constexpr IndexPairList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
 
-  EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, Index>::value_known_statically(i, *this);
+  EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::value_known_statically(i, *this);
   }
 };
 
 namespace internal {
 
-template<typename FirstType, typename... OtherTypes>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
-  Index result = 1;
-  EIGEN_UNROLL_LOOP
-  for (size_t i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) {
+template<typename FirstType, typename... OtherTypes> size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
+  size_t result = 1;
+  for (int i = 0; i < array_size<IndexList<FirstType, OtherTypes...> >::value; ++i) {
     result *= sizes[i];
   }
   return result;
@@ -385,30 +372,30 @@ template<typename FirstType, typename... OtherTypes> struct array_size<const Ind
   static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
 };
 
-template<Index N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr Index array_get(IndexList<FirstType, OtherTypes...>& a) {
+template<DenseIndex N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(IndexList<FirstType, OtherTypes...>& a) {
   return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
 }
-template<Index N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr Index array_get(const IndexList<FirstType, OtherTypes...>& a) {
+template<DenseIndex N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(const IndexList<FirstType, OtherTypes...>& a) {
   return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
 }
 
 template <typename T>
 struct index_known_statically_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
 struct index_known_statically_impl<IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
 struct index_known_statically_impl<const IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
   }
 };
@@ -460,14 +447,14 @@ template <typename FirstType, typename... OtherTypes>
 
 template <typename Tx>
 struct index_statically_eq_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_eq_impl<IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexList<FirstType, OtherTypes...>().get(i) == value);
   }
@@ -475,7 +462,7 @@ struct index_statically_eq_impl<IndexList<FirstType, OtherTypes...> > {
 
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_eq_impl<const IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexList<FirstType, OtherTypes...>().get(i) == value);
   }
@@ -484,14 +471,14 @@ struct index_statically_eq_impl<const IndexList<FirstType, OtherTypes...> > {
 
 template <typename T>
 struct index_statically_ne_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_ne_impl<IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexList<FirstType, OtherTypes...>().get(i) != value);
   }
@@ -499,7 +486,7 @@ struct index_statically_ne_impl<IndexList<FirstType, OtherTypes...> > {
 
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_ne_impl<const IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexList<FirstType, OtherTypes...>().get(i) != value);
   }
@@ -508,14 +495,14 @@ struct index_statically_ne_impl<const IndexList<FirstType, OtherTypes...> > {
 
 template <typename T>
 struct index_statically_gt_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_gt_impl<IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexList<FirstType, OtherTypes...>().get(i) > value);
   }
@@ -523,7 +510,7 @@ struct index_statically_gt_impl<IndexList<FirstType, OtherTypes...> > {
 
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_gt_impl<const IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexList<FirstType, OtherTypes...>().get(i) > value);
   }
@@ -533,14 +520,14 @@ struct index_statically_gt_impl<const IndexList<FirstType, OtherTypes...> > {
 
 template <typename T>
 struct index_statically_lt_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_lt_impl<IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexList<FirstType, OtherTypes...>().get(i) < value);
   }
@@ -548,7 +535,7 @@ struct index_statically_lt_impl<IndexList<FirstType, OtherTypes...> > {
 
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_lt_impl<const IndexList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexList<FirstType, OtherTypes...>().get(i) < value);
   }
@@ -558,14 +545,14 @@ struct index_statically_lt_impl<const IndexList<FirstType, OtherTypes...> > {
 
 template <typename Tx>
 struct index_pair_first_statically_eq_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
 struct index_pair_first_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
   }
@@ -573,7 +560,7 @@ struct index_pair_first_statically_eq_impl<IndexPairList<FirstType, OtherTypes..
 
 template <typename FirstType, typename... OtherTypes>
 struct index_pair_first_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
   }
@@ -583,14 +570,14 @@ struct index_pair_first_statically_eq_impl<const IndexPairList<FirstType, OtherT
 
 template <typename Tx>
 struct index_pair_second_statically_eq_impl {
-  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
     return false;
   }
 };
 
 template <typename FirstType, typename... OtherTypes>
 struct index_pair_second_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
   }
@@ -598,7 +585,7 @@ struct index_pair_second_statically_eq_impl<IndexPairList<FirstType, OtherTypes.
 
 template <typename FirstType, typename... OtherTypes>
 struct index_pair_second_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
-  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
     return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
         (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
   }
@@ -615,7 +602,7 @@ namespace internal {
 
 template <typename T>
 struct index_known_statically_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const Index) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex) {
     return false;
   }
 };
@@ -636,42 +623,42 @@ struct indices_statically_known_to_increase_impl {
 
 template <typename T>
 struct index_statically_eq_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
     return false;
   }
 };
 
 template <typename T>
 struct index_statically_ne_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
     return false;
   }
 };
 
 template <typename T>
 struct index_statically_gt_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
     return false;
   }
 };
 
 template <typename T>
 struct index_statically_lt_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
     return false;
   }
 };
 
 template <typename Tx>
 struct index_pair_first_statically_eq_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
     return false;
   }
 };
 
 template <typename Tx>
 struct index_pair_second_statically_eq_impl {
-  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
     return false;
   }
 };
@@ -687,7 +674,7 @@ struct index_pair_second_statically_eq_impl {
 namespace Eigen {
 namespace internal {
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(Index i) {
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(DenseIndex i) {
   return index_known_statically_impl<T>::run(i);
 }
 
@@ -702,32 +689,32 @@ static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool indices_statically_known_to_increa
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(Index i, Index value) {
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(DenseIndex i, DenseIndex value) {
   return index_statically_eq_impl<T>::run(i, value);
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(Index i, Index value) {
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(DenseIndex i, DenseIndex value) {
   return index_statically_ne_impl<T>::run(i, value);
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(Index i, Index value) {
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(DenseIndex i, DenseIndex value) {
   return index_statically_gt_impl<T>::run(i, value);
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(Index i, Index value) {
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(DenseIndex i, DenseIndex value) {
   return index_statically_lt_impl<T>::run(i, value);
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(Index i, Index value) {
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(DenseIndex i, DenseIndex value) {
   return index_pair_first_statically_eq_impl<T>::run(i, value);
 }
 
 template <typename T>
-static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(Index i, Index value) {
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(DenseIndex i, DenseIndex value) {
   return index_pair_second_statically_eq_impl<T>::run(i, value);
 }
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
index 7dadec7fb..f391fb9ee 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@@ -31,7 +31,6 @@ struct traits<TensorInflationOp<Strides, XprType> > : public traits<XprType>
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename Strides, typename XprType>
@@ -85,24 +84,17 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
-    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_strides(op.strides())
   {
@@ -137,7 +129,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
@@ -152,7 +144,6 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
     eigen_assert(index < dimensions().TotalSize());
     *inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx = index / m_outputStrides[i];
         if (idx != idx / m_fastStrides[i] * m_strides[i]) {
@@ -167,7 +158,6 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
       *inputIndex += index / m_strides[0];
       return true;
     } else {
-      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx = index / m_outputStrides[i];
         if (idx != idx / m_fastStrides[i] * m_strides[i]) {
@@ -203,7 +193,6 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-    EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
@@ -224,14 +213,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
                         compute_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
  protected:
   Dimensions m_dimensions;
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
index 26a3818f3..33edc49e3 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
@@ -32,7 +32,7 @@ struct Initializer {
                   Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
                   const InitList& vals) {
     int i = 0;
-    for (const auto& v : vals) {
+    for (auto v : vals) {
       (*indices)[traits<Derived>::NumDimensions - N] = i++;
       Initializer<Derived, N - 1>::run(tensor, indices, v);
     }
@@ -48,7 +48,7 @@ struct Initializer<Derived, 1> {
                   const InitList& vals) {
     int i = 0;
     // There is likely a faster way to do that than iterating.
-    for (const auto& v : vals) {
+    for (auto v : vals) {
       (*indices)[traits<Derived>::NumDimensions - 1] = i++;
       tensor.coeffRef(*indices) = v;
     }
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index 6d5cce4aa..ede3939c2 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -21,7 +21,7 @@ namespace Eigen {
   * \brief Fast integer division by a constant.
   *
   * See the paper from Granlund and Montgomery for explanation.
-  *   (at https://doi.org/10.1145/773473.178249)
+  *   (at http://dx.doi.org/10.1145/773473.178249)
   *
   * \sa Tensor
   */
@@ -35,10 +35,8 @@ namespace {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
   {
-#ifdef EIGEN_GPU_COMPILE_PHASE
+#ifdef __CUDA_ARCH__
     return __clz(val);
-#elif defined(SYCL_DEVICE_ONLY)
-    return cl::sycl::clz(val);
 #elif EIGEN_COMP_MSVC
     unsigned long index;
     _BitScanReverse(&index, val);
@@ -53,10 +51,8 @@ namespace {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
   {
-#ifdef EIGEN_GPU_COMPILE_PHASE
+#ifdef __CUDA_ARCH__
     return __clzll(val);
-#elif defined(SYCL_DEVICE_ONLY)
-    return static_cast<int>(cl::sycl::clz(val));
 #elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64
     unsigned long index;
     _BitScanReverse64(&index, val);
@@ -90,10 +86,8 @@ namespace {
 
   template <typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#if defined(__CUDA_ARCH__)
     return __umulhi(a, b);
-#elif defined(SYCL_DEVICE_ONLY)
-    return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
 #else
     return (static_cast<uint64_t>(a) * b) >> 32;
 #endif
@@ -101,11 +95,9 @@ namespace {
 
   template <typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#if defined(__CUDA_ARCH__)
     return __umul64hi(a, b);
-#elif defined(SYCL_DEVICE_ONLY)
-    return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
-#elif EIGEN_HAS_BUILTIN_INT128
+#elif defined(__SIZEOF_INT128__)
     __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
     return static_cast<uint64_t>(v >> 64);
 #else
@@ -124,7 +116,7 @@ namespace {
   template <typename T>
   struct DividerHelper<64, T> {
     static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
-#if EIGEN_HAS_BUILTIN_INT128 && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
+#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__)
       return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
 #else
       const uint64_t shift = 1ULL << log_div;
@@ -167,7 +159,7 @@ struct TensorIntDivisor {
     shift2 = log_div > 1 ? log_div-1 : 0;
   }
 
-  // Must have 0 <= numerator. On platforms that don't support the __uint128_t
+  // Must have 0 <= numerator. On platforms that dont support the __uint128_t
   // type numerator should also be less than 2^32-1.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const {
     eigen_assert(static_cast<typename UnsignedTraits<T>::type>(numerator) < NumTraits<UnsignedType>::highest()/2);
@@ -203,10 +195,8 @@ class TensorIntDivisor<int32_t, true> {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
-#ifdef EIGEN_GPU_COMPILE_PHASE
+#ifdef __CUDA_ARCH__
     return (__umulhi(magic, n) >> shift);
-#elif defined(SYCL_DEVICE_ONLY)
-    return (cl::sycl::mul_hi(magic, static_cast<uint32_t>(n)) >> shift);
 #else
     uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n);
     return (static_cast<uint32_t>(v >> 32) >> shift);
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index 05fa80e59..cd0109ef4 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -46,7 +46,6 @@ struct traits<TensorLayoutSwapOp<XprType> > : public traits<XprType>
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = traits<XprType>::NumDimensions;
   static const int Layout = (traits<XprType>::Layout == ColMajor) ? RowMajor : ColMajor;
-  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename XprType>
@@ -119,17 +118,11 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
   enum {
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
     CoordAccess = false,  // to be implemented
     RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
   };
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device)
   {
@@ -138,22 +131,13 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
     }
   }
 
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
-
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
     return m_impl.evalSubExprsIfNeeded(data);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
@@ -175,9 +159,7 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
     return m_impl.costPerCoeff(vectorized);
   }
 
-  EIGEN_DEVICE_FUNC typename Storage::Type data() const {
-    return constCast(m_impl.data());
-  }
+  EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); }
 
   const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 
@@ -198,20 +180,14 @@ template<typename ArgType, typename Device>
   enum {
     IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = (static_cast<int>(TensorEvaluator<ArgType, Device>::Layout) == static_cast<int>(ColMajor)) ? RowMajor : ColMajor,
     CoordAccess = false  // to be implemented
   };
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
   { }
-  
+
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
index af9e5db70..ee0078bbc 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
@@ -27,7 +27,7 @@
  */
 
 // SFINAE requires variadic templates
-#if !defined(EIGEN_GPUCC)
+#ifndef __CUDACC__
 #if EIGEN_HAS_VARIADIC_TEMPLATES
   // SFINAE doesn't work for gcc <= 4.7
   #ifdef EIGEN_COMP_GNUC
@@ -51,43 +51,4 @@
 #endif
 
 
-#if EIGEN_OS_WIN || EIGEN_OS_WIN64
-#define EIGEN_SLEEP(n) Sleep(n)
-#elif EIGEN_OS_GNULINUX
-#define EIGEN_SLEEP(n) usleep(n * 1000);
-#else
-#define EIGEN_SLEEP(n) sleep(std::max<unsigned>(1, n/1000))
-#endif
-
-// Define a macro to use a reference on the host but a value on the device
-#if defined(SYCL_DEVICE_ONLY)
-  #define EIGEN_DEVICE_REF
-#else
-  #define EIGEN_DEVICE_REF &
-#endif
-
-// Define a macro for catching SYCL exceptions if exceptions are enabled
-#define EIGEN_SYCL_TRY_CATCH(X) \
-  do { \
-    EIGEN_TRY {X;} \
-    EIGEN_CATCH(const cl::sycl::exception& e) { \
-      EIGEN_THROW_X(std::runtime_error("SYCL exception at " + \
-                                       std::string(__FILE__) + ":" + \
-                                       std::to_string(__LINE__) + "\n" + \
-                                       e.what())); \
-    } \
-  } while (false)
-
-// Define a macro if local memory flags are unset or one of them is set
-// Setting both flags is the same as unsetting them
-#if (!defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)) || \
-     (defined(EIGEN_SYCL_LOCAL_MEM) &&  defined(EIGEN_SYCL_NO_LOCAL_MEM))
-  #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1
-  #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1
-#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
-  #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1
-#elif !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
-  #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1
-#endif
-
 #endif
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
index 172a6bab8..e4fc86a40 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -31,38 +31,20 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
   public:
     typedef TensorMap<PlainObjectType, Options_, MakePointer_> Self;
     typedef typename PlainObjectType::Base Base;
-  #ifdef EIGEN_USE_SYCL
-    typedef  typename Eigen::internal::remove_reference<typename Eigen::internal::nested<Self>::type>::type Nested;
-  #else
-     typedef typename Eigen::internal::nested<Self>::type Nested;
-  #endif
-   typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
+    typedef typename Eigen::internal::nested<Self>::type Nested;
+    typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
     typedef typename internal::traits<PlainObjectType>::Index Index;
     typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real RealScalar;
     typedef typename Base::CoeffReturnType CoeffReturnType;
 
+  /*    typedef typename internal::conditional<
+                         bool(internal::is_lvalue<PlainObjectType>::value),
+                         Scalar *,
+                         const Scalar *>::type
+                     PointerType;*/
     typedef typename MakePointer_<Scalar>::Type PointerType;
-    typedef typename MakePointer_<Scalar>::ConstType PointerConstType;
-
-    // WARN: PointerType still can be a pointer to const (const Scalar*), for
-    // example in TensorMap<Tensor<const Scalar, ...>> expression. This type of
-    // expression should be illegal, but adding this restriction is not possible
-    // in practice (see https://bitbucket.org/eigen/eigen/pull-requests/488).
-    typedef typename internal::conditional<
-        bool(internal::is_lvalue<PlainObjectType>::value),
-        PointerType,      // use simple pointer in lvalue expressions
-        PointerConstType  // use const pointer in rvalue expressions
-        >::type StoragePointerType;
-
-    // If TensorMap was constructed over rvalue expression (e.g. const Tensor),
-    // we should return a reference to const from operator() (and others), even
-    // if TensorMap itself is not const.
-    typedef typename internal::conditional<
-        bool(internal::is_lvalue<PlainObjectType>::value),
-        Scalar&,
-        const Scalar&
-        >::type StorageRefType;
+    typedef PointerType PointerArgType;
 
     static const int Options = Options_;
 
@@ -77,47 +59,47 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     };
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr) : m_data(dataPtr), m_dimensions() {
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr) : m_data(dataPtr), m_dimensions() {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 
 #if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 #else
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) {
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
       EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) {
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) {
       EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) {
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) {
       EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) {
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) {
       EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) {
+    EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) {
       EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 #endif
 
-   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const array<Index, NumIndices>& dimensions)
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const array<Index, NumIndices>& dimensions)
       : m_data(dataPtr), m_dimensions(dimensions)
     { }
 
     template <typename Dimensions>
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const Dimensions& dimensions)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const Dimensions& dimensions)
       : m_data(dataPtr), m_dimensions(dimensions)
     { }
 
@@ -134,12 +116,12 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StoragePointerType data() { return m_data; }
+    EIGEN_STRONG_INLINE PointerType data() { return m_data; }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StoragePointerType data() const { return m_data; }
+    EIGEN_STRONG_INLINE const PointerType data() const { return m_data; }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(const array<Index, NumIndices>& indices) const
+    EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const
     {
       //      eigen_assert(checkIndexRange(indices));
       if (PlainObjectType::Options&RowMajor) {
@@ -152,14 +134,14 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()() const
+    EIGEN_STRONG_INLINE const Scalar& operator()() const
     {
       EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
       return m_data[0];
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(Index index) const
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const
     {
       eigen_internal_assert(index >= 0 && index < size());
       return m_data[index];
@@ -167,10 +149,9 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
 
 #if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
     {
       EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
-      eigen_assert(internal::all((Eigen::NumTraits<Index>::highest() >= otherIndices)...));
       if (PlainObjectType::Options&RowMajor) {
         const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
         return m_data[index];
@@ -181,7 +162,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     }
 #else
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1) const
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const
     {
       if (PlainObjectType::Options&RowMajor) {
         const Index index = i1 + i0 * m_dimensions[1];
@@ -192,7 +173,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2) const
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const
     {
       if (PlainObjectType::Options&RowMajor) {
          const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0);
@@ -203,7 +184,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3) const
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const
     {
       if (PlainObjectType::Options&RowMajor) {
         const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
@@ -214,7 +195,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
+    EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const
     {
       if (PlainObjectType::Options&RowMajor) {
         const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
@@ -227,7 +208,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
 #endif
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(const array<Index, NumIndices>& indices)
+    EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices)
     {
       //      eigen_assert(checkIndexRange(indices));
       if (PlainObjectType::Options&RowMajor) {
@@ -240,14 +221,14 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()()
+    EIGEN_STRONG_INLINE Scalar& operator()()
     {
       EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
       return m_data[0];
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(Index index)
+    EIGEN_STRONG_INLINE Scalar& operator()(Index index)
     {
       eigen_internal_assert(index >= 0 && index < size());
       return m_data[index];
@@ -255,10 +236,9 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
 
 #if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
+    EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
     {
       static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
-       eigen_assert(internal::all((Eigen::NumTraits<Index>::highest() >= otherIndices)...));
       const std::size_t NumDims = sizeof...(otherIndices) + 2;
       if (PlainObjectType::Options&RowMajor) {
         const Index index = m_dimensions.IndexOfRowMajor(array<Index, NumDims>{{firstIndex, secondIndex, otherIndices...}});
@@ -270,7 +250,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     }
 #else
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1)
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1)
     {
        if (PlainObjectType::Options&RowMajor) {
          const Index index = i1 + i0 * m_dimensions[1];
@@ -281,7 +261,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2)
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2)
     {
        if (PlainObjectType::Options&RowMajor) {
          const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0);
@@ -292,7 +272,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3)
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3)
     {
       if (PlainObjectType::Options&RowMajor) {
         const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0));
@@ -303,7 +283,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
       }
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
+    EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4)
     {
       if (PlainObjectType::Options&RowMajor) {
         const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)));
@@ -334,7 +314,7 @@ template<typename PlainObjectType, int Options_, template <class> class MakePoin
     }
 
   private:
-    StoragePointerType m_data;
+    typename MakePointer_<Scalar>::Type m_data;
     Dimensions m_dimensions;
 };
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index a3a750f21..615559d44 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -52,13 +52,11 @@ struct PacketType : internal::packet_traits<Scalar> {
 };
 
 // For CUDA packet types when using a GpuDevice
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16)
-
-typedef ulonglong2 Packet4h2;
-template<>
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16)
+template <>
 struct PacketType<half, GpuDevice> {
-  typedef Packet4h2 type;
-  static const int size = 8;
+  typedef half2 type;
+  static const int size = 2;
   enum {
     HasAdd    = 1,
     HasSub    = 1,
@@ -77,7 +75,6 @@ struct PacketType<half, GpuDevice> {
     HasSqrt   = 1,
     HasRsqrt  = 1,
     HasExp    = 1,
-    HasExpm1  = 0,
     HasLog    = 1,
     HasLog1p  = 0,
     HasLog10  = 0,
@@ -87,57 +84,9 @@ struct PacketType<half, GpuDevice> {
 #endif
 
 #if defined(EIGEN_USE_SYCL)
-
-namespace TensorSycl {
-namespace internal {
-
-template <typename Index, Index A, Index B> struct PlusOp {
-  static constexpr Index Value = A + B;
-};
-
-template <typename Index, Index A, Index B> struct DivOp {
-  static constexpr Index Value = A / B;
-};
-
-template <typename Index, Index start, Index end, Index step,
-          template <class Indx, Indx...> class StepOp>
-struct static_for {
-  template <typename UnaryOperator>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator op) {
-    op(start);
-    static_for<Index, StepOp<Index, start, step>::Value, end, step,
-               StepOp>::loop(op);
-  }
-};
-template <typename Index, Index end, Index step,
-          template <class Indx, Indx...> class StepOp>
-struct static_for<Index, end, end, step, StepOp> {
-  template <typename UnaryOperator>
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator) {}
-};
-
-template <typename OutScalar, typename Device, bool Vectorizable>
-struct Vectorise {
-  static const int PacketSize = 1;
-  typedef OutScalar PacketReturnType;
-};
-
-template <typename OutScalar, typename Device>
-struct Vectorise<OutScalar, Device, true> {
-  static const int PacketSize = Eigen::PacketType<OutScalar, Device>::size;
-  typedef typename Eigen::PacketType<OutScalar, Device>::type PacketReturnType;
-};
-
-static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index roundUp(Index x, Index y) {
-  return ((((x) + (y)-1) / (y)) * (y));
-}
-
-} // namespace internal
-} // namespace TensorSycl
-
-template <>
-  struct PacketType<half, SyclDevice> {
-  typedef half type;
+template <typename T>
+  struct PacketType<T, SyclDevice> {
+  typedef T type;
   static const int size = 1;
   enum {
     HasAdd    = 0,
@@ -154,59 +103,9 @@ template <>
     HasBlend  = 0
   };
 };
-template <typename Scalar>
-struct PacketType<Scalar, SyclDevice> : internal::default_packet_traits {
-  typedef Scalar type;
-  typedef Scalar half;
-  enum {
-    Vectorizable = 0,
-    size = 1,
-    AlignedOnScalar = 0,
-    HasHalfPacket = 0
-  };
-  enum {
-    HasAdd    = 0,
-    HasSub    = 0,
-    HasMul    = 0,
-    HasNegate = 0,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasConj   = 0,
-    HasSetLinear = 0
-  };
-
-};
-
-template <typename Scalar>
-struct PacketType<Scalar, const SyclDevice> : PacketType<Scalar, SyclDevice>{};
-
-#ifndef EIGEN_DONT_VECTORIZE_SYCL
-#define PACKET_TYPE(CVQual, Type, val, lengths, DEV)\
-template<> struct PacketType<CVQual Type, DEV> : internal::sycl_packet_traits<val, lengths> \
-{\
-  typedef typename internal::packet_traits<Type>::type type;\
-  typedef typename internal::packet_traits<Type>::half half;\
-};
-
-
-PACKET_TYPE(const, float, 1, 4, SyclDevice)
-PACKET_TYPE(, float, 1, 4, SyclDevice)
-PACKET_TYPE(const, float, 1, 4, const SyclDevice)
-PACKET_TYPE(, float, 1, 4, const SyclDevice)
-
-PACKET_TYPE(const, double, 0, 2, SyclDevice)
-PACKET_TYPE(, double, 0, 2, SyclDevice)
-PACKET_TYPE(const, double, 0, 2, const SyclDevice)
-PACKET_TYPE(, double, 0, 2, const SyclDevice)
-#undef PACKET_TYPE
-
-template<> struct PacketType<half, const SyclDevice>: PacketType<half, SyclDevice>{};
-template<> struct PacketType<const half, const SyclDevice>: PacketType<half, SyclDevice>{};
-#endif
 #endif
 
+
 // Tuple mimics std::pair but works on e.g. nvcc.
 template <typename U, typename V> struct Tuple {
  public:
@@ -224,9 +123,7 @@ template <typename U, typename V> struct Tuple {
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Tuple& operator= (const Tuple& rhs) {
-  #ifndef SYCL_DEVICE_ONLY
     if (&rhs == this) return *this;
-  #endif
     first = rhs.first;
     second = rhs.second;
     return *this;
@@ -271,12 +168,12 @@ template <typename Idx> struct IndexPair {
 #ifdef EIGEN_HAS_SFINAE
 namespace internal {
 
-  template<typename IndexType, typename Index, Index... Is>
+  template<typename IndexType, Index... Is>
   EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   array<Index, sizeof...(Is)> customIndices2Array(IndexType& idx, numeric_list<Index, Is...>) {
     return { idx[Is]... };
   }
-  template<typename IndexType, typename Index>
+  template<typename IndexType>
   EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   array<Index, 0> customIndices2Array(IndexType&, numeric_list<Index>) {
     return array<Index, 0>();
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index f107d1b19..d34f1e328 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -31,13 +31,12 @@ struct traits<TensorReshapingOp<NewDimensions, XprType> > : public traits<XprTyp
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = array_size<NewDimensions>::value;
   static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename NewDimensions, typename XprType>
 struct eval<TensorReshapingOp<NewDimensions, XprType>, Eigen::Dense>
 {
-  typedef const TensorReshapingOp<NewDimensions, XprType>EIGEN_DEVICE_REF type;
+  typedef const TensorReshapingOp<NewDimensions, XprType>& type;
 };
 
 template<typename NewDimensions, typename XprType>
@@ -102,62 +101,14 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
   typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
   typedef NewDimensions Dimensions;
 
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-  typedef StorageMemory<typename internal::remove_const<CoeffReturnType>::type, Device> ConstCastStorage;
-
-  static const int NumOutputDims = internal::array_size<Dimensions>::value;
-  static const int NumInputDims  = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-
-  enum ReshapingKind {
-    // We do not use layout information to determine reshaping kind.
-    // Depending on the layout `N` can be inner or outer dimension.
-    OneByN = 0,  // expr.reshape(1, N)
-    NByOne = 1,  // expr.reshape(N, 1)
-    Runtime = 2  // Reshape dimensions are dynamic (specified at runtime).
-  };
-
-  // clang-format off
-  static const ReshapingKind kind =
-#if defined(EIGEN_HAS_INDEX_LIST)
-        (NumOutputDims == 2 && internal::index_statically_eq<NewDimensions>(/*index=*/0, /*value=*/1)) ? OneByN
-      : (NumOutputDims == 2 && internal::index_statically_eq<NewDimensions>(/*index=*/1, /*value=*/1)) ? NByOne
-      : Runtime;
-#else
-        Runtime;
-#endif
-  // clang-format on
-
   enum {
-    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    // For trivial reshapes with raw access to underlying data we will provide
-    // zero overhead block access.
-    // TODO(ezhulenev): Consider adding block access without raw access?
-    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess &&
-                        NumInputDims > 0 && NumOutputDims > 0,
-    PreferBlockAccess = false,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess       = false,  // to be implemented
-    RawAccess         = TensorEvaluator<ArgType, Device>::RawAccess
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
   };
 
-  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumOutputDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef
-      typename internal::TensorMaterializedBlock<ScalarNoConst, NumOutputDims,
-                                                 Layout, Index>
-          TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_dimensions(op.dimensions())
   {
@@ -166,17 +117,14 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
     eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions()));
   }
 
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType data, EvalSubExprsCallback done) {
-    m_impl.evalSubExprsIfNeededAsync(data, std::move(done));
-  }
-#endif
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
     return m_impl.evalSubExprsIfNeeded(data);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
@@ -198,53 +146,10 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
     return m_impl.costPerCoeff(vectorized);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    return internal::TensorBlockResourceRequirements::any();
-  }
-
-  // required in block(OutputTensorBlock* output_block) const
-  // For C++03 compatibility this must be defined outside the method
-  struct BlockIteratorState {
-    Index stride;
-    Index span;
-    Index size;
-    Index count;
-  };
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    eigen_assert(m_impl.data() != NULL);
-    eigen_assert((kind == Runtime) ||
-                 (kind == OneByN && desc.dimensions()[0] == 1) ||
-                 (kind == NByOne && desc.dimensions()[1] == 1));
-
-    if (kind == OneByN || kind == NByOne) {
-      // We can guarantee at compile time that block is just a contiguous slice
-      // of the underlying expression memory buffer.
-      return TensorBlock(internal::TensorBlockKind::kView,
-                           m_impl.data() + desc.offset(), desc.dimensions());
-    } else {
-      // This will do additional runtime checks, and in the end it might be also
-      // a view, or it might be a block materialized in the temporary buffer.
-      return TensorBlock::materialize(m_impl.data(), m_dimensions, desc,
-                                        scratch);
-    }
-  }
-
-  EIGEN_DEVICE_FUNC typename Storage::Type data() const {
-    return constCast(m_impl.data());
-  }
+  EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); }
 
   EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 
-  #ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-  #endif
  protected:
   TensorEvaluator<ArgType, Device> m_impl;
   NewDimensions m_dimensions;
@@ -262,13 +167,11 @@ template<typename NewDimensions, typename ArgType, typename Device>
   typedef NewDimensions Dimensions;
 
   enum {
-    IsAligned         = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
-    PreferBlockAccess = false,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess       = false,  // to be implemented
-    RawAccess         = TensorEvaluator<ArgType, Device>::RawAccess
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
   };
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
@@ -280,38 +183,15 @@ template<typename NewDimensions, typename ArgType, typename Device>
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<TensorEvaluator::NumOutputDims, Index>
-      TensorBlockDesc;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
   {
     return this->m_impl.coeffRef(index);
   }
-
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
     this->m_impl.template writePacket<StoreMode>(index, x);
   }
-
-  template <typename TensorBlock>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
-      const TensorBlockDesc& desc, const TensorBlock& block) {
-    assert(this->m_impl.data() != NULL);
-
-    typedef typename TensorBlock::XprType TensorBlockExpr;
-    typedef internal::TensorBlockAssignment<
-        Scalar, TensorEvaluator::NumOutputDims, TensorBlockExpr, Index>
-        TensorBlockAssign;
-
-    TensorBlockAssign::Run(
-        TensorBlockAssign::target(desc.dimensions(),
-                                  internal::strides<Layout>(this->dimensions()),
-                                  this->m_impl.data(), desc.offset()),
-        block.expr());
-  }
 };
 
 
@@ -334,13 +214,12 @@ struct traits<TensorSlicingOp<StartIndices, Sizes, XprType> > : public traits<Xp
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = array_size<StartIndices>::value;
   static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename StartIndices, typename Sizes, typename XprType>
 struct eval<TensorSlicingOp<StartIndices, Sizes, XprType>, Eigen::Dense>
 {
-  typedef const TensorSlicingOp<StartIndices, Sizes, XprType>EIGEN_DEVICE_REF type;
+  typedef const TensorSlicingOp<StartIndices, Sizes, XprType>& type;
 };
 
 template<typename StartIndices, typename Sizes, typename XprType>
@@ -404,12 +283,9 @@ class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, X
 
 // Fixme: figure out the exact threshold
 namespace {
-template <typename Index, typename Device, bool BlockAccess> struct MemcpyTriggerForSlicing {
+template <typename Index, typename Device> struct MemcpyTriggerForSlicing {
   EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) { }
-  EIGEN_DEVICE_FUNC bool operator ()(Index total, Index contiguous) const {
-    const bool prefer_block_evaluation = BlockAccess && total > 32*1024;
-    return !prefer_block_evaluation && contiguous > threshold_;
-  }
+  EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > threshold_; }
 
  private:
   Index threshold_;
@@ -418,21 +294,11 @@ template <typename Index, typename Device, bool BlockAccess> struct MemcpyTrigge
 // It is very expensive to start the memcpy kernel on GPU: we therefore only
 // use it for large copies.
 #ifdef EIGEN_USE_GPU
-template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index, GpuDevice, BlockAccess>  {
+template <typename Index> struct MemcpyTriggerForSlicing<Index, GpuDevice>  {
   EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) { }
-  EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; }
+  EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; }
 };
 #endif
-
-// It is very expensive to start the memcpy kernel on GPU: we therefore only
-// use it for large copies.
-#ifdef EIGEN_USE_SYCL
-template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index, Eigen::SyclDevice, BlockAccess>  {
-  EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { }
-  EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; }
-};
-#endif
-
 }
 
 // Eval as rvalue
@@ -442,60 +308,23 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
   typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
   static const int NumDims = internal::array_size<Sizes>::value;
 
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef Sizes Dimensions;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef StorageMemory<typename internal::remove_const<CoeffReturnType>::type, Device> ConstCastStorage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
   enum {
     // Alignment can't be guaranteed at compile time since it depends on the
     // slice offsets and sizes.
-    IsAligned         = false,
-    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess &&
-                        // FIXME: Temporary workaround for bug in slicing of bool tensors.
-                        !internal::is_same<typename internal::remove_const<Scalar>::type, bool>::value,
-    PreferBlockAccess = true,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess       = false,
-    RawAccess         = false
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,
+    RawAccess = false
   };
 
-  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  // Tensor slicing does not change the block type.
-  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
   {
-    for (Index i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+    for (std::size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
       eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]);
     }
 
-    m_is_identity = true;
-    for (int i = 0; i < internal::array_size<Dimensions>::value; ++i) {
-      eigen_assert(m_impl.dimensions()[i] >=
-                   op.sizes()[i] + op.startIndices()[i]);
-      if (m_impl.dimensions()[i] != op.sizes()[i] ||
-          op.startIndices()[i] != 0) {
-        m_is_identity = false;
-      }
-    }
-
-    // No strides for scalars.
-    if (NumDims == 0) return;
-
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
     const Sizes& output_dims = op.sizes();
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -525,12 +354,18 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
     }
   }
 
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef Sizes Dimensions;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) {
     m_impl.evalSubExprsIfNeeded(NULL);
-    if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization
-        && data && m_impl.data()) {
+    if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && data && m_impl.data()) {
       Index contiguous_values = 1;
       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
         for (int i = 0; i < NumDims; ++i) {
@@ -548,12 +383,12 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
         }
       }
       // Use memcpy if it's going to be faster than using the regular evaluation.
-      const MemcpyTriggerForSlicing<Index, Device, BlockAccess> trigger(m_device);
-      if (trigger(internal::array_prod(dimensions()), contiguous_values)) {
-        EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data();
-        for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
+      const MemcpyTriggerForSlicing<Index, Device> trigger(m_device);
+      if (trigger(contiguous_values)) {
+        Scalar* src = (Scalar*)m_impl.data();
+        for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
           Index offset = srcCoeff(i);
-          m_device.memcpy((void*)(m_device.get(data + i)), m_device.get(src+offset), contiguous_values * sizeof(Scalar));
+          m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar));
         }
         return false;
       }
@@ -561,42 +396,25 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
     return true;
   }
 
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType data, EvalSubExprsCallback done) {
-    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
-  }
-#endif  // EIGEN_USE_THREADS
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    if (m_is_identity) {
-      return m_impl.coeff(index);
-    } else {
-      return m_impl.coeff(srcCoeff(index));
-    }
+    return m_impl.coeff(srcCoeff(index));
   }
 
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    const int packetSize = PacketType<CoeffReturnType, Device>::size;
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
     EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+packetSize-1 < internal::array_prod(dimensions()));
 
-    if (m_is_identity) {
-      return m_impl.template packet<LoadMode>(index);
-    }
-
     Index inputIndices[] = {0, 0};
     Index indices[] = {index, index + packetSize - 1};
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx0 = indices[0] / m_fastOutputStrides[i];
         const Index idx1 = indices[1] / m_fastOutputStrides[i];
@@ -608,7 +426,6 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
       inputIndices[0] += (indices[0] + m_offsets[0]);
       inputIndices[1] += (indices[1] + m_offsets[0]);
     } else {
-      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx0 = indices[0] / m_fastOutputStrides[i];
         const Index idx1 = indices[1] / m_fastOutputStrides[i];
@@ -628,7 +445,6 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
       values[0] = m_impl.coeff(inputIndices[0]);
       values[packetSize-1] = m_impl.coeff(inputIndices[1]);
-      EIGEN_UNROLL_LOOP
       for (int i = 1; i < packetSize-1; ++i) {
         values[i] = coeff(index+i);
       }
@@ -638,28 +454,12 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims);
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    const size_t target_size = m_device.lastLevelCacheSize();
-    return internal::TensorBlockResourceRequirements::merge(
-        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size),
-        m_impl.getResourceRequirements());
-  }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    TensorBlockDesc arg_desc = desc.WithOffset(srcCoeff(desc.offset()));
-    TensorBlock block = m_impl.block(arg_desc, scratch);
-    if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
-    return block;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
-    typename Storage::Type result = constCast(m_impl.data());
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
+    Scalar* result = m_impl.data();
     if (result) {
       Index offset = 0;
       if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -693,19 +493,12 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
     }
     return NULL;
   }
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
 
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
   {
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx = index / m_fastOutputStrides[i];
         inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
@@ -713,7 +506,6 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
       }
       inputIndex += (index + m_offsets[0]);
     } else {
-      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx = index / m_fastOutputStrides[i];
         inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
@@ -728,9 +520,8 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
   array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
   array<Index, NumDims> m_inputStrides;
   TensorEvaluator<ArgType, Device> m_impl;
-  const Device EIGEN_DEVICE_REF m_device;
+  const Device& m_device;
   Dimensions m_dimensions;
-  bool m_is_identity;
   const StartIndices m_offsets;
 };
 
@@ -744,55 +535,36 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
   typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
   static const int NumDims = internal::array_size<Sizes>::value;
 
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : Base(op, device)
+    { }
+
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef Sizes Dimensions;
 
-  enum {
-    IsAligned         = false,
-    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = TensorEvaluator<ArgType, Device>::BlockAccess,
-    PreferBlockAccess = true,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess       = false,
-    RawAccess         = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
-  };
-
-  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-    : Base(op, device)
-    { }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
   {
-    if (this->m_is_identity) {
-      return this->m_impl.coeffRef(index);
-    } else {
-      return this->m_impl.coeffRef(this->srcCoeff(index));
-    }
+    return this->m_impl.coeffRef(this->srcCoeff(index));
   }
 
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
-    if (this->m_is_identity) {
-      this->m_impl.template writePacket<StoreMode>(index, x);
-      return;
-    }
-
-    const int packetSize = PacketType<CoeffReturnType, Device>::size;
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
     Index inputIndices[] = {0, 0};
     Index indices[] = {index, index + packetSize - 1};
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
         const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
@@ -804,7 +576,6 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
       inputIndices[0] += (indices[0] + this->m_offsets[0]);
       inputIndices[1] += (indices[1] + this->m_offsets[0]);
     } else {
-      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
         const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
@@ -824,21 +595,15 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
       internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
       this->m_impl.coeffRef(inputIndices[0]) = values[0];
       this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1];
-      EIGEN_UNROLL_LOOP
       for (int i = 1; i < packetSize-1; ++i) {
         this->coeffRef(index+i) = values[i];
       }
     }
   }
-
-  template<typename TensorBlock>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
-      const TensorBlockDesc& desc, const TensorBlock& block) {
-    TensorBlockDesc arg_desc = desc.WithOffset(this->srcCoeff(desc.offset()));
-    this->m_impl.writeBlock(arg_desc, block);
-  }
 };
 
+
+
 namespace internal {
 template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
 struct traits<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> > : public traits<XprType>
@@ -851,13 +616,12 @@ struct traits<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprTyp
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = array_size<StartIndices>::value;
   static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
 struct eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Eigen::Dense>
 {
-  typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>EIGEN_DEVICE_REF type;
+  typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>& type;
 };
 
 template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
@@ -930,13 +694,6 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
 {
   typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
   static const int NumDims = internal::array_size<Strides>::value;
-  typedef typename XprType::Index Index;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-  typedef Strides Dimensions;
 
   enum {
     // Alignment can't be guaranteed at compile time since it depends on the
@@ -944,60 +701,43 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
     IsAligned = false,
     PacketAccess = false,
     BlockAccess = false,
-    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess = false
   };
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device),
-        m_device(device),
-        m_strides(op.strides())
+      : m_impl(op.expression(), device), m_device(device), m_strides(op.strides())
   {
     // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero
-    DSizes<Index, NumDims> startIndicesClamped, stopIndicesClamped;
-    for (ptrdiff_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+    DSizes<Index,NumDims> startIndicesClamped, stopIndicesClamped;
+    for (size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
       eigen_assert(m_strides[i] != 0 && "0 stride is invalid");
-      if (m_strides[i] > 0) {
-        startIndicesClamped[i] =
-            clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
-        stopIndicesClamped[i] =
-            clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
-      } else {
-        /* implies m_strides[i] < 0 by assert */
-        startIndicesClamped[i] =
-            clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
-        stopIndicesClamped[i] =
-            clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
+      if(m_strides[i]>0){
+        startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
+        stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
+      }else{
+        /* implies m_strides[i]<0 by assert */
+        startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
+        stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
       }
       m_startIndices[i] = startIndicesClamped[i];
     }
 
-    typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
-    const InputDimensions& input_dims = m_impl.dimensions();
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
 
     // check for degenerate intervals and compute output tensor shape
-    bool degenerate = false;
-    m_is_identity = true;
-    for (int i = 0; i < NumDims; i++) {
+    bool degenerate = false;;
+    for(int i = 0; i < NumDims; i++){
       Index interval = stopIndicesClamped[i] - startIndicesClamped[i];
-      if (interval == 0 || ((interval < 0) != (m_strides[i] < 0))) {
+      if(interval == 0 || ((interval<0) != (m_strides[i]<0))){
         m_dimensions[i] = 0;
         degenerate = true;
-      } else {
-        m_dimensions[i] =
-            (interval / m_strides[i]) + (interval % m_strides[i] != 0 ? 1 : 0);
+      }else{
+        m_dimensions[i] = interval / m_strides[i]
+                          + (interval % m_strides[i] != 0 ? 1 : 0);
         eigen_assert(m_dimensions[i] >= 0);
       }
-      if (m_strides[i] != 1 || interval != m_impl.dimensions()[i]) {
-        m_is_identity = false;
-      }
     }
-
     Strides output_dims = m_dimensions;
 
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -1034,12 +774,22 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
         m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);
       }
     }
+    m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1),
+                                          device.lastLevelCacheSize() /
+                                          sizeof(Scalar));
   }
 
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef Strides Dimensions;
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
@@ -1050,39 +800,28 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    if (m_is_identity) {
-      return m_impl.coeff(index);
-    } else {
-      return m_impl.coeff(srcCoeff(index));
-    }
+    return m_impl.coeff(srcCoeff(index));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims);
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
     return NULL;
   }
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
+
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
   {
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i >= 0; --i) {
         const Index idx = index / m_fastOutputStrides[i];
         inputIndex += idx * m_inputStrides[i] + m_offsets[i];
         index -= idx * m_outputStrides[i];
       }
     } else {
-      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims; ++i) {
         const Index idx = index / m_fastOutputStrides[i];
         inputIndex += idx * m_inputStrides[i] + m_offsets[i];
@@ -1092,24 +831,20 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
     return inputIndex;
   }
 
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
-#ifndef SYCL_DEVICE_ONLY
+  static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
     return numext::maxi(min, numext::mini(max,value));
-#else
-    return cl::sycl::clamp(value, min, max);
-#endif
   }
 
   array<Index, NumDims> m_outputStrides;
   array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
   array<Index, NumDims> m_inputStrides;
-  bool m_is_identity;
   TensorEvaluator<ArgType, Device> m_impl;
-  const Device EIGEN_DEVICE_REF m_device;
+  const Device& m_device;
   DSizes<Index, NumDims> m_startIndices; // clamped startIndices
   DSizes<Index, NumDims> m_dimensions;
   DSizes<Index, NumDims> m_offsets; // offset in a flattened shape
   const Strides m_strides;
+  std::size_t m_block_total_size_max;
 };
 
 // Eval as lvalue
@@ -1125,33 +860,25 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
     IsAligned = false,
     PacketAccess = false,
     BlockAccess = false,
-    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
     RawAccess = false
   };
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
     { }
 
   typedef typename XprType::Index Index;
   typedef typename XprType::Scalar Scalar;
+  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef Strides Dimensions;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
   {
-    if (this->m_is_identity) {
-      return this->m_impl.coeffRef(index);
-    } else {
-      return this->m_impl.coeffRef(this->srcCoeff(index));
-    }
+    return this->m_impl.coeffRef(this->srcCoeff(index));
   }
 };
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 561666c6f..647bcf108 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -31,7 +31,6 @@ struct traits<TensorPaddingOp<PaddingDimensions, XprType> > : public traits<XprT
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename PaddingDimensions, typename XprType>
@@ -91,33 +90,18 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
-    IsAligned         = true,
-    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
-    PreferBlockAccess = true,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess       = true,
-    RawAccess         = false
+    IsAligned = true,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = true,
+    RawAccess = false
   };
 
-  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
-                                                     Layout, Index>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device)
+      : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value())
   {
     // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead
     // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector
@@ -151,19 +135,10 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType, EvalSubExprsCallback done) {
-    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
-  }
-#endif  // EIGEN_USE_THREADS
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
@@ -173,7 +148,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
     eigen_assert(index < dimensions().TotalSize());
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx = index / m_outputStrides[i];
         if (isPaddingAtIndexForDim(idx, i)) {
@@ -187,7 +161,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
       }
       inputIndex += (index - m_padding[0].first);
     } else {
-      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx = index / m_outputStrides[i+1];
         if (isPaddingAtIndexForDim(idx, i)) {
@@ -216,298 +189,18 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     TensorOpCost cost = m_impl.costPerCoeff(vectorized);
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims; ++i)
         updateCostPerDimension(cost, i, i == 0);
     } else {
-      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i >= 0; --i)
         updateCostPerDimension(cost, i, i == NumDims - 1);
     }
     return cost;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    const size_t target_size = m_device.lastLevelCacheSize();
-    return internal::TensorBlockResourceRequirements::merge(
-        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size),
-        m_impl.getResourceRequirements());
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    // If one of the dimensions is zero, return empty block view.
-    if (desc.size() == 0) {
-      return TensorBlock(internal::TensorBlockKind::kView, NULL,
-                           desc.dimensions());
-    }
-
-    static const bool IsColMajor = Layout == static_cast<int>(ColMajor);
-    const int inner_dim_idx = IsColMajor ? 0 : NumDims - 1;
-
-    Index offset = desc.offset();
-
-    // Compute offsets in the output tensor corresponding to the desc.offset().
-    DSizes<Index, NumDims> output_offsets;
-    for (int i = NumDims - 1; i > 0; --i) {
-      const int dim = IsColMajor ? i : NumDims - i - 1;
-      const int stride_dim = IsColMajor ? dim : dim + 1;
-      output_offsets[dim] = offset / m_outputStrides[stride_dim];
-      offset -= output_offsets[dim] * m_outputStrides[stride_dim];
-    }
-    output_offsets[inner_dim_idx] = offset;
-
-    // Offsets in the input corresponding to output offsets.
-    DSizes<Index, NumDims> input_offsets = output_offsets;
-    for (int i = 0; i < NumDims; ++i) {
-      const int dim = IsColMajor ? i : NumDims - i - 1;
-      input_offsets[dim] = input_offsets[dim] - m_padding[dim].first;
-    }
-
-    // Compute offset in the input buffer (at this point it might be illegal and
-    // point outside of the input buffer, because we don't check for negative
-    // offsets, it will be autocorrected in the block iteration loop below).
-    Index input_offset = 0;
-    for (int i = 0; i < NumDims; ++i) {
-      const int dim = IsColMajor ? i : NumDims - i - 1;
-      input_offset += input_offsets[dim] * m_inputStrides[dim];
-    }
-
-    // Destination buffer and scratch buffer both indexed from 0 and have the
-    // same dimensions as the requested block (for destination buffer this
-    // property is guaranteed by `desc.destination()`).
-    Index output_offset = 0;
-    const DSizes<Index, NumDims> output_strides =
-        internal::strides<Layout>(desc.dimensions());
-
-    // NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1`
-    // dimensions, skipping innermost dimension. In theory it should be possible
-    // to squeeze matching innermost dimensions, however in practice that did
-    // not show any improvements in benchmarks. Also in practice first outer
-    // dimension usually has padding, and will prevent squeezing.
-
-    // Initialize output block iterator state. Dimension in this array are
-    // always in inner_most -> outer_most order (col major layout).
-    array<BlockIteratorState, NumDims - 1> it;
-    for (int i = 0; i < NumDims - 1; ++i) {
-      const int dim = IsColMajor ? i + 1 : NumDims - i - 2;
-      it[i].count = 0;
-      it[i].size = desc.dimension(dim);
-
-      it[i].input_stride = m_inputStrides[dim];
-      it[i].input_span = it[i].input_stride * (it[i].size - 1);
-
-      it[i].output_stride = output_strides[dim];
-      it[i].output_span = it[i].output_stride * (it[i].size - 1);
-    }
-
-    const Index input_inner_dim_size =
-        static_cast<Index>(m_impl.dimensions()[inner_dim_idx]);
-
-    // Total output size.
-    const Index output_size = desc.size();
-
-    // We will fill inner dimension of this size in the output. It might be
-    // larger than the inner dimension in the input, so we might have to pad
-    // before/after we copy values from the input inner dimension.
-    const Index output_inner_dim_size = desc.dimension(inner_dim_idx);
-
-    // How many values to fill with padding BEFORE reading from the input inner
-    // dimension.
-    const Index output_inner_pad_before_size =
-        input_offsets[inner_dim_idx] < 0
-            ? numext::mini(numext::abs(input_offsets[inner_dim_idx]),
-                           output_inner_dim_size)
-            : 0;
-
-    // How many values we can actually copy from the input inner dimension.
-    const Index output_inner_copy_size = numext::mini(
-        // Want to copy from input.
-        (output_inner_dim_size - output_inner_pad_before_size),
-        // Can copy from input.
-        numext::maxi(input_inner_dim_size - (input_offsets[inner_dim_idx] +
-                                             output_inner_pad_before_size),
-                     Index(0)));
-
-    eigen_assert(output_inner_copy_size >= 0);
-
-    // How many values to fill with padding AFTER reading from the input inner
-    // dimension.
-    const Index output_inner_pad_after_size =
-        (output_inner_dim_size - output_inner_copy_size -
-         output_inner_pad_before_size);
-
-    // Sanity check, sum of all sizes must be equal to the output size.
-    eigen_assert(output_inner_dim_size ==
-                 (output_inner_pad_before_size + output_inner_copy_size +
-                  output_inner_pad_after_size));
-
-    // Keep track of current coordinates and padding in the output.
-    DSizes<Index, NumDims> output_coord = output_offsets;
-    DSizes<Index, NumDims> output_padded;
-    for (int i = 0; i < NumDims; ++i) {
-      const int dim = IsColMajor ? i : NumDims - i - 1;
-      output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
-    }
-
-    typedef internal::StridedLinearBufferCopy<ScalarNoConst, Index> LinCopy;
-
-    // Prepare storage for the materialized padding result.
-    const typename TensorBlock::Storage block_storage =
-        TensorBlock::prepareStorage(desc, scratch);
-
-    // TODO(ezhulenev): Squeeze multiple non-padded inner dimensions into a
-    // single logical inner dimension.
-
-    // When possible we squeeze writes for the innermost (only if non-padded)
-    // dimension with the first padded dimension. This allows to reduce the
-    // number of calls to LinCopy and better utilize vector instructions.
-    const bool squeeze_writes =
-        NumDims > 1 &&
-        // inner dimension is not padded
-        (input_inner_dim_size == m_dimensions[inner_dim_idx]) &&
-        // and equal to the block inner dimension
-        (input_inner_dim_size == output_inner_dim_size);
-
-    const int squeeze_dim = IsColMajor ? inner_dim_idx + 1 : inner_dim_idx - 1;
-
-    // Maximum coordinate on a squeeze dimension that we can write to.
-    const Index squeeze_max_coord =
-        squeeze_writes ? numext::mini(
-                             // max non-padded element in the input
-                             static_cast<Index>(m_dimensions[squeeze_dim] -
-                                                m_padding[squeeze_dim].second),
-                             // max element in the output buffer
-                             static_cast<Index>(output_offsets[squeeze_dim] +
-                                                desc.dimension(squeeze_dim)))
-                       : static_cast<Index>(0);
-
-    // Iterate copying data from `m_impl.data()` to the output buffer.
-    for (Index size = 0; size < output_size;) {
-      // Detect if we are in the padded region (exclude innermost dimension).
-      bool is_padded = false;
-      for (int j = 1; j < NumDims; ++j) {
-        const int dim = IsColMajor ? j : NumDims - j - 1;
-        is_padded = output_padded[dim];
-        if (is_padded) break;
-      }
-
-      if (is_padded) {
-        // Fill single innermost dimension with padding value.
-        size += output_inner_dim_size;
-
-        LinCopy::template Run<LinCopy::Kind::FillLinear>(
-            typename LinCopy::Dst(output_offset, 1, block_storage.data()),
-            typename LinCopy::Src(0, 0, &m_paddingValue),
-            output_inner_dim_size);
-
-
-      } else if (squeeze_writes) {
-        // Squeeze multiple reads from innermost dimensions.
-        const Index squeeze_num = squeeze_max_coord - output_coord[squeeze_dim];
-        size += output_inner_dim_size * squeeze_num;
-
-        // Copy `squeeze_num` inner dimensions from input to output.
-        LinCopy::template Run<LinCopy::Kind::Linear>(
-            typename LinCopy::Dst(output_offset, 1, block_storage.data()),
-            typename LinCopy::Src(input_offset, 1, m_impl.data()),
-            output_inner_dim_size * squeeze_num);
-
-        // Update iteration state for only `squeeze_num - 1` processed inner
-        // dimensions, because we have another iteration state update at the end
-        // of the loop that will update iteration state for the last inner
-        // processed dimension.
-        it[0].count += (squeeze_num - 1);
-        input_offset += it[0].input_stride * (squeeze_num - 1);
-        output_offset += it[0].output_stride * (squeeze_num - 1);
-        output_coord[squeeze_dim] += (squeeze_num - 1);
-
-      } else {
-        // Single read from innermost dimension.
-        size += output_inner_dim_size;
-
-        {  // Fill with padding before copying from input inner dimension.
-          const Index out = output_offset;
-
-          LinCopy::template Run<LinCopy::Kind::FillLinear>(
-              typename LinCopy::Dst(out, 1, block_storage.data()),
-              typename LinCopy::Src(0, 0, &m_paddingValue),
-              output_inner_pad_before_size);
-        }
-
-        {  // Copy data from input inner dimension.
-          const Index out = output_offset + output_inner_pad_before_size;
-          const Index in = input_offset + output_inner_pad_before_size;
-
-          eigen_assert(output_inner_copy_size == 0 || m_impl.data() != NULL);
-
-          LinCopy::template Run<LinCopy::Kind::Linear>(
-              typename LinCopy::Dst(out, 1, block_storage.data()),
-              typename LinCopy::Src(in, 1, m_impl.data()),
-              output_inner_copy_size);
-        }
-
-        {  // Fill with padding after copying from input inner dimension.
-          const Index out = output_offset + output_inner_pad_before_size +
-                            output_inner_copy_size;
-
-          LinCopy::template Run<LinCopy::Kind::FillLinear>(
-              typename LinCopy::Dst(out, 1, block_storage.data()),
-              typename LinCopy::Src(0, 0, &m_paddingValue),
-              output_inner_pad_after_size);
-        }
-      }
-
-      for (int j = 0; j < NumDims - 1; ++j) {
-        const int dim = IsColMajor ? j + 1 : NumDims - j - 2;
-
-        if (++it[j].count < it[j].size) {
-          input_offset += it[j].input_stride;
-          output_offset += it[j].output_stride;
-          output_coord[dim] += 1;
-          output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
-          break;
-        }
-        it[j].count = 0;
-        input_offset -= it[j].input_span;
-        output_offset -= it[j].output_span;
-        output_coord[dim] -= it[j].size - 1;
-        output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
-      }
-    }
-
-    return block_storage.AsTensorMaterializedBlock();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
  private:
-  struct BlockIteratorState {
-    BlockIteratorState()
-        : count(0),
-          size(0),
-          input_stride(0),
-          input_span(0),
-          output_stride(0),
-          output_span(0) {}
-
-    Index count;
-    Index size;
-    Index input_stride;
-    Index input_span;
-    Index output_stride;
-    Index output_span;
-  };
-
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(
       Index index, int dim_index) const {
 #if defined(EIGEN_HAS_INDEX_LIST)
@@ -569,23 +262,22 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
     const Index initialIndex = index;
     Index inputIndex = 0;
-    EIGEN_UNROLL_LOOP
     for (int i = NumDims - 1; i > 0; --i) {
-      const Index firstIdx = index;
-      const Index lastIdx = index + PacketSize - 1;
+      const Index first = index;
+      const Index last = index + PacketSize - 1;
       const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
       const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
       const Index lastPaddedRight = m_outputStrides[i+1];
 
-      if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) {
+      if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) {
         // all the coefficient are in the padding zone.
         return internal::pset1<PacketReturnType>(m_paddingValue);
       }
-      else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
+      else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) {
         // all the coefficient are in the padding zone.
         return internal::pset1<PacketReturnType>(m_paddingValue);
       }
-      else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
+      else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
         // all the coefficient are between the 2 padding zones.
         const Index idx = index / m_outputStrides[i];
         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
@@ -597,21 +289,21 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
       }
     }
 
-    const Index lastIdx = index + PacketSize - 1;
-    const Index firstIdx = index;
+    const Index last = index + PacketSize - 1;
+    const Index first = index;
     const Index lastPaddedLeft = m_padding[0].first;
     const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
     const Index lastPaddedRight = m_outputStrides[1];
 
-    if (!isLeftPaddingCompileTimeZero(0) && lastIdx < lastPaddedLeft) {
+    if (!isLeftPaddingCompileTimeZero(0) && last < lastPaddedLeft) {
       // all the coefficient are in the padding zone.
       return internal::pset1<PacketReturnType>(m_paddingValue);
     }
-    else if (!isRightPaddingCompileTimeZero(0) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
+    else if (!isRightPaddingCompileTimeZero(0) && first >= firstPaddedRight && last < lastPaddedRight) {
       // all the coefficient are in the padding zone.
       return internal::pset1<PacketReturnType>(m_paddingValue);
     }
-    else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
+    else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
       // all the coefficient are between the 2 padding zones.
       inputIndex += (index - m_padding[0].first);
       return m_impl.template packet<Unaligned>(inputIndex);
@@ -627,23 +319,23 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
     const Index initialIndex = index;
     Index inputIndex = 0;
-    EIGEN_UNROLL_LOOP
+
     for (int i = 0; i < NumDims - 1; ++i) {
-      const Index firstIdx = index;
-      const Index lastIdx = index + PacketSize - 1;
+      const Index first = index;
+      const Index last = index + PacketSize - 1;
       const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1];
       const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1];
       const Index lastPaddedRight = m_outputStrides[i];
 
-      if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) {
+      if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) {
         // all the coefficient are in the padding zone.
         return internal::pset1<PacketReturnType>(m_paddingValue);
       }
-      else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
+      else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) {
         // all the coefficient are in the padding zone.
         return internal::pset1<PacketReturnType>(m_paddingValue);
       }
-      else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
+      else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
         // all the coefficient are between the 2 padding zones.
         const Index idx = index / m_outputStrides[i+1];
         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
@@ -655,21 +347,21 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
       }
     }
 
-    const Index lastIdx = index + PacketSize - 1;
-    const Index firstIdx = index;
+    const Index last = index + PacketSize - 1;
+    const Index first = index;
     const Index lastPaddedLeft = m_padding[NumDims-1].first;
     const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
     const Index lastPaddedRight = m_outputStrides[NumDims-1];
 
-    if (!isLeftPaddingCompileTimeZero(NumDims-1) && lastIdx < lastPaddedLeft) {
+    if (!isLeftPaddingCompileTimeZero(NumDims-1) && last < lastPaddedLeft) {
       // all the coefficient are in the padding zone.
       return internal::pset1<PacketReturnType>(m_paddingValue);
     }
-    else if (!isRightPaddingCompileTimeZero(NumDims-1) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
+    else if (!isRightPaddingCompileTimeZero(NumDims-1) && first >= firstPaddedRight && last < lastPaddedRight) {
       // all the coefficient are in the padding zone.
       return internal::pset1<PacketReturnType>(m_paddingValue);
     }
-    else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
+    else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
       // all the coefficient are between the 2 padding zones.
       inputIndex += (index - m_padding[NumDims-1].first);
       return m_impl.template packet<Unaligned>(inputIndex);
@@ -681,7 +373,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
   {
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-    EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
@@ -696,8 +387,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   PaddingDimensions m_padding;
 
   Scalar m_paddingValue;
-
-  const Device EIGEN_DEVICE_REF m_device;
 };
 
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index 64a436e50..886a254f6 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -31,7 +31,6 @@ struct traits<TensorPatchOp<PatchDim, XprType> > : public traits<XprType>
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions + 1;
   static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename PatchDim, typename XprType>
@@ -88,25 +87,17 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
 
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
  };
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device)
   {
@@ -152,7 +143,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
@@ -170,7 +161,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
     Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index];
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 2; i > 0; --i) {
         const Index patchIdx = patchIndex / m_patchStrides[i];
         patchIndex -= patchIdx * m_patchStrides[i];
@@ -179,7 +169,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
         inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
       }
     } else {
-      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 2; ++i) {
         const Index patchIdx = patchIndex / m_patchStrides[i];
         patchIndex -= patchIdx * m_patchStrides[i];
@@ -207,7 +196,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
 
     Index inputIndices[2] = {0, 0};
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 2; i > 0; --i) {
         const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
                                    patchIndices[1] / m_patchStrides[i]};
@@ -223,7 +211,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
         inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i];
       }
     } else {
-      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 2; ++i) {
         const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i],
                                    patchIndices[1] / m_patchStrides[i]};
@@ -250,7 +237,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
       EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
       values[0] = m_impl.coeff(inputIndices[0]);
       values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
-      EIGEN_UNROLL_LOOP
       for (int i = 1; i < PacketSize-1; ++i) {
         values[i] = coeff(index+i);
       }
@@ -267,14 +253,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
            TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh); 
-  }
-#endif
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
  protected:
   Dimensions m_dimensions;
@@ -283,7 +262,6 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
   array<Index, NumDims-1> m_patchStrides;
 
   TensorEvaluator<ArgType, Device> m_impl;
-
 };
 
 } // end namespace Eigen
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
index 445248163..1655a813e 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -2,7 +2,6 @@
 // for linear algebra.
 //
 // Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-// Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -17,10 +16,10 @@ namespace internal {
 namespace {
 
 EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
+#ifdef __CUDA_ARCH__
   // We don't support 3d kernels since we currently only use 1 and
   // 2d kernels.
-  gpu_assert(threadIdx.z == 0);
+  assert(threadIdx.z == 0);
   return clock64() +
       blockIdx.x * blockDim.x + threadIdx.x +
       gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
@@ -45,15 +44,6 @@ EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
   uint64_t rnd = ::random() ^ mach_absolute_time();
   return rnd;
 
-#elif defined __native_client__
-  // Same approach as for win32, except using clock_gettime
-  timespec ts;
-  clock_gettime(CLOCK_REALTIME, &ts);
-  int rnd1 = ::rand();
-  int rnd2 = ::rand();
-  uint64_t rnd = (rnd1 | rnd2 << 16) ^ ts.tv_nsec;
-  return rnd;
-
 #else
   // Augment the current time with pseudo random number generation
   // to ensure that we get different seeds if we try to generate seeds
@@ -65,11 +55,11 @@ EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
 #endif
 }
 
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) {
   // TODO: Unify with the implementation in the non blocking thread pool.
   uint64_t current = *state;
   // Update the internal state
-  *state = current * 6364136223846793005ULL + (stream << 1 | 1);
+  *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
   // Generate the random output (using the PCG-XSH-RS scheme)
   return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
 }
@@ -83,17 +73,17 @@ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t
 
 
 template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-T RandomToTypeUniform(uint64_t* state, uint64_t stream) {
-  unsigned rnd = PCG_XSH_RS_generator(state, stream);
+T RandomToTypeUniform(uint64_t* state) {
+  unsigned rnd = PCG_XSH_RS_generator(state);
   return static_cast<T>(rnd);
 }
 
 
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
+Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state) {
   Eigen::half result;
   // Generate 10 random bits for the mantissa
-  unsigned rnd = PCG_XSH_RS_generator(state, stream);
+  unsigned rnd = PCG_XSH_RS_generator(state);
   result.x = static_cast<uint16_t>(rnd & 0x3ffu);
   // Set the exponent
   result.x |= (static_cast<uint16_t>(15) << 10);
@@ -103,14 +93,14 @@ Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
 
 
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) {
+float RandomToTypeUniform<float>(uint64_t* state) {
   typedef union {
     uint32_t raw;
     float fp;
   } internal;
   internal result;
   // Generate 23 random bits for the mantissa mantissa
-  const unsigned rnd = PCG_XSH_RS_generator(state, stream);
+  const unsigned rnd = PCG_XSH_RS_generator(state);
   result.raw = rnd & 0x7fffffu;
   // Set the exponent
   result.raw |= (static_cast<uint32_t>(127) << 23);
@@ -119,7 +109,7 @@ float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) {
 }
 
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {
+double RandomToTypeUniform<double>(uint64_t* state) {
   typedef union {
     uint64_t raw;
     double dp;
@@ -128,9 +118,9 @@ double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {
   result.raw = 0;
   // Generate 52 random bits for the mantissa
   // First generate the upper 20 bits
-  unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;
+  unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu;
   // The generate the lower 32 bits
-  unsigned rnd2 = PCG_XSH_RS_generator(state, stream);
+  unsigned rnd2 = PCG_XSH_RS_generator(state);
   result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
   // Set the exponent
   result.raw |= (static_cast<uint64_t>(1023) << 52);
@@ -139,14 +129,14 @@ double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {
 }
 
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state, uint64_t stream) {
-  return std::complex<float>(RandomToTypeUniform<float>(state, stream),
-                             RandomToTypeUniform<float>(state, stream));
+std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state) {
+  return std::complex<float>(RandomToTypeUniform<float>(state),
+                             RandomToTypeUniform<float>(state));
 }
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state, uint64_t stream) {
-  return std::complex<double>(RandomToTypeUniform<double>(state, stream),
-                              RandomToTypeUniform<double>(state, stream));
+std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state) {
+  return std::complex<double>(RandomToTypeUniform<double>(state),
+                              RandomToTypeUniform<double>(state));
 }
 
 template <typename T> class UniformRandomGenerator {
@@ -157,42 +147,17 @@ template <typename T> class UniformRandomGenerator {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
       uint64_t seed = 0) {
     m_state = PCG_XSH_RS_state(seed);
-    #ifdef EIGEN_USE_SYCL
-    // In SYCL it is not possible to build PCG_XSH_RS_state in one step. 
-    // Therefor, we need two step to initializate the m_state.
-    // IN SYCL, the constructor of the functor is s called on the CPU
-    // and we get the clock seed here from the CPU. However, This seed is 
-    //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
-    // and only  available on the Operator() function (which is called on the GPU).
-    // Thus for CUDA (((CLOCK  + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread 
-    // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds  
-    // the  (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction 
-    // similar to CUDA Therefore, the thread Id injection is not available at this stage. 
-    //However when the operator() is called the thread ID will be avilable. So inside the opeator, 
-    // we add the thrreadID, BlockId,... (which is equivalent of i) 
-    //to the seed and construct the unique m_state per thead similar to cuda.  
-    m_exec_once =false;
-   #endif
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
       const UniformRandomGenerator& other) {
     m_state = other.m_state;
-    #ifdef EIGEN_USE_SYCL
-     m_exec_once =other.m_exec_once;
-    #endif
   }
 
   template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   T operator()(Index i) const {
-    #ifdef EIGEN_USE_SYCL
-      if(!m_exec_once) {
-      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
-      // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
-       m_state += (i * 6364136223846793005ULL);
-       m_exec_once =true;
-      }
-    #endif
-    T result = RandomToTypeUniform<T>(&m_state, i);
+    uint64_t local_state = m_state + i;
+    T result = RandomToTypeUniform<T>(&local_state);
+    m_state = local_state;
     return result;
   }
 
@@ -200,25 +165,16 @@ template <typename T> class UniformRandomGenerator {
   Packet packetOp(Index i) const {
     const int packetSize = internal::unpacket_traits<Packet>::size;
     EIGEN_ALIGN_MAX T values[packetSize];
-      #ifdef EIGEN_USE_SYCL
-      if(!m_exec_once) {
-      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
-       m_state += (i * 6364136223846793005ULL);
-       m_exec_once =true;
-      }
-    #endif
-    EIGEN_UNROLL_LOOP
+    uint64_t local_state = m_state + i;
     for (int j = 0; j < packetSize; ++j) {
-      values[j] = RandomToTypeUniform<T>(&m_state, i);
+      values[j] = RandomToTypeUniform<T>(&local_state);
     }
+    m_state = local_state;
     return internal::pload<Packet>(values);
   }
 
  private:
   mutable uint64_t m_state;
-  #ifdef EIGEN_USE_SYCL
-  mutable bool m_exec_once;
-  #endif
 };
 
 template <typename Scalar>
@@ -234,14 +190,14 @@ struct functor_traits<UniformRandomGenerator<Scalar> > {
 
 
 template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-T RandomToTypeNormal(uint64_t* state, uint64_t stream) {
+T RandomToTypeNormal(uint64_t* state) {
   // Use the ratio of uniform method to generate numbers following a normal
   // distribution. See for example Numerical Recipes chapter 7.3.9 for the
   // details.
   T u, v, q;
   do {
-    u = RandomToTypeUniform<T>(state, stream);
-    v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5));
+    u = RandomToTypeUniform<T>(state);
+    v = T(1.7156) * (RandomToTypeUniform<T>(state) - T(0.5));
     const T x = u - T(0.449871);
     const T y = numext::abs(v) + T(0.386595);
     q = x*x + y * (T(0.196)*y - T(0.25472)*x);
@@ -252,14 +208,14 @@ T RandomToTypeNormal(uint64_t* state, uint64_t stream) {
 }
 
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state, uint64_t stream) {
-  return std::complex<float>(RandomToTypeNormal<float>(state, stream),
-                             RandomToTypeNormal<float>(state, stream));
+std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state) {
+  return std::complex<float>(RandomToTypeNormal<float>(state),
+                             RandomToTypeNormal<float>(state));
 }
 template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state, uint64_t stream) {
-  return std::complex<double>(RandomToTypeNormal<double>(state, stream),
-                              RandomToTypeNormal<double>(state, stream));
+std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state) {
+  return std::complex<double>(RandomToTypeNormal<double>(state),
+                              RandomToTypeNormal<double>(state));
 }
 
 
@@ -270,38 +226,17 @@ template <typename T> class NormalRandomGenerator {
   // Uses the given "seed" if non-zero, otherwise uses a random seed.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
     m_state = PCG_XSH_RS_state(seed);
-    #ifdef EIGEN_USE_SYCL
-    // In SYCL it is not possible to build PCG_XSH_RS_state in one step. 
-    // Therefor, we need two steps to initializate the m_state.
-    // IN SYCL, the constructor of the functor is s called on the CPU
-    // and we get the clock seed here from the CPU. However, This seed is 
-    //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
-    // and only  available on the Operator() function (which is called on the GPU).
-    // Therefore, the thread Id injection is not available at this stage. However when the operator() 
-    //is called the thread ID will be avilable. So inside the opeator, 
-    // we add the thrreadID, BlockId,... (which is equivalent of i) 
-    //to the seed and construct the unique m_state per thead similar to cuda.  
-    m_exec_once =false;
-   #endif
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
       const NormalRandomGenerator& other) {
     m_state = other.m_state;
-#ifdef EIGEN_USE_SYCL
-    m_exec_once=other.m_exec_once;
-#endif
   }
 
  template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   T operator()(Index i) const {
-    #ifdef EIGEN_USE_SYCL
-    if(!m_exec_once) {
-      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
-      m_state += (i * 6364136223846793005ULL);
-      m_exec_once =true;
-    }
-    #endif
-    T result = RandomToTypeNormal<T>(&m_state, i);
+    uint64_t local_state = m_state + i;
+    T result = RandomToTypeNormal<T>(&local_state);
+    m_state = local_state;
     return result;
   }
 
@@ -309,25 +244,16 @@ template <typename T> class NormalRandomGenerator {
   Packet packetOp(Index i) const {
     const int packetSize = internal::unpacket_traits<Packet>::size;
     EIGEN_ALIGN_MAX T values[packetSize];
-    #ifdef EIGEN_USE_SYCL
-    if(!m_exec_once) {
-      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
-      m_state += (i * 6364136223846793005ULL);
-      m_exec_once =true;
-    }
-    #endif
-    EIGEN_UNROLL_LOOP
+    uint64_t local_state = m_state + i;
     for (int j = 0; j < packetSize; ++j) {
-      values[j] = RandomToTypeNormal<T>(&m_state, i);
+      values[j] = RandomToTypeNormal<T>(&local_state);
     }
+    m_state = local_state;
     return internal::pload<Packet>(values);
   }
 
  private:
   mutable uint64_t m_state;
-   #ifdef EIGEN_USE_SYCL
-  mutable bool m_exec_once;
-  #endif
 };
 
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 8332a9ae0..41d0d0022 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -11,20 +11,8 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
 #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
 
-// clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
-// so we'll use a macro to make clang happy.
-#ifndef KERNEL_FRIEND
-#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
-#define KERNEL_FRIEND friend __global__
-#else
-#define KERNEL_FRIEND friend
-#endif
-#endif
-
-
 namespace Eigen {
 
-
 /** \class TensorReduction
   * \ingroup CXX11_Tensor_Module
   *
@@ -44,7 +32,6 @@ namespace internal {
   typedef typename XprType::Nested Nested;
   static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
   static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
 
   template <class T> struct MakePointer {
     // Intermediate typedef to workaround MSVC issue.
@@ -165,9 +152,7 @@ struct GenericDimReducer<-1, Self, Op> {
   }
 };
 
-template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess),
-          bool UseTreeReduction = (!Self::ReducerTraits::IsStateful &&
-                                   !Self::ReducerTraits::IsExactlyAssociative)>
+template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
 struct InnerMostDimReducer {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
     typename Self::CoeffReturnType accum = reducer.initialize();
@@ -179,88 +164,23 @@ struct InnerMostDimReducer {
 };
 
 template <typename Self, typename Op>
-struct InnerMostDimReducer<Self, Op, true, false> {
+struct InnerMostDimReducer<Self, Op, true> {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
-    const typename Self::Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
+    const int packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
     const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
-    typename Self::PacketReturnType paccum = reducer.template initializePacket<typename Self::PacketReturnType>();
+    typename Self::PacketReturnType p = reducer.template initializePacket<typename Self::PacketReturnType>();
     for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
-      reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
+      reducer.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &p);
     }
     typename Self::CoeffReturnType accum = reducer.initialize();
     for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) {
       reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
     }
-    return reducer.finalizeBoth(accum, paccum);
+    return reducer.finalizeBoth(accum, p);
   }
 };
 
-#if !defined(EIGEN_HIPCC) 
-static const int kLeafSize = 1024;
-
-template <typename Self, typename Op>
-struct InnerMostDimReducer<Self, Op, false, true> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
-  reduce(const Self& self, typename Self::Index firstIndex,
-         typename Self::Index numValuesToReduce, Op& reducer) {
-    typename Self::CoeffReturnType accum = reducer.initialize();
-    if (numValuesToReduce > kLeafSize) {
-      const typename Self::Index half = numValuesToReduce / 2;
-      reducer.reduce(reduce(self, firstIndex, half, reducer), &accum);
-      reducer.reduce(
-          reduce(self, firstIndex + half, numValuesToReduce - half, reducer),
-          &accum);
-    } else {
-      for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
-        reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
-      }
-    }
-    return reducer.finalize(accum);
-  }
-};
-
-template <typename Self, typename Op>
-struct InnerMostDimReducer<Self, Op, true, true> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType
-  reduce(const Self& self, typename Self::Index firstIndex,
-         typename Self::Index numValuesToReduce, Op& reducer) {
-    const typename Self::Index packetSize =
-        internal::unpacket_traits<typename Self::PacketReturnType>::size;
-    typename Self::CoeffReturnType accum = reducer.initialize();
-    if (numValuesToReduce > packetSize * kLeafSize) {
-      // Make sure the split point is aligned on a packet boundary.
-      const typename Self::Index split =
-          packetSize *
-          divup(firstIndex + divup(numValuesToReduce, typename Self::Index(2)),
-                packetSize);
-      const typename Self::Index num_left =
-          numext::mini(split - firstIndex, numValuesToReduce);
-      reducer.reduce(reduce(self, firstIndex, num_left, reducer), &accum);
-      if (num_left < numValuesToReduce) {
-        reducer.reduce(
-            reduce(self, split, numValuesToReduce - num_left, reducer), &accum);
-      }
-      return reducer.finalize(accum);
-    } else {
-      const typename Self::Index VectorizedSize =
-          (numValuesToReduce / packetSize) * packetSize;
-      typename Self::PacketReturnType paccum =
-          reducer.template initializePacket<typename Self::PacketReturnType>();
-      for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) {
-        reducer.reducePacket(
-            self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum);
-      }
-      for (typename Self::Index j = VectorizedSize; j < numValuesToReduce;
-           ++j) {
-        reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
-      }
-      return reducer.finalizeBoth(accum, paccum);
-    }
-  }
-};
-#endif
- 
-template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
+template <int DimIndex, typename Self, typename Op, bool vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
 struct InnerMostDimPreserver {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) {
     eigen_assert(false && "should never be called");
@@ -295,11 +215,11 @@ struct InnerMostDimPreserver<-1, Self, Op, true> {
 };
 
 // Default full reducer
-template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
+template <typename Self, typename Op, typename Device, bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
 struct FullReducer {
   static const bool HasOptimizedImplementation = false;
 
-  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::EvaluatorPointerType output) {
+  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) {
     const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
     *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
   }
@@ -309,7 +229,7 @@ struct FullReducer {
 #ifdef EIGEN_USE_THREADS
 // Multithreaded full reducers
 template <typename Self, typename Op,
-          bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
+          bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)>
 struct FullReducerShard {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex,
                   typename Self::Index numValuesToReduce, Op& reducer,
@@ -322,8 +242,8 @@ struct FullReducerShard {
 // Multithreaded full reducer
 template <typename Self, typename Op, bool Vectorizable>
 struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
-  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful;
-  static const Index PacketSize =
+  static const bool HasOptimizedImplementation = !Op::IsStateful;
+  static const int PacketSize =
       unpacket_traits<typename Self::PacketReturnType>::size;
 
   // launch one reducer per thread and accumulate the result.
@@ -400,58 +320,29 @@ struct OuterReducer {
   }
 };
 
-#ifdef EIGEN_USE_SYCL
-// Default Generic reducer
-template <typename Self, typename Op, typename Device>
-struct GenericReducer {
-  static const bool HasOptimizedImplementation = false;
 
-  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) {
-    eigen_assert(false && "Not implemented");
-    return true;
-  }
-};
-#endif
-
-#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
-template <int B, int N, typename S, typename R, typename I_>
-__global__ void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+template <int B, int N, typename S, typename R, typename I>
+__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
 
 
-#if defined(EIGEN_HAS_GPU_FP16)
-template <typename S, typename R, typename I_>
-__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*);
-template <int B, int N, typename S, typename R, typename I_>
-__global__ void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*);
-template <int NPT, typename S, typename R, typename I_>
-__global__ void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename S, typename R, typename I>
+__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
+template <int B, int N, typename S, typename R, typename I>
+__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
+template <int NPT, typename S, typename R, typename I>
+__global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*);
 
 #endif
 
-template <int NPT, typename S, typename R, typename I_>
-__global__ void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+template <int NPT, typename S, typename R, typename I>
+__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
 
-template <int NPT, typename S, typename R, typename I_>
-__global__ void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+template <int NPT, typename S, typename R, typename I>
+__global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
 #endif
 
-/**
- * For SYCL, the return type of the reduction is deduced from the initialize method of the given Op.
- * This allows the reduction to have a different type for the accumulator than the input data type.
- * If this is the case, the functor needs to have two reduce method: one for reducing an element of the input
- * with the accumulator and the other for reducing two accumulators.
- * Such a reducer can be useful for instance when the accumulator is a boolean or a bitset that checks for
- * some properties of the input.
- */
-template <typename Op, typename CoeffReturnType>
-struct ReductionReturnType {
-#if defined(EIGEN_USE_SYCL)
-  typedef typename remove_const<decltype(std::declval<Op>().initialize())>::type type;
-#else
-  typedef typename remove_const<CoeffReturnType>::type type;
-#endif
-};
-
 }  // end namespace internal
 
 
@@ -485,15 +376,11 @@ class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType,
     const Op m_reducer;
 };
 
-template<typename ArgType, typename Device>
-struct TensorReductionEvaluatorBase;
 
 // Eval as rvalue
 template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
-struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
+struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
 {
-  typedef internal::reducer_traits<Op, Device> ReducerTraits;
-  typedef Dims ReducedDims;
   typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType;
   typedef typename XprType::Index Index;
   typedef ArgType ChildType;
@@ -503,42 +390,26 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
   static const int NumOutputDims = NumInputDims - NumReducedDims;
   typedef typename internal::conditional<NumOutputDims==0, Sizes<>, DSizes<Index, NumOutputDims> >::type Dimensions;
   typedef typename XprType::Scalar Scalar;
-  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
+  typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
   static const bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
-  typedef typename internal::ReductionReturnType<Op, typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename internal::remove_const<typename XprType::CoeffReturnType>::type CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const Index PacketSize = PacketType<CoeffReturnType, Device>::size;
-
-  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-    // Subset of strides of the input tensor for the non-reduced dimensions.
-  // Indexed by output dimensions.
-  static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
     IsAligned = false,
-    PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
-    BlockAccess = false,
-    PreferBlockAccess = true,
+    PacketAccess = Self::InputPacketAccess && Op::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   static const bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
   static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
   static const bool RunningFullReduction = (NumOutputDims==0);
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device)
-      : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device), m_xpr_dims(op.dims())
   {
     EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
     EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
@@ -563,13 +434,11 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
         m_outputStrides[0] = 1;
         for (int i = 1; i < NumOutputDims; ++i) {
           m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
-          m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
         }
       } else {
-        m_outputStrides[NumOutputDims - 1] = 1;
+        m_outputStrides.back() = 1;
         for (int i = NumOutputDims - 2; i >= 0; --i) {
           m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
-          m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
         }
       }
     }
@@ -597,7 +466,6 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
           ++reduceIndex;
         } else {
           m_preservedStrides[outputIndex] = input_strides[i];
-          m_output_to_input_dim_map[outputIndex] = i;
           ++outputIndex;
         }
       }
@@ -607,26 +475,13 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
     if (NumOutputDims == 0) {
       m_preservedStrides[0] = internal::array_prod(input_dims);
     }
-
-    m_numValuesToReduce =
-        NumOutputDims == 0
-            ? internal::array_prod(input_dims)
-            : (static_cast<int>(Layout) == static_cast<int>(ColMajor))
-                  ? m_preservedStrides[0]
-                  : m_preservedStrides[NumOutputDims - 1];
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_STRONG_INLINE
-#if !defined(EIGEN_HIPCC)
-  // Marking this as EIGEN_DEVICE_FUNC for HIPCC requires also doing the same
-  // for all the functions being called within here, which then leads to
-  // proliferation of EIGEN_DEVICE_FUNC markings, one of which will eventually
-  // result in an NVCC error
-  EIGEN_DEVICE_FUNC
-#endif
-  bool evalSubExprsIfNeededCommon(EvaluatorPointerType data) {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_<CoeffReturnType>::Type data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+
     // Use the FullReducer if possible.
     if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction &&
         internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
@@ -634,7 +489,7 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
          !RunningOnGPU))) {
       bool need_assign = false;
       if (!data) {
-        m_result = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType))));
+        m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));
         data = m_result;
         need_assign = true;
       }
@@ -642,9 +497,20 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
       internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
       return need_assign;
     }
+    else if(RunningOnSycl){
+      const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+      const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+      if (!data) {
+        data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+        m_result = data;
+      }
+      Op reducer(m_reducer);
+      internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+      return (m_result != NULL);
+    }
 
     // Attempt to use an optimized reduction.
-    else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) {
+    else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) {
       bool reducing_inner_dims = true;
       for (int i = 0; i < NumReducedDims; ++i) {
         if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -658,8 +524,8 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
         if (!data) {
-          if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) || (RunningOnSycl)) {
-            data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
+          if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) {
+            data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
             m_result = data;
           }
           else {
@@ -667,10 +533,9 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
           }
         }
         Op reducer(m_reducer);
-        // For SYCL this if always return false
         if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
           if (m_result) {
-            m_device.deallocate_temp(m_result);
+            m_device.deallocate(m_result);
             m_result = NULL;
           }
           return true;
@@ -692,8 +557,8 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
         if (!data) {
-          if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) || (RunningOnSycl)) {
-            data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
+          if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) {
+            data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
             m_result = data;
           }
           else {
@@ -701,10 +566,9 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
           }
         }
         Op reducer(m_reducer);
-        // For SYCL this if always return false
         if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
           if (m_result) {
-            m_device.deallocate_temp(m_result);
+            m_device.deallocate(m_result);
             m_result = NULL;
           }
           return true;
@@ -712,64 +576,21 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
           return (m_result != NULL);
         }
       }
-      #if defined(EIGEN_USE_SYCL)
-      // If there is no Optimised version for SYCL, the reduction expression 
-      // must break into two subexpression and use the SYCL generic Reducer on the device.
-      if(RunningOnSycl) {
-         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
-         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
-         if (!data) {
-           data = static_cast<EvaluatorPointerType>(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
-           m_result = data;
-         }
-         Op reducer(m_reducer);
-         internal::GenericReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
-         return (m_result != NULL);
-       }
-      #endif
     }
     return true;
   }
 
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_STRONG_INLINE
-#if !defined(EIGEN_HIPCC)
-      EIGEN_DEVICE_FUNC
-#endif
-      void
-      evalSubExprsIfNeededAsync(EvaluatorPointerType data,
-                                EvalSubExprsCallback done) {
-    m_impl.evalSubExprsIfNeededAsync(NULL, [this, data, done](bool) {
-      done(evalSubExprsIfNeededCommon(data));
-    });
-  }
-#endif
-
-  EIGEN_STRONG_INLINE
-#if !defined(EIGEN_HIPCC)
-  // Marking this as EIGEN_DEVICE_FUNC for HIPCC requires also doing the same
-  // for all the functions being called within here, which then leads to
-  // proliferation of EIGEN_DEVICE_FUNC markings, one of which will eventually
-  // result in an NVCC error
-  EIGEN_DEVICE_FUNC
-#endif
-  bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    return evalSubExprsIfNeededCommon(data);
-  }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
     if (m_result) {
-      m_device.deallocate_temp(m_result);
+      m_device.deallocate(m_result);
       m_result = NULL;
     }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    if (( RunningFullReduction || RunningOnGPU) && m_result ) {
+    if ((RunningOnSycl || RunningFullReduction || RunningOnGPU) && m_result) {
       return *(m_result + index);
     }
     Op reducer(m_reducer);
@@ -841,52 +662,37 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
     }
   }
 
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
-  EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
-  EIGEN_DEVICE_FUNC const Device& device() const { return m_device; }
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-    m_result.bind(cgh);
-  }
-#endif
+  EIGEN_DEVICE_FUNC typename MakePointer_<Scalar>::Type data() const { return m_result; }
+  /// required by sycl in order to extract the accessor
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+  /// added for sycl in order to construct the buffer from the sycl device
+  const Device& device() const{return m_device;}
+  /// added for sycl in order to re-construct the reduction eval on the device for the sub-kernel
+  const Dims& xprDims() const {return m_xpr_dims;}
+
 
   private:
   template <int, typename, typename> friend struct internal::GenericDimReducer;
-  template <typename, typename, bool, bool> friend struct internal::InnerMostDimReducer;
+  template <typename, typename, bool> friend struct internal::InnerMostDimReducer;
   template <int, typename, typename, bool> friend struct internal::InnerMostDimPreserver;
   template <typename S, typename O, typename D, bool V> friend struct internal::FullReducer;
 #ifdef EIGEN_USE_THREADS
   template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
 #endif
-#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
-  template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
-#if defined(EIGEN_HAS_GPU_FP16)
-  template <typename S, typename R, typename I_> KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<Eigen::half>::type*);
-  template <int B, int N, typename S, typename R, typename I_> KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<Eigen::half>::type*);
-  template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
+#ifdef EIGEN_HAS_CUDA_FP16
+  template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
+  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
+  template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
 #endif
-  template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+  template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
 
-  template <int NPT, typename S, typename R, typename I_> KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+  template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
 #endif
 
-#if defined(EIGEN_USE_SYCL)
- template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::GenericNondeterministicReducer;
- // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer
- template <typename, typename, typename> friend struct internal::GenericReducer;
-#endif
-
-
   template <typename S, typename O, typename D> friend struct internal::InnerReducer;
 
-  struct BlockIteratorState {
-    Index input_dim;
-    Index output_size;
-    Index output_count;
-  };
-
   // Returns the Index in the input tensor of the first value that needs to be
   // used to compute the reduction at output index "index".
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
@@ -935,12 +741,10 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
   Dimensions m_dimensions;
   // Precomputed strides for the output tensor.
   array<Index, NumOutputDims> m_outputStrides;
-  array<internal::TensorIntDivisor<Index>, NumOutputDims> m_fastOutputStrides;
+  // Subset of strides of the input tensor for the non-reduced dimensions.
+  // Indexed by output dimensions.
+  static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
   array<Index, NumPreservedStrides> m_preservedStrides;
-  // Map from output to input dimension index.
-  array<Index, NumOutputDims> m_output_to_input_dim_map;
-  // How many values go into each reduction
-  Index m_numValuesToReduce;
 
   // Subset of strides of the input tensor for the reduced dimensions.
   // Indexed by reduced dimensions.
@@ -956,7 +760,7 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
   Op m_reducer;
 
   // For full reductions
-#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
   static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
   static const bool RunningOnSycl = false;
 #elif defined(EIGEN_USE_SYCL)
@@ -966,36 +770,10 @@ static const bool RunningOnGPU = false;
   static const bool RunningOnGPU = false;
   static const bool RunningOnSycl = false;
 #endif
-  EvaluatorPointerType m_result;
+  typename MakePointer_<CoeffReturnType>::Type m_result;
 
-  const Device EIGEN_DEVICE_REF m_device;
-};
-
-template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
-struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
-: public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
-  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Base;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){}
-};
-
-
-template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_>
-struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice>
-: public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> {
-
-  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> Base;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){}
-  // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
-  //Therefore the coeff function should be overridden by for SYCL kernel
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const {
-    return *(this->data() + index);
-  }
-  // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
-  //Therefore the packet function should be overridden by for SYCL kernel
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const {
-    return internal::pload<typename Base::PacketReturnType>(this->data() + index);
-  }
+  const Device& m_device;
+  const Dims& m_xpr_dims;
 };
 
 } // end namespace Eigen
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 68780cd3c..65638b6a8 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -1,6 +1,750 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#if defined(__clang__) || defined(__GNUC__)
-#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorReductionGpu.h file"
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
+
+namespace Eigen {
+namespace internal {
+
+
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+// Full reducers for GPU, don't vectorize for now
+
+// Reducer function that enables multiple cuda thread to safely accumulate at the same
+// output address. It basically reads the current value of the output variable, and
+// attempts to update it with the new value. If in the meantime another cuda thread
+// updated the content of the output address it will try again.
+template <typename T, typename R>
+__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
+#if __CUDA_ARCH__ >= 300
+  if (sizeof(T) == 4)
+  {
+    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+    unsigned int newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned int readback;
+    while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  }
+  else if (sizeof(T) == 8) {
+    unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
+    unsigned long long newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned long long readback;
+    while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  }
+  else {
+    assert(0 && "Wordsize not supported");
+  }
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+// We extend atomicExch to support extra data types
+template <typename Type>
+__device__ inline Type atomicExchCustom(Type* address, Type val) {
+  return atomicExch(address, val);
+}
+
+template <>
+__device__ inline double atomicExchCustom(double* address, double val) {
+  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
+  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
+}
+
+#ifdef EIGEN_HAS_CUDA_FP16
+template <template <typename T> class R>
+__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
+  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+  unsigned int newval = oldval;
+  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+  if (newval == oldval) {
+    return;
+  }
+  unsigned int readback;
+  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+    oldval = readback;
+    newval = oldval;
+    reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+  }
+}
 #endif
 
-#include "TensorReductionGpu.h"
+template <>
+__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
+#if __CUDA_ARCH__ >= 300
+  atomicAdd(output, accum);
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+
+template <typename CoeffType, typename Index>
+__global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index num_threads = blockDim.x * gridDim.x;
+  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+    output[i] = val;
+  }
+}
+
+
+template <int BlockSize, int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
+                                    typename Self::CoeffReturnType* output, unsigned int* semaphore) {
+#if __CUDA_ARCH__ >= 300
+  // Initialize the output value
+  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      *output = reducer.initialize();
+    }
+  }
+  else {
+    if (threadIdx.x == 0) {
+      unsigned int block = atomicCAS(semaphore, 0u, 1u);
+      if (block == 0) {
+        // We're the first block to run, initialize the output value
+        atomicExchCustom(output, reducer.initialize());
+        __threadfence();
+        atomicExch(semaphore, 2u);
+      }
+      else {
+        // Wait for the first block to initialize the output value.
+        // Use atomicCAS here to ensure that the reads aren't cached
+        unsigned int val;
+        do {
+          val = atomicCAS(semaphore, 2u, 2u);
+        }
+        while (val < 2u);
+      }
+    }
+  }
+
+  __syncthreads();
+
+  eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
+
+  typename Self::CoeffReturnType accum = reducer.initialize();
+  Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
+  for (Index i = 0; i < max_iter; i+=BlockSize) {
+    const Index index = first_index + i;
+    eigen_assert(index < num_coeffs);
+    typename Self::CoeffReturnType val = input.m_impl.coeff(index);
+    reducer.reduce(val, &accum);
+  }
+
+#pragma unroll
+  for (int offset = warpSize/2; offset > 0; offset /= 2) {
+    reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
+  }
+
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicReduce(output, accum, reducer);
+  }
+
+  if (gridDim.x > 1 && threadIdx.x == 0) {
+    // Let the last block reset the semaphore
+    atomicInc(semaphore, gridDim.x + 1);
+  }
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename Self,
+          typename Reducer, typename Index>
+__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
+  eigen_assert(blockDim.x == 1);
+  eigen_assert(gridDim.x == 1);
+  if (num_coeffs % 2 != 0) {
+    half last = input.m_impl.coeff(num_coeffs-1);
+    *scratch = __halves2half2(last, reducer.initialize());
+  } else {
+    *scratch = reducer.template initializePacket<half2>();
+  }
+}
+
+template <typename Self,
+          typename Reducer, typename Index>
+__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index num_packets = num_coeffs / 2;
+  for (Index i = thread_id; i < num_packets; i += num_threads) {
+    ((half2*)output)[i] = reducer.template initializePacket<half2>();
+  }
+
+  if (thread_id == 0 && num_coeffs % 2 != 0) {
+    output[num_coeffs-1] = reducer.initialize();
+  }
+}
+
+template <int BlockSize, int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+                                    half* output, half2* scratch) {
+  eigen_assert(NumPerThread % 2 == 0);
+
+  const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
+
+  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
+  if (gridDim.x == 1 && first_index == 0) {
+    if (num_coeffs % 2 != 0) {
+      half last = input.m_impl.coeff(num_coeffs-1);
+      *scratch = __halves2half2(last, reducer.initialize());
+    } else {
+      *scratch = reducer.template initializePacket<half2>();
+    }
+    __syncthreads();
+  }
+
+  half2 accum = reducer.template initializePacket<half2>();
+  const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
+  for (Index i = 0; i < max_iter; i += BlockSize) {
+    const Index index = first_index + 2*i;
+    eigen_assert(index + 1 < num_coeffs);
+    half2 val = input.m_impl.template packet<Unaligned>(index);
+    reducer.reducePacket(val, &accum);
+  }
+
+#pragma unroll
+  for (int offset = warpSize/2; offset > 0; offset /= 2) {
+    reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
+  }
+
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicReduce(scratch, accum, reducer);
+  }
+
+  __syncthreads();
+
+  if (gridDim.x == 1 && first_index == 0) {
+    half tmp = __low2half(*scratch);
+    reducer.reduce(__high2half(*scratch), &tmp);
+    *output = tmp;
+  }
+}
+
+template <typename Op>
+__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
+  eigen_assert(threadIdx.x == 1);
+  half tmp = __low2half(*scratch);
+  reducer.reduce(__high2half(*scratch), &tmp);
+  *output = tmp;
+}
+
+#endif
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct FullReductionLauncher {
+  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
+    assert(false && "Should only be called on doubles, floats and half floats");
+  }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct FullReductionLauncher<
+    Self, Op, OutputType, PacketAccess,
+    typename internal::enable_if<
+      internal::is_same<float, OutputType>::value ||
+      internal::is_same<double, OutputType>::value,
+    void>::type> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
+    typedef typename Self::Index Index;
+    typedef typename Self::CoeffReturnType Scalar;
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+
+    unsigned int* semaphore = NULL;
+    if (num_blocks > 1) {
+      semaphore = device.semaphore();
+    }
+
+    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
+  }
+};
+
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, false> {
+  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
+    assert(false && "Should not be called since there is no packet accessor");
+  }
+};
+
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, true> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
+    typedef typename Self::Index Index;
+
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    half2* scratch = static_cast<half2*>(device.scratchpad());
+
+    if (num_blocks > 1) {
+      // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      LAUNCH_CUDA_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
+                         1, 1, 0, device, reducer, self, num_coeffs, scratch);
+    }
+
+    LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
+
+    if (num_blocks > 1) {
+      LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
+                         1, 1, 0, device, reducer, output, scratch);
+    }
+  }
+};
+#endif
+
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple cases
+  // of doubles, floats and half floats
+#ifdef EIGEN_HAS_CUDA_FP16
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+                                                (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                 internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif
+
+  template <typename OutputType>
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+    assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return;
+    }
+
+    FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
+  }
+};
+
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                         typename Self::CoeffReturnType* output) {
+#if __CUDA_ARCH__ >= 300
+  typedef typename Self::CoeffReturnType Type;
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  const int unroll_times = 16;
+  eigen_assert(NumPerThread % unroll_times == 0);
+
+  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
+  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
+
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+    const Index row = i / input_col_blocks;
+
+    if (row < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
+
+      Type reduced_val = reducer.initialize();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
+        if (last_col >= num_coeffs_to_reduce) {
+          for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
+            const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+            reducer.reduce(val, &reduced_val);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + blockDim.x * (j + k);
+            reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = warpSize/2; offset > 0; offset /= 2) {
+        reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
+      }
+
+      if ((threadIdx.x & (warpSize - 1)) == 0) {
+        atomicReduce(&(output[row]), reduced_val, reducer);
+      }
+    }
+  }
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+#ifdef EIGEN_HAS_CUDA_FP16
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                              half* output) {
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  const int unroll_times = 16;
+  eigen_assert(NumPerThread % unroll_times == 0);
+  eigen_assert(unroll_times % 2 == 0);
+
+  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
+  const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
+
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    Index i = 2*thread_id;
+    for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) {
+      half* loc = output + i;
+      *((half2*)loc) = reducer.template initializePacket<half2>();
+    }
+    if (i < num_preserved_coeffs) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+    const Index row = 2 * (i / input_col_blocks);
+
+    if (row + 1 < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin = 2 * (col_block * blockDim.x * NumPerThread + threadIdx.x);
+
+      half2 reduced_val1 = reducer.template initializePacket<half2>();
+      half2 reduced_val2 = reducer.template initializePacket<half2>();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * 2;
+        if (last_col >= num_coeffs_to_reduce) {
+          Index col = col_begin + blockDim.x * j;
+          for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) {
+            const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val1, &reduced_val1);
+            const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val2, &reduced_val2);
+          }
+          if (col < num_coeffs_to_reduce) {
+            // Peel;
+            const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+            const half2 val1 = __halves2half2(last1, reducer.initialize());
+            reducer.reducePacket(val1, &reduced_val1);
+            const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col);
+            const half2 val2 = __halves2half2(last2, reducer.initialize());
+            reducer.reducePacket(val2, &reduced_val2);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + blockDim.x * (j + k) * 2;
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1);
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = warpSize/2; offset > 0; offset /= 2) {
+        reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
+        reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
+      }
+
+      half val1 =  __low2half(reduced_val1);
+      reducer.reduce(__high2half(reduced_val1), &val1);
+      half val2 =  __low2half(reduced_val2);
+      reducer.reduce(__high2half(reduced_val2), &val2);
+      half2 val = __halves2half2(val1, val2);
+
+      if ((threadIdx.x & (warpSize - 1)) == 0) {
+        half* loc = output + row;
+        atomicReduce((half2*)loc, val, reducer);
+      }
+    }
+  }
+}
+
+#endif
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct InnerReductionLauncher {
+  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
+    assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
+    return true;
+  }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct InnerReductionLauncher<
+  Self, Op, OutputType, PacketAccess,
+  typename internal::enable_if<
+    internal::is_same<float, OutputType>::value ||
+    internal::is_same<double, OutputType>::value,
+  void>::type> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumCudaMultiProcessors() *
+                           device.maxCudaThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumCudaMultiProcessors() *
+                           device.maxCudaThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>),
+                         num_blocks, 1024, 0, device, reducer.initialize(),
+                         num_preserved_vals, output);
+    }
+
+    LAUNCH_CUDA_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
+  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
+    assert(false && "Should not be called since there is no packet accessor");
+    return true;
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    if (num_preserved_vals % 2 != 0) {
+      // Not supported yet, revert to the slower code path
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = /*256*/128;
+    const int num_per_thread = /*128*/64;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumCudaMultiProcessors() *
+                           device.maxCudaThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumCudaMultiProcessors() *
+                           device.maxCudaThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
+                         1, 1, 0, device, reducer, self, num_preserved_vals, output);
+    }
+
+    LAUNCH_CUDA_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+#endif
+
+
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, GpuDevice> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats and half floats.
+#ifdef EIGEN_HAS_CUDA_FP16
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif
+
+  template <typename OutputType>
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return true;
+    }
+    // It's faster to use the usual code.
+    if (num_coeffs_to_reduce <= 128) {
+      return true;
+    }
+
+    return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
+  }
+};
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                     typename Self::CoeffReturnType* output) {
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  // Do the reduction.
+  const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
+  for (Index i = thread_id; i < max_iter; i += num_threads) {
+    const Index input_col = i % num_preserved_coeffs;
+    const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
+    typename Self::CoeffReturnType reduced_val = reducer.initialize();
+    const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
+    for (Index j = input_row; j < max_row; j++) {
+      typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
+      reducer.reduce(val, &reduced_val);
+    }
+    atomicReduce(&(output[input_col]), reduced_val, reducer);
+  }
+}
+
+
+template <typename Self, typename Op>
+struct OuterReducer<Self, Op, GpuDevice> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats.
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
+  template <typename Device, typename OutputType>
+  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
+    assert(false && "Should only be called to reduce doubles or floats on a gpu device");
+    return true;
+  }
+
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    // It's faster to use the usual code.
+    if (num_coeffs_to_reduce <= 32) {
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = 256;
+    const int num_per_thread = 16;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumCudaMultiProcessors() *
+                           device.maxCudaThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs in the reduction kernel itself when we don't have to worry
+      // about race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumCudaMultiProcessors() *
+                             device.maxCudaThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
+                         num_blocks, 1024, 0, device, reducer.initialize(),
+                         num_preserved_vals, output);
+    }
+
+    LAUNCH_CUDA_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+
+#endif
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
deleted file mode 100644
index 36df03d62..000000000
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
+++ /dev/null
@@ -1,967 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
-#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
-
-namespace Eigen {
-namespace internal {
-
-
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
-// Full reducers for GPU, don't vectorize for now
-
-// Reducer function that enables multiple gpu thread to safely accumulate at the same
-// output address. It basically reads the current value of the output variable, and
-// attempts to update it with the new value. If in the meantime another gpu thread
-// updated the content of the output address it will try again.
-template <typename T, typename R>
-__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
-#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
-  if (sizeof(T) == 4)
-  {
-    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
-    unsigned int newval = oldval;
-    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-    if (newval == oldval) {
-      return;
-    }
-    unsigned int readback;
-    while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
-      oldval = readback;
-      newval = oldval;
-      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-      if (newval == oldval) {
-        return;
-      }
-    }
-  }
-  else if (sizeof(T) == 8) {
-    unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
-    unsigned long long newval = oldval;
-    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-    if (newval == oldval) {
-      return;
-    }
-    unsigned long long readback;
-    while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) {
-      oldval = readback;
-      newval = oldval;
-      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
-      if (newval == oldval) {
-        return;
-      }
-    }
-  }
-  else {
-    gpu_assert(0 && "Wordsize not supported");
-  }
-#else // EIGEN_CUDA_ARCH >= 300
-  gpu_assert(0 && "Shouldn't be called on unsupported device");
-#endif // EIGEN_CUDA_ARCH >= 300
-}
-
-// We extend atomicExch to support extra data types
-template <typename Type>
-__device__ inline Type atomicExchCustom(Type* address, Type val) {
-  return atomicExch(address, val);
-}
-
-template <>
-__device__ inline double atomicExchCustom(double* address, double val) {
-  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
-  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
-}
-
-#ifdef EIGEN_HAS_GPU_FP16
-template <template <typename T> class R>
-__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
-  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
-  unsigned int newval = oldval;
-  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
-  if (newval == oldval) {
-    return;
-  }
-  unsigned int readback;
-  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
-    oldval = readback;
-    newval = oldval;
-    reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
-    if (newval == oldval) {
-      return;
-    }
-  }
-}
-// reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations
-template <template <typename T> class R>
-__device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum,
-                                    R<half>& reducer) {
-  half2* houtput=reinterpret_cast<half2*>(output);
-  half2* haccum=reinterpret_cast<half2*>(&accum);
-  for(int i=0;i<4;++i){
-    atomicReduce(houtput+i,*(haccum+i),reducer);
-  }
-}
-#endif  // EIGEN_HAS_GPU_FP16
-
-template <>
-__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
-#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
-  atomicAdd(output, accum);
-#else // EIGEN_CUDA_ARCH >= 300
-  gpu_assert(0 && "Shouldn't be called on unsupported device");
-#endif // EIGEN_CUDA_ARCH >= 300
-}
-
-
-template <typename CoeffType, typename Index>
-__global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  const Index num_threads = blockDim.x * gridDim.x;
-  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
-    output[i] = val;
-  }
-}
-
-
-template <int BlockSize, int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
-                                    typename Self::CoeffReturnType* output, unsigned int* semaphore) {
-#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
-  // Initialize the output value
-  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
-  if (gridDim.x == 1) {
-    if (first_index == 0) {
-      *output = reducer.initialize();
-    }
-  }
-  else {
-    if (threadIdx.x == 0) {
-      unsigned int block = atomicCAS(semaphore, 0u, 1u);
-      if (block == 0) {
-        // We're the first block to run, initialize the output value
-        atomicExchCustom(output, reducer.initialize());
-        __threadfence();
-        atomicExch(semaphore, 2u);
-      }
-      else {
-        // Wait for the first block to initialize the output value.
-        // Use atomicCAS here to ensure that the reads aren't cached
-        unsigned int val;
-        do {
-          val = atomicCAS(semaphore, 2u, 2u);
-        }
-        while (val < 2u);
-      }
-    }
-  }
-
-  __syncthreads();
-
-  eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
-
-  typename Self::CoeffReturnType accum = reducer.initialize();
-  Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
-  for (Index i = 0; i < max_iter; i+=BlockSize) {
-    const Index index = first_index + i;
-    eigen_assert(index < num_coeffs);
-    typename Self::CoeffReturnType val = input.m_impl.coeff(index);
-    reducer.reduce(val, &accum);
-  }
-
-#pragma unroll
-  for (int offset = warpSize/2; offset > 0; offset /= 2) {
-  #if defined(EIGEN_HIPCC)
-    // use std::is_floating_point to determine the type of reduced_val 
-    // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error 
-    // and list the float and int versions of __shfl_down as the candidate functions. 
-    if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
-      reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
-    } else {
-      reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
-    }
-  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-    reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
-  #else
-    reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
-  #endif
-  }
-
-  if ((threadIdx.x & (warpSize - 1)) == 0) {
-    atomicReduce(output, accum, reducer);
-  }
-
-  if (gridDim.x > 1 && threadIdx.x == 0) {
-    // Let the last block reset the semaphore
-    atomicInc(semaphore, gridDim.x + 1);
-#if defined(EIGEN_HIPCC)
-    __threadfence_system();
-#endif
-  }
-#else // EIGEN_CUDA_ARCH >= 300
-  gpu_assert(0 && "Shouldn't be called on unsupported device");
-#endif // EIGEN_CUDA_ARCH >= 300
-}
-
-
-#ifdef EIGEN_HAS_GPU_FP16
-template <typename Self,
-          typename Reducer, typename Index>
-__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
-                                                      packet_traits<Eigen::half>::type* scratch) {
-  eigen_assert(blockDim.x == 1);
-  eigen_assert(gridDim.x == 1);
-  typedef packet_traits<Eigen::half>::type packet_type;
-  Index packet_remainder =
-      num_coeffs % Index(unpacket_traits<packet_type>::size);
-  if (packet_remainder != 0) {
-    half2* h2scratch = reinterpret_cast<half2*>(scratch);
-    for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
-      *h2scratch =
-          __halves2half2(input.m_impl.coeff(i), input.m_impl.coeff(i + 1));
-      h2scratch++;
-    }
-    if ((num_coeffs & 1) != 0) {
-      half lastCoeff = input.m_impl.coeff(num_coeffs - 1);
-      *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
-    }
-  } else {
-    *scratch = reducer.template initializePacket<packet_type>();
-  }
-}
-
-template <typename Self,
-          typename Reducer, typename Index>
-__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  const Index num_threads = blockDim.x * gridDim.x;
-  typedef typename packet_traits<Eigen::half>::type PacketType;
-
-  const Index num_packets =
-      num_coeffs / Index(unpacket_traits<PacketType>::size);
-  PacketType* p_output = reinterpret_cast<PacketType*>(output);
-  for (Index i = thread_id; i < num_packets; i += num_threads) {
-    p_output[i] = reducer.template initializePacket<PacketType>();
-  }
-  Index packet_remainder =
-      num_coeffs % Index(unpacket_traits<PacketType>::size);
-  if (thread_id < packet_remainder) {
-    output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
-  }
-}
-
-template <int BlockSize, int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
-                                    half* output, packet_traits<Eigen::half>::type* scratch) {
-  typedef typename packet_traits<Eigen::half>::type PacketType;
-  const int packet_width = unpacket_traits<PacketType>::size;
-  eigen_assert(NumPerThread % packet_width == 0);
-  const Index first_index =
-      blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x;
-
-  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
-
-  if (gridDim.x == 1) {
-    if (first_index == 0) {
-      int rem = num_coeffs % packet_width;
-      if (rem != 0) {
-        half2* p_scratch = reinterpret_cast<half2*>(scratch);
-        *scratch = reducer.template initializePacket<PacketType>();
-        for (int i = 0; i < rem / 2; i++) {
-          *p_scratch = __halves2half2(
-              input.m_impl.coeff(num_coeffs - packet_width + 2 * i),
-              input.m_impl.coeff(num_coeffs - packet_width + 2 * i + 1));
-          p_scratch++;
-        }
-        if ((num_coeffs & 1) != 0) {
-          half last = input.m_impl.coeff(num_coeffs - 1);
-          *p_scratch = __halves2half2(last, reducer.initialize());
-        }
-      } else {
-        *scratch = reducer.template initializePacket<PacketType>();
-      }
-    }
-    __syncthreads();
-  }
-
-  PacketType accum = reducer.template initializePacket<PacketType>();
-  const Index max_iter =
-      numext::mini<Index>((num_coeffs - first_index) / packet_width,
-                          NumPerThread * BlockSize / packet_width);
-  for (Index i = 0; i < max_iter; i += BlockSize) {
-    const Index index = first_index + packet_width * i;
-    eigen_assert(index + packet_width < num_coeffs);
-    PacketType val = input.m_impl.template packet<Unaligned>(index);
-    reducer.reducePacket(val, &accum);
-  }
-
-#pragma unroll
-  for (int offset = warpSize/2; offset > 0; offset /= 2) {
-  #if defined(EIGEN_HIPCC)
-    PacketType r1;
-    half2* hr = reinterpret_cast<half2*>(&r1);
-    half2* hacc = reinterpret_cast<half2*>(&accum);
-    for (int i = 0; i < packet_width / 2; i++) {
-      // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
-      union { int i; half2 h; } wka_in, wka_out;
-      wka_in.h = hacc[i];
-      wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
-      hr[i] = wka_out.h;
-    }
-    reducer.reducePacket(r1, &accum);
-  #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-    PacketType r1;
-    half2* hr = reinterpret_cast<half2*>(&r1);
-    half2* hacc = reinterpret_cast<half2*>(&accum);
-    for (int i = 0; i < packet_width / 2; i++) {
-      hr[i] = __shfl_down(hacc[i], offset, warpSize);
-    }
-    reducer.reducePacket(r1, &accum);
-  #else
-    PacketType r1;
-    half2* hr = reinterpret_cast<half2*>(&r1);
-    half2* hacc = reinterpret_cast<half2*>(&accum);
-    for (int i = 0; i < packet_width / 2; i++) {
-      hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize);
-    }
-    reducer.reducePacket(r1, &accum);
-
-  #endif
-  }
-
-  if ((threadIdx.x & (warpSize - 1)) == 0) {
-    atomicReduce(scratch, accum, reducer);
-  }
-
-  __syncthreads();
-  half2* rv1 = reinterpret_cast<half2*>(scratch);
-  if (packet_width > 2) {
-    reducer.reducePacket(rv1[2], rv1);
-    reducer.reducePacket(rv1[3], rv1 + 1);
-    reducer.reducePacket(rv1[1], rv1);
-  }
-  if (gridDim.x == 1) {
-    if (first_index == 0) {
-      half tmp = __low2half(*rv1);
-      reducer.reduce(__high2half(*rv1), &tmp);
-      *output = tmp;
-    }
-  }
-}
-
-template <typename Op>
-__global__ void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) {
-  eigen_assert(threadIdx.x == 1);
-  half2* pscratch = reinterpret_cast<half2*>(scratch);
-  half tmp = __float2half(0.f);
-  typedef packet_traits<Eigen::half>::type packet_type;
-  for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
-    reducer.reduce(__low2half(*pscratch), &tmp);
-    reducer.reduce(__high2half(*pscratch), &tmp);
-    pscratch++;
-  }
-  *output = tmp;
-}
-
-#endif // EIGEN_HAS_GPU_FP16
-
-template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
-struct FullReductionLauncher {
-  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
-    gpu_assert(false && "Should only be called on doubles, floats and half floats");
-  }
-};
-
-// Specialization for float and double
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
-struct FullReductionLauncher<
-    Self, Op, OutputType, PacketAccess,
-    typename internal::enable_if<
-      internal::is_same<float, OutputType>::value ||
-      internal::is_same<double, OutputType>::value,
-    void>::type> {
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
-
-    typedef typename Self::Index Index;
-    const int block_size = 256;
-    const int num_per_thread = 128;
-    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-
-    unsigned int* semaphore = NULL;
-    if (num_blocks > 1) {
-      semaphore = device.semaphore();
-    }
-
-    LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
-  }
-};
-
-#ifdef EIGEN_HAS_GPU_FP16
-template <typename Self, typename Op>
-struct FullReductionLauncher<Self, Op, Eigen::half, false> {
-  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
-    gpu_assert(false && "Should not be called since there is no packet accessor");
-  }
-};
-
-template <typename Self, typename Op>
-struct FullReductionLauncher<Self, Op, Eigen::half, true> {
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
-    typedef typename Self::Index Index;
-    typedef typename packet_traits<Eigen::half>::type PacketType;
-
-    const int block_size = 256;
-    const int num_per_thread = 128;
-    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    PacketType* scratch = static_cast<PacketType*>(device.scratchpad());
-    // half2* scratch = static_cast<half2*>(device.scratchpad());
-
-    if (num_blocks > 1) {
-      // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
-      // won't be a race conditions between multiple thread blocks.
-      LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
-                         1, 1, 0, device, reducer, self, num_coeffs, scratch);
-    }
-
-    LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
-
-    if (num_blocks > 1) {
-      LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
-                         1, 1, 0, device, reducer, output, scratch);
-    }
-  }
-};
-#endif // EIGEN_HAS_GPU_FP16
-
-
-template <typename Self, typename Op, bool Vectorizable>
-struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
-  // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple cases
-  // of doubles, floats and half floats
-#ifdef EIGEN_HAS_GPU_FP16
-  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
-      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-       internal::is_same<typename Self::CoeffReturnType, double>::value ||
-       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
-#else // EIGEN_HAS_GPU_FP16
-  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
-                                                (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-                                                 internal::is_same<typename Self::CoeffReturnType, double>::value);
-#endif // EIGEN_HAS_GPU_FP16
-
-  template <typename OutputType>
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
-    gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
-    const Index num_coeffs = array_prod(self.m_impl.dimensions());
-    // Don't crash when we're called with an input tensor of size 0.
-    if (num_coeffs == 0) {
-      return;
-    }
-
-    FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
-  }
-};
-
-
-template <int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
-                                         typename Self::CoeffReturnType* output) {
-#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
-  typedef typename Self::CoeffReturnType Type;
-  eigen_assert(blockDim.y == 1);
-  eigen_assert(blockDim.z == 1);
-  eigen_assert(gridDim.y == 1);
-  eigen_assert(gridDim.z == 1);
-
-  const int unroll_times = 16;
-  eigen_assert(NumPerThread % unroll_times == 0);
-
-  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
-  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
-
-  const Index num_threads = blockDim.x * gridDim.x;
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-
-  // Initialize the output values if they weren't initialized by the ReductionInitKernel
-  if (gridDim.x == 1) {
-    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
-      output[i] = reducer.initialize();
-    }
-    __syncthreads();
-  }
-
-  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
-    const Index row = i / input_col_blocks;
-
-    if (row < num_preserved_coeffs) {
-      const Index col_block = i % input_col_blocks;
-      const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
-
-      Type reduced_val = reducer.initialize();
-
-      for (Index j = 0; j < NumPerThread; j += unroll_times) {
-        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
-        if (last_col >= num_coeffs_to_reduce) {
-          for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
-            const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
-            reducer.reduce(val, &reduced_val);
-          }
-          break;
-        } else {
-          // Faster version of the loop with no branches after unrolling.
-#pragma unroll
-          for (int k = 0; k < unroll_times; ++k) {
-            const Index col = col_begin + blockDim.x * (j + k);
-            reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
-          }
-        }
-      }
-
-#pragma unroll
-      for (int offset = warpSize/2; offset > 0; offset /= 2) {
-      #if defined(EIGEN_HIPCC)
-        // use std::is_floating_point to determine the type of reduced_val 
-       // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error 
-       // and list the float and int versions of __shfl_down as the candidate functions. 
-        if (std::is_floating_point<Type>::value) {
-          reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
-        } else {
-          reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
-        }
-      #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-        reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
-      #else
-        reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
-      #endif
-      }
-
-      if ((threadIdx.x & (warpSize - 1)) == 0) {
-        atomicReduce(&(output[row]), reduced_val, reducer);
-      }
-    }
-  }
-#else // EIGEN_CUDA_ARCH >= 300
-  gpu_assert(0 && "Shouldn't be called on unsupported device");
-#endif // EIGEN_CUDA_ARCH >= 300
-}
-
-#ifdef EIGEN_HAS_GPU_FP16
-
-template <int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
-                                              half* output) {
-  eigen_assert(blockDim.y == 1);
-  eigen_assert(blockDim.z == 1);
-  eigen_assert(gridDim.y == 1);
-  eigen_assert(gridDim.z == 1);
-
-  typedef typename packet_traits<Eigen::half>::type PacketType;
-  const int packet_width = unpacket_traits<PacketType>::size;
-  const int unroll_times = 16 / packet_width;
-  eigen_assert(NumPerThread % unroll_times == 0);
-  eigen_assert(unroll_times % 2 == 0);
-
-  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
-  const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
-
-  const Index num_threads = blockDim.x * gridDim.x;
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-
-  // Initialize the output values if they weren't initialized by the ReductionInitKernel
-  if (gridDim.x == 1) {
-    Index i = packet_width * thread_id;
-    for (; i + packet_width <= num_preserved_coeffs;
-         i += packet_width * num_threads) {
-      PacketType* poutput = reinterpret_cast<PacketType*>(output + i);
-      *poutput = reducer.template initializePacket<PacketType>();
-    }
-    if (i < num_preserved_coeffs) {
-      output[i] = reducer.initialize();
-    }
-    __syncthreads();
-  }
-
-  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
-    const Index row = 2 * (i / input_col_blocks);  // everybody takes 2 rows
-
-    if (row + 1 < num_preserved_coeffs) {
-      const Index col_block = i % input_col_blocks;
-      const Index col_begin =
-          packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
-
-      PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
-      PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
-
-      for (Index j = 0; j < NumPerThread; j += unroll_times) {
-        const Index last_col =
-            col_begin + blockDim.x * (j + unroll_times - 1) * packet_width;
-        if (last_col >= num_coeffs_to_reduce) {
-          Index col = col_begin + blockDim.x * j;
-          for (; col + packet_width <= num_coeffs_to_reduce;
-               col += blockDim.x) {
-            const PacketType val1 = input.m_impl.template packet<Unaligned>(
-                row * num_coeffs_to_reduce + col);
-            reducer.reducePacket(val1, &reduced_val1);
-            const PacketType val2 = input.m_impl.template packet<Unaligned>(
-                (row + 1) * num_coeffs_to_reduce + col);
-            reducer.reducePacket(val2, &reduced_val2);
-          }
-          if (col < num_coeffs_to_reduce) {
-            PacketType r1 = reducer.template initializePacket<PacketType>();
-            PacketType r2 = reducer.template initializePacket<PacketType>();
-            half2* hr1 = reinterpret_cast<half2*>(&r1);
-            half2* hr2 = reinterpret_cast<half2*>(&r2);
-            while (col + 1 < num_coeffs_to_reduce) {
-              *hr1 = __halves2half2(
-                  input.m_impl.coeff(row * num_coeffs_to_reduce + col),
-                  input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
-              *hr2 = __halves2half2(
-                  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
-                  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col +
-                                     1));
-              hr1++;
-              hr2++;
-              col += 2;
-            }
-            if (col < num_coeffs_to_reduce) {
-              // Peel;
-              const half last1 =
-                  input.m_impl.coeff(row * num_coeffs_to_reduce + col);
-              *hr1 = __halves2half2(last1, reducer.initialize());
-              const half last2 =
-                  input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
-              *hr2 = __halves2half2(last2, reducer.initialize());
-            }
-            reducer.reducePacket(r1, &reduced_val1);
-            reducer.reducePacket(r2, &reduced_val2);
-          }
-          break;
-        } else {
-          // Faster version of the loop with no branches after unrolling.
-#pragma unroll
-          for (int k = 0; k < unroll_times; ++k) {
-            const Index col = col_begin + blockDim.x * (j + k) * packet_width;
-            reducer.reducePacket(input.m_impl.template packet<Unaligned>(
-                                     row * num_coeffs_to_reduce + col),
-                                 &reduced_val1);
-            reducer.reducePacket(input.m_impl.template packet<Unaligned>(
-                                     (row + 1) * num_coeffs_to_reduce + col),
-                                 &reduced_val2);
-          }
-        }
-      }
-
-#pragma unroll
-      for (int offset = warpSize/2; offset > 0; offset /= 2) {
-      #if defined(EIGEN_HIPCC)
-        PacketType r1;
-        PacketType r2;
-        half2* hr1 = reinterpret_cast<half2*>(&r1);
-        half2* hr2 = reinterpret_cast<half2*>(&r2);
-        half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
-        half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
-        for (int i = 0; i < packet_width / 2; i++) {
-	  // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
-	  union { int i; half2 h; } wka_in1, wka_out1;
-	  wka_in1.h = rv1[i];
-	  wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
-	  hr1[i] = wka_out1.h;
-
-	  union { int i; half2 h; } wka_in2, wka_out2;
-	  wka_in2.h = rv2[i];
-	  wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
-	  hr2[i] = wka_out2.h;
-        }
-        reducer.reducePacket(r1, &reduced_val1);
-        reducer.reducePacket(r2, &reduced_val2);
-      #elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
-        PacketType r1;
-        PacketType r2;
-        half2* hr1 = reinterpret_cast<half2*>(&r1);
-        half2* hr2 = reinterpret_cast<half2*>(&r2);
-        half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
-        half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
-        for (int i = 0; i < packet_width / 2; i++) {
-          hr1[i] = __shfl_down(rv1[i], offset, warpSize);
-          hr2[i] = __shfl_down(rv2[i], offset, warpSize);
-        }
-        reducer.reducePacket(r1, &reduced_val1);
-        reducer.reducePacket(r2, &reduced_val2);
-      #else
-        PacketType r1;
-        PacketType r2;
-        half2* hr1 = reinterpret_cast<half2*>(&r1);
-        half2* hr2 = reinterpret_cast<half2*>(&r2);
-        half2* rr1 = reinterpret_cast<half2*>(&reduced_val1);
-        half2* rr2 = reinterpret_cast<half2*>(&reduced_val2);
-        for (int i = 0; i < packet_width / 2; i++) {
-          hr1[i] =
-              __shfl_down_sync(0xFFFFFFFF, rr1[i], (unsigned)offset, warpSize);
-          hr2[i] =
-              __shfl_down_sync(0xFFFFFFFF, rr2[i], (unsigned)offset, warpSize);
-        }
-        reducer.reducePacket(r1, &reduced_val1);
-        reducer.reducePacket(r2, &reduced_val2);
-
-      #endif
-      }
-      half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
-      half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
-      half2 val;
-      if (packet_width > 2) {
-        reducer.reducePacket(rv1[2], rv1);
-        reducer.reducePacket(rv1[3], rv1 + 1);
-        reducer.reducePacket(rv1[1], rv1);
-        reducer.reducePacket(rv2[2], rv2);
-        reducer.reducePacket(rv2[3], rv2 + 1);
-        reducer.reducePacket(rv2[1], rv2);
-      }
-      half val1 = __low2half(*rv1);
-      reducer.reduce(__high2half(*rv1), &val1);
-      half val2 = __low2half(*rv2);
-      reducer.reduce(__high2half(*rv2), &val2);
-      val = __halves2half2(val1, val2);
-      if ((threadIdx.x & (warpSize - 1)) == 0) {
-        half* loc = output + row;
-        atomicReduce((half2*)loc, val, reducer);
-      }
-    }
-  }
-}
-
-#endif // EIGEN_HAS_GPU_FP16
-
-template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
-struct InnerReductionLauncher {
-  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
-    gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
-    return true;
-  }
-};
-
-// Specialization for float and double
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
-struct InnerReductionLauncher<
-  Self, Op, OutputType, PacketAccess,
-  typename internal::enable_if<
-    internal::is_same<float, OutputType>::value ||
-    internal::is_same<double, OutputType>::value,
-  void>::type> {
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    typedef typename Self::Index Index;
-
-    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
-    const int block_size = 256;
-    const int num_per_thread = 128;
-    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    const int max_blocks = device.getNumGpuMultiProcessors() *
-                           device.maxGpuThreadsPerMultiProcessor() / block_size;
-    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-
-    if (num_blocks > 1) {
-      // We initialize the outputs outside the reduction kernel when we can't be sure that there
-      // won't be a race conditions between multiple thread blocks.
-      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
-      const int max_blocks = device.getNumGpuMultiProcessors() *
-                           device.maxGpuThreadsPerMultiProcessor() / 1024;
-      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-      LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>),
-                         num_blocks, 1024, 0, device, reducer.initialize(),
-                         num_preserved_vals, output);
-    }
-
-    LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
-
-    return false;
-  }
-};
-
-#ifdef EIGEN_HAS_GPU_FP16
-template <typename Self, typename Op>
-struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
-  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
-    gpu_assert(false && "Should not be called since there is no packet accessor");
-    return true;
-  }
-};
-
-template <typename Self, typename Op>
-struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    typedef typename Self::Index Index;
-
-    if (num_preserved_vals % 2 != 0) {
-      // Not supported yet, revert to the slower code path
-      return true;
-    }
-
-    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
-    const int block_size = /*256*/128;
-    const int num_per_thread = /*128*/64;
-    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    const int max_blocks = device.getNumGpuMultiProcessors() *
-                           device.maxGpuThreadsPerMultiProcessor() / block_size;
-    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-
-    if (num_blocks > 1) {
-      // We initialize the outputs outside the reduction kernel when we can't be sure that there
-      // won't be a race conditions between multiple thread blocks.
-      LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
-                         1, 1, 0, device, reducer, self, num_preserved_vals, output);
-    }
-
-    LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
-
-    return false;
-  }
-};
-#endif // EIGEN_HAS_GPU_FP16
-
-
-template <typename Self, typename Op>
-struct InnerReducer<Self, Op, GpuDevice> {
-  // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple case
-  // of floats and half floats.
-#ifdef EIGEN_HAS_GPU_FP16
-  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
-      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-       internal::is_same<typename Self::CoeffReturnType, double>::value ||
-       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
-#else // EIGEN_HAS_GPU_FP16
-  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
-                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
-#endif // EIGEN_HAS_GPU_FP16
-
-  template <typename OutputType>
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
-    const Index num_coeffs = array_prod(self.m_impl.dimensions());
-    // Don't crash when we're called with an input tensor of size 0.
-    if (num_coeffs == 0) {
-      return true;
-    }
-    // It's faster to use the usual code.
-    if (num_coeffs_to_reduce <= 128) {
-      return true;
-    }
-
-    return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
-  }
-};
-
-template <int NumPerThread, typename Self,
-          typename Reducer, typename Index>
-__global__ void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
-                                     typename Self::CoeffReturnType* output) {
-  const Index num_threads = blockDim.x * gridDim.x;
-  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  // Initialize the output values if they weren't initialized by the ReductionInitKernel
-  if (gridDim.x == 1) {
-    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
-      output[i] = reducer.initialize();
-    }
-    __syncthreads();
-  }
-
-  // Do the reduction.
-  const Index max_iter = num_preserved_coeffs * divup<Index>(num_coeffs_to_reduce, NumPerThread);
-  for (Index i = thread_id; i < max_iter; i += num_threads) {
-    const Index input_col = i % num_preserved_coeffs;
-    const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
-    typename Self::CoeffReturnType reduced_val = reducer.initialize();
-    const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
-    for (Index j = input_row; j < max_row; j++) {
-      typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
-      reducer.reduce(val, &reduced_val);
-    }
-    atomicReduce(&(output[input_col]), reduced_val, reducer);
-  }
-}
-
-
-template <typename Self, typename Op>
-struct OuterReducer<Self, Op, GpuDevice> {
-  // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple case
-  // of floats.
-  static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful &&
-                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
-  template <typename Device, typename OutputType>
-  static
-    #if !defined(EIGEN_HIPCC)
-    // FIXME :  leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error
-    //          (in the cxx11_tensor_reduction_gpu test)
-    //
-    // terminate called after throwing an instance of 'std::runtime_error'
-    //   what():  No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL...
-    //
-    // don't know why this happens (and why is it a runtime error instead of a compile time error)
-    //
-    // this will be fixed by HIP PR#457
-    EIGEN_DEVICE_FUNC
-    #endif
-    bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
-    gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device");
-    return true;
-  }
-
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    typedef typename Self::Index Index;
-
-    // It's faster to use the usual code.
-    if (num_coeffs_to_reduce <= 32) {
-      return true;
-    }
-
-    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
-    const int block_size = 256;
-    const int num_per_thread = 16;
-    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    const int max_blocks = device.getNumGpuMultiProcessors() *
-                           device.maxGpuThreadsPerMultiProcessor() / block_size;
-    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-
-    if (num_blocks > 1) {
-      // We initialize the outputs in the reduction kernel itself when we don't have to worry
-      // about race conditions between multiple thread blocks.
-      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
-      const int max_blocks = device.getNumGpuMultiProcessors() *
-                             device.maxGpuThreadsPerMultiProcessor() / 1024;
-      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-      LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>),
-                         num_blocks, 1024, 0, device, reducer.initialize(),
-                         num_preserved_vals, output);
-    }
-
-    LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
-
-    return false;
-  }
-};
-
-#endif // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
-
-
-} // end namespace internal
-} // end namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
index 387c3edf4..3daecb045 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -11,576 +11,232 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 /*****************************************************************
- * TensorReductionSycl.h
+ * TensorSyclPlaceHolderExpr.h
  *
  * \brief:
- *  This is the specialization of the reduction operation. Two phase reduction approach 
- * is used since the GPU does not have Global Synchronization for global memory among 
- * different work-group/thread block. To solve the problem, we need to create two kernels 
- * to reduce the data, where the first kernel reduce the data locally and each local 
- * workgroup/thread-block save the input data into global memory. In the second phase (global reduction)
- * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element. 
- * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU:
- * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
+ *  This is the specialisation of the placeholder expression based on the
+ * operation type
  *
- *****************************************************************/
+*****************************************************************/
 
 #ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
 #define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
+
 namespace Eigen {
-namespace TensorSycl {
 namespace internal {
 
-template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>
-struct OpDefiner {
-  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
-  typedef Op type;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; }
+template<typename CoeffReturnType, typename KernelName> struct syclGenericBufferReducer{
+template<typename BufferTOut, typename BufferTIn>
+static void run(BufferTOut* bufOut, BufferTIn& bufI, const Eigen::SyclDevice& dev, size_t length, size_t local){
+  do {
+          auto f = [length, local, bufOut, &bufI](cl::sycl::handler& h) mutable {
+            cl::sycl::nd_range<1> r{cl::sycl::range<1>{std::max(length, local)},
+                                    cl::sycl::range<1>{std::min(length, local)}};
+            /* Two accessors are used: one to the buffer that is being reduced,
+             * and a second to local memory, used to store intermediate data. */
+            auto aI =
+                bufI.template get_access<cl::sycl::access::mode::read_write>(h);
+            auto aOut =
+                bufOut->template get_access<cl::sycl::access::mode::discard_write>(h);
+            cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write,
+                               cl::sycl::access::target::local>
+                scratch(cl::sycl::range<1>(local), h);
+
+            /* The parallel_for invocation chosen is the variant with an nd_item
+             * parameter, since the code requires barriers for correctness. */
+            h.parallel_for<KernelName>(
+                r, [aOut, aI, scratch, local, length](cl::sycl::nd_item<1> id) {
+                  size_t globalid = id.get_global(0);
+                  size_t localid = id.get_local(0);
+                  /* All threads collectively read from global memory into local.
+                   * The barrier ensures all threads' IO is resolved before
+                   * execution continues (strictly speaking, all threads within
+                   * a single work-group - there is no co-ordination between
+                   * work-groups, only work-items). */
+                  if (globalid < length) {
+                    scratch[localid] = aI[globalid];
+                  }
+                  id.barrier(cl::sycl::access::fence_space::local_space);
+
+                  /* Apply the reduction operation between the current local
+                   * id and the one on the other half of the vector. */
+                  if (globalid < length) {
+                    int min = (length < local) ? length : local;
+                    for (size_t offset = min / 2; offset > 0; offset /= 2) {
+                      if (localid < offset) {
+                        scratch[localid] += scratch[localid + offset];
+                      }
+                      id.barrier(cl::sycl::access::fence_space::local_space);
+                    }
+                    /* The final result will be stored in local id 0. */
+                    if (localid == 0) {
+                      aI[id.get_group(0)] = scratch[localid];
+                      if((length<=local) && globalid ==0){
+                        aOut[globalid]=scratch[localid];
+                      }
+                    }
+                  }
+                });
+          };
+            dev.m_queue.submit(f);
+            dev.m_queue.throw_asynchronous();
+
+          /* At this point, you could queue::wait_and_throw() to ensure that
+           * errors are caught quickly. However, this would likely impact
+           * performance negatively. */
+          length = length / local;
+
+        } while (length > 1);
+
+
+
+}
 
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
-                                                                            const Index &) {
-    return accumulator;
-  }
 };
 
-template <typename CoeffReturnType, typename Index>
-struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {
-  typedef Eigen::internal::SumReducer<CoeffReturnType> type;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
-    return type();
-  }
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,
-                                                                           const Index &scale) {
-    ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;
-    return quotient_op(accumulator, CoeffReturnType(scale));
-  }
-};
-
-template <typename CoeffReturnType, typename Index>
-struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {
-  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
-  typedef Eigen::internal::SumReducer<CoeffReturnType> type;
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
-    return type();
-  }
-
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
-                                                                            const Index &scale) {
-    return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
-  }
-};
-
-template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,
-          Index local_range>
-struct SecondStepFullReducer {
-  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
-      LocalAccessor;
-  typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;
-  typedef typename OpDef::type Op;
-  LocalAccessor scratch;
-  InputAccessor aI;
-  OutputAccessor outAcc;
-  Op op;
-  SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
-      : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
-
-  void operator()(cl::sycl::nd_item<1> itemID) {
-    // Our empirical research shows that the best performance will be achieved
-    // when there is only one element per thread to reduce in the second step.
-    // in this step the second step reduction time is almost negligible.
-    // Hence, in the second step of reduction the input size is fixed to the
-    // local size, thus, there is only one element read per thread. The
-    // algorithm must be changed if the number of reduce per thread in the
-    // second step is greater than 1. Otherwise, the result will be wrong.
-    const Index localid = itemID.get_local_id(0);
-    auto aInPtr = aI.get_pointer() + localid;
-    auto aOutPtr = outAcc.get_pointer();
-    CoeffReturnType *scratchptr = scratch.get_pointer();
-    CoeffReturnType accumulator = *aInPtr;
-
-    scratchptr[localid] = op.finalize(accumulator);
-#pragma unroll 8
-    for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
-      itemID.barrier(cl::sycl::access::fence_space::local_space);
-      if (localid < offset) {
-        op.reduce(scratchptr[localid + offset], &accumulator);
-        scratchptr[localid] = op.finalize(accumulator);
-      }
-    }
-    if (localid == 0) *aOutPtr = op.finalize(accumulator);
-  }
-};
-
-// Full reduction first phase. In this version the vectorization is true and the reduction accept 
-// any generic reducerOp  e.g( max, min, sum, mean, iamax, iamin, etc ). 
-template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>
-class FullReductionKernelFunctor {
- public:
-  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
-  typedef typename Evaluator::Index Index;
-  typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,
-                    (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
-      OpDef;
-
-  typedef typename OpDef::type Op;
-  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
-  typedef typename Evaluator::PacketReturnType PacketReturnType;
-  typedef
-      typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess),
-                                              PacketReturnType, CoeffReturnType>::type OutType;
-  typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
-      LocalAccessor;
-  LocalAccessor scratch;
-  Evaluator evaluator;
-  EvaluatorPointerType final_output;
-  Index rng;
-  Op op;
-
-  FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
-                             Index rng_, OpType op_)
-      : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
-
-  void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
-
-  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<Vect>::type compute_reduction(
-      const cl::sycl::nd_item<1> &itemID) {
-    auto output_ptr = final_output.get_pointer();
-    Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
-    Index globalid = itemID.get_global_id(0);
-    Index localid = itemID.get_local_id(0);
-    Index step = Evaluator::PacketSize * itemID.get_global_range(0);
-    Index start = Evaluator::PacketSize * globalid;
-    // vectorizable parts
-    PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
-#pragma unroll(8 / Evaluator::PacketSize)
-    for (Index i = start; i < VectorizedRange; i += step) {
-      op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
-    }
-    globalid += VectorizedRange;
-    // non vectorizable parts
-    for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
-      op.template reducePacket<PacketReturnType>(
-          ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(
-              evaluator.impl().coeff(i), op.initialize()),
-          &packetAccumulator);
-    }
-    scratch[localid] = packetAccumulator =
-        OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
-    // reduction parts // Local size is always power of 2
-    EIGEN_UNROLL_LOOP
-    for (Index offset = local_range / 2; offset > 0; offset /= 2) {
-      itemID.barrier(cl::sycl::access::fence_space::local_space);
-      if (localid < offset) {
-        op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
-        scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
-      }
-    }
-    if (localid == 0) {
-      output_ptr[itemID.get_group(0)] =
-          op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
-    }
-  }
-
-  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!Vect>::type compute_reduction(
-      const cl::sycl::nd_item<1> &itemID) {
-    auto output_ptr = final_output.get_pointer();
-    Index globalid = itemID.get_global_id(0);
-    Index localid = itemID.get_local_id(0);
-    // vectorizable parts
-    CoeffReturnType accumulator = op.initialize();
-    // non vectorizable parts
-    for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
-      op.reduce(evaluator.impl().coeff(i), &accumulator);
-    }
-    scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
-
-    // reduction parts. the local size is always power of 2
-    EIGEN_UNROLL_LOOP
-    for (Index offset = local_range / 2; offset > 0; offset /= 2) {
-      itemID.barrier(cl::sycl::access::fence_space::local_space);
-      if (localid < offset) {
-        op.reduce(scratch[localid + offset], &accumulator);
-        scratch[localid] = op.finalize(accumulator);
-      }
-    }
-    if (localid == 0) {
-      output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
-    }
-  }
-};
-
-template <typename Evaluator, typename OpType>
-class GenericNondeterministicReducer {
- public:
-  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
-  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
-  typedef typename Evaluator::Index Index;
-  typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
-  typedef typename OpDef::type Op;
-  template <typename Scratch>
-  GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
-                       Index range_, Index num_values_to_reduce_)
-      : evaluator(evaluator_),
-        output_accessor(output_accessor_),
-        functor(OpDef::get_op(functor_)),
-        range(range_),
-        num_values_to_reduce(num_values_to_reduce_) {}
-
-  void operator()(cl::sycl::nd_item<1> itemID) {
-    auto output_accessor_ptr = output_accessor.get_pointer();
-    /// const cast added as a naive solution to solve the qualifier drop error
-    Index globalid = static_cast<Index>(itemID.get_global_linear_id());
-    if (globalid < range) {
-      CoeffReturnType accum = functor.initialize();
-      Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
-          evaluator, evaluator.firstInput(globalid), functor, &accum);
-      output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
-    }
-  }
-
- private:
-  Evaluator evaluator;
-  EvaluatorPointerType output_accessor;
-  Op functor;
-  Index range;
-  Index num_values_to_reduce;
-};
-
-enum class reduction_dim { inner_most, outer_most };
-// default is preserver
-template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>
-struct PartialReductionKernel {
-  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
-  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
-  typedef typename Evaluator::Index Index;
-  typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
-  typedef typename OpDef::type Op;
-  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
-      ScratchAcc;
-  ScratchAcc scratch;
-  Evaluator evaluator;
-  EvaluatorPointerType output_accessor;
-  Op op;
-  const Index preserve_elements_num_groups;
-  const Index reduce_elements_num_groups;
-  const Index num_coeffs_to_preserve;
-  const Index num_coeffs_to_reduce;
-
-  PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
-                         const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,
-                         const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
-      : scratch(scratch_),
-        evaluator(evaluator_),
-        output_accessor(output_accessor_),
-        op(OpDef::get_op(op_)),
-        preserve_elements_num_groups(preserve_elements_num_groups_),
-        reduce_elements_num_groups(reduce_elements_num_groups_),
-        num_coeffs_to_preserve(num_coeffs_to_preserve_),
-        num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
-                                                                 CoeffReturnType &accumulator) {
-    if (globalPId >= num_coeffs_to_preserve) {
-      return;
-    }
-    Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
-                                                          : globalRId + (globalPId * num_coeffs_to_reduce);
-    Index localOffset = globalRId;
-
-    const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
-    const Index per_thread_global_stride =
-        rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
-#pragma unroll 8
-    for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
-      op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
-      localOffset += per_thread_local_stride;
-      global_offset += per_thread_global_stride;
-    }
-  }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
-    const Index linearLocalThreadId = itemID.get_local_id(0);
-    Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
-                                                           : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
-    Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
-                                                           : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
-    const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
-                                                           : itemID.get_group(0) / reduce_elements_num_groups;
-    const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
-                                                           : itemID.get_group(0) % reduce_elements_num_groups;
-
-    Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
-    const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
-    auto scratchPtr = scratch.get_pointer().get();
-    auto outPtr =
-        output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
-    CoeffReturnType accumulator = op.initialize();
-
-    element_wise_reduce(globalRId, globalPId, accumulator);
-
-    accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
-    scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
-        accumulator;
-    if (rt == reduction_dim::inner_most) {
-      pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
-      rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
-      globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
-    }
-
-    /* Apply the reduction operation between the current local
-     * id and the one on the other half of the vector. */
-    auto out_scratch_ptr =
-        scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
-    itemID.barrier(cl::sycl::access::fence_space::local_space);
-    if (rt == reduction_dim::inner_most) {
-      accumulator = *out_scratch_ptr;
-    }
-    // The Local LocalThreadSizeR is always power of 2
-    EIGEN_UNROLL_LOOP
-    for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
-      if (rLocalThreadId < offset) {
-        op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
-        // The result has already been divided for mean reducer in the
-        // previous reduction so no need to divide furthermore
-        *out_scratch_ptr = op.finalize(accumulator);
-      }
-      /* All threads collectively read from global memory into local.
-       * The barrier ensures all threads' IO is resolved before
-       * execution continues (strictly speaking, all threads within
-       * a single work-group - there is no co-ordination between
-       * work-groups, only work-items). */
-      itemID.barrier(cl::sycl::access::fence_space::local_space);
-    }
-
-    if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
-      outPtr[globalPId] = op.finalize(accumulator);
-    }
-  }
-};
-
-template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>
-struct SecondStepPartialReduction {
-  typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;
-  typedef typename OpDef::type Op;
-  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
-      ScratchAccessor;
-  InputAccessor input_accessor;
-  OutputAccessor output_accessor;
-  Op op;
-  const Index num_coeffs_to_preserve;
-  const Index num_coeffs_to_reduce;
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,
-                                                                   OutputAccessor output_accessor_, OpType op_,
-                                                                   const Index num_coeffs_to_preserve_,
-                                                                   const Index num_coeffs_to_reduce_)
-      : input_accessor(input_accessor_),
-        output_accessor(output_accessor_),
-        op(OpDef::get_op(op_)),
-        num_coeffs_to_preserve(num_coeffs_to_preserve_),
-        num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
-    const Index globalId = itemID.get_global_id(0);
-
-    if (globalId >= num_coeffs_to_preserve) return;
-
-    auto in_ptr = input_accessor.get_pointer() + globalId;
-
-    OutScalar accumulator = op.initialize();
-// num_coeffs_to_reduce is not bigger that 256
-#pragma unroll 8
-    for (Index i = 0; i < num_coeffs_to_reduce; i++) {
-      op.reduce(*in_ptr, &accumulator);
-      in_ptr += num_coeffs_to_preserve;
-    }
-    output_accessor.get_pointer()[globalId] = op.finalize(accumulator);
-  }
-};  // namespace internal
-
-template <typename Index, Index LTP, Index LTR, bool BC_>
-struct ReductionPannel {
-  static EIGEN_CONSTEXPR Index LocalThreadSizeP = LTP;
-  static EIGEN_CONSTEXPR Index LocalThreadSizeR = LTR;
-  static EIGEN_CONSTEXPR bool BC = BC_;
-};
-
-template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
-struct PartialReducerLauncher {
-  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
-  typedef typename Self::CoeffReturnType CoeffReturnType;
-  typedef typename Self::Storage Storage;
-  typedef typename Self::Index Index;
-  typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>
-      PannelParameters;
-
-  typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;
-
-  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,
-                  Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
-    Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
-
-    // getPowerOfTwo makes sure local range is power of 2 and <=
-    // maxSyclThreadPerBlock this will help us to avoid extra check on the
-    // kernel
-    static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
-                    (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),
-                  "The Local thread size must be a power of 2 for the reduction "
-                  "operation");
-
-    EIGEN_CONSTEXPR Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
-    // In this step, we force the code not to be more than 2-step reduction:
-    // Our empirical research shows that if each thread reduces at least 64
-    // elemnts individually, we get better performance. However, this can change
-    // on different platforms. In this step we force the code not to be
-    // morthan step reduction: Our empirical research shows that for inner_most
-    // dim reducer, it is better to have 8 group in a reduce dimension for sizes
-    // > 1024 to achieve the best performance.
-    const Index reductionPerThread = 64;
-    Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);
-    const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
-    Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
-    const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
-    const Index globalRange = pNumGroups * rNumGroups * localRange;
-
-    EIGEN_CONSTEXPR Index scratchSize =
-        PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
-    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
-    if (rNumGroups > 1) {
-      CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
-          dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));
-      EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
-      dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
-          self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
-          num_coeffs_to_reduce);
-
-      typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
-          SecondStepPartialReductionKernel;
-
-      dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
-          temp_accessor, output,
-          cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)), Index(1),
-          reducer, num_coeffs_to_preserve, rNumGroups);
-
-      self.device().deallocate_temp(temp_pointer);
-    } else {
-      dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
-          self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
-          num_coeffs_to_reduce);
-    }
-    return false;
-  }
-};
-}  // namespace internal
-}  // namespace TensorSycl
-
-namespace internal {
-
+/// For now let's start with a full reducer
+/// Self is useless here because in expression construction we are going to treat reduction as a leafnode.
+/// we want to take reduction child and then build a construction and apply the full reducer function on it. Fullreducre applies the
+/// reduction operation on the child of the reduction. once it is done the reduction is an empty shell and can be thrown away and treated as
+// a leafNode.
 template <typename Self, typename Op, bool Vectorizable>
-struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
+struct FullReducer<Self, Op, const Eigen::SyclDevice, Vectorizable> {
+
   typedef typename Self::CoeffReturnType CoeffReturnType;
-  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
-  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
-  static EIGEN_CONSTEXPR int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
-  static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
-    typedef typename conditional<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType>::type OutType;
-    static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
-                    (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
-                  "The Local thread size must be a power of 2 for the reduction "
-                  "operation");
-    EIGEN_CONSTEXPR Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
+  static const bool HasOptimizedImplementation = false;
 
-    typename Self::Index inputSize = self.impl().dimensions().TotalSize();
-    // In this step we force the code not to be more than 2-step reduction:
-    // Our empirical research shows that if each thread reduces at least 512
-    // elemnts individually, we get better performance.
-    const Index reductionPerThread = 2048;
-    // const Index num_work_group =
-    Index reductionGroup = dev.getPowerOfTwo(
-        (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);
-    const Index num_work_group = std::min(reductionGroup, local_range);
-    // 1
-    // ? local_range
-    // : 1);
-    const Index global_range = num_work_group * local_range;
+  static void run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output) {
+    typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
+    typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
+    auto functors = TensorSycl::internal::extractFunctors(self.impl());
+    int red_factor =256; /// initial reduction. If the size is less than red_factor we only creates one thread.
+    size_t inputSize =self.impl().dimensions().TotalSize();
+    size_t rng = inputSize/red_factor; // the total number of thread initially is half the size of the input
+    size_t remaining = inputSize% red_factor;
+    if(rng ==0) {
+      red_factor=1;
+    };
+    size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
+    size_t GRange=std::max((size_t )1, rng);
 
-    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
-    typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;
-    if (num_work_group > 1) {
-      CoeffReturnType *temp_pointer =
-          static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
-      typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
-      dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
-                                                                      local_range, inputSize, reducer);
+    // convert global range to power of 2 for redecution
+    GRange--;
+    GRange |= GRange >> 1;
+    GRange |= GRange >> 2;
+    GRange |= GRange >> 4;
+    GRange |= GRange >> 8;
+    GRange |= GRange >> 16;
+#if __x86_64__ || __ppc64__ || _WIN64
+    GRange |= GRange >> 32;
+#endif
+    GRange++;
+    size_t  outTileSize = tileSize;
+    /// if the shared memory is less than the GRange, we set shared_mem size to the TotalSize and in this case one kernel would be created for recursion to reduce all to one.
+    if (GRange < outTileSize) outTileSize=GRange;
+    // getting final out buffer at the moment the created buffer is true because there is no need for assign
+    auto out_buffer =dev.template get_sycl_buffer<typename Eigen::internal::remove_all<CoeffReturnType>::type>(self.dimensions().TotalSize(), output);
+    /// creating the shared memory for calculating reduction.
+    /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
+    /// recursively apply reduction on it in order to reduce the whole.
+    auto temp_global_buffer =cl::sycl::buffer<CoeffReturnType, 1>(cl::sycl::range<1>(GRange));
+    typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
+    Dims dims= self.xprDims();
+    Op functor = reducer;
+    dev.m_queue.submit([&](cl::sycl::handler &cgh) {
+      // create a tuple of accessors from Evaluator
+      auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
+      auto tmp_global_accessor = temp_global_buffer. template get_access<cl::sycl::access::mode::read_write, cl::sycl::access::target::global_buffer>(cgh);
 
-      typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
-                                                          EvaluatorPointerType, Index, local_range>
-          GenericRKernel;
-      dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
-          tmp_global_accessor, data,
-          cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,
-          reducer);
+      cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(outTileSize)), [=](cl::sycl::nd_item<1> itemID) {
+        typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
+        auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+        /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
+        /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
+        /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
+        const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
+        /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
+        /// the device_evaluator is detectable and recognisable on the device.
+        auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+        /// const cast added as a naive solution to solve the qualifier drop error
+        auto globalid=itemID.get_global_linear_id();
 
-      dev.deallocate_temp(temp_pointer);
-    } else {
-      dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
-                                                                      reducer);
+        if(globalid<rng)
+          tmp_global_accessor.get_pointer()[globalid]=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*globalid, red_factor, const_cast<Op&>(functor));
+        else
+          tmp_global_accessor.get_pointer()[globalid]=static_cast<CoeffReturnType>(0);
+
+        if(remaining!=0 && globalid==0 )
+          // this will add the rest of input buffer when the input size is not devidable to red_factor.
+          tmp_global_accessor.get_pointer()[globalid]+=InnerMostDimReducer<decltype(device_self_evaluator), Op, false>::reduce(device_self_evaluator, red_factor*(rng), remaining, const_cast<Op&>(functor));
+      });
+    });
+  dev.m_queue.throw_asynchronous();
+
+/// This is used to recursively reduce the tmp value to an element of 1;
+  syclGenericBufferReducer<CoeffReturnType,HostExpr>::run(out_buffer, temp_global_buffer,dev, GRange,  outTileSize);
+  }
+
+};
+
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, const Eigen::SyclDevice> {
+
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  static const bool HasOptimizedImplementation = false;
+
+  static bool run(const Self& self, Op& reducer, const Eigen::SyclDevice& dev, CoeffReturnType* output, typename Self::Index , typename Self::Index num_coeffs_to_preserve) {
+    typedef const typename Self::ChildType HostExpr; /// this is the child of reduction
+    typedef  typename TensorSycl::internal::createPlaceHolderExpression<HostExpr>::Type PlaceHolderExpr;
+    auto functors = TensorSycl::internal::extractFunctors(self.impl());
+
+    size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
+
+    size_t GRange=num_coeffs_to_preserve;
+    if (tileSize>GRange) tileSize=GRange;
+    else if(GRange>tileSize){
+      size_t xMode = GRange % tileSize;
+      if (xMode != 0) GRange += (tileSize - xMode);
     }
-  }
-};
-// vectorizable inner_most most dim preserver
-// col reduction
-template <typename Self, typename Op>
-struct OuterReducer<Self, Op, Eigen::SyclDevice> {
-  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+    // getting final out buffer at the moment the created buffer is true because there is no need for assign
+    /// creating the shared memory for calculating reduction.
+    /// This one is used to collect all the reduced value of shared memory as we dont have global barrier on GPU. Once it is saved we can
+    /// recursively apply reduction on it in order to reduce the whole.
+    typedef typename Eigen::internal::remove_all<decltype(self.xprDims())>::type Dims;
+    Dims dims= self.xprDims();
+    Op functor = reducer;
 
-  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
-                  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
-                  typename Self::Index num_coeffs_to_preserve) {
-    return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
-        Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,
-                                                                                 num_coeffs_to_reduce,
-                                                                                 num_coeffs_to_preserve);
-  }
-};
-// row reduction
-template <typename Self, typename Op>
-struct InnerReducer<Self, Op, Eigen::SyclDevice> {
-  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = true;
+    dev.m_queue.submit([&](cl::sycl::handler &cgh) {
+      // create a tuple of accessors from Evaluator
+      auto tuple_of_accessors =  TensorSycl::internal::createTupleOfAccessors(cgh, self.impl());
+      auto output_accessor = dev.template get_sycl_accessor<cl::sycl::access::mode::discard_write>(num_coeffs_to_preserve,cgh, output);
 
-  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
-                  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
-                  typename Self::Index num_coeffs_to_preserve) {
-    return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
-        Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,
-                                                                                 num_coeffs_to_reduce,
-                                                                                 num_coeffs_to_preserve);
-  }
-};
-
-// ArmgMax uses this kernel for partial reduction//
-// TODO(@mehdi.goli) come up with a better kernel
-// generic partial reduction
-template <typename Self, typename Op>
-struct GenericReducer<Self, Op, Eigen::SyclDevice> {
-  static EIGEN_CONSTEXPR bool HasOptimizedImplementation = false;
-  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
-                  typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
-                  typename Self::Index num_coeffs_to_preserve) {
-    typename Self::Index range, GRange, tileSize;
-    dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
-
-    dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
-                                       TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
-        self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
-        reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1));
+      cgh.parallel_for<Self>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
+        typedef typename TensorSycl::internal::ConvertToDeviceExpression<const HostExpr>::Type DevExpr;
+        auto device_expr = TensorSycl::internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+        /// reduction cannot be captured automatically through our device conversion recursion. The reason is that reduction has two behaviour
+        /// the first behaviour is when it is used as a root to lauch the sub-kernel. The second one is when it is treated as a leafnode to pass the
+        /// calculated result to its parent kernel. While the latter is automatically detected through our device expression generator. The former is created here.
+        const auto device_self_expr= TensorReductionOp<Op, Dims, decltype(device_expr.expr) ,MakeGlobalPointer>(device_expr.expr, dims, functor);
+        /// This is the evaluator for device_self_expr. This is exactly similar to the self which has been passed to run function. The difference is
+        /// the device_evaluator is detectable and recognisable on the device.
+        typedef Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice> DeiceSelf;
+        auto device_self_evaluator = Eigen::TensorEvaluator<decltype(device_self_expr), Eigen::DefaultDevice>(device_self_expr, Eigen::DefaultDevice());
+        /// const cast added as a naive solution to solve the qualifier drop error
+        auto globalid=itemID.get_global_linear_id();
+        if (globalid< static_cast<size_t>(num_coeffs_to_preserve)) {
+          typename DeiceSelf::CoeffReturnType accum = functor.initialize();
+          GenericDimReducer<DeiceSelf::NumReducedDims-1, DeiceSelf, Op>::reduce(device_self_evaluator, device_self_evaluator.firstInput(globalid),const_cast<Op&>(functor), &accum);
+          functor.finalize(accum);
+          output_accessor.get_pointer()[globalid]= accum;
+        }
+      });
+    });
+  dev.m_queue.throw_asynchronous();
     return false;
   }
 };
 
-}  // namespace internal
+}  // end namespace internal
 }  // namespace Eigen
 
 #endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
index 030d19844..99245f778 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -31,7 +31,7 @@ class TensorLazyBaseEvaluator {
   int refCount() const { return m_refcount; }
 
  private:
-  // No copy, no assignment;
+  // No copy, no assigment;
   TensorLazyBaseEvaluator(const TensorLazyBaseEvaluator& other);
   TensorLazyBaseEvaluator& operator = (const TensorLazyBaseEvaluator& other);
 
@@ -44,9 +44,6 @@ class TensorLazyEvaluatorReadOnly : public TensorLazyBaseEvaluator<Dimensions, t
  public:
   //  typedef typename TensorEvaluator<Expr, Device>::Dimensions Dimensions;
   typedef typename TensorEvaluator<Expr, Device>::Scalar Scalar;
-  typedef StorageMemory<Scalar, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-  typedef  TensorEvaluator<Expr, Device> EvalType;
 
   TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device), m_dummy(Scalar(0)) {
     m_dims = m_impl.dimensions();
@@ -82,8 +79,6 @@ class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly<Dimension
  public:
   typedef TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> Base;
   typedef typename Base::Scalar Scalar;
-  typedef StorageMemory<Scalar, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
 
   TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) {
   }
@@ -141,17 +136,11 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
     enum {
       IsAligned = false,
       PacketAccess = false,
-      BlockAccess = false,
-      PreferBlockAccess = false,
       Layout = PlainObjectType::Layout,
       CoordAccess = false,  // to be implemented
       RawAccess = false
     };
 
-    //===- Tensor block evaluation strategy (see TensorBlock.h) -----------===//
-    typedef internal::TensorBlockNotImplemented TensorBlock;
-    //===------------------------------------------------------------------===//
-
     EIGEN_STRONG_INLINE TensorRef() : m_evaluator(NULL) {
     }
 
@@ -371,30 +360,22 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
   typedef typename Derived::Scalar CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
   typedef typename Derived::Dimensions Dimensions;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
     IsAligned = false,
     PacketAccess = false,
-    BlockAccess = false,
-    PreferBlockAccess = false,
     Layout = TensorRef<Derived>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&)
       : m_ref(m)
   { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
     return true;
   }
 
@@ -408,7 +389,7 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
     return m_ref.coeffRef(index);
   }
 
-  EIGEN_DEVICE_FUNC const Scalar* data() const { return m_ref.data(); }
+  EIGEN_DEVICE_FUNC Scalar* data() const { return m_ref.data(); }
 
  protected:
   TensorRef<Derived> m_ref;
@@ -430,15 +411,9 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons
   enum {
     IsAligned = false,
     PacketAccess = false,
-    BlockAccess = false,
-    PreferBlockAccess = false,
     RawAccess = false
   };
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d)
   { }
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 2fc85c13c..14e392e36 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -31,7 +31,6 @@ struct traits<TensorReverseOp<ReverseDimensions,
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename ReverseDimensions, typename XprType>
@@ -108,39 +107,19 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
-    IsAligned         = false,
-    PacketAccess      = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess       = NumDims > 0,
-    PreferBlockAccess = true,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess       = false,  // to be implemented
-    RawAccess         = false
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
-  typedef internal::TensorIntDivisor<Index> IndexDivisor;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
-      ArgTensorBlock;
-
-  typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims,
-                                                     Layout, Index>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
                                                         const Device& device)
-      : m_impl(op.expression(), device),
-        m_reverse(op.reverse()),
-        m_device(device)
+      : m_impl(op.expression(), device), m_reverse(op.reverse())
   {
     // Reversing a scalar isn't supported yet. It would be a no-op anyway.
     EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -151,13 +130,11 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
       m_strides[0] = 1;
       for (int i = 1; i < NumDims; ++i) {
         m_strides[i] = m_strides[i-1] * m_dimensions[i-1];
-        if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]);
       }
     } else {
       m_strides[NumDims-1] = 1;
       for (int i = NumDims - 2; i >= 0; --i) {
         m_strides[i] = m_strides[i+1] * m_dimensions[i+1];
-        if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]);
       }
     }
   }
@@ -165,19 +142,10 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType, EvalSubExprsCallback done) {
-    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
-  }
-#endif  // EIGEN_USE_THREADS
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
@@ -187,9 +155,8 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
     eigen_assert(index < dimensions().TotalSize());
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
-        Index idx = index / m_fastStrides[i];
+        Index idx = index / m_strides[i];
         index -= idx * m_strides[i];
         if (m_reverse[i]) {
           idx = m_dimensions[i] - idx - 1;
@@ -202,9 +169,8 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
         inputIndex += index;
       }
     } else {
-      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
-        Index idx = index / m_fastStrides[i];
+        Index idx = index / m_strides[i];
         index -= idx * m_strides[i];
         if (m_reverse[i]) {
           idx = m_dimensions[i] - idx - 1;
@@ -236,7 +202,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
     // local structure.
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type
                                                             values[PacketSize];
-    EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
@@ -244,130 +209,6 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
     return rslt;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    const size_t target_size = m_device.lastLevelCacheSize();
-    // Block evaluation reads underlying memory in reverse order, and default
-    // cost model does not properly catch this in bytes stored/loaded.
-    return internal::TensorBlockResourceRequirements::skewed<Scalar>(
-               target_size)
-        .addCostPerCoeff({0, 0, 24});
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool /*root_of_expr_ast*/ = false) const {
-    // TODO(ezhulenev): If underlying tensor expression supports and prefers
-    // block evaluation we must use it. Currently we use coeff and packet
-    // access into the underlying tensor expression.
-    // static const bool useBlockAccessForArgType =
-    //     TensorEvaluator<ArgType, Device>::BlockAccess &&
-    //     TensorEvaluator<ArgType, Device>::PreferBlockAccess;
-
-    static const bool isColMajor =
-        static_cast<int>(Layout) == static_cast<int>(ColMajor);
-
-    static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1;
-    const bool inner_dim_reversed = m_reverse[inner_dim_idx];
-
-    // Offset in the output block.
-    Index block_offset = 0;
-
-    // Offset in the input Tensor.
-    Index input_offset = reverseIndex(desc.offset());
-
-    // Initialize output block iterator state. Dimension in this array are
-    // always in inner_most -> outer_most order (col major layout).
-    array<BlockIteratorState, NumDims> it;
-    for (int i = 0; i < NumDims; ++i) {
-      const int dim = isColMajor ? i : NumDims - 1 - i;
-      it[i].size = desc.dimension(dim);
-      it[i].count = 0;
-      it[i].reverse = m_reverse[dim];
-
-      it[i].block_stride =
-          i == 0 ? 1 : (it[i - 1].size * it[i - 1].block_stride);
-      it[i].block_span = it[i].block_stride * (it[i].size - 1);
-
-      it[i].input_stride = m_strides[dim];
-      it[i].input_span = it[i].input_stride * (it[i].size - 1);
-
-      if (it[i].reverse) {
-        it[i].input_stride = -1 * it[i].input_stride;
-        it[i].input_span = -1 * it[i].input_span;
-      }
-    }
-
-    // If multiple inner dimensions have the same reverse flag, check if we can
-    // merge them into a single virtual inner dimension.
-    int effective_inner_dim = 0;
-    for (int i = 1; i < NumDims; ++i) {
-      if (it[i].reverse != it[effective_inner_dim].reverse) break;
-      if (it[i].block_stride != it[effective_inner_dim].size) break;
-      if (it[i].block_stride != numext::abs(it[i].input_stride)) break;
-
-      it[i].size = it[effective_inner_dim].size * it[i].size;
-
-      it[i].block_stride = 1;
-      it[i].input_stride = (inner_dim_reversed ? -1 : 1);
-
-      it[i].block_span = it[i].block_stride * (it[i].size - 1);
-      it[i].input_span = it[i].input_stride * (it[i].size - 1);
-
-      effective_inner_dim = i;
-    }
-
-    eigen_assert(it[effective_inner_dim].block_stride == 1);
-    eigen_assert(it[effective_inner_dim].input_stride ==
-                 (inner_dim_reversed ? -1 : 1));
-
-    const Index inner_dim_size = it[effective_inner_dim].size;
-
-    // Prepare storage for the materialized reverse result.
-    const typename TensorBlock::Storage block_storage =
-        TensorBlock::prepareStorage(desc, scratch);
-    CoeffReturnType* block_buffer = block_storage.data();
-
-    while (it[NumDims - 1].count < it[NumDims - 1].size) {
-      // Copy inner-most dimension data from reversed location in input.
-      Index dst = block_offset;
-      Index src = input_offset;
-
-      // NOTE(ezhulenev): Adding vectorized path with internal::preverse showed
-      // worse results in benchmarks than a simple coefficient loop.
-      if (inner_dim_reversed) {
-        for (Index i = 0; i < inner_dim_size; ++i) {
-          block_buffer[dst] = m_impl.coeff(src);
-          ++dst;
-          --src;
-        }
-      } else {
-        for (Index i = 0; i < inner_dim_size; ++i) {
-          block_buffer[dst] = m_impl.coeff(src);
-          ++dst;
-          ++src;
-        }
-      }
-
-      // For the 1d tensor we need to generate only one inner-most dimension.
-      if ((NumDims - effective_inner_dim) == 1) break;
-
-      // Update offset.
-      for (Index i = effective_inner_dim + 1; i < NumDims; ++i) {
-        if (++it[i].count < it[i].size) {
-          block_offset += it[i].block_stride;
-          input_offset += it[i].input_stride;
-          break;
-        }
-        if (i != NumDims - 1) it[i].count = 0;
-        block_offset -= it[i].block_span;
-        input_offset -= it[i].input_span;
-      }
-    }
-
-    return block_storage.AsTensorMaterializedBlock();
-  }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
                                      2 * TensorOpCost::MulCost<Index>() +
@@ -381,42 +222,13 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
            TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
  protected:
   Dimensions m_dimensions;
   array<Index, NumDims> m_strides;
-  array<IndexDivisor, NumDims> m_fastStrides;
   TensorEvaluator<ArgType, Device> m_impl;
   ReverseDimensions m_reverse;
-  const Device EIGEN_DEVICE_REF m_device;
-
- private:
-  struct BlockIteratorState {
-    BlockIteratorState()
-        : size(0),
-          count(0),
-          reverse(false),
-          block_stride(0),
-          block_span(0),
-          input_stride(0),
-          input_span(0) {}
-
-    Index size;
-    Index count;
-    bool reverse;
-    Index block_stride;
-    Index block_span;
-    Index input_stride;
-    Index input_span;
-  };
 };
 
 // Eval as lvalue
@@ -435,8 +247,6 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -448,12 +258,8 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-  
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const Dimensions& dimensions() const { return this->m_dimensions; }
 
@@ -469,11 +275,11 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
     // This code is pilfered from TensorMorphing.h
     EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
     internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
-    EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       this->coeffRef(index+i) = values[i];
     }
   }
+
 };
 
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index bef8d261f..8501466ce 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -24,7 +24,6 @@ struct traits<TensorScanOp<Op, XprType> >
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename Op, typename XprType>
@@ -77,257 +76,172 @@ protected:
   const bool m_exclusive;
 };
 
+template <typename Self, typename Reducer, typename Device>
+struct ScanLauncher;
 
-namespace internal {
+// Eval as rvalue
+template <typename Op, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
 
-template <typename Self>
-EIGEN_STRONG_INLINE void ReduceScalar(Self& self, Index offset,
-                                      typename Self::CoeffReturnType* data) {
-  // Compute the scan along the axis, starting at the given offset
-  typename Self::CoeffReturnType accum = self.accumulator().initialize();
-  if (self.stride() == 1) {
-    if (self.exclusive()) {
-      for (Index curr = offset; curr < offset + self.size(); ++curr) {
-        data[curr] = self.accumulator().finalize(accum);
-        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+  typedef TensorScanOp<Op, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,
+    RawAccess = true
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
+                                                        const Device& device)
+      : m_impl(op.expression(), device),
+        m_device(device),
+        m_exclusive(op.exclusive()),
+        m_accumulator(op.accumulator()),
+        m_size(m_impl.dimensions()[op.axis()]),
+        m_stride(1),
+        m_output(NULL) {
+
+    // Accumulating a scalar isn't supported.
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
+
+    // Compute stride of scan axis
+    const Dimensions& dims = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < op.axis(); ++i) {
+        m_stride = m_stride * dims[i];
       }
     } else {
-      for (Index curr = offset; curr < offset + self.size(); ++curr) {
-        self.accumulator().reduce(self.inner().coeff(curr), &accum);
-        data[curr] = self.accumulator().finalize(accum);
-      }
-    }
-  } else {
-    if (self.exclusive()) {
-      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
-        Index curr = offset + idx3 * self.stride();
-        data[curr] = self.accumulator().finalize(accum);
-        self.accumulator().reduce(self.inner().coeff(curr), &accum);
-      }
-    } else {
-      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
-        Index curr = offset + idx3 * self.stride();
-        self.accumulator().reduce(self.inner().coeff(curr), &accum);
-        data[curr] = self.accumulator().finalize(accum);
+      for (int i = NumDims - 1; i > op.axis(); --i) {
+        m_stride = m_stride * dims[i];
       }
     }
   }
-}
 
-template <typename Self>
-EIGEN_STRONG_INLINE void ReducePacket(Self& self, Index offset,
-                                      typename Self::CoeffReturnType* data) {
-  using Scalar = typename Self::CoeffReturnType;
-  using Packet = typename Self::PacketReturnType;
-  // Compute the scan along the axis, starting at the calculated offset
-  Packet accum = self.accumulator().template initializePacket<Packet>();
-  if (self.stride() == 1) {
-    if (self.exclusive()) {
-      for (Index curr = offset; curr < offset + self.size(); ++curr) {
-        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
-        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
-      }
-    } else {
-      for (Index curr = offset; curr < offset + self.size(); ++curr) {
-        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
-        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
-      }
-    }
-  } else {
-    if (self.exclusive()) {
-      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
-        const Index curr = offset + idx3 * self.stride();
-        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
-        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
-      }
-    } else {
-      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
-        const Index curr = offset + idx3 * self.stride();
-        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
-        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
-      }
-    }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+    return m_impl.dimensions();
   }
-}
 
-template <typename Self, bool Vectorize, bool Parallel>
-struct ReduceBlock {
-  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
-                                      typename Self::CoeffReturnType* data) {
-    for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
-      // Calculate the starting offset for the scan
-      Index offset = idx1 + idx2;
-      ReduceScalar(self, offset, data);
-    }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const {
+    return m_stride;
   }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
+    return m_size;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const {
+    return m_accumulator;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const {
+    return m_exclusive;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const {
+    return m_impl;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const {
+    return m_device;
+  }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    ScanLauncher<Self, Op, Device> launcher;
+    if (data) {
+      launcher(*this, data);
+      return false;
+    }
+
+    const Index total_size = internal::array_prod(dimensions());
+    m_output = static_cast<CoeffReturnType*>(m_device.allocate(total_size * sizeof(Scalar)));
+    launcher(*this, m_output);
+    return true;
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const
+  {
+    return m_output;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_output[index];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    if (m_output != NULL) {
+      m_device.deallocate(m_output);
+      m_output = NULL;
+    }
+    m_impl.cleanup();
+  }
+
+protected:
+  TensorEvaluator<ArgType, Device> m_impl;
+  const Device& m_device;
+  const bool m_exclusive;
+  Op m_accumulator;
+  const Index m_size;
+  Index m_stride;
+  CoeffReturnType* m_output;
 };
 
-// Specialization for vectorized reduction.
-template <typename Self>
-struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/false> {
-  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
-                                      typename Self::CoeffReturnType* data) {
-    using Packet = typename Self::PacketReturnType;
-    const int PacketSize = internal::unpacket_traits<Packet>::size;
-    Index idx2 = 0;
-    for (; idx2 + PacketSize <= self.stride(); idx2 += PacketSize) {
-      // Calculate the starting offset for the packet scan
-      Index offset = idx1 + idx2;
-      ReducePacket(self, offset, data);
-    }
-    for (; idx2 < self.stride(); idx2++) {
-      // Calculate the starting offset for the scan
-      Index offset = idx1 + idx2;
-      ReduceScalar(self, offset, data);
-    }
-  }
-};
-
-// Single-threaded CPU implementation of scan
-template <typename Self, typename Reducer, typename Device,
-          bool Vectorize =
-              (TensorEvaluator<typename Self::ChildTypeNoConst, Device>::PacketAccess &&
-               internal::reducer_traits<Reducer, Device>::PacketAccess)>
+// CPU implementation of scan
+// TODO(ibab) This single-threaded implementation should be parallelized,
+// at least by running multiple scans at the same time.
+template <typename Self, typename Reducer, typename Device>
 struct ScanLauncher {
-  void operator()(Self& self, typename Self::CoeffReturnType* data) {
+  void operator()(Self& self, typename Self::CoeffReturnType *data) {
     Index total_size = internal::array_prod(self.dimensions());
 
     // We fix the index along the scan axis to 0 and perform a
     // scan per remaining entry. The iteration is split into two nested
-    // loops to avoid an integer division by keeping track of each idx1 and
-    // idx2.
+    // loops to avoid an integer division by keeping track of each idx1 and idx2.
     for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
-      ReduceBlock<Self, Vectorize, /*Parallel=*/false> block_reducer;
-      block_reducer(self, idx1, data);
-    }
-  }
-};
+      for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
+        // Calculate the starting offset for the scan
+        Index offset = idx1 + idx2;
 
-#ifdef EIGEN_USE_THREADS
+        // Compute the scan along the axis, starting at the calculated offset
+        typename Self::CoeffReturnType accum = self.accumulator().initialize();
+        for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+          Index curr = offset + idx3 * self.stride();
 
-// Adjust block_size to avoid false sharing of cachelines among
-// threads. Currently set to twice the cache line size on Intel and ARM
-// processors.
-EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) {
-  EIGEN_CONSTEXPR Index kBlockAlignment = 128;
-  const Index items_per_cacheline =
-      numext::maxi<Index>(1, kBlockAlignment / item_size);
-  return items_per_cacheline * divup(block_size, items_per_cacheline);
-}
-
-template <typename Self>
-struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/true> {
-  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
-                                      typename Self::CoeffReturnType* data) {
-    using Scalar = typename Self::CoeffReturnType;
-    using Packet = typename Self::PacketReturnType;
-    const int PacketSize = internal::unpacket_traits<Packet>::size;
-    Index num_scalars = self.stride();
-    Index num_packets = 0;
-    if (self.stride() >= PacketSize) {
-      num_packets = self.stride() / PacketSize;
-      self.device().parallelFor(
-          num_packets,
-        TensorOpCost(PacketSize * self.size(), PacketSize * self.size(),
-                     16 * PacketSize * self.size(), true, PacketSize),
-        // Make the shard size large enough that two neighboring threads
-        // won't write to the same cacheline of `data`.
-        [=](Index blk_size) {
-          return AdjustBlockSize(PacketSize * sizeof(Scalar), blk_size);
-        },
-        [&](Index first, Index last) {
-          for (Index packet = first; packet < last; ++packet) {
-            const Index idx2 = packet * PacketSize;
-            ReducePacket(self, idx1 + idx2, data);
+          if (self.exclusive()) {
+            data[curr] = self.accumulator().finalize(accum);
+            self.accumulator().reduce(self.inner().coeff(curr), &accum);
+          } else {
+            self.accumulator().reduce(self.inner().coeff(curr), &accum);
+            data[curr] = self.accumulator().finalize(accum);
           }
-        });
-      num_scalars -= num_packets * PacketSize;
-    }
-    self.device().parallelFor(
-        num_scalars, TensorOpCost(self.size(), self.size(), 16 * self.size()),
-        // Make the shard size large enough that two neighboring threads
-        // won't write to the same cacheline of `data`.
-        [=](Index blk_size) {
-          return AdjustBlockSize(sizeof(Scalar), blk_size);
-        },
-        [&](Index first, Index last) {
-          for (Index scalar = first; scalar < last; ++scalar) {
-            const Index idx2 = num_packets * PacketSize + scalar;
-            ReduceScalar(self, idx1 + idx2, data);
-          }
-        });
-  }
-};
-
-template <typename Self>
-struct ReduceBlock<Self, /*Vectorize=*/false, /*Parallel=*/true> {
-  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1,
-                                      typename Self::CoeffReturnType* data) {
-    using Scalar = typename Self::CoeffReturnType;
-    self.device().parallelFor(
-        self.stride(), TensorOpCost(self.size(), self.size(), 16 * self.size()),
-        // Make the shard size large enough that two neighboring threads
-        // won't write to the same cacheline of `data`.
-        [=](Index blk_size) {
-          return AdjustBlockSize(sizeof(Scalar), blk_size);
-        },
-        [&](Index first, Index last) {
-          for (Index idx2 = first; idx2 < last; ++idx2) {
-            ReduceScalar(self, idx1 + idx2, data);
-          }
-        });
-  }
-};
-
-// Specialization for multi-threaded execution.
-template <typename Self, typename Reducer, bool Vectorize>
-struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> {
-  void operator()(Self& self, typename Self::CoeffReturnType* data) {
-    using Scalar = typename Self::CoeffReturnType;
-    using Packet = typename Self::PacketReturnType;
-    const int PacketSize = internal::unpacket_traits<Packet>::size;
-    const Index total_size = internal::array_prod(self.dimensions());
-    const Index inner_block_size = self.stride() * self.size();
-    bool parallelize_by_outer_blocks = (total_size >= (self.stride() * inner_block_size));
-
-    if ((parallelize_by_outer_blocks && total_size <= 4096) ||
-        (!parallelize_by_outer_blocks && self.stride() < PacketSize)) {
-      ScanLauncher<Self, Reducer, DefaultDevice, Vectorize> launcher;
-      launcher(self, data);
-      return;
-    }
-
-    if (parallelize_by_outer_blocks) {
-      // Parallelize over outer blocks.
-      const Index num_outer_blocks = total_size / inner_block_size;
-      self.device().parallelFor(
-          num_outer_blocks,
-          TensorOpCost(inner_block_size, inner_block_size,
-                       16 * PacketSize * inner_block_size, Vectorize,
-                       PacketSize),
-          [=](Index blk_size) {
-            return AdjustBlockSize(inner_block_size * sizeof(Scalar), blk_size);
-          },
-          [&](Index first, Index last) {
-            for (Index idx1 = first; idx1 < last; ++idx1) {
-              ReduceBlock<Self, Vectorize, /*Parallelize=*/false> block_reducer;
-              block_reducer(self, idx1 * inner_block_size, data);
-            }
-          });
-    } else {
-      // Parallelize over inner packets/scalars dimensions when the reduction
-      // axis is not an inner dimension.
-      ReduceBlock<Self, Vectorize, /*Parallelize=*/true> block_reducer;
-      for (Index idx1 = 0; idx1 < total_size;
-           idx1 += self.stride() * self.size()) {
-        block_reducer(self, idx1, data);
+        }
       }
     }
   }
 };
-#endif  // EIGEN_USE_THREADS
 
-#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
 
 // GPU implementation of scan
 // TODO(ibab) This placeholder implementation performs multiple scans in
@@ -358,171 +272,15 @@ __global__ void ScanKernel(Self self, Index total_size, typename Self::CoeffRetu
 }
 
 template <typename Self, typename Reducer>
-struct ScanLauncher<Self, Reducer, GpuDevice, false> {
+struct ScanLauncher<Self, Reducer, GpuDevice> {
   void operator()(const Self& self, typename Self::CoeffReturnType* data) {
      Index total_size = internal::array_prod(self.dimensions());
      Index num_blocks = (total_size / self.size() + 63) / 64;
      Index block_size = 64;
-
-     LAUNCH_GPU_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
+     LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
   }
 };
-#endif  // EIGEN_USE_GPU && (EIGEN_GPUCC)
-
-}  // namespace internal
-
-// Eval as rvalue
-template <typename Op, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
-
-  typedef TensorScanOp<Op, ArgType> XprType;
-  typedef typename XprType::Index Index;
-  typedef const ArgType ChildTypeNoConst;
-  typedef const ArgType ChildType;
-  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-  typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
-  typedef StorageMemory<Scalar, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  enum {
-    IsAligned = false,
-    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess = false,
-    PreferBlockAccess = false,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,
-    RawAccess = true
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
-                                                        const Device& device)
-      : m_impl(op.expression(), device),
-        m_device(device),
-        m_exclusive(op.exclusive()),
-        m_accumulator(op.accumulator()),
-        m_size(m_impl.dimensions()[op.axis()]),
-        m_stride(1), m_consume_dim(op.axis()),
-        m_output(NULL) {
-
-    // Accumulating a scalar isn't supported.
-    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
-
-    // Compute stride of scan axis
-    const Dimensions& dims = m_impl.dimensions();
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = 0; i < op.axis(); ++i) {
-        m_stride = m_stride * dims[i];
-      }
-    } else {
-      // dims can only be indexed through unsigned integers,
-      // so let's use an unsigned type to let the compiler knows.
-      // This prevents stupid warnings: ""'*((void*)(& evaluator)+64)[18446744073709551615]' may be used uninitialized in this function"
-      unsigned int axis = internal::convert_index<unsigned int>(op.axis());
-      for (unsigned int i = NumDims - 1; i > axis; --i) {
-        m_stride = m_stride * dims[i];
-      }
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
-    return m_impl.dimensions();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const {
-    return m_stride;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& consume_dim() const {
-    return m_consume_dim;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
-    return m_size;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const {
-    return m_accumulator;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const {
-    return m_exclusive;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const {
-    return m_impl;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const {
-    return m_device;
-  }
-
-  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    internal::ScanLauncher<Self, Op, Device> launcher;
-    if (data) {
-      launcher(*this, data);
-      return false;
-    }
-
-    const Index total_size = internal::array_prod(dimensions());
-    m_output = static_cast<EvaluatorPointerType>(m_device.get((Scalar*) m_device.allocate_temp(total_size * sizeof(Scalar))));
-    launcher(*this, m_output);
-    return true;
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
-    return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const
-  {
-    return m_output;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    return m_output[index];
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
-    return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
-    if (m_output) {
-      m_device.deallocate_temp(m_output);
-      m_output = NULL;
-    }
-    m_impl.cleanup();
-  }
-
-#ifdef EIGEN_USE_SYCL
- // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-    m_output.bind(cgh);
-  }
-#endif
-protected:
-  TensorEvaluator<ArgType, Device> m_impl;
-  const Device EIGEN_DEVICE_REF m_device;
-  const bool m_exclusive;
-  Op m_accumulator;
-  const Index m_size;
-  Index m_stride;
-  Index m_consume_dim;
-  EvaluatorPointerType m_output;
-};
+#endif  // EIGEN_USE_GPU && __CUDACC__
 
 }  // end namespace Eigen
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
deleted file mode 100644
index 0078692cd..000000000
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
+++ /dev/null
@@ -1,512 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Mehdi Goli    Codeplay Software Ltd.
-// Ralph Potter  Codeplay Software Ltd.
-// Luke Iwanski  Codeplay Software Ltd.
-// Contact: <eigen@codeplay.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-/*****************************************************************
- * TensorScanSycl.h
- *
- * \brief:
- *  Tensor Scan Sycl implement the extend  version of
- * "Efficient parallel scan algorithms for GPUs." .for Tensor operations.
- * The algorithm requires up to 3 stage (consequently 3 kernels) depending on
- * the size of the tensor. In the first kernel (ScanKernelFunctor), each
- * threads within the work-group individually reduces the allocated elements per
- * thread in order to reduces the total number of blocks. In the next step all
- * thread within the work-group will reduce the associated blocks into the
- * temporary buffers. In the next kernel(ScanBlockKernelFunctor), the temporary
- * buffer is given as an input and all the threads within a work-group scan and
- * reduces the boundaries between the blocks (generated from the previous
- * kernel). and write the data on the temporary buffer. If the second kernel is
- * required, the third and final kerenl (ScanAdjustmentKernelFunctor) will
- * adjust the final result into the output buffer.
- * The original algorithm for the parallel prefix sum can be found here:
- *
- * Sengupta, Shubhabrata, Mark Harris, and Michael Garland. "Efficient parallel
- * scan algorithms for GPUs." NVIDIA, Santa Clara, CA, Tech. Rep. NVR-2008-003
- *1, no. 1 (2008): 1-17.
- *****************************************************************/
-
-#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
-#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
-
-namespace Eigen {
-namespace TensorSycl {
-namespace internal {
-
-#ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE
-#define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4)
-#endif
-
-template <typename index_t>
-struct ScanParameters {
-  // must be power of 2
-  static EIGEN_CONSTEXPR index_t ScanPerThread = 8;
-  const index_t total_size;
-  const index_t non_scan_size;
-  const index_t scan_size;
-  const index_t non_scan_stride;
-  const index_t scan_stride;
-  const index_t panel_threads;
-  const index_t group_threads;
-  const index_t block_threads;
-  const index_t elements_per_group;
-  const index_t elements_per_block;
-  const index_t loop_range;
-
-  ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_,
-                 index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_,
-                 index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
-      : total_size(total_size_),
-        non_scan_size(non_scan_size_),
-        scan_size(scan_size_),
-        non_scan_stride(non_scan_stride_),
-        scan_stride(scan_stride_),
-        panel_threads(panel_threads_),
-        group_threads(group_threads_),
-        block_threads(block_threads_),
-        elements_per_group(elements_per_group_),
-        elements_per_block(elements_per_block_),
-        loop_range(loop_range_) {}
-};
-
-enum class scan_step { first, second };
-template <typename Evaluator, typename CoeffReturnType, typename OutAccessor, typename Op, typename Index,
-          scan_step stp>
-struct ScanKernelFunctor {
-  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
-      LocalAccessor;
-  static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
-
-  LocalAccessor scratch;
-  Evaluator dev_eval;
-  OutAccessor out_accessor;
-  OutAccessor temp_accessor;
-  const ScanParameters<Index> scanParameters;
-  Op accumulator;
-  const bool inclusive;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_,
-                                                          OutAccessor out_accessor_, OutAccessor temp_accessor_,
-                                                          const ScanParameters<Index> scanParameters_, Op accumulator_,
-                                                          const bool inclusive_)
-      : scratch(scratch_),
-        dev_eval(dev_eval_),
-        out_accessor(out_accessor_),
-        temp_accessor(temp_accessor_),
-        scanParameters(scanParameters_),
-        accumulator(accumulator_),
-        inclusive(inclusive_) {}
-
-  template <scan_step sst = stp, typename Input>
-  typename ::Eigen::internal::enable_if<sst == scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC
-      EIGEN_STRONG_INLINE
-      read(const Input &inpt, Index global_id) {
-    return inpt.coeff(global_id);
-  }
-
-  template <scan_step sst = stp, typename Input>
-  typename ::Eigen::internal::enable_if<sst != scan_step::first, CoeffReturnType>::type EIGEN_DEVICE_FUNC
-      EIGEN_STRONG_INLINE
-      read(const Input &inpt, Index global_id) {
-    return inpt[global_id];
-  }
-
-  template <scan_step sst = stp, typename InclusiveOp>
-  typename ::Eigen::internal::enable_if<sst == scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  first_step_inclusive_Operation(InclusiveOp inclusive_op) {
-    inclusive_op();
-  }
-
-  template <scan_step sst = stp, typename InclusiveOp>
-  typename ::Eigen::internal::enable_if<sst != scan_step::first>::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  first_step_inclusive_Operation(InclusiveOp) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
-    auto out_ptr = out_accessor.get_pointer();
-    auto tmp_ptr = temp_accessor.get_pointer();
-    auto scratch_ptr = scratch.get_pointer().get();
-
-    for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
-      Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
-      Index tmp = data_offset % scanParameters.panel_threads;
-      const Index panel_id = data_offset / scanParameters.panel_threads;
-      const Index group_id = tmp / scanParameters.group_threads;
-      tmp = tmp % scanParameters.group_threads;
-      const Index block_id = tmp / scanParameters.block_threads;
-      const Index local_id = tmp % scanParameters.block_threads;
-      // we put one element per packet in scratch_mem
-      const Index scratch_stride = scanParameters.elements_per_block / PacketSize;
-      const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride;
-      CoeffReturnType private_scan[ScanParameters<Index>::ScanPerThread];
-      CoeffReturnType inclusive_scan;
-      // the actual panel size is scan_size * non_scan_size.
-      // elements_per_panel is roundup to power of 2 for binary tree
-      const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
-      const Index group_offset = group_id * scanParameters.non_scan_stride;
-      // This will be effective when the size is bigger than elements_per_block
-      const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
-      const Index thread_offset = (ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride);
-      const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
-      Index next_elements = 0;
-      EIGEN_UNROLL_LOOP
-      for (int i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
-        Index global_id = global_offset + next_elements;
-        private_scan[i] = ((((block_id * scanParameters.elements_per_block) +
-                             (ScanParameters<Index>::ScanPerThread * local_id) + i) < scanParameters.scan_size) &&
-                           (global_id < scanParameters.total_size))
-                              ? read(dev_eval, global_id)
-                              : accumulator.initialize();
-        next_elements += scanParameters.scan_stride;
-      }
-      first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
-        if (inclusive) {
-          inclusive_scan = private_scan[ScanParameters<Index>::ScanPerThread - 1];
-        }
-      });
-      // This for loop must be 2
-      EIGEN_UNROLL_LOOP
-      for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
-        Index private_offset = 1;
-        // build sum in place up the tree
-        EIGEN_UNROLL_LOOP
-        for (Index d = PacketSize >> 1; d > 0; d >>= 1) {
-          EIGEN_UNROLL_LOOP
-          for (Index l = 0; l < d; l++) {
-            Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
-            Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
-            CoeffReturnType accum = accumulator.initialize();
-            accumulator.reduce(private_scan[ai], &accum);
-            accumulator.reduce(private_scan[bi], &accum);
-            private_scan[bi] = accumulator.finalize(accum);
-          }
-          private_offset *= 2;
-        }
-        scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =
-            private_scan[PacketSize - 1 + packetIndex];
-        private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize();
-        // traverse down tree & build scan
-        EIGEN_UNROLL_LOOP
-        for (Index d = 1; d < PacketSize; d *= 2) {
-          private_offset >>= 1;
-          EIGEN_UNROLL_LOOP
-          for (Index l = 0; l < d; l++) {
-            Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
-            Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
-            CoeffReturnType accum = accumulator.initialize();
-            accumulator.reduce(private_scan[ai], &accum);
-            accumulator.reduce(private_scan[bi], &accum);
-            private_scan[ai] = private_scan[bi];
-            private_scan[bi] = accumulator.finalize(accum);
-          }
-        }
-      }
-
-      Index offset = 1;
-      // build sum in place up the tree
-      for (Index d = scratch_stride >> 1; d > 0; d >>= 1) {
-        // Synchronise
-        itemID.barrier(cl::sycl::access::fence_space::local_space);
-        if (local_id < d) {
-          Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
-          Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
-          CoeffReturnType accum = accumulator.initialize();
-          accumulator.reduce(scratch_ptr[ai], &accum);
-          accumulator.reduce(scratch_ptr[bi], &accum);
-          scratch_ptr[bi] = accumulator.finalize(accum);
-        }
-        offset *= 2;
-      }
-      // Synchronise
-      itemID.barrier(cl::sycl::access::fence_space::local_space);
-      // next step optimisation
-      if (local_id == 0) {
-        if (((scanParameters.elements_per_group / scanParameters.elements_per_block) > 1)) {
-          const Index temp_id = panel_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) *
-                                    scanParameters.non_scan_size +
-                                group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) +
-                                block_id;
-          tmp_ptr[temp_id] = scratch_ptr[scratch_stride - 1 + scratch_offset];
-        }
-        // clear the last element
-        scratch_ptr[scratch_stride - 1 + scratch_offset] = accumulator.initialize();
-      }
-      // traverse down tree & build scan
-      for (Index d = 1; d < scratch_stride; d *= 2) {
-        offset >>= 1;
-        // Synchronise
-        itemID.barrier(cl::sycl::access::fence_space::local_space);
-        if (local_id < d) {
-          Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
-          Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
-          CoeffReturnType accum = accumulator.initialize();
-          accumulator.reduce(scratch_ptr[ai], &accum);
-          accumulator.reduce(scratch_ptr[bi], &accum);
-          scratch_ptr[ai] = scratch_ptr[bi];
-          scratch_ptr[bi] = accumulator.finalize(accum);
-        }
-      }
-      // Synchronise
-      itemID.barrier(cl::sycl::access::fence_space::local_space);
-      // This for loop must be 2
-      EIGEN_UNROLL_LOOP
-      for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
-        EIGEN_UNROLL_LOOP
-        for (Index i = 0; i < PacketSize; i++) {
-          CoeffReturnType accum = private_scan[packetIndex + i];
-          accumulator.reduce(scratch_ptr[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);
-          private_scan[packetIndex + i] = accumulator.finalize(accum);
-        }
-      }
-      first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
-        if (inclusive) {
-          accumulator.reduce(private_scan[ScanParameters<Index>::ScanPerThread - 1], &inclusive_scan);
-          private_scan[0] = accumulator.finalize(inclusive_scan);
-        }
-      });
-      next_elements = 0;
-      // right the first set of private param
-      EIGEN_UNROLL_LOOP
-      for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
-        Index global_id = global_offset + next_elements;
-        if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
-             scanParameters.scan_size) &&
-            (global_id < scanParameters.total_size)) {
-          Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters<Index>::ScanPerThread) * (inclusive));
-          out_ptr[global_id] = private_scan[private_id];
-        }
-        next_elements += scanParameters.scan_stride;
-      }
-    }  // end for loop
-  }
-};
-
-template <typename CoeffReturnType, typename InAccessor, typename OutAccessor, typename Op, typename Index>
-struct ScanAdjustmentKernelFunctor {
-  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
-      LocalAccessor;
-  static EIGEN_CONSTEXPR int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
-  InAccessor in_accessor;
-  OutAccessor out_accessor;
-  const ScanParameters<Index> scanParameters;
-  Op accumulator;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_,
-                                                                    OutAccessor out_accessor_,
-                                                                    const ScanParameters<Index> scanParameters_,
-                                                                    Op accumulator_)
-      : in_accessor(in_accessor_),
-        out_accessor(out_accessor_),
-        scanParameters(scanParameters_),
-        accumulator(accumulator_) {}
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) {
-    auto in_ptr = in_accessor.get_pointer();
-    auto out_ptr = out_accessor.get_pointer();
-
-    for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
-      Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
-      Index tmp = data_offset % scanParameters.panel_threads;
-      const Index panel_id = data_offset / scanParameters.panel_threads;
-      const Index group_id = tmp / scanParameters.group_threads;
-      tmp = tmp % scanParameters.group_threads;
-      const Index block_id = tmp / scanParameters.block_threads;
-      const Index local_id = tmp % scanParameters.block_threads;
-
-      // the actual panel size is scan_size * non_scan_size.
-      // elements_per_panel is roundup to power of 2 for binary tree
-      const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
-      const Index group_offset = group_id * scanParameters.non_scan_stride;
-      // This will be effective when the size is bigger than elements_per_block
-      const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
-      const Index thread_offset = ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride;
-
-      const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
-      const Index block_size = scanParameters.elements_per_group / scanParameters.elements_per_block;
-      const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id;
-      CoeffReturnType adjust_val = in_ptr[in_id];
-
-      Index next_elements = 0;
-      EIGEN_UNROLL_LOOP
-      for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
-        Index global_id = global_offset + next_elements;
-        if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
-             scanParameters.scan_size) &&
-            (global_id < scanParameters.total_size)) {
-          CoeffReturnType accum = adjust_val;
-          accumulator.reduce(out_ptr[global_id], &accum);
-          out_ptr[global_id] = accumulator.finalize(accum);
-        }
-        next_elements += scanParameters.scan_stride;
-      }
-    }
-  }
-};
-
-template <typename Index>
-struct ScanInfo {
-  const Index &total_size;
-  const Index &scan_size;
-  const Index &panel_size;
-  const Index &non_scan_size;
-  const Index &scan_stride;
-  const Index &non_scan_stride;
-
-  Index max_elements_per_block;
-  Index block_size;
-  Index panel_threads;
-  Index group_threads;
-  Index block_threads;
-  Index elements_per_group;
-  Index elements_per_block;
-  Index loop_range;
-  Index global_range;
-  Index local_range;
-  const Eigen::SyclDevice &dev;
-  EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_,
-                               const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_,
-                               const Eigen::SyclDevice &dev_)
-      : total_size(total_size_),
-        scan_size(scan_size_),
-        panel_size(panel_size_),
-        non_scan_size(non_scan_size_),
-        scan_stride(scan_stride_),
-        non_scan_stride(non_scan_stride_),
-        dev(dev_) {
-    // must be power of 2
-    local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()),
-                           Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1));
-
-    max_elements_per_block = local_range * ScanParameters<Index>::ScanPerThread;
-
-    elements_per_group =
-        dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters<Index>::ScanPerThread)), true);
-    const Index elements_per_panel = elements_per_group * non_scan_size;
-    elements_per_block = std::min(Index(elements_per_group), Index(max_elements_per_block));
-    panel_threads = elements_per_panel / ScanParameters<Index>::ScanPerThread;
-    group_threads = elements_per_group / ScanParameters<Index>::ScanPerThread;
-    block_threads = elements_per_block / ScanParameters<Index>::ScanPerThread;
-    block_size = elements_per_group / elements_per_block;
-#ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE
-    const Index max_threads = std::min(Index(panel_threads * panel_size), Index(EIGEN_SYCL_MAX_GLOBAL_RANGE));
-#else
-    const Index max_threads = panel_threads * panel_size;
-#endif
-    global_range = roundUp(max_threads, local_range);
-    loop_range = Index(
-        std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters<Index>::ScanPerThread)));
-  }
-  inline ScanParameters<Index> get_scan_parameter() {
-    return ScanParameters<Index>(total_size, non_scan_size, scan_size, non_scan_stride, scan_stride, panel_threads,
-                                 group_threads, block_threads, elements_per_group, elements_per_block, loop_range);
-  }
-  inline cl::sycl::nd_range<1> get_thread_range() {
-    return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
-  }
-};
-
-template <typename EvaluatorPointerType, typename CoeffReturnType, typename Reducer, typename Index>
-struct SYCLAdjustBlockOffset {
-  EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr,
-                                                           Reducer &accumulator, const Index total_size,
-                                                           const Index scan_size, const Index panel_size,
-                                                           const Index non_scan_size, const Index scan_stride,
-                                                           const Index non_scan_stride, const Eigen::SyclDevice &dev) {
-    auto scan_info =
-        ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
-
-    typedef ScanAdjustmentKernelFunctor<CoeffReturnType, EvaluatorPointerType, EvaluatorPointerType, Reducer, Index>
-        AdjustFuctor;
-    dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),
-                                                                      scan_info.max_elements_per_block,
-                                                                      scan_info.get_scan_parameter(), accumulator);
-  }
-};
-
-template <typename CoeffReturnType, scan_step stp>
-struct ScanLauncher_impl {
-  template <typename Input, typename EvaluatorPointerType, typename Reducer, typename Index>
-  EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator,
-                                             const Index total_size, const Index scan_size, const Index panel_size,
-                                             const Index non_scan_size, const Index scan_stride,
-                                             const Index non_scan_stride, const bool inclusive,
-                                             const Eigen::SyclDevice &dev) {
-    auto scan_info =
-        ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
-    const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size;
-    const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters<Index>::ScanPerThread / 2);
-    CoeffReturnType *temp_pointer =
-        static_cast<CoeffReturnType *>(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType)));
-    EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
-
-    typedef ScanKernelFunctor<Input, CoeffReturnType, EvaluatorPointerType, Reducer, Index, stp> ScanFunctor;
-    dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(
-        in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,
-        scan_info.get_scan_parameter(), accumulator, inclusive);
-
-    if (scan_info.block_size > 1) {
-      ScanLauncher_impl<CoeffReturnType, scan_step::second>::scan_block(
-          tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size,
-          non_scan_size, Index(1), scan_info.block_size, false, dev);
-
-      SYCLAdjustBlockOffset<EvaluatorPointerType, CoeffReturnType, Reducer, Index>::adjust_scan_block_offset(
-          tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride,
-          non_scan_stride, dev);
-    }
-    dev.deallocate_temp(temp_pointer);
-  }
-};
-
-}  // namespace internal
-}  // namespace TensorSycl
-
-template <typename Self, typename Reducer>
-struct ScanLauncher<Self, Reducer, Eigen::SyclDevice> {
-  typedef typename Self::Index Index;
-  typedef typename Self::CoeffReturnType CoeffReturnType;
-  typedef typename Self::Storage Storage;
-  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
-  void operator()(Self &self, EvaluatorPointerType data) {
-    const Index total_size = internal::array_prod(self.dimensions());
-    const Index scan_size = self.size();
-    const Index scan_stride = self.stride();
-    // this is the scan op (can be sum or ...)
-    auto accumulator = self.accumulator();
-    auto inclusive = !self.exclusive();
-    auto consume_dim = self.consume_dim();
-    auto dev = self.device();
-
-    auto dims = self.inner().dimensions();
-
-    Index non_scan_size = 1;
-    Index panel_size = 1;
-    if (static_cast<int>(Self::Layout) == static_cast<int>(ColMajor)) {
-      for (int i = 0; i < consume_dim; i++) {
-        non_scan_size *= dims[i];
-      }
-      for (int i = consume_dim + 1; i < Self::NumDims; i++) {
-        panel_size *= dims[i];
-      }
-    } else {
-      for (int i = Self::NumDims - 1; i > consume_dim; i--) {
-        non_scan_size *= dims[i];
-      }
-      for (int i = consume_dim - 1; i >= 0; i--) {
-        panel_size *= dims[i];
-      }
-    }
-    const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size;
-    auto eval_impl = self.inner();
-    TensorSycl::internal::ScanLauncher_impl<CoeffReturnType, TensorSycl::internal::scan_step::first>::scan_block(
-        eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride,
-        inclusive, dev);
-  }
-};
-}  // namespace Eigen
-
-#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index 597ca64cd..113c060e3 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -31,7 +31,6 @@ struct traits<TensorShufflingOp<Shuffle, XprType> > : public traits<XprType>
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename Shuffle, typename XprType>
@@ -61,8 +60,8 @@ class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType>
   typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shfl)
-      : m_xpr(expr), m_shuffle(shfl) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shuffle)
+      : m_xpr(expr), m_shuffle(shuffle) {}
 
     EIGEN_DEVICE_FUNC
     const Shuffle& shufflePermutation() const { return m_shuffle; }
@@ -100,7 +99,6 @@ class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType>
 template<typename Shuffle, typename ArgType, typename Device>
 struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
 {
-  typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Self;
   typedef TensorShufflingOp<Shuffle, ArgType> XprType;
   typedef typename XprType::Index Index;
   static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
@@ -108,245 +106,100 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
-    IsAligned         = false,
-    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
-    PreferBlockAccess = true,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess       = false,  // to be implemented
-    RawAccess         = false
+    IsAligned = false,
+    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
   };
 
-  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
-
-  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
-                                                     Layout, Index>
-      TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
-                                                        const Device& device)
-      : m_device(device),
-        m_impl(op.expression(), device)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device)
   {
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
     const Shuffle& shuffle = op.shufflePermutation();
-    m_is_identity = true;
     for (int i = 0; i < NumDims; ++i) {
-      m_shuffle[i] = static_cast<int>(shuffle[i]);
       m_dimensions[i] = input_dims[shuffle[i]];
-      m_inverseShuffle[shuffle[i]] = i;
-      if (m_is_identity && shuffle[i] != i) {
-        m_is_identity = false;
-      }
     }
 
+    array<Index, NumDims> inputStrides;
+
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      m_unshuffledInputStrides[0] = 1;
+      inputStrides[0] = 1;
       m_outputStrides[0] = 1;
-
       for (int i = 1; i < NumDims; ++i) {
-        m_unshuffledInputStrides[i] =
-            m_unshuffledInputStrides[i - 1] * input_dims[i - 1];
+        inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1];
         m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
       }
     } else {
-      m_unshuffledInputStrides[NumDims - 1] = 1;
+      inputStrides[NumDims - 1] = 1;
       m_outputStrides[NumDims - 1] = 1;
       for (int i = NumDims - 2; i >= 0; --i) {
-        m_unshuffledInputStrides[i] =
-            m_unshuffledInputStrides[i + 1] * input_dims[i + 1];
+        inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
         m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
       }
     }
 
     for (int i = 0; i < NumDims; ++i) {
-      m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]];
+      m_inputStrides[i] = inputStrides[shuffle[i]];
     }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-
-#ifdef EIGEN_USE_THREADS
-  template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
-      EvaluatorPointerType, EvalSubExprsCallback done) {
-    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
-  }
-#endif  // EIGEN_USE_THREADS
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    if (m_is_identity) {
-      return m_impl.coeff(index);
-    } else {
-      return m_impl.coeff(srcCoeff(index));
-    }
+    return m_impl.coeff(srcCoeff(index));
   }
 
-  template <int LoadMode, typename Self, bool ImplPacketAccess>
-  struct PacketLoader {
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    static PacketReturnType Run(const Self& self, Index index) {
-      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-      EIGEN_UNROLL_LOOP
-      for (int i = 0; i < PacketSize; ++i) {
-        values[i] = self.coeff(index + i);
-      }
-      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-      return rslt;
-    }
-  };
-
-  template<int LoadMode, typename Self>
-  struct PacketLoader<LoadMode, Self, true> {
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    static PacketReturnType Run(const Self& self, Index index) {
-      if (self.m_is_identity) {
-        return self.m_impl.template packet<LoadMode>(index);
-      } else {
-        EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-        EIGEN_UNROLL_LOOP
-        for (int i = 0; i < PacketSize; ++i) {
-          values[i] = self.coeff(index + i);
-        }
-        PacketReturnType rslt = internal::pload<PacketReturnType>(values);
-        return rslt;
-      }
-    }
-  };
-
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-        eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
-    return PacketLoader<LoadMode, Self, TensorEvaluator<ArgType, Device>::PacketAccess>::Run(*this, index);
-  }
+    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  internal::TensorBlockResourceRequirements getResourceRequirements() const {
-    static const int inner_dim =
-        Layout == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
-
-    const size_t target_size = m_device.firstLevelCacheSize();
-    const bool inner_dim_shuffled = m_shuffle[inner_dim] != inner_dim;
-
-    // Shuffled inner dimensions leads to a random memory access, which is not
-    // captured by default cost model bytes loaded/stored. We add this cost
-    // explicitly. The number of cycles picked based on the benchmarks.
-    // TODO(ezhulenev): This number was picked based on a very questionable
-    // benchmarks, add benchmarks that are representative of real workloads.
-    using BlockRequirements = internal::TensorBlockResourceRequirements;
-    if (inner_dim_shuffled) {
-      return BlockRequirements::uniform<Scalar>(target_size)
-          .addCostPerCoeff({0, 0, NumDims * 28});
-    } else {
-      return BlockRequirements::skewed<Scalar>(target_size);
+    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = coeff(index+i);
     }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
-  block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
-          bool root_of_expr_ast = false) const {
-    assert(m_impl.data() != NULL);
-
-    typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
-        TensorBlockIO;
-    typedef typename TensorBlockIO::Dst TensorBlockIODst;
-    typedef typename TensorBlockIO::Src TensorBlockIOSrc;
-
-    const typename TensorBlock::Storage block_storage =
-        TensorBlock::prepareStorage(
-            desc, scratch, /*allow_strided_storage=*/root_of_expr_ast);
-
-    typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides);
-    TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset()));
-
-    TensorBlockIODst dst(block_storage.dimensions(), block_storage.strides(),
-                         block_storage.data());
-
-    typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle);
-    TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
-
-    return block_storage.AsTensorMaterializedBlock();
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
-    const double compute_cost = m_is_identity ? TensorOpCost::AddCost<Index>() :
-                                NumDims * (2 * TensorOpCost::AddCost<Index>() +
+    const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() +
                                            2 * TensorOpCost::MulCost<Index>() +
                                            TensorOpCost::DivCost<Index>());
     return m_impl.costPerCoeff(vectorized) +
-           TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize);
+           TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
-#ifdef EIGEN_USE_SYCL
-   // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
  protected:
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex(
-      Index input_index,
-      const DSizes<Index, NumDims>& input_block_strides,
-      const DSizes<Index, NumDims>& output_block_strides,
-      const DSizes<internal::TensorIntDivisor<Index>, NumDims>& fast_input_block_strides) const {
-    Index output_index = 0;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx = input_index / fast_input_block_strides[i];
-        output_index += idx * output_block_strides[m_inverseShuffle[i]];
-        input_index -= idx * input_block_strides[i];
-      }
-      return output_index + input_index *
-          output_block_strides[m_inverseShuffle[0]];
-    } else {
-      for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx = input_index / fast_input_block_strides[i];
-        output_index += idx * output_block_strides[m_inverseShuffle[i]];
-        input_index -= idx * input_block_strides[i];
-      }
-      return output_index + input_index *
-          output_block_strides[m_inverseShuffle[NumDims - 1]];
-    }
-  }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int i = NumDims - 1; i > 0; --i) {
-        const Index idx = index / m_fastOutputStrides[i];
+        const Index idx = index / m_outputStrides[i];
         inputIndex += idx * m_inputStrides[i];
         index -= idx * m_outputStrides[i];
       }
       return inputIndex + index * m_inputStrides[0];
     } else {
       for (int i = 0; i < NumDims - 1; ++i) {
-        const Index idx = index / m_fastOutputStrides[i];
+        const Index idx = index / m_outputStrides[i];
         inputIndex += idx * m_inputStrides[i];
         index -= idx * m_outputStrides[i];
       }
@@ -355,15 +208,8 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   }
 
   Dimensions m_dimensions;
-  bool m_is_identity;
-  array<int, NumDims> m_shuffle;
-  array<Index, NumDims> m_inverseShuffle;  // TODO(ezhulenev): Make it int type.
   array<Index, NumDims> m_outputStrides;
-  array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
   array<Index, NumDims> m_inputStrides;
-  array<Index, NumDims> m_unshuffledInputStrides;
-
-  const Device EIGEN_DEVICE_REF m_device;
   TensorEvaluator<ArgType, Device> m_impl;
 };
 
@@ -382,23 +228,14 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
-    IsAligned         = false,
-    PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
-    BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
-    PreferBlockAccess = true,
-    Layout            = TensorEvaluator<ArgType, Device>::Layout,
-    RawAccess         = false
+    IsAligned = false,
+    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    RawAccess = false
   };
 
-  typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : Base(op, device)
   { }
@@ -415,68 +252,10 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
 
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
     internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
-    EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       this->coeffRef(index+i) = values[i];
     }
   }
-
-  template <typename TensorBlock>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
-      const TensorBlockDesc& desc, const TensorBlock& block) {
-    eigen_assert(this->m_impl.data() != NULL);
-
-    typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
-        TensorBlockIO;
-    typedef typename TensorBlockIO::Dst TensorBlockIODst;
-    typedef typename TensorBlockIO::Src TensorBlockIOSrc;
-
-    const Scalar* block_buffer = block.data();
-
-    // TODO(ezhulenev): TensorBlockIO should be able to read from any Eigen
-    // expression with coefficient and packet access as `src`.
-    void* mem = NULL;
-    if (block_buffer == NULL) {
-      mem = this->m_device.allocate(desc.size() * sizeof(Scalar));
-      ScalarNoConst* buf = static_cast<ScalarNoConst*>(mem);
-
-      typedef internal::TensorBlockAssignment<
-          ScalarNoConst, NumDims, typename TensorBlock::XprType, Index>
-          TensorBlockAssignment;
-
-      TensorBlockAssignment::Run(
-          TensorBlockAssignment::target(
-              desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
-              buf),
-          block.expr());
-
-      block_buffer = buf;
-    }
-
-    // Read from block.
-    TensorBlockIOSrc src(internal::strides<Layout>(desc.dimensions()),
-                         block_buffer);
-
-    // Write to the output buffer.
-    typename TensorBlockIO::Dimensions output_strides(
-        this->m_unshuffledInputStrides);
-    typename TensorBlockIO::Dimensions output_dimensions;
-    for (int i = 0; i < NumDims; ++i) {
-      output_dimensions[this->m_shuffle[i]] = desc.dimension(i);
-    }
-    TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(),
-                         this->srcCoeff(desc.offset()));
-
-    // Reorder dimensions according to the shuffle.
-    typename TensorBlockIO::DimensionsMap dst_to_src_dim_map;
-    for (int i = 0; i < NumDims; ++i) {
-      dst_to_src_dim_map[i] = static_cast<int>(this->m_inverseShuffle[i]);
-    }
-    TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
-
-    // Deallocate temporary buffer used for the block materialization.
-    if (mem != NULL) this->m_device.deallocate(mem);
-  }
 };
 
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index d05f37532..6c35bfdb6 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -31,13 +31,12 @@ struct traits<TensorStridingOp<Strides, XprType> > : public traits<XprType>
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions;
   static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
 };
 
 template<typename Strides, typename XprType>
 struct eval<TensorStridingOp<Strides, XprType>, Eigen::Dense>
 {
-  typedef const TensorStridingOp<Strides, XprType>EIGEN_DEVICE_REF type;
+  typedef const TensorStridingOp<Strides, XprType>& type;
 };
 
 template<typename Strides, typename XprType>
@@ -107,30 +106,22 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device)
   {
     m_dimensions = m_impl.dimensions();
     for (int i = 0; i < NumDims; ++i) {
-      m_dimensions[i] =Eigen::numext::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
+      m_dimensions[i] = ceilf(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
     }
 
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
@@ -155,10 +146,9 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
     }
   }
 
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType/*data*/) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
@@ -180,7 +170,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
     Index inputIndices[] = {0, 0};
     Index indices[] = {index, index + PacketSize - 1};
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx0 = indices[0] / m_outputStrides[i];
         const Index idx1 = indices[1] / m_outputStrides[i];
@@ -192,7 +181,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
       inputIndices[0] += indices[0] * m_inputStrides[0];
       inputIndices[1] += indices[1] * m_inputStrides[0];
     } else {  // RowMajor
-      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx0 = indices[0] / m_outputStrides[i];
         const Index idx1 = indices[1] / m_outputStrides[i];
@@ -212,7 +200,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       values[0] = m_impl.coeff(inputIndices[0]);
       values[PacketSize-1] = m_impl.coeff(inputIndices[1]);
-      EIGEN_UNROLL_LOOP
       for (int i = 1; i < PacketSize-1; ++i) {
         values[i] = coeff(index+i);
       }
@@ -235,20 +222,13 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
         TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
   {
     Index inputIndex = 0;
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx = index / m_outputStrides[i];
         inputIndex += idx * m_inputStrides[i];
@@ -256,7 +236,6 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
       }
       inputIndex += index * m_inputStrides[0];
     } else {  // RowMajor
-      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx = index / m_outputStrides[i];
         inputIndex += idx * m_inputStrides[i];
@@ -273,6 +252,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
   TensorEvaluator<ArgType, Device> m_impl;
 };
 
+
 // Eval as lvalue
 template<typename Strides, typename ArgType, typename Device>
 struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
@@ -287,7 +267,6 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
   enum {
     IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    PreferBlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
     RawAccess = false
@@ -300,7 +279,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
   typedef typename XprType::Scalar Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
   {
@@ -316,7 +295,6 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
     Index inputIndices[] = {0, 0};
     Index indices[] = {index, index + PacketSize - 1};
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      EIGEN_UNROLL_LOOP
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx0 = indices[0] / this->m_outputStrides[i];
         const Index idx1 = indices[1] / this->m_outputStrides[i];
@@ -328,7 +306,6 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
       inputIndices[0] += indices[0] * this->m_inputStrides[0];
       inputIndices[1] += indices[1] * this->m_inputStrides[0];
     } else {  // RowMajor
-      EIGEN_UNROLL_LOOP
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx0 = indices[0] / this->m_outputStrides[i];
         const Index idx1 = indices[1] / this->m_outputStrides[i];
@@ -348,7 +325,6 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
       internal::pstore<Scalar, PacketReturnType>(values, x);
       this->m_impl.coeffRef(inputIndices[0]) = values[0];
       this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize-1];
-      EIGEN_UNROLL_LOOP
       for (int i = 1; i < PacketSize-1; ++i) {
         this->coeffRef(index+i) = values[i];
       }
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
new file mode 100644
index 000000000..bb8800d45
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h
@@ -0,0 +1,82 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: eigen@codeplay.com
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// General include header of SYCL target for Tensor Module
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
+
+#ifdef EIGEN_USE_SYCL
+
+// global pointer to set different attribute state for a class
+template <class T>
+struct MakeGlobalPointer {
+  typedef typename cl::sycl::global_ptr<T>::pointer_t Type;
+};
+
+// global pointer to set different attribute state for a class
+template <class T>
+struct MakeLocalPointer {
+  typedef typename cl::sycl::local_ptr<T>::pointer_t Type;
+};
+
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+
+/// This struct is used for special expression nodes with no operations (for example assign and selectOP).
+  struct NoOP;
+
+template<bool IsConst, typename T> struct GetType{
+  typedef const T Type;
+};
+template<typename T> struct GetType<false, T>{
+  typedef T Type;
+};
+
+}
+}
+}
+
+// tuple construction
+#include "TensorSyclTuple.h"
+
+// counting number of leaf at compile time
+#include "TensorSyclLeafCount.h"
+
+// The index PlaceHolder takes the actual expression and replaces the actual
+// data on it with the place holder. It uses the same pre-order expression tree
+// traverse as the leaf count in order to give the right access number to each
+// node in the expression
+#include "TensorSyclPlaceHolderExpr.h"
+
+// creation of an accessor tuple from a tuple of SYCL buffers
+#include "TensorSyclExtractAccessor.h"
+
+// this is used to change the address space type in tensor map for GPU
+#include "TensorSyclConvertToDeviceExpression.h"
+
+// this is used to extract the functors
+#include "TensorSyclExtractFunctors.h"
+
+// this is used to create tensormap on the device
+// this is used to construct the expression on the device
+#include "TensorSyclExprConstructor.h"
+
+/// this is used for extracting tensor reduction
+#include "TensorReductionSycl.h"
+
+// kernel execution using fusion
+#include "TensorSyclRun.h"
+
+#endif  // end of EIGEN_USE_SYCL
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
new file mode 100644
index 000000000..8729c86ee
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h
@@ -0,0 +1,121 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorSyclConvertToDeviceExpression.h
+ *
+ * \brief:
+ *  Conversion from host pointer to device pointer
+ *  inside leaf nodes of the expression.
+ *
+*****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_CONVERT_TO_DEVICE_EXPRESSION_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_CONVERT_TO_DEVICE_EXPRESSION_HPP
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+
+/// \struct ConvertToDeviceExpression
+/// \brief This struct is used to convert the MakePointer in the host expression
+/// to the MakeGlobalPointer for the device expression. For the leafNodes
+/// containing the pointer. This is due to the fact that the address space of
+/// the pointer T* is different on the host and the device.
+template <typename Expr>
+struct ConvertToDeviceExpression;
+
+template<template<class...> class NonOpCategory, bool IsConst, typename... Args>
+struct NonOpConversion{
+  typedef typename GetType<IsConst, NonOpCategory<typename ConvertToDeviceExpression<Args>::Type...> >::Type Type;
+};
+
+
+template<template<class, template <class> class > class NonOpCategory, bool IsConst, typename Args>
+struct DeviceConvertor{
+  typedef typename GetType<IsConst, NonOpCategory<typename ConvertToDeviceExpression<Args>::Type, MakeGlobalPointer> >::Type Type;
+};
+
+/// specialisation of the \ref ConvertToDeviceExpression struct when the node
+/// type is TensorMap
+#define TENSORMAPCONVERT(CVQual)\
+template <typename Scalar_, int Options_, int Options2_, int NumIndices_, typename IndexType_, template <class> class MakePointer_>\
+struct ConvertToDeviceExpression<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_> > {\
+  typedef CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer> Type;\
+};
+
+TENSORMAPCONVERT(const)
+TENSORMAPCONVERT()
+#undef TENSORMAPCONVERT
+
+/// specialisation of the \ref ConvertToDeviceExpression struct when the node
+/// type is TensorCwiseNullaryOp, TensorCwiseUnaryOp, TensorCwiseBinaryOp, TensorCwiseTernaryOp, TensorBroadcastingOp
+#define CATEGORYCONVERT(CVQual)\
+template <template<class, class...> class Category, typename OP, typename... subExprs>\
+struct ConvertToDeviceExpression<CVQual Category<OP, subExprs...> > {\
+  typedef CVQual Category<OP, typename ConvertToDeviceExpression<subExprs>::Type... > Type;\
+};
+CATEGORYCONVERT(const)
+CATEGORYCONVERT()
+#undef CATEGORYCONVERT
+
+
+/// specialisation of the \ref ConvertToDeviceExpression struct when the node
+/// type is  TensorCwiseSelectOp
+#define SELECTOPCONVERT(CVQual, Res)\
+template <typename IfExpr, typename ThenExpr, typename ElseExpr>\
+struct ConvertToDeviceExpression<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr> >\
+: NonOpConversion<TensorSelectOp, Res, IfExpr, ThenExpr, ElseExpr> {};
+SELECTOPCONVERT(const, true)
+SELECTOPCONVERT(, false)
+#undef SELECTOPCONVERT
+
+/// specialisation of the \ref ConvertToDeviceExpression struct when the node
+/// type is const AssingOP
+#define ASSIGNCONVERT(CVQual, Res)\
+template <typename LHSExpr, typename RHSExpr>\
+struct ConvertToDeviceExpression<CVQual TensorAssignOp<LHSExpr, RHSExpr> >\
+: NonOpConversion<TensorAssignOp, Res, LHSExpr, RHSExpr>{};
+
+ASSIGNCONVERT(const, true)
+ASSIGNCONVERT(, false)
+#undef ASSIGNCONVERT
+
+/// specialisation of the \ref ConvertToDeviceExpression struct when the node
+/// type is either TensorForcedEvalOp or TensorEvalToOp
+#define KERNELBROKERCONVERT(CVQual, Res, ExprNode)\
+template <typename Expr>\
+struct ConvertToDeviceExpression<CVQual ExprNode<Expr> > \
+: DeviceConvertor<ExprNode, Res, Expr>{};
+
+KERNELBROKERCONVERT(const, true, TensorForcedEvalOp)
+KERNELBROKERCONVERT(, false, TensorForcedEvalOp)
+KERNELBROKERCONVERT(const, true, TensorEvalToOp)
+KERNELBROKERCONVERT(, false, TensorEvalToOp)
+#undef KERNELBROKERCONVERT
+
+/// specialisation of the \ref ConvertToDeviceExpression struct when the node type is TensorReductionOp
+#define KERNELBROKERCONVERTREDUCTION(CVQual)\
+template <typename OP, typename Dim, typename subExpr, template <class> class MakePointer_>\
+struct ConvertToDeviceExpression<CVQual TensorReductionOp<OP, Dim, subExpr, MakePointer_> > {\
+  typedef CVQual TensorReductionOp<OP, Dim, typename ConvertToDeviceExpression<subExpr>::Type, MakeGlobalPointer> Type;\
+};
+
+KERNELBROKERCONVERTREDUCTION(const)
+KERNELBROKERCONVERTREDUCTION()
+#undef KERNELBROKERCONVERTREDUCTION
+
+}  // namespace internal
+}  // namespace TensorSycl
+}  // namespace Eigen
+
+#endif  // UNSUPPORTED_EIGEN_CXX1
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
new file mode 100644
index 000000000..983f63180
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h
@@ -0,0 +1,239 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorSyclExprConstructor.h
+ *
+ * \brief:
+ *  This file re-create an expression on the SYCL device in order
+ *  to use the original tensor evaluator.
+ *
+*****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+/// this class is used by EvalToOp in order to create an lhs expression which is
+/// a pointer from an accessor on device-only buffer
+template <typename PtrType, size_t N, typename... Params>
+struct EvalToLHSConstructor {
+  PtrType expr;
+  EvalToLHSConstructor(const utility::tuple::Tuple<Params...> &t): expr((&(*(utility::tuple::get<N>(t).get_pointer())))) {}
+};
+
+/// struct ExprConstructor is used to reconstruct the expression on the device and
+/// recreate the expression with MakeGlobalPointer containing the device address
+/// space for the TensorMap pointers used in eval function.
+/// It receives the original expression type, the functor of the node, the tuple
+/// of accessors, and the device expression type to re-instantiate the
+/// expression tree for the device
+template <typename OrigExpr, typename IndexExpr, typename... Params>
+struct ExprConstructor;
+
+/// specialisation of the \ref ExprConstructor struct when the node type is
+/// TensorMap
+#define TENSORMAP(CVQual)\
+template <typename Scalar_, int Options_, int Options2_, int Options3_, int NumIndices_, typename IndexType_,\
+template <class> class MakePointer_, size_t N, typename... Params>\
+struct ExprConstructor< CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer>,\
+CVQual PlaceHolder<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options3_, MakePointer_>, N>, Params...>{\
+  typedef  CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakeGlobalPointer>  Type;\
+  Type expr;\
+  template <typename FuncDetector>\
+  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
+  : expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\
+};
+
+TENSORMAP(const)
+TENSORMAP()
+#undef TENSORMAP
+
+#define UNARYCATEGORY(CVQual)\
+template <template<class, class> class UnaryCategory, typename OP, typename OrigRHSExpr, typename RHSExpr, typename... Params>\
+struct ExprConstructor<CVQual UnaryCategory<OP, OrigRHSExpr>, CVQual UnaryCategory<OP, RHSExpr>, Params...> {\
+  typedef  ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_type;\
+  my_type rhsExpr;\
+  typedef CVQual UnaryCategory<OP, typename my_type::Type> Type;\
+  Type expr;\
+  template <typename FuncDetector>\
+  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
+  : rhsExpr(funcD.rhsExpr, t), expr(rhsExpr.expr, funcD.func) {}\
+};
+
+UNARYCATEGORY(const)
+UNARYCATEGORY()
+#undef UNARYCATEGORY
+
+/// specialisation of the \ref ExprConstructor struct when the node type is
+/// TensorBinaryOp
+#define BINARYCATEGORY(CVQual)\
+template <template<class, class, class> class BinaryCategory, typename OP, typename OrigLHSExpr, typename OrigRHSExpr, typename LHSExpr,\
+typename RHSExpr, typename... Params>\
+struct ExprConstructor<CVQual BinaryCategory<OP, OrigLHSExpr, OrigRHSExpr>,  CVQual BinaryCategory<OP, LHSExpr, RHSExpr>, Params...> {\
+  typedef  ExprConstructor<OrigLHSExpr, LHSExpr, Params...> my_left_type;\
+  typedef  ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_right_type;\
+  typedef  CVQual BinaryCategory<OP, typename my_left_type::Type, typename my_right_type::Type> Type;\
+  my_left_type lhsExpr;\
+  my_right_type rhsExpr;\
+  Type expr;\
+  template <typename FuncDetector>\
+  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
+  : lhsExpr(funcD.lhsExpr, t),rhsExpr(funcD.rhsExpr, t), expr(lhsExpr.expr, rhsExpr.expr, funcD.func) {}\
+};
+
+BINARYCATEGORY(const)
+BINARYCATEGORY()
+#undef BINARYCATEGORY
+
+/// specialisation of the \ref ExprConstructor struct when the node type is
+/// TensorCwiseTernaryOp
+#define TERNARYCATEGORY(CVQual)\
+template <template <class, class, class, class> class TernaryCategory, typename OP, typename OrigArg1Expr, typename OrigArg2Expr,typename OrigArg3Expr,\
+typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename... Params>\
+struct ExprConstructor<CVQual TernaryCategory<OP, OrigArg1Expr, OrigArg2Expr, OrigArg3Expr>, CVQual TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Params...> {\
+  typedef ExprConstructor<OrigArg1Expr, Arg1Expr, Params...> my_arg1_type;\
+  typedef ExprConstructor<OrigArg2Expr, Arg2Expr, Params...> my_arg2_type;\
+  typedef ExprConstructor<OrigArg3Expr, Arg3Expr, Params...> my_arg3_type;\
+  typedef  CVQual TernaryCategory<OP, typename my_arg1_type::Type, typename my_arg2_type::Type, typename my_arg3_type::Type> Type;\
+  my_arg1_type arg1Expr;\
+  my_arg2_type arg2Expr;\
+  my_arg3_type arg3Expr;\
+  Type expr;\
+  template <typename FuncDetector>\
+  ExprConstructor(FuncDetector &funcD,const utility::tuple::Tuple<Params...> &t)\
+  : arg1Expr(funcD.arg1Expr, t), arg2Expr(funcD.arg2Expr, t), arg3Expr(funcD.arg3Expr, t), expr(arg1Expr.expr, arg2Expr.expr, arg3Expr.expr, funcD.func) {}\
+};
+
+TERNARYCATEGORY(const)
+TERNARYCATEGORY()
+#undef TERNARYCATEGORY
+
+/// specialisation of the \ref ExprConstructor struct when the node type is
+/// TensorCwiseSelectOp
+#define SELECTOP(CVQual)\
+template <typename OrigIfExpr, typename OrigThenExpr, typename OrigElseExpr, typename IfExpr, typename ThenExpr, typename ElseExpr, typename... Params>\
+struct ExprConstructor< CVQual TensorSelectOp<OrigIfExpr, OrigThenExpr, OrigElseExpr>, CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Params...> {\
+  typedef  ExprConstructor<OrigIfExpr, IfExpr, Params...> my_if_type;\
+  typedef  ExprConstructor<OrigThenExpr, ThenExpr, Params...> my_then_type;\
+  typedef  ExprConstructor<OrigElseExpr, ElseExpr, Params...> my_else_type;\
+  typedef CVQual TensorSelectOp<typename my_if_type::Type, typename my_then_type::Type, typename my_else_type::Type> Type;\
+  my_if_type ifExpr;\
+  my_then_type thenExpr;\
+  my_else_type elseExpr;\
+  Type expr;\
+  template <typename FuncDetector>\
+  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
+  : ifExpr(funcD.ifExpr, t), thenExpr(funcD.thenExpr, t), elseExpr(funcD.elseExpr, t), expr(ifExpr.expr, thenExpr.expr, elseExpr.expr) {}\
+};
+
+SELECTOP(const)
+SELECTOP()
+#undef SELECTOP
+
+/// specialisation of the \ref ExprConstructor struct when the node type is
+/// const TensorAssignOp
+#define ASSIGN(CVQual)\
+template <typename OrigLHSExpr, typename OrigRHSExpr, typename LHSExpr, typename RHSExpr, typename... Params>\
+struct ExprConstructor<CVQual TensorAssignOp<OrigLHSExpr, OrigRHSExpr>,  CVQual TensorAssignOp<LHSExpr, RHSExpr>, Params...> {\
+  typedef ExprConstructor<OrigLHSExpr, LHSExpr, Params...> my_left_type;\
+  typedef ExprConstructor<OrigRHSExpr, RHSExpr, Params...> my_right_type;\
+  typedef CVQual TensorAssignOp<typename my_left_type::Type, typename my_right_type::Type>  Type;\
+  my_left_type lhsExpr;\
+  my_right_type rhsExpr;\
+  Type expr;\
+  template <typename FuncDetector>\
+  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
+  : lhsExpr(funcD.lhsExpr, t), rhsExpr(funcD.rhsExpr, t), expr(lhsExpr.expr, rhsExpr.expr) {}\
+ };
+
+ ASSIGN(const)
+ ASSIGN()
+ #undef ASSIGN
+/// specialisation of the \ref ExprConstructor struct when the node type is
+///  TensorEvalToOp
+#define EVALTO(CVQual)\
+template <typename OrigExpr, typename Expr, typename... Params>\
+struct ExprConstructor<CVQual TensorEvalToOp<OrigExpr, MakeGlobalPointer>, CVQual TensorEvalToOp<Expr>, Params...> {\
+  typedef ExprConstructor<OrigExpr, Expr, Params...> my_expr_type;\
+  typedef typename TensorEvalToOp<OrigExpr, MakeGlobalPointer>::PointerType my_buffer_type;\
+  typedef CVQual TensorEvalToOp<typename my_expr_type::Type, MakeGlobalPointer> Type;\
+  my_expr_type nestedExpression;\
+  EvalToLHSConstructor<my_buffer_type, 0, Params...> buffer;\
+  Type expr;\
+  template <typename FuncDetector>\
+  ExprConstructor(FuncDetector &funcD, const utility::tuple::Tuple<Params...> &t)\
+  : nestedExpression(funcD.rhsExpr, t), buffer(t), expr(buffer.expr, nestedExpression.expr) {}\
+};
+
+EVALTO(const)
+EVALTO()
+#undef EVALTO
+
+/// specialisation of the \ref ExprConstructor struct when the node type is
+/// TensorForcedEvalOp
+#define FORCEDEVAL(CVQual)\
+template <typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
+struct ExprConstructor<CVQual TensorForcedEvalOp<OrigExpr, MakeGlobalPointer>,\
+CVQual PlaceHolder<CVQual TensorForcedEvalOp<DevExpr>, N>, Params...> {\
+  typedef CVQual TensorMap<Tensor<typename TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::Scalar,\
+  TensorForcedEvalOp<DevExpr, MakeGlobalPointer>::NumDimensions, 0, typename TensorForcedEvalOp<DevExpr>::Index>, 0, MakeGlobalPointer> Type;\
+  Type expr;\
+  template <typename FuncDetector>\
+  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
+  : expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\
+};
+
+FORCEDEVAL(const)
+FORCEDEVAL()
+#undef FORCEDEVAL
+
+template <bool Conds,  size_t X , size_t Y > struct ValueCondition {
+  static const size_t Res =X;
+};
+template<size_t X, size_t Y> struct ValueCondition<false, X , Y> {
+  static const size_t Res =Y;
+};
+
+/// specialisation of the \ref ExprConstructor struct when the node type is TensorReductionOp
+#define SYCLREDUCTIONEXPR(CVQual)\
+template <typename OP, typename Dim, typename OrigExpr, typename DevExpr, size_t N, typename... Params>\
+struct ExprConstructor<CVQual TensorReductionOp<OP, Dim, OrigExpr, MakeGlobalPointer>,\
+CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dim, DevExpr>, N>, Params...> {\
+  static const size_t NumIndices= ValueCondition< TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions==0,  1, TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::NumDimensions >::Res;\
+  typedef CVQual TensorMap<Tensor<typename TensorReductionOp<OP, Dim, DevExpr, MakeGlobalPointer>::Scalar,\
+  NumIndices, 0, typename TensorReductionOp<OP, Dim, DevExpr>::Index>, 0, MakeGlobalPointer> Type;\
+  Type expr;\
+  template <typename FuncDetector>\
+  ExprConstructor(FuncDetector &fd, const utility::tuple::Tuple<Params...> &t)\
+  : expr(Type((&(*(utility::tuple::get<N>(t).get_pointer()))), fd.dimensions())) {}\
+};
+
+SYCLREDUCTIONEXPR(const)
+SYCLREDUCTIONEXPR()
+#undef SYCLREDUCTIONEXPR
+
+/// template deduction for \ref ExprConstructor struct
+template <typename OrigExpr, typename IndexExpr, typename FuncD, typename... Params>
+auto createDeviceExpression(FuncD &funcD, const utility::tuple::Tuple<Params...> &t)
+    -> decltype(ExprConstructor<OrigExpr, IndexExpr, Params...>(funcD, t)) {
+  return ExprConstructor<OrigExpr, IndexExpr, Params...>(funcD, t);
+}
+
+} /// namespace TensorSycl
+} /// namespace internal
+} /// namespace Eigen
+
+
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXPR_CONSTRUCTOR_HPP
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
new file mode 100644
index 000000000..cc18fcdf9
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h
@@ -0,0 +1,204 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorSyclExtractAccessor.h
+ *
+ * \brief:
+ * ExtractAccessor takes Expression placeHolder expression and the tuple of sycl
+ * buffers as an input. Using pre-order tree traversal, ExtractAccessor
+ * recursively calls itself for its children in the expression tree. The
+ * leaf node in the PlaceHolder expression is nothing but a container preserving
+ * the order of the actual data in the tuple of sycl buffer. By invoking the
+ * extract accessor for the PlaceHolder<N>, an accessor is created for the Nth
+ * buffer in the tuple of buffers. This accessor is then added as an Nth
+ * element in the tuple of accessors. In this case we preserve the order of data
+ * in the expression tree.
+ *
+ * This is the specialisation of extract accessor method for different operation
+ * type in the PlaceHolder expression.
+ *
+*****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+/// struct ExtractAccessor: Extract Accessor Class is used to extract the
+/// accessor from a buffer.
+/// Depending on the type of the leaf node we can get a read accessor or a
+/// read_write accessor
+template <typename Evaluator>
+struct ExtractAccessor;
+
+struct AccessorConstructor{
+  template<typename Arg> static inline auto getTuple(cl::sycl::handler& cgh, Arg eval)
+  -> decltype(ExtractAccessor<Arg>::getTuple(cgh, eval)) {
+  return ExtractAccessor<Arg>::getTuple(cgh, eval);
+  }
+
+  template<typename Arg1, typename Arg2> static inline auto getTuple(cl::sycl::handler& cgh, Arg1 eval1, Arg2 eval2)
+  -> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2))) {
+    return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1), ExtractAccessor<Arg2>::getTuple(cgh, eval2));
+  }
+  template<typename Arg1, typename Arg2, typename Arg3>	static inline auto getTuple(cl::sycl::handler& cgh, Arg1 eval1 , Arg2 eval2 , Arg3 eval3)
+  -> decltype(utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)))) {
+    return utility::tuple::append(ExtractAccessor<Arg1>::getTuple(cgh, eval1),utility::tuple::append(ExtractAccessor<Arg2>::getTuple(cgh, eval2), ExtractAccessor<Arg3>::getTuple(cgh, eval3)));
+  }
+  template< cl::sycl::access::mode AcM, typename Arg> static inline auto getAccessor(cl::sycl::handler& cgh, Arg eval)
+  -> decltype(utility::tuple::make_tuple( eval.device().template get_sycl_accessor<AcM,
+  typename Eigen::internal::remove_all<typename Arg::CoeffReturnType>::type>(eval.dimensions().TotalSize(), cgh,eval.data()))){
+    return utility::tuple::make_tuple(eval.device().template get_sycl_accessor<AcM, typename Eigen::internal::remove_all<typename Arg::CoeffReturnType>::type>(eval.dimensions().TotalSize(), cgh,eval.data()));
+  }
+};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is
+/// const TensorCwiseNullaryOp, const TensorCwiseUnaryOp and const TensorBroadcastingOp
+template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> eval)
+  -> decltype(AccessorConstructor::getTuple(cgh, eval.impl())){
+    return AccessorConstructor::getTuple(cgh, eval.impl());
+  }
+};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseNullaryOp,  TensorCwiseUnaryOp and  TensorBroadcastingOp
+template <template<class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<UnaryCategory<OP, RHSExpr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorCwiseBinaryOp
+template <template<class, class, class> class BinaryCategory, typename OP,  typename LHSExpr, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> eval)
+  -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){
+    return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());
+  }
+};
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseBinaryOp
+template <template<class, class, class> class BinaryCategory, typename OP,  typename LHSExpr, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> >{};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is
+/// const TensorCwiseTernaryOp
+template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> eval)
+  -> decltype(AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl())){
+    return AccessorConstructor::getTuple(cgh, eval.arg1Impl(), eval.arg2Impl(), eval.arg3Impl());
+  }
+};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorCwiseTernaryOp
+template <template<class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >{};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is
+/// const TensorCwiseSelectOp. This is a special case where there is no OP
+template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> eval)
+  -> decltype(AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl())){
+    return AccessorConstructor::getTuple(cgh, eval.cond_impl(), eval.then_impl(), eval.else_impl());
+  }
+};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is
+/// TensorCwiseSelectOp. This is a special case where there is no OP
+template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >{};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorAssignOp
+template <typename LHSExpr, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> eval)
+  -> decltype(AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl())){
+    return AccessorConstructor::getTuple(cgh, eval.left_impl(), eval.right_impl());
+ }
+};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorAssignOp
+template <typename LHSExpr, typename RHSExpr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorAssignOp<LHSExpr, RHSExpr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> >{};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorMap
+#define TENSORMAPEXPR(CVQual, ACCType)\
+template <typename PlainObjectType, int Options_, typename Dev>\
+struct ExtractAccessor<TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> > {\
+  static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<CVQual TensorMap<PlainObjectType, Options_>, Dev> eval)\
+  -> decltype(AccessorConstructor::template getAccessor<ACCType>(cgh, eval)){\
+    return AccessorConstructor::template getAccessor<ACCType>(cgh, eval);\
+  }\
+};
+TENSORMAPEXPR(const, cl::sycl::access::mode::read)
+TENSORMAPEXPR(, cl::sycl::access::mode::read_write)
+#undef TENSORMAPEXPR
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorForcedEvalOp
+template <typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> > {
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> eval)
+  -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){
+    return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);
+  }
+};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorForcedEvalOp
+template <typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorForcedEvalOp<Expr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorForcedEvalOp<Expr>, Dev> >{};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorEvalToOp
+template <typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorEvalToOp<Expr>, Dev> > {
+  static inline auto getTuple(cl::sycl::handler& cgh,const TensorEvaluator<const TensorEvalToOp<Expr>, Dev> eval)
+  -> decltype(utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()))){
+    return utility::tuple::append(AccessorConstructor::template getAccessor<cl::sycl::access::mode::write>(cgh, eval), AccessorConstructor::getTuple(cgh, eval.impl()));
+  }
+};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorEvalToOp
+template <typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorEvalToOp<Expr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorEvalToOp<Expr>, Dev> >{};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is const TensorReductionOp
+template <typename OP, typename Dim, typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> > {
+  static inline auto getTuple(cl::sycl::handler& cgh, const TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> eval)
+  -> decltype(AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval)){
+    return AccessorConstructor::template getAccessor<cl::sycl::access::mode::read>(cgh, eval);
+  }
+};
+
+/// specialisation of the \ref ExtractAccessor struct when the node type is TensorReductionOp
+template <typename OP, typename Dim, typename Expr, typename Dev>
+struct ExtractAccessor<TensorEvaluator<TensorReductionOp<OP, Dim, Expr>, Dev> >
+: ExtractAccessor<TensorEvaluator<const TensorReductionOp<OP, Dim, Expr>, Dev> >{};
+
+/// template deduction for \ref ExtractAccessor
+template <typename Evaluator>
+auto createTupleOfAccessors(cl::sycl::handler& cgh, const Evaluator& expr)
+-> decltype(ExtractAccessor<Evaluator>::getTuple(cgh, expr)) {
+  return ExtractAccessor<Evaluator>::getTuple(cgh, expr);
+}
+
+} /// namespace TensorSycl
+} /// namespace internal
+} /// namespace Eigen
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_ACCESSOR_HPP
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
new file mode 100644
index 000000000..9edd38ea4
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h
@@ -0,0 +1,177 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorSyclextractFunctors.h
+ *
+ * \brief:
+ *  Used to extract all the functors allocated to each node of the expression
+*tree.
+ *
+*****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+/// struct FunctorExtractor:  This struct is used to extract the functors
+/// constructed on
+/// the host-side, to pack them and reuse them in reconstruction of the
+/// expression on the device.
+/// We have to do that as in Eigen the functors are not stateless so we cannot
+/// re-instantiate them on the device.
+/// We have to pass instantiated functors to the device.
+// This struct is used for leafNode (TensorMap) and nodes behaving like leafNode (TensorForcedEval).
+template <typename Evaluator> struct FunctorExtractor{
+  typedef typename Evaluator::Dimensions Dimensions;
+  const Dimensions m_dimensions;
+  const Dimensions& dimensions() const { return m_dimensions; }
+  FunctorExtractor(const Evaluator& expr)
+  : m_dimensions(expr.dimensions()) {}
+
+};
+
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorCwiseNullaryOp, const TensorCwiseUnaryOp, and const TensorBroadcastingOp
+template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> > {
+  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
+  OP func;
+  FunctorExtractor(const TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev>& expr)
+  : rhsExpr(expr.impl()), func(expr.functor()) {}
+};
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// TensorCwiseNullaryOp, TensorCwiseUnaryOp, and TensorBroadcastingOp
+template <template <class, class> class UnaryCategory, typename OP, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<UnaryCategory<OP, RHSExpr>, Dev> >
+: FunctorExtractor<TensorEvaluator<const UnaryCategory<OP, RHSExpr>, Dev> >{};
+
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorCwiseBinaryOp
+template <template<class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev> > {
+  FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;
+  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
+  OP func;
+  FunctorExtractor(const TensorEvaluator<const BinaryCategory<OP, LHSExpr, RHSExpr>, Dev>& expr)
+  : lhsExpr(expr.left_impl()),rhsExpr(expr.right_impl()),func(expr.functor()) {}
+};
+
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorCwiseBinaryOp
+template <template <class, class, class> class BinaryCategory, typename OP, typename LHSExpr, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<BinaryCategory<OP,  LHSExpr, RHSExpr>, Dev> >
+: FunctorExtractor<TensorEvaluator<const BinaryCategory<OP,  LHSExpr, RHSExpr>, Dev> >{};
+
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorCwiseTernaryOp
+template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr,typename Dev>
+struct FunctorExtractor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> > {
+  FunctorExtractor<TensorEvaluator<Arg1Expr, Dev> > arg1Expr;
+  FunctorExtractor<TensorEvaluator<Arg2Expr, Dev> > arg2Expr;
+  FunctorExtractor<TensorEvaluator<Arg3Expr, Dev> > arg3Expr;
+  OP func;
+  FunctorExtractor(const TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev>& expr)
+  : arg1Expr(expr.arg1Impl()), arg2Expr(expr.arg2Impl()), arg3Expr(expr.arg3Impl()), func(expr.functor()) {}
+};
+
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// TensorCwiseTernaryOp
+template <template <class, class, class, class> class TernaryCategory, typename OP, typename Arg1Expr, typename Arg2Expr, typename Arg3Expr, typename Dev>
+struct FunctorExtractor<TensorEvaluator< TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >
+:FunctorExtractor<TensorEvaluator<const TernaryCategory<OP, Arg1Expr, Arg2Expr, Arg3Expr>, Dev> >{};
+
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated.
+template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
+struct FunctorExtractor< TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {
+  FunctorExtractor<TensorEvaluator<IfExpr, Dev> > ifExpr;
+  FunctorExtractor<TensorEvaluator<ThenExpr, Dev> > thenExpr;
+  FunctorExtractor<TensorEvaluator<ElseExpr, Dev> > elseExpr;
+  FunctorExtractor(const TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev>& expr)
+  : ifExpr(expr.cond_impl()), thenExpr(expr.then_impl()), elseExpr(expr.else_impl()) {}
+};
+
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// TensorCwiseSelectOp. This is an specialisation without OP so it has to be separated
+template <typename IfExpr, typename ThenExpr, typename ElseExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> >
+:FunctorExtractor< TensorEvaluator<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, Dev> > {};
+
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorAssignOp. This is an specialisation without OP so it has to be separated.
+template <typename LHSExpr, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> > {
+  FunctorExtractor<TensorEvaluator<LHSExpr, Dev> > lhsExpr;
+  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
+  FunctorExtractor(const TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev>& expr)
+  : lhsExpr(expr.left_impl()), rhsExpr(expr.right_impl()) {}
+};
+
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// TensorAssignOp. This is an specialisation without OP so it has to be separated.
+template <typename LHSExpr, typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<TensorAssignOp<LHSExpr, RHSExpr>, Dev> >
+:FunctorExtractor<TensorEvaluator<const TensorAssignOp<LHSExpr, RHSExpr>, Dev> >{};
+
+
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// const TensorEvalToOp, This is an specialisation without OP so it has to be separated.
+template <typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev> > {
+  FunctorExtractor<TensorEvaluator<RHSExpr, Dev> > rhsExpr;
+  FunctorExtractor(const TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev>& expr)
+  : rhsExpr(expr.impl()) {}
+};
+
+/// specialisation of the \ref FunctorExtractor struct when the node type is
+/// TensorEvalToOp. This is a specialisation without OP so it has to be separated.
+template <typename RHSExpr, typename Dev>
+struct FunctorExtractor<TensorEvaluator<TensorEvalToOp<RHSExpr>, Dev> >
+: FunctorExtractor<TensorEvaluator<const TensorEvalToOp<RHSExpr>, Dev> > {};
+
+template<typename Dim, size_t NumOutputDim> struct DimConstr {
+template<typename InDim>
+  static inline Dim getDim(InDim dims ) {return dims;}
+};
+
+template<typename Dim> struct DimConstr<Dim, 0> {
+  template<typename InDim>
+    static inline Dim getDim(InDim dims ) {return Dim(dims.TotalSize());}
+};
+
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct FunctorExtractor<TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{
+  typedef TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Evaluator;
+  typedef typename Eigen::internal::conditional<Evaluator::NumOutputDims==0, DSizes<typename Evaluator::Index, 1>, typename Evaluator::Dimensions >::type Dimensions;
+  const Dimensions m_dimensions;
+  const Dimensions& dimensions() const { return m_dimensions; }
+  FunctorExtractor(const TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>& expr)
+  : m_dimensions(DimConstr<Dimensions, Evaluator::NumOutputDims>::getDim(expr.dimensions())) {}
+};
+
+
+template<typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct FunctorExtractor<TensorEvaluator<TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>
+: FunctorExtractor<TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>>{};
+/// template deduction function for FunctorExtractor
+template <typename Evaluator>
+auto inline extractFunctors(const Evaluator& evaluator)-> FunctorExtractor<Evaluator> {
+  return FunctorExtractor<Evaluator>(evaluator);
+}
+}  // namespace internal
+}  // namespace TensorSycl
+}  // namespace Eigen
+
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_EXTRACT_FUNCTORS_HPP
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
new file mode 100644
index 000000000..25d1fac9b
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h
@@ -0,0 +1,114 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorSyclLeafCount.h
+ *
+ * \brief:
+ *  The leaf count used the pre-order expression tree traverse in order to name
+ *  count the number of leaf nodes in the expression
+ *
+*****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+/// \brief LeafCount used to counting terminal nodes. The total number of
+/// leaf nodes is used by MakePlaceHolderExprHelper to find the order
+/// of the leaf node in a expression tree at compile time.
+template <typename Expr>
+struct LeafCount;
+
+template<typename... Args> struct CategoryCount;
+
+template<> struct CategoryCount<>
+{
+  static const size_t Count =0;
+};
+
+template<typename Arg, typename... Args>
+struct CategoryCount<Arg,Args...>{
+  static const size_t Count = LeafCount<Arg>::Count + CategoryCount<Args...>::Count;
+};
+
+/// specialisation of the \ref LeafCount struct when the node type is const TensorMap
+template <typename PlainObjectType, int Options_, template <class> class MakePointer_>
+struct LeafCount<const TensorMap<PlainObjectType, Options_, MakePointer_> > {
+  static const size_t Count =1;
+};
+
+/// specialisation of the \ref LeafCount struct when the node type is TensorMap
+template <typename PlainObjectType, int Options_, template <class> class MakePointer_>
+struct LeafCount<TensorMap<PlainObjectType, Options_, MakePointer_> > :LeafCount<const TensorMap<PlainObjectType, Options_, MakePointer_> >{};
+
+// const TensorCwiseUnaryOp, const TensorCwiseNullaryOp, const TensorCwiseBinaryOp, const TensorCwiseTernaryOp, and Const TensorBroadcastingOp
+template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr>
+struct LeafCount<const CategoryExpr<OP, RHSExpr...> >: CategoryCount<RHSExpr...> {};
+// TensorCwiseUnaryOp,  TensorCwiseNullaryOp,  TensorCwiseBinaryOp,  TensorCwiseTernaryOp, and  TensorBroadcastingOp
+template <template <class, class...> class CategoryExpr, typename OP, typename... RHSExpr>
+struct LeafCount<CategoryExpr<OP, RHSExpr...> > :LeafCount<const CategoryExpr<OP, RHSExpr...> >{};
+
+/// specialisation of the \ref LeafCount struct when the node type is const TensorSelectOp is an exception
+template <typename IfExpr, typename ThenExpr, typename ElseExpr>
+struct LeafCount<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > : CategoryCount<IfExpr, ThenExpr, ElseExpr> {};
+/// specialisation of the \ref LeafCount struct when the node type is TensorSelectOp
+template <typename IfExpr, typename ThenExpr, typename ElseExpr>
+struct LeafCount<TensorSelectOp<IfExpr, ThenExpr, ElseExpr> >: LeafCount<const TensorSelectOp<IfExpr, ThenExpr, ElseExpr> > {};
+
+
+/// specialisation of the \ref LeafCount struct when the node type is const TensorAssignOp
+template <typename LHSExpr, typename RHSExpr>
+struct LeafCount<const TensorAssignOp<LHSExpr, RHSExpr> >: CategoryCount<LHSExpr,RHSExpr> {};
+
+/// specialisation of the \ref LeafCount struct when the node type is
+/// TensorAssignOp is an exception. It is not the same as Unary
+template <typename LHSExpr, typename RHSExpr>
+struct LeafCount<TensorAssignOp<LHSExpr, RHSExpr> > :LeafCount<const TensorAssignOp<LHSExpr, RHSExpr> >{};
+
+/// specialisation of the \ref LeafCount struct when the node type is const TensorForcedEvalOp
+template <typename Expr>
+struct LeafCount<const TensorForcedEvalOp<Expr> > {
+    static const size_t Count =1;
+};
+
+/// specialisation of the \ref LeafCount struct when the node type is TensorForcedEvalOp
+template <typename Expr>
+struct LeafCount<TensorForcedEvalOp<Expr> >: LeafCount<const TensorForcedEvalOp<Expr> > {};
+
+/// specialisation of the \ref LeafCount struct when the node type is const TensorEvalToOp
+template <typename Expr>
+struct LeafCount<const TensorEvalToOp<Expr> > {
+  static const size_t Count = 1 + CategoryCount<Expr>::Count;
+};
+
+/// specialisation of the \ref LeafCount struct when the node type is const TensorReductionOp
+template <typename OP, typename Dim, typename Expr>
+struct LeafCount<const TensorReductionOp<OP, Dim, Expr> > {
+    static const size_t Count =1;
+};
+
+/// specialisation of the \ref LeafCount struct when the node type is TensorReductionOp
+template <typename OP, typename Dim, typename Expr>
+struct LeafCount<TensorReductionOp<OP, Dim, Expr> >: LeafCount<const TensorReductionOp<OP, Dim, Expr> >{};
+
+/// specialisation of the \ref LeafCount struct when the node type is TensorEvalToOp
+template <typename Expr>
+struct LeafCount<TensorEvalToOp<Expr> >: LeafCount<const TensorEvalToOp<Expr> >{};
+
+} /// namespace TensorSycl
+} /// namespace internal
+} /// namespace Eigen
+
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_LEAF_COUNT_HPP
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
new file mode 100644
index 000000000..d4c250c6d
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h
@@ -0,0 +1,181 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorSyclPlaceHolderExpr.h
+ *
+ * \brief:
+ *  This is the specialisation of the placeholder expression based on the
+ * operation type
+ *
+*****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+
+/// \struct PlaceHolder
+/// \brief PlaceHolder is used to replace the \ref TensorMap in the expression
+/// tree.
+/// PlaceHolder contains the order of the leaf node in the expression tree.
+template <typename Scalar, size_t N>
+struct PlaceHolder {
+  static constexpr size_t I = N;
+  typedef Scalar Type;
+};
+
+/// \sttruct PlaceHolderExpression
+/// \brief it is used to create the PlaceHolder expression. The PlaceHolder
+/// expression is a copy of expression type in which the TensorMap of the has
+/// been replaced with PlaceHolder.
+template <typename Expr, size_t N>
+struct PlaceHolderExpression;
+
+template<size_t N, typename... Args>
+struct CalculateIndex;
+
+template<size_t N, typename Arg>
+struct CalculateIndex<N, Arg>{
+  typedef typename PlaceHolderExpression<Arg, N>::Type ArgType;
+  typedef utility::tuple::Tuple<ArgType> ArgsTuple;
+};
+
+template<size_t N, typename Arg1, typename Arg2>
+struct CalculateIndex<N, Arg1, Arg2>{
+  static const size_t Arg2LeafCount = LeafCount<Arg2>::Count;
+  typedef typename PlaceHolderExpression<Arg1, N - Arg2LeafCount>::Type Arg1Type;
+  typedef typename PlaceHolderExpression<Arg2, N>::Type Arg2Type;
+  typedef utility::tuple::Tuple<Arg1Type, Arg2Type> ArgsTuple;
+};
+
+template<size_t N, typename Arg1, typename Arg2, typename Arg3>
+struct CalculateIndex<N, Arg1, Arg2, Arg3> {
+  static const size_t Arg3LeafCount = LeafCount<Arg3>::Count;
+  static const size_t Arg2LeafCount = LeafCount<Arg2>::Count;
+  typedef typename PlaceHolderExpression<Arg1, N - Arg3LeafCount - Arg2LeafCount>::Type Arg1Type;
+  typedef typename PlaceHolderExpression<Arg2, N - Arg3LeafCount>::Type Arg2Type;
+  typedef typename PlaceHolderExpression<Arg3, N>::Type Arg3Type;
+  typedef utility::tuple::Tuple<Arg1Type, Arg2Type, Arg3Type> ArgsTuple;
+};
+
+template<template<class...> class Category , class OP, class TPL>
+struct CategoryHelper;
+
+template<template<class...> class Category , class OP, class ...T >
+struct CategoryHelper<Category, OP, utility::tuple::Tuple<T...> > {
+  typedef Category<OP, T... > Type;
+};
+
+template<template<class...> class Category , class ...T >
+struct CategoryHelper<Category, NoOP, utility::tuple::Tuple<T...> > {
+  typedef Category<T... > Type;
+};
+
+/// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensorCwiseNullaryOp, TensorCwiseUnaryOp, TensorBroadcastingOp, TensorCwiseBinaryOp,  TensorCwiseTernaryOp
+#define OPEXPRCATEGORY(CVQual)\
+template <template <class, class... > class Category, typename OP, typename... SubExpr, size_t N>\
+struct PlaceHolderExpression<CVQual Category<OP, SubExpr...>, N>{\
+  typedef CVQual typename CategoryHelper<Category, OP, typename CalculateIndex<N, SubExpr...>::ArgsTuple>::Type Type;\
+};
+
+OPEXPRCATEGORY(const)
+OPEXPRCATEGORY()
+#undef OPEXPRCATEGORY
+
+/// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensorCwiseSelectOp
+#define SELECTEXPR(CVQual)\
+template <typename IfExpr, typename ThenExpr, typename ElseExpr, size_t N>\
+struct PlaceHolderExpression<CVQual TensorSelectOp<IfExpr, ThenExpr, ElseExpr>, N> {\
+  typedef CVQual typename CategoryHelper<TensorSelectOp, NoOP, typename CalculateIndex<N, IfExpr, ThenExpr, ElseExpr>::ArgsTuple>::Type Type;\
+};
+
+SELECTEXPR(const)
+SELECTEXPR()
+#undef SELECTEXPR
+
+/// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensorAssignOp
+#define ASSIGNEXPR(CVQual)\
+template <typename LHSExpr, typename RHSExpr, size_t N>\
+struct PlaceHolderExpression<CVQual TensorAssignOp<LHSExpr, RHSExpr>, N> {\
+  typedef CVQual typename CategoryHelper<TensorAssignOp, NoOP, typename CalculateIndex<N, LHSExpr, RHSExpr>::ArgsTuple>::Type Type;\
+};
+
+ASSIGNEXPR(const)
+ASSIGNEXPR()
+#undef ASSIGNEXPR
+
+/// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensorMap
+#define TENSORMAPEXPR(CVQual)\
+template <typename Scalar_, int Options_, int Options2_, int NumIndices_, typename IndexType_, template <class> class MakePointer_, size_t N>\
+struct PlaceHolderExpression< CVQual TensorMap< Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_>, N> {\
+  typedef CVQual PlaceHolder<CVQual TensorMap<Tensor<Scalar_, NumIndices_, Options_, IndexType_>, Options2_, MakePointer_>, N> Type;\
+};
+
+TENSORMAPEXPR(const)
+TENSORMAPEXPR()
+#undef TENSORMAPEXPR
+
+/// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensorForcedEvalOp
+#define FORCEDEVAL(CVQual)\
+template <typename Expr, size_t N>\
+struct PlaceHolderExpression<CVQual TensorForcedEvalOp<Expr>, N> {\
+  typedef CVQual PlaceHolder<CVQual TensorForcedEvalOp<Expr>, N> Type;\
+};
+
+FORCEDEVAL(const)
+FORCEDEVAL()
+#undef FORCEDEVAL
+
+/// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensorEvalToOp
+#define EVALTO(CVQual)\
+template <typename Expr, size_t N>\
+struct PlaceHolderExpression<CVQual TensorEvalToOp<Expr>, N> {\
+  typedef CVQual TensorEvalToOp<typename CalculateIndex <N, Expr>::ArgType> Type;\
+};
+
+EVALTO(const)
+EVALTO()
+#undef EVALTO
+
+
+/// specialisation of the \ref PlaceHolderExpression when the node is
+/// TensorReductionOp
+#define SYCLREDUCTION(CVQual)\
+template <typename OP, typename Dims, typename Expr, size_t N>\
+struct PlaceHolderExpression<CVQual TensorReductionOp<OP, Dims, Expr>, N>{\
+  typedef CVQual PlaceHolder<CVQual TensorReductionOp<OP, Dims,Expr>, N> Type;\
+};
+SYCLREDUCTION(const)
+SYCLREDUCTION()
+#undef SYCLREDUCTION
+
+/// template deduction for \ref PlaceHolderExpression struct
+template <typename Expr>
+struct createPlaceHolderExpression {
+  static const size_t TotalLeaves = LeafCount<Expr>::Count;
+  typedef typename PlaceHolderExpression<Expr, TotalLeaves - 1>::Type Type;
+};
+
+}  // internal
+}  // TensorSycl
+}  // namespace Eigen
+
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_PLACEHOLDER_EXPR_HPP
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
new file mode 100644
index 000000000..7914b6fad
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h
@@ -0,0 +1,70 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Cummins Chris PhD student at The University of Edinburgh.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorSyclRun.h
+ *
+ * \brief:
+ * Schedule_kernel invoke an specialised version of kernel struct. The
+ * specialisation is based on the data dimension in sycl buffer
+ *
+*****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP
+
+namespace Eigen {
+namespace TensorSycl {
+/// The run function in tensor sycl convert the expression tree to a buffer
+/// based expression tree;
+/// creates the expression tree for the device with accessor to buffers;
+/// construct the kernel and submit it to the sycl queue.
+template <typename Expr, typename Dev>
+void run(Expr &expr, Dev &dev) {
+  Eigen::TensorEvaluator<Expr, Dev> evaluator(expr, dev);
+  const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+  if (needs_assign) {
+    typedef  typename internal::createPlaceHolderExpression<Expr>::Type PlaceHolderExpr;
+    auto functors = internal::extractFunctors(evaluator);
+
+    size_t tileSize =dev.m_queue.get_device(). template get_info<cl::sycl::info::device::max_work_group_size>()/2;
+    dev.m_queue.submit([&](cl::sycl::handler &cgh) {
+
+      // create a tuple of accessors from Evaluator
+      auto tuple_of_accessors = internal::createTupleOfAccessors<decltype(evaluator)>(cgh, evaluator);
+      const auto range = utility::tuple::get<0>(tuple_of_accessors).get_range()[0];
+      size_t GRange=range;
+      if (tileSize>GRange) tileSize=GRange;
+      else if(GRange>tileSize){
+        size_t xMode = GRange % tileSize;
+        if (xMode != 0) GRange += (tileSize - xMode);
+      }
+      // run the kernel
+      cgh.parallel_for<PlaceHolderExpr>( cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), [=](cl::sycl::nd_item<1> itemID) {
+        typedef  typename internal::ConvertToDeviceExpression<Expr>::Type DevExpr;
+        auto device_expr =internal::createDeviceExpression<DevExpr, PlaceHolderExpr>(functors, tuple_of_accessors);
+        auto device_evaluator = Eigen::TensorEvaluator<decltype(device_expr.expr), Eigen::DefaultDevice>(device_expr.expr, Eigen::DefaultDevice());
+        if (itemID.get_global_linear_id() < range) {
+          device_evaluator.evalScalar(static_cast<int>(itemID.get_global_linear_id()));
+        }
+      });
+    });
+    dev.m_queue.throw_asynchronous();
+  }
+
+  evaluator.cleanup();
+}
+}  // namespace TensorSycl
+}  // namespace Eigen
+
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_SYCLRUN_HPP
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
new file mode 100644
index 000000000..83915f31a
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h
@@ -0,0 +1,237 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensroSyclTuple.h
+ *
+ * \brief:
+ *  Minimal implementation of std::tuple that can be used inside a SYCL kernel.
+ *
+*****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
+namespace utility {
+namespace tuple {
+/// \struct StaticIf
+/// \brief The StaticIf struct is used to statically choose the type based on the
+/// condition.
+template <bool, typename T = void> struct StaticIf;
+/// \brief specialisation of the \ref StaticIf when the condition is true
+template <typename T>
+struct StaticIf<true, T> {
+  typedef T type;
+};
+
+/// \struct Tuple
+/// \brief is a fixed-size collection of heterogeneous values
+/// \tparam Ts...	-	the types of the elements that the tuple stores.
+/// Empty list is supported.
+template <class... Ts>
+struct Tuple {};
+
+/// \brief specialisation of the \ref Tuple class when the tuple has at least
+/// one element.
+/// \tparam T : the type of the first element in the tuple.
+/// \tparam Ts... the rest of the elements in the tuple. Ts... can be empty.
+template <class T, class... Ts>
+struct Tuple<T, Ts...> {
+  Tuple(T t, Ts... ts) : head(t), tail(ts...) {}
+  T head;
+  Tuple<Ts...> tail;
+};
+
+///\ struct ElemTypeHolder
+/// \brief ElemTypeHolder class is used to specify the types of the
+/// elements inside the tuple
+/// \tparam size_t the number of elements inside the tuple
+/// \tparam class the tuple class
+template <size_t, class>
+struct ElemTypeHolder;
+
+/// \brief specialisation of the \ref ElemTypeHolder class when the number of
+/// elements inside the tuple is 1
+template <class T, class... Ts>
+struct ElemTypeHolder<0, Tuple<T, Ts...> > {
+  typedef T type;
+};
+
+/// \brief specialisation of the \ref ElemTypeHolder class when the number of
+/// elements inside the tuple is bigger than 1. It recursively calls itself to
+/// detect the type of each element in the tuple
+/// \tparam T : the type of the first element in the tuple.
+/// \tparam Ts... the rest of the elements in the tuple. Ts... can be empty.
+/// \tparam K is the Kth element in the tuple
+template <size_t k, class T, class... Ts>
+struct ElemTypeHolder<k, Tuple<T, Ts...> > {
+  typedef typename ElemTypeHolder<k - 1, Tuple<Ts...> >::type type;
+};
+
+/// get
+/// \brief Extracts the first element from the tuple.
+/// K=0 represents the first element of the tuple. The tuple cannot be empty.
+/// \tparam Ts... are the type of the elements in the tuple.
+/// \param t is the tuple whose contents to extract
+/// \return  typename ElemTypeHolder<0, Tuple<Ts...> >::type &>::type
+
+#define TERMINATE_CONDS_TUPLE_GET(CVQual) \
+template <size_t k, class... Ts> \
+typename StaticIf<k == 0, CVQual typename ElemTypeHolder<0, Tuple<Ts...> >::type &>::type \
+get(CVQual Tuple<Ts...> &t) { \
+  static_assert(sizeof...(Ts)!=0, "The requseted value is bigger than the size of the tuple"); \
+  return t.head; \
+}
+
+TERMINATE_CONDS_TUPLE_GET(const)
+TERMINATE_CONDS_TUPLE_GET()
+#undef TERMINATE_CONDS_TUPLE_GET
+/// get
+/// \brief Extracts the Kth element from the tuple.
+///\tparam K is an integer value in [0,sizeof...(Types)).
+/// \tparam T is the (sizeof...(Types) -(K+1)) element in the tuple
+/// \tparam Ts... are the type of the elements  in the tuple.
+/// \param t is the tuple whose contents to extract
+/// \return  typename ElemTypeHolder<K, Tuple<Ts...> >::type &>::type
+#define RECURSIVE_TUPLE_GET(CVQual) \
+template <size_t k, class T, class... Ts> \
+typename StaticIf<k != 0, CVQual typename ElemTypeHolder<k, Tuple<T, Ts...> >::type &>::type \
+get(CVQual Tuple<T, Ts...> &t) { \
+  return utility::tuple::get<k - 1>(t.tail); \
+}
+RECURSIVE_TUPLE_GET(const)
+RECURSIVE_TUPLE_GET()
+#undef RECURSIVE_TUPLE_GET
+
+/// make_tuple
+/// \brief Creates a tuple object, deducing the target type from the types of
+/// arguments.
+/// \tparam Args the type of the arguments to construct the tuple from
+/// \param args zero or more arguments to construct the tuple from
+/// \return Tuple<Args...>
+template <typename... Args>
+Tuple<Args...> make_tuple(Args... args) {
+  return Tuple<Args...>(args...);
+}
+
+/// size
+/// \brief Provides access to the number of elements in a tuple as a
+/// compile-time constant expression.
+/// \tparam Args the type of the arguments to construct the tuple from
+/// \return size_t
+template <typename... Args>
+static constexpr size_t size(Tuple<Args...> &) {
+  return sizeof...(Args);
+}
+
+/// \struct IndexList
+/// \brief Creates a list of index from the elements in the tuple
+/// \tparam Is... a list of index from [0 to sizeof...(tuple elements))
+template <size_t... Is>
+struct IndexList {};
+
+/// \struct RangeBuilder
+/// \brief Collects internal details for generating index ranges [MIN, MAX)
+/// Declare primary template for index range builder
+/// \tparam MIN is the starting index in the tuple
+/// \tparam N represents sizeof..(elemens)- sizeof...(Is)
+/// \tparam Is... are the list of generated index so far
+template <size_t MIN, size_t N, size_t... Is>
+struct RangeBuilder;
+
+// FIXME Doxygen has problems with recursive inheritance
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+/// \brief base Step: Specialisation of the \ref RangeBuilder when the
+/// MIN==MAX. In this case the Is... is [0 to sizeof...(tuple elements))
+/// \tparam MIN is the starting index of the tuple
+/// \tparam Is is [0 to sizeof...(tuple elements))
+template <size_t MIN, size_t... Is>
+struct RangeBuilder<MIN, MIN, Is...> {
+  typedef IndexList<Is...> type;
+};
+
+/// Induction step: Specialisation of the RangeBuilder class when N!=MIN
+/// in this case we are recursively subtracting N by one and adding one
+/// index to Is... list until MIN==N
+/// \tparam MIN is the starting index in the tuple
+/// \tparam N represents sizeof..(elemens)- sizeof...(Is)
+/// \tparam Is... are the list of generated index so far
+template <size_t MIN, size_t N, size_t... Is>
+struct RangeBuilder : public RangeBuilder<MIN, N - 1, N - 1, Is...> {};
+#endif // EIGEN_PARSED_BY_DOXYGEN
+
+/// \brief IndexRange that returns a [MIN, MAX) index range
+/// \tparam MIN is the starting index in the tuple
+/// \tparam MAX is the size of the tuple
+template <size_t MIN, size_t MAX>
+struct IndexRange: RangeBuilder<MIN, MAX>::type {};
+
+/// append_base
+/// \brief unpacking the elements of the input tuple t and creating a new tuple
+/// by adding element a at the end of it.
+///\tparam Args... the type of the elements inside the tuple t
+/// \tparam T the type of the new element going to be added at the end of tuple
+/// \tparam I... is the list of index from [0 to sizeof...(t))
+/// \param t the tuple on which we want to append a.
+/// \param a the new elements going to be added to the tuple
+/// \return Tuple<Args..., T>
+template <typename... Args, typename T, size_t... I>
+Tuple<Args..., T> append_base(Tuple<Args...> t, T a,IndexList<I...>) {
+  return utility::tuple::make_tuple(get<I>(t)..., a);
+}
+
+/// append
+/// \brief the deduction function for \ref append_base that automatically
+/// generate the \ref IndexRange
+///\tparam Args... the type of the elements inside the tuple t
+/// \tparam T the type of the new element going to be added at the end of tuple
+/// \param t the tuple on which we want to append a.
+/// \param a the new elements going to be added to the tuple
+/// \return Tuple<Args..., T>
+template <typename... Args, typename T>
+Tuple<Args..., T> append(Tuple<Args...> t, T a) {
+  return utility::tuple::append_base(t, a,  IndexRange<0, sizeof...(Args)>());
+}
+
+/// append_base
+/// \brief This is a specialisation of \ref append_base when we want to
+/// concatenate
+/// tuple t2 at the end of the tuple t1. Here we unpack both tuples, generate the
+/// IndexRange for each of them and create an output tuple T that contains both
+/// elements of t1 and t2.
+///\tparam Args1... the type of the elements inside the tuple t1
+///\tparam Args2... the type of the elements inside the tuple t2
+/// \tparam I1... is the list of index from [0 to sizeof...(t1))
+/// \tparam I2... is the list of index from [0 to sizeof...(t2))
+/// \param t1 is the tuple on which we want to append t2.
+/// \param t2 is the tuple that is going to be added on t1.
+/// \return Tuple<Args1..., Args2...>
+template <typename... Args1, typename... Args2, size_t... I1, size_t... I2>
+Tuple<Args1..., Args2...> append_base(Tuple<Args1...> t1, Tuple<Args2...> t2, IndexList<I1...>, IndexList<I2...>) {
+  return utility::tuple::make_tuple(get<I1>(t1)...,get<I2>(t2)...);
+}
+
+/// append
+/// \brief deduction function for \ref append_base when we are appending tuple
+/// t1 by tuple t2. In this case the \ref IndexRange for both tuple are
+/// automatically generated.
+///\tparam Args1... the type of the elements inside the tuple t1
+///\tparam Args2... the type of the elements inside the tuple t2
+/// \param t1 is the tuple on which we want to append t2.
+/// \param t2 is the tuple that is going to be added on t1.
+/// \return Tuple<Args1..., Args2...>
+template <typename... Args1, typename... Args2>
+Tuple<Args1..., Args2...> append(Tuple<Args1...> t1,Tuple<Args2...> t2) {
+  return utility::tuple::append_base(t1, t2, IndexRange<0, sizeof...(Args1)>(), IndexRange<0, sizeof...(Args2)>());
+}
+}  // tuple
+}  // utility
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSORSYCL_TUPLE_HPP
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
deleted file mode 100644
index 24d22c189..000000000
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
+++ /dev/null
@@ -1,303 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2017 Gagan Goel <gagan.nith@gmail.com>
-// Copyright (C) 2017 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
-#define EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
-
-namespace Eigen {
-
-/** \class TensorTrace
-  * \ingroup CXX11_Tensor_Module
-  *
-  * \brief Tensor Trace class.
-  *
-  *
-  */
-
-namespace internal {
-template<typename Dims, typename XprType>
-struct traits<TensorTraceOp<Dims, XprType> > : public traits<XprType>
-{
-  typedef typename XprType::Scalar Scalar;
-  typedef traits<XprType> XprTraits;
-  typedef typename XprTraits::StorageKind StorageKind;
-  typedef typename XprTraits::Index Index;
-  typedef typename XprType::Nested Nested;
-  typedef typename remove_reference<Nested>::type _Nested;
-  static const int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
-  static const int Layout = XprTraits::Layout;
-};
-
-template<typename Dims, typename XprType>
-struct eval<TensorTraceOp<Dims, XprType>, Eigen::Dense>
-{
-  typedef const TensorTraceOp<Dims, XprType>& type;
-};
-
-template<typename Dims, typename XprType>
-struct nested<TensorTraceOp<Dims, XprType>, 1, typename eval<TensorTraceOp<Dims, XprType> >::type>
-{
-  typedef TensorTraceOp<Dims, XprType> type;
-};
-
-} // end namespace internal
-
-
-template<typename Dims, typename XprType>
-class TensorTraceOp : public TensorBase<TensorTraceOp<Dims, XprType> >
-{
-  public:
-    typedef typename Eigen::internal::traits<TensorTraceOp>::Scalar Scalar;
-    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
-    typedef typename XprType::CoeffReturnType CoeffReturnType;
-    typedef typename Eigen::internal::nested<TensorTraceOp>::type Nested;
-    typedef typename Eigen::internal::traits<TensorTraceOp>::StorageKind StorageKind;
-    typedef typename Eigen::internal::traits<TensorTraceOp>::Index Index;
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTraceOp(const XprType& expr, const Dims& dims)
-      : m_xpr(expr), m_dims(dims) {
-    }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const Dims& dims() const { return m_dims; }
-
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const typename internal::remove_all<typename XprType::Nested>::type& expression() const { return m_xpr; }
-
-  protected:
-    typename XprType::Nested m_xpr;
-    const Dims m_dims;
-};
-
-
-// Eval as rvalue
-template<typename Dims, typename ArgType, typename Device>
-struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
-{
-  typedef TensorTraceOp<Dims, ArgType> XprType;
-  static const int NumInputDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
-  static const int NumReducedDims = internal::array_size<Dims>::value;
-  static const int NumOutputDims = NumInputDims - NumReducedDims;
-  typedef typename XprType::Index Index;
-  typedef DSizes<Index, NumOutputDims> Dimensions;
-  typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::CoeffReturnType CoeffReturnType;
-  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
-
-  enum {
-    IsAligned = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
-    BlockAccess = false,
-    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
-    Layout = TensorEvaluator<ArgType, Device>::Layout,
-    CoordAccess = false,
-    RawAccess = false
-  };
-
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
-    : m_impl(op.expression(), device), m_traceDim(1), m_device(device)
-  {
-
-    EIGEN_STATIC_ASSERT((NumOutputDims >= 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    EIGEN_STATIC_ASSERT((NumReducedDims >= 2) || ((NumReducedDims == 0) && (NumInputDims == 0)), YOU_MADE_A_PROGRAMMING_MISTAKE);
-
-    for (int i = 0; i < NumInputDims; ++i) {
-      m_reduced[i] = false;
-    }
-
-    const Dims& op_dims = op.dims();
-    for (int i = 0; i < NumReducedDims; ++i) {
-      eigen_assert(op_dims[i] >= 0);
-      eigen_assert(op_dims[i] < NumInputDims);
-      m_reduced[op_dims[i]] = true;
-    }
-
-    // All the dimensions should be distinct to compute the trace
-    int num_distinct_reduce_dims = 0;
-    for (int i = 0; i < NumInputDims; ++i) {
-      if (m_reduced[i]) {
-        ++num_distinct_reduce_dims;
-      }
-    }
-
-    eigen_assert(num_distinct_reduce_dims == NumReducedDims);
-
-    // Compute the dimensions of the result.
-    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
-
-    int output_index = 0;
-    int reduced_index = 0;
-    for (int i = 0; i < NumInputDims; ++i) {
-      if (m_reduced[i]) {
-        m_reducedDims[reduced_index] = input_dims[i];
-        if (reduced_index > 0) {
-          // All the trace dimensions must have the same size
-          eigen_assert(m_reducedDims[0] == m_reducedDims[reduced_index]);
-        }
-        ++reduced_index;
-      }
-      else {
-        m_dimensions[output_index] = input_dims[i];
-        ++output_index;
-      }
-    }
-
-    if (NumReducedDims != 0) {
-      m_traceDim = m_reducedDims[0];
-    }
-
-    // Compute the output strides
-    if (NumOutputDims > 0) {
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        m_outputStrides[0] = 1;
-        for (int i = 1; i < NumOutputDims; ++i) {
-          m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
-        }
-      }
-      else {
-        m_outputStrides.back() = 1;
-        for (int i = NumOutputDims - 2; i >= 0; --i) {
-          m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
-        }
-      }
-    }
-
-    // Compute the input strides
-    if (NumInputDims > 0) {
-      array<Index, NumInputDims> input_strides;
-      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-        input_strides[0] = 1;
-        for (int i = 1; i < NumInputDims; ++i) {
-          input_strides[i] = input_strides[i - 1] * input_dims[i - 1];
-        }
-      }
-      else {
-        input_strides.back() = 1;
-        for (int i = NumInputDims - 2; i >= 0; --i) {
-          input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
-        }
-      }
-
-      output_index = 0;
-      reduced_index = 0;
-      for (int i = 0; i < NumInputDims; ++i) {
-        if(m_reduced[i]) {
-          m_reducedStrides[reduced_index] = input_strides[i];
-          ++reduced_index;
-        }
-        else {
-          m_preservedStrides[output_index] = input_strides[i];
-          ++output_index;
-        }
-      }
-    }
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
-    return m_dimensions;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
-    m_impl.evalSubExprsIfNeeded(NULL);
-    return true;
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
-    m_impl.cleanup();
-  }
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-  {
-    // Initialize the result
-    CoeffReturnType result = internal::cast<int, CoeffReturnType>(0);
-    Index index_stride = 0;
-    for (int i = 0; i < NumReducedDims; ++i) {
-      index_stride += m_reducedStrides[i];
-    }
-
-    // If trace is requested along all dimensions, starting index would be 0
-    Index cur_index = 0;
-    if (NumOutputDims != 0)
-      cur_index = firstInput(index);
-    for (Index i = 0; i < m_traceDim; ++i) {
-        result += m_impl.coeff(cur_index);
-        cur_index += index_stride;
-    }
-
-    return result;
-  }
-
-  template<int LoadMode>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
-
-    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
-
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-    for (int i = 0; i < PacketSize; ++i) {
-        values[i] = coeff(index + i);
-    }
-    PacketReturnType result = internal::ploadt<PacketReturnType, LoadMode>(values);
-    return result;
-  }
-
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
-
- protected:
-  // Given the output index, finds the first index in the input tensor used to compute the trace
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
-    Index startInput = 0;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = NumOutputDims - 1; i > 0; --i) {
-        const Index idx = index / m_outputStrides[i];
-        startInput += idx * m_preservedStrides[i];
-        index -= idx * m_outputStrides[i];
-      }
-      startInput += index * m_preservedStrides[0];
-    }
-    else {
-      for (int i = 0; i < NumOutputDims - 1; ++i) {
-        const Index idx = index / m_outputStrides[i];
-        startInput += idx * m_preservedStrides[i];
-        index -= idx * m_outputStrides[i];
-      }
-      startInput += index * m_preservedStrides[NumOutputDims - 1];
-    }
-    return startInput;
-  }
-
-  Dimensions m_dimensions;
-  TensorEvaluator<ArgType, Device> m_impl;
-  // Initialize the size of the trace dimension
-  Index m_traceDim;
-  const Device EIGEN_DEVICE_REF m_device;
-  array<bool, NumInputDims> m_reduced;
-  array<Index, NumReducedDims> m_reducedDims;
-  array<Index, NumOutputDims> m_outputStrides;
-  array<Index, NumReducedDims> m_reducedStrides;
-  array<Index, NumOutputDims> m_preservedStrides;
-};
-
-
-} // End namespace Eigen
-
-#endif // EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
index 4f7fd340e..ffcf8b00f 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -59,7 +59,6 @@ struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
   template <typename T> struct MakePointer {
     typedef T* Type;
   };
-  typedef typename MakePointer<Scalar>::Type PointerType;
 };
 
 
@@ -78,7 +77,6 @@ struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> >
   template <typename T> struct MakePointer {
     typedef T* Type;
   };
-  typedef typename MakePointer<Scalar>::Type PointerType;
 };
 
 
@@ -101,7 +99,6 @@ struct traits<TensorMap<PlainObjectType, Options_, MakePointer_> >
     typedef MakePointer_<T> MakePointerT;
     typedef typename MakePointerT::Type Type;
   };
-  typedef typename MakePointer<Scalar>::Type PointerType;
 };
 
 template<typename PlainObjectType>
@@ -118,56 +115,55 @@ struct traits<TensorRef<PlainObjectType> >
     Options = BaseTraits::Options,
     Flags = BaseTraits::Flags
   };
-  typedef typename BaseTraits::PointerType PointerType;
 };
 
 
 template<typename _Scalar, int NumIndices_, int Options, typename IndexType_>
 struct eval<Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
 {
-  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type;
+  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type;
 };
 
 template<typename _Scalar, int NumIndices_, int Options, typename IndexType_>
 struct eval<const Tensor<_Scalar, NumIndices_, Options, IndexType_>, Eigen::Dense>
 {
-  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>EIGEN_DEVICE_REF type;
+  typedef const Tensor<_Scalar, NumIndices_, Options, IndexType_>& type;
 };
 
 template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
 struct eval<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
 {
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
 };
 
 template<typename Scalar_, typename Dimensions, int Options, typename IndexType_>
 struct eval<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense>
 {
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
 };
 
 template<typename PlainObjectType, int Options, template <class> class MakePointer>
 struct eval<TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense>
 {
-  typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type;
+  typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
 };
 
 template<typename PlainObjectType, int Options, template <class> class MakePointer>
 struct eval<const TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense>
 {
-  typedef const TensorMap<PlainObjectType, Options, MakePointer>EIGEN_DEVICE_REF type;
+  typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
 };
 
 template<typename PlainObjectType>
 struct eval<TensorRef<PlainObjectType>, Eigen::Dense>
 {
-  typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
+  typedef const TensorRef<PlainObjectType>& type;
 };
 
 template<typename PlainObjectType>
 struct eval<const TensorRef<PlainObjectType>, Eigen::Dense>
 {
-  typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
+  typedef const TensorRef<PlainObjectType>& type;
 };
 
 // TODO nested<> does not exist anymore in Eigen/Core, and it thus has to be removed in favor of ref_selector.
@@ -179,38 +175,50 @@ template<typename T, int n=1, typename PlainObject = void> struct nested
 template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
 struct nested<Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
 {
-  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>EIGEN_DEVICE_REF type;
+  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type;
 };
 
 template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
 struct nested<const Tensor<Scalar_, NumIndices_, Options_, IndexType_> >
 {
-  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>EIGEN_DEVICE_REF type;
+  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_>& type;
 };
 
 template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
 struct nested<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >
 {
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
 };
 
 template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
 struct nested<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> >
 {
-  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>EIGEN_DEVICE_REF type;
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>& type;
 };
 
 
+template <typename PlainObjectType, int Options, template <class> class MakePointer>
+struct nested<TensorMap<PlainObjectType, Options, MakePointer> >
+{
+  typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
+};
+
+template <typename PlainObjectType, int Options, template <class> class MakePointer>
+struct nested<const TensorMap<PlainObjectType, Options, MakePointer> >
+{
+  typedef const TensorMap<PlainObjectType, Options, MakePointer>& type;
+};
+
 template <typename PlainObjectType>
 struct nested<TensorRef<PlainObjectType> >
 {
-  typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
+  typedef const TensorRef<PlainObjectType>& type;
 };
 
 template <typename PlainObjectType>
 struct nested<const TensorRef<PlainObjectType> >
 {
-  typedef const TensorRef<PlainObjectType>EIGEN_DEVICE_REF type;
+  typedef const TensorRef<PlainObjectType>& type;
 };
 
 }  // end namespace internal
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index d23f2e4c8..3523e7c94 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -23,7 +23,6 @@ struct static_val {
 
   template <typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
-    EIGEN_UNUSED_VARIABLE(v);
     eigen_assert(v == n);
   }
 };
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index 000ed5b41..0ca2cac84 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -22,7 +22,6 @@ namespace Eigen {
   * dimensions.
   */
 namespace internal {
-
 template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
 struct traits<TensorVolumePatchOp<Planes, Rows, Cols, XprType> > : public traits<XprType>
 {
@@ -34,8 +33,6 @@ struct traits<TensorVolumePatchOp<Planes, Rows, Cols, XprType> > : public traits
   typedef typename remove_reference<Nested>::type _Nested;
   static const int NumDimensions = XprTraits::NumDimensions + 1;
   static const int Layout = XprTraits::Layout;
-  typedef typename XprTraits::PointerType PointerType;
-
 };
 
 template<DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
@@ -68,12 +65,12 @@ class TensorVolumePatchOp : public TensorBase<TensorVolumePatchOp<Planes, Rows,
                                                             DenseIndex in_plane_strides, DenseIndex in_row_strides, DenseIndex in_col_strides,
                                                             DenseIndex plane_inflate_strides, DenseIndex row_inflate_strides, DenseIndex col_inflate_strides,
                                                             PaddingType padding_type, Scalar padding_value)
-                                                            : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
-                                                            m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
-                                                            m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
-                                                            m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
-                                                            m_padding_explicit(false), m_padding_top_z(0), m_padding_bottom_z(0), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
-                                                            m_padding_type(padding_type), m_padding_value(padding_value) {}
+      : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+        m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
+        m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+        m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+        m_padding_explicit(false), m_padding_top_z(0), m_padding_bottom_z(0), m_padding_top(0), m_padding_bottom(0), m_padding_left(0), m_padding_right(0),
+        m_padding_type(padding_type), m_padding_value(padding_value) {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
                                                            DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides,
@@ -83,13 +80,13 @@ class TensorVolumePatchOp : public TensorBase<TensorVolumePatchOp<Planes, Rows,
                                                            DenseIndex padding_top, DenseIndex padding_bottom,
                                                            DenseIndex padding_left, DenseIndex padding_right,
                                                            Scalar padding_value)
-                                                           : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
-                                                           m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
-                                                           m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
-                                                           m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
-                                                           m_padding_explicit(true), m_padding_top_z(padding_top_z), m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
-                                                           m_padding_left(padding_left), m_padding_right(padding_right),
-                                                           m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
+      : m_xpr(expr), m_patch_planes(patch_planes), m_patch_rows(patch_rows), m_patch_cols(patch_cols),
+        m_plane_strides(plane_strides), m_row_strides(row_strides), m_col_strides(col_strides),
+        m_in_plane_strides(in_plane_strides), m_in_row_strides(in_row_strides), m_in_col_strides(in_col_strides),
+        m_plane_inflate_strides(plane_inflate_strides), m_row_inflate_strides(row_inflate_strides), m_col_inflate_strides(col_inflate_strides),
+        m_padding_explicit(true), m_padding_top_z(padding_top_z), m_padding_bottom_z(padding_bottom_z), m_padding_top(padding_top), m_padding_bottom(padding_bottom),
+        m_padding_left(padding_left), m_padding_right(padding_right),
+        m_padding_type(PADDING_VALID), m_padding_value(padding_value) {}
 
     EIGEN_DEVICE_FUNC
     DenseIndex patch_planes() const { return m_patch_planes; }
@@ -176,26 +173,19 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
   typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
-  static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
-  typedef StorageMemory<CoeffReturnType, Device> Storage;
-  typedef typename Storage::Type EvaluatorPointerType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
     IsAligned = false,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     BlockAccess = false,
-    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
     RawAccess = false
   };
 
-  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
-  typedef internal::TensorBlockNotImplemented TensorBlock;
-  //===--------------------------------------------------------------------===//
-
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) :
- m_impl(op.expression(), device)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device)
   {
     EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE);
 
@@ -332,7 +322,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
 
     // Fast representations of different variables.
     m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
-
     m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
     m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
     m_fastRowStride = internal::TensorIntDivisor<Index>(m_rowStride);
@@ -352,7 +341,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
@@ -513,11 +502,10 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
     return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
   }
 
-  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+  EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
   const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 
-
   Index planePaddingTop() const { return m_planePaddingTop; }
   Index rowPaddingTop() const { return m_rowPaddingTop; }
   Index colPaddingLeft() const { return m_colPaddingLeft; }
@@ -534,17 +522,10 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
   Index rowInflateStride() const { return m_row_inflate_strides; }
   Index colInflateStride() const { return m_col_inflate_strides; }
 
-#ifdef EIGEN_USE_SYCL
-  // binding placeholder accessors to a command group handler for SYCL
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
-    m_impl.bind(cgh);
-  }
-#endif
  protected:
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const
   {
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-    EIGEN_UNROLL_LOOP
     for (int i = 0; i < PacketSize; ++i) {
       values[i] = coeff(index+i);
     }
@@ -554,7 +535,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
 
   Dimensions m_dimensions;
 
-  // Parameters passed to the constructor.
+  // Parameters passed to the costructor.
   Index m_plane_strides;
   Index m_row_strides;
   Index m_col_strides;
@@ -619,8 +600,6 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
   Scalar m_paddingValue;
 
   TensorEvaluator<ArgType, Device> m_impl;
-
-
 };
 
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
index 54bf9dbb3..5e97d07a9 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
@@ -244,7 +244,7 @@ struct dimino_first_step_elements
   * multiplying all elements in the given subgroup with the new
   * coset representative. Note that the first element of the
   * subgroup is always the identity element, so the first element of
-  * the result of this template is going to be the coset
+  * ther result of this template is going to be the coset
   * representative itself.
   *
   * Note that this template accepts an additional boolean parameter
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h
deleted file mode 100644
index e4c59dc3d..000000000
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h
+++ /dev/null
@@ -1,67 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// Barrier is an object that allows one or more threads to wait until
-// Notify has been called a specified number of times.
-
-#ifndef EIGEN_CXX11_THREADPOOL_BARRIER_H
-#define EIGEN_CXX11_THREADPOOL_BARRIER_H
-
-namespace Eigen {
-
-class Barrier {
- public:
-  Barrier(unsigned int count) : state_(count << 1), notified_(false) {
-    eigen_plain_assert(((count << 1) >> 1) == count);
-  }
-  ~Barrier() { eigen_plain_assert((state_ >> 1) == 0); }
-
-  void Notify() {
-    unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
-    if (v != 1) {
-      // Clear the lowest bit (waiter flag) and check that the original state
-      // value was not zero. If it was zero, it means that notify was called
-      // more times than the original count.
-      eigen_plain_assert(((v + 2) & ~1) != 0);
-      return;  // either count has not dropped to 0, or waiter is not waiting
-    }
-    std::unique_lock<std::mutex> l(mu_);
-    eigen_plain_assert(!notified_);
-    notified_ = true;
-    cv_.notify_all();
-  }
-
-  void Wait() {
-    unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
-    if ((v >> 1) == 0) return;
-    std::unique_lock<std::mutex> l(mu_);
-    while (!notified_) {
-      cv_.wait(l);
-    }
-  }
-
- private:
-  std::mutex mu_;
-  std::condition_variable cv_;
-  std::atomic<unsigned int> state_;  // low bit is waiter flag
-  bool notified_;
-};
-
-// Notification is an object that allows a user to to wait for another
-// thread to signal a notification that an event has occurred.
-//
-// Multiple threads can wait on the same Notification object,
-// but only one caller must call Notify() on the object.
-struct Notification : Barrier {
-  Notification() : Barrier(1){};
-};
-
-}  // namespace Eigen
-
-#endif  // EIGEN_CXX11_THREADPOOL_BARRIER_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
index 4549aa069..4749d6240 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
@@ -33,10 +33,10 @@ namespace Eigen {
 //   ec.Notify(true);
 //
 // Notify is cheap if there are no waiting threads. Prewait/CommitWait are not
-// cheap, but they are executed only if the preceding predicate check has
+// cheap, but they are executed only if the preceeding predicate check has
 // failed.
 //
-// Algorithm outline:
+// Algorihtm outline:
 // There are two main variables: predicate (managed by user) and state_.
 // Operation closely resembles Dekker mutual algorithm:
 // https://en.wikipedia.org/wiki/Dekker%27s_algorithm
@@ -50,114 +50,117 @@ class EventCount {
  public:
   class Waiter;
 
-  EventCount(MaxSizeVector<Waiter>& waiters)
-      : state_(kStackMask), waiters_(waiters) {
-    eigen_plain_assert(waiters.size() < (1 << kWaiterBits) - 1);
+  EventCount(MaxSizeVector<Waiter>& waiters) : waiters_(waiters) {
+    eigen_assert(waiters.size() < (1 << kWaiterBits) - 1);
+    // Initialize epoch to something close to overflow to test overflow.
+    state_ = kStackMask | (kEpochMask - kEpochInc * waiters.size() * 2);
   }
 
   ~EventCount() {
     // Ensure there are no waiters.
-    eigen_plain_assert(state_.load() == kStackMask);
+    eigen_plain_assert((state_.load() & (kStackMask | kWaiterMask)) == kStackMask);
   }
 
   // Prewait prepares for waiting.
-  // After calling Prewait, the thread must re-check the wait predicate
-  // and then call either CancelWait or CommitWait.
-  void Prewait() {
-    uint64_t state = state_.load(std::memory_order_relaxed);
-    for (;;) {
-      CheckState(state);
-      uint64_t newstate = state + kWaiterInc;
-      CheckState(newstate);
-      if (state_.compare_exchange_weak(state, newstate,
-                                       std::memory_order_seq_cst))
-        return;
-    }
+  // After calling this function the thread must re-check the wait predicate
+  // and call either CancelWait or CommitWait passing the same Waiter object.
+  void Prewait(Waiter* w) {
+    w->epoch = state_.fetch_add(kWaiterInc, std::memory_order_relaxed);
+    std::atomic_thread_fence(std::memory_order_seq_cst);
   }
 
-  // CommitWait commits waiting after Prewait.
+  // CommitWait commits waiting.
   void CommitWait(Waiter* w) {
-    eigen_plain_assert((w->epoch & ~kEpochMask) == 0);
     w->state = Waiter::kNotSignaled;
-    const uint64_t me = (w - &waiters_[0]) | w->epoch;
+    // Modification epoch of this waiter.
+    uint64_t epoch =
+        (w->epoch & kEpochMask) +
+        (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift);
     uint64_t state = state_.load(std::memory_order_seq_cst);
     for (;;) {
-      CheckState(state, true);
-      uint64_t newstate;
-      if ((state & kSignalMask) != 0) {
-        // Consume the signal and return immidiately.
-        newstate = state - kWaiterInc - kSignalInc;
-      } else {
-        // Remove this thread from pre-wait counter and add to the waiter stack.
-        newstate = ((state & kWaiterMask) - kWaiterInc) | me;
-        w->next.store(state & (kStackMask | kEpochMask),
-                      std::memory_order_relaxed);
+      if (int64_t((state & kEpochMask) - epoch) < 0) {
+        // The preceeding waiter has not decided on its fate. Wait until it
+        // calls either CancelWait or CommitWait, or is notified.
+        EIGEN_THREAD_YIELD();
+        state = state_.load(std::memory_order_seq_cst);
+        continue;
       }
-      CheckState(newstate);
+      // We've already been notified.
+      if (int64_t((state & kEpochMask) - epoch) > 0) return;
+      // Remove this thread from prewait counter and add it to the waiter list.
+      eigen_assert((state & kWaiterMask) != 0);
+      uint64_t newstate = state - kWaiterInc + kEpochInc;
+      newstate = (newstate & ~kStackMask) | (w - &waiters_[0]);
+      if ((state & kStackMask) == kStackMask)
+        w->next.store(nullptr, std::memory_order_relaxed);
+      else
+        w->next.store(&waiters_[state & kStackMask], std::memory_order_relaxed);
       if (state_.compare_exchange_weak(state, newstate,
-                                       std::memory_order_acq_rel)) {
-        if ((state & kSignalMask) == 0) {
-          w->epoch += kEpochInc;
-          Park(w);
-        }
-        return;
-      }
+                                       std::memory_order_release))
+        break;
     }
+    Park(w);
   }
 
   // CancelWait cancels effects of the previous Prewait call.
-  void CancelWait() {
+  void CancelWait(Waiter* w) {
+    uint64_t epoch =
+        (w->epoch & kEpochMask) +
+        (((w->epoch & kWaiterMask) >> kWaiterShift) << kEpochShift);
     uint64_t state = state_.load(std::memory_order_relaxed);
     for (;;) {
-      CheckState(state, true);
-      uint64_t newstate = state - kWaiterInc;
-      // We don't know if the thread was also notified or not,
-      // so we should not consume a signal unconditionaly.
-      // Only if number of waiters is equal to number of signals,
-      // we know that the thread was notified and we must take away the signal.
-      if (((state & kWaiterMask) >> kWaiterShift) ==
-          ((state & kSignalMask) >> kSignalShift))
-        newstate -= kSignalInc;
-      CheckState(newstate);
-      if (state_.compare_exchange_weak(state, newstate,
-                                       std::memory_order_acq_rel))
+      if (int64_t((state & kEpochMask) - epoch) < 0) {
+        // The preceeding waiter has not decided on its fate. Wait until it
+        // calls either CancelWait or CommitWait, or is notified.
+        EIGEN_THREAD_YIELD();
+        state = state_.load(std::memory_order_relaxed);
+        continue;
+      }
+      // We've already been notified.
+      if (int64_t((state & kEpochMask) - epoch) > 0) return;
+      // Remove this thread from prewait counter.
+      eigen_assert((state & kWaiterMask) != 0);
+      if (state_.compare_exchange_weak(state, state - kWaiterInc + kEpochInc,
+                                       std::memory_order_relaxed))
         return;
     }
   }
 
   // Notify wakes one or all waiting threads.
   // Must be called after changing the associated wait predicate.
-  void Notify(bool notifyAll) {
+  void Notify(bool all) {
     std::atomic_thread_fence(std::memory_order_seq_cst);
     uint64_t state = state_.load(std::memory_order_acquire);
     for (;;) {
-      CheckState(state);
-      const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
-      const uint64_t signals = (state & kSignalMask) >> kSignalShift;
       // Easy case: no waiters.
-      if ((state & kStackMask) == kStackMask && waiters == signals) return;
+      if ((state & kStackMask) == kStackMask && (state & kWaiterMask) == 0)
+        return;
+      uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
       uint64_t newstate;
-      if (notifyAll) {
-        // Empty wait stack and set signal to number of pre-wait threads.
-        newstate =
-            (state & kWaiterMask) | (waiters << kSignalShift) | kStackMask;
-      } else if (signals < waiters) {
+      if (all) {
+        // Reset prewait counter and empty wait list.
+        newstate = (state & kEpochMask) + (kEpochInc * waiters) + kStackMask;
+      } else if (waiters) {
         // There is a thread in pre-wait state, unblock it.
-        newstate = state + kSignalInc;
+        newstate = state + kEpochInc - kWaiterInc;
       } else {
         // Pop a waiter from list and unpark it.
         Waiter* w = &waiters_[state & kStackMask];
-        uint64_t next = w->next.load(std::memory_order_relaxed);
-        newstate = (state & (kWaiterMask | kSignalMask)) | next;
+        Waiter* wnext = w->next.load(std::memory_order_relaxed);
+        uint64_t next = kStackMask;
+        if (wnext != nullptr) next = wnext - &waiters_[0];
+        // Note: we don't add kEpochInc here. ABA problem on the lock-free stack
+        // can't happen because a waiter is re-pushed onto the stack only after
+        // it was in the pre-wait state which inevitably leads to epoch
+        // increment.
+        newstate = (state & kEpochMask) + next;
       }
-      CheckState(newstate);
       if (state_.compare_exchange_weak(state, newstate,
-                                       std::memory_order_acq_rel)) {
-        if (!notifyAll && (signals < waiters))
-          return;  // unblocked pre-wait thread
+                                       std::memory_order_acquire)) {
+        if (!all && waiters) return;  // unblocked pre-wait thread
         if ((state & kStackMask) == kStackMask) return;
         Waiter* w = &waiters_[state & kStackMask];
-        if (!notifyAll) w->next.store(kStackMask, std::memory_order_relaxed);
+        if (!all) w->next.store(nullptr, std::memory_order_relaxed);
         Unpark(w);
         return;
       }
@@ -166,13 +169,12 @@ class EventCount {
 
   class Waiter {
     friend class EventCount;
-    // Align to 128 byte boundary to prevent false sharing with other Waiter
-    // objects in the same vector.
-    EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<uint64_t> next;
+    // Align to 128 byte boundary to prevent false sharing with other Waiter objects in the same vector.
+    EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<Waiter*> next;
     std::mutex mu;
     std::condition_variable cv;
-    uint64_t epoch = 0;
-    unsigned state = kNotSignaled;
+    uint64_t epoch;
+    unsigned state;
     enum {
       kNotSignaled,
       kWaiting,
@@ -182,41 +184,23 @@ class EventCount {
 
  private:
   // State_ layout:
-  // - low kWaiterBits is a stack of waiters committed wait
-  //   (indexes in waiters_ array are used as stack elements,
-  //   kStackMask means empty stack).
+  // - low kStackBits is a stack of waiters committed wait.
   // - next kWaiterBits is count of waiters in prewait state.
-  // - next kWaiterBits is count of pending signals.
-  // - remaining bits are ABA counter for the stack.
-  //   (stored in Waiter node and incremented on push).
-  static const uint64_t kWaiterBits = 14;
-  static const uint64_t kStackMask = (1ull << kWaiterBits) - 1;
-  static const uint64_t kWaiterShift = kWaiterBits;
+  // - next kEpochBits is modification counter.
+  static const uint64_t kStackBits = 16;
+  static const uint64_t kStackMask = (1ull << kStackBits) - 1;
+  static const uint64_t kWaiterBits = 16;
+  static const uint64_t kWaiterShift = 16;
   static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1)
                                       << kWaiterShift;
-  static const uint64_t kWaiterInc = 1ull << kWaiterShift;
-  static const uint64_t kSignalShift = 2 * kWaiterBits;
-  static const uint64_t kSignalMask = ((1ull << kWaiterBits) - 1)
-                                      << kSignalShift;
-  static const uint64_t kSignalInc = 1ull << kSignalShift;
-  static const uint64_t kEpochShift = 3 * kWaiterBits;
-  static const uint64_t kEpochBits = 64 - kEpochShift;
+  static const uint64_t kWaiterInc = 1ull << kWaiterBits;
+  static const uint64_t kEpochBits = 32;
+  static const uint64_t kEpochShift = 32;
   static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
   static const uint64_t kEpochInc = 1ull << kEpochShift;
   std::atomic<uint64_t> state_;
   MaxSizeVector<Waiter>& waiters_;
 
-  static void CheckState(uint64_t state, bool waiter = false) {
-    static_assert(kEpochBits >= 20, "not enough bits to prevent ABA problem");
-    const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
-    const uint64_t signals = (state & kSignalMask) >> kSignalShift;
-    eigen_plain_assert(waiters >= signals);
-    eigen_plain_assert(waiters < (1 << kWaiterBits) - 1);
-    eigen_plain_assert(!waiter || waiters > 0);
-    (void)waiters;
-    (void)signals;
-  }
-
   void Park(Waiter* w) {
     std::unique_lock<std::mutex> lock(w->mu);
     while (w->state != Waiter::kSignaled) {
@@ -225,10 +209,10 @@ class EventCount {
     }
   }
 
-  void Unpark(Waiter* w) {
-    for (Waiter* next; w; w = next) {
-      uint64_t wnext = w->next.load(std::memory_order_relaxed) & kStackMask;
-      next = wnext == kStackMask ? nullptr : &waiters_[wnext];
+  void Unpark(Waiter* waiters) {
+    Waiter* next = nullptr;
+    for (Waiter* w = waiters; w; w = next) {
+      next = w->next.load(std::memory_order_relaxed);
       unsigned state;
       {
         std::unique_lock<std::mutex> lock(w->mu);
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
index 43a274651..354bce52a 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -10,116 +10,79 @@
 #ifndef EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
 #define EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
 
+
 namespace Eigen {
 
 template <typename Environment>
-class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
+class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
  public:
   typedef typename Environment::Task Task;
   typedef RunQueue<Task, 1024> Queue;
 
-  ThreadPoolTempl(int num_threads, Environment env = Environment())
-      : ThreadPoolTempl(num_threads, true, env) {}
-
-  ThreadPoolTempl(int num_threads, bool allow_spinning,
-                  Environment env = Environment())
+  NonBlockingThreadPoolTempl(int num_threads, Environment env = Environment())
       : env_(env),
-        num_threads_(num_threads),
-        allow_spinning_(allow_spinning),
-        thread_data_(num_threads),
-        all_coprimes_(num_threads),
+        threads_(num_threads),
+        queues_(num_threads),
+        coprimes_(num_threads),
         waiters_(num_threads),
-        global_steal_partition_(EncodePartition(0, num_threads_)),
         blocked_(0),
         spinning_(0),
         done_(false),
-        cancelled_(false),
         ec_(waiters_) {
-    waiters_.resize(num_threads_);
-    // Calculate coprimes of all numbers [1, num_threads].
-    // Coprimes are used for random walks over all threads in Steal
+    waiters_.resize(num_threads);
+
+    // Calculate coprimes of num_threads.
+    // Coprimes are used for a random walk over all threads in Steal
     // and NonEmptyQueueIndex. Iteration is based on the fact that if we take
-    // a random starting thread index t and calculate num_threads - 1 subsequent
+    // a walk starting thread index t and calculate num_threads - 1 subsequent
     // indices as (t + coprime) % num_threads, we will cover all threads without
     // repetitions (effectively getting a presudo-random permutation of thread
     // indices).
-    eigen_plain_assert(num_threads_ < kMaxThreads);
-    for (int i = 1; i <= num_threads_; ++i) {
-      all_coprimes_.emplace_back(i);
-      ComputeCoprimes(i, &all_coprimes_.back());
+    for (int i = 1; i <= num_threads; i++) {
+      unsigned a = i;
+      unsigned b = num_threads;
+      // If GCD(a, b) == 1, then a and b are coprimes.
+      while (b != 0) {
+        unsigned tmp = a;
+        a = b;
+        b = tmp % b;
+      }
+      if (a == 1) {
+        coprimes_.push_back(i);
+      }
     }
-#ifndef EIGEN_THREAD_LOCAL
-    init_barrier_.reset(new Barrier(num_threads_));
-#endif
-    thread_data_.resize(num_threads_);
-    for (int i = 0; i < num_threads_; i++) {
-      SetStealPartition(i, EncodePartition(0, num_threads_));
-      thread_data_[i].thread.reset(
-          env_.CreateThread([this, i]() { WorkerLoop(i); }));
+    for (int i = 0; i < num_threads; i++) {
+      queues_.push_back(new Queue());
+    }
+    for (int i = 0; i < num_threads; i++) {
+      threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); }));
     }
-#ifndef EIGEN_THREAD_LOCAL
-    // Wait for workers to initialize per_thread_map_. Otherwise we might race
-    // with them in Schedule or CurrentThreadId.
-    init_barrier_->Wait();
-#endif
   }
 
-  ~ThreadPoolTempl() {
+  ~NonBlockingThreadPoolTempl() {
     done_ = true;
-
     // Now if all threads block without work, they will start exiting.
     // But note that threads can continue to work arbitrary long,
     // block, submit new work, unblock and otherwise live full life.
-    if (!cancelled_) {
-      ec_.Notify(true);
-    } else {
-      // Since we were cancelled, there might be entries in the queues.
-      // Empty them to prevent their destructor from asserting.
-      for (size_t i = 0; i < thread_data_.size(); i++) {
-        thread_data_[i].queue.Flush();
-      }
-    }
-    // Join threads explicitly (by destroying) to avoid destruction order within
-    // this class.
-    for (size_t i = 0; i < thread_data_.size(); ++i)
-      thread_data_[i].thread.reset();
+    ec_.Notify(true);
+
+    // Join threads explicitly to avoid destruction order issues.
+    for (size_t i = 0; i < threads_.size(); i++) delete threads_[i];
+    for (size_t i = 0; i < threads_.size(); i++) delete queues_[i];
   }
 
-  void SetStealPartitions(const std::vector<std::pair<unsigned, unsigned>>& partitions) {
-    eigen_plain_assert(partitions.size() == static_cast<std::size_t>(num_threads_));
-
-    // Pass this information to each thread queue.
-    for (int i = 0; i < num_threads_; i++) {
-      const auto& pair = partitions[i];
-      unsigned start = pair.first, end = pair.second;
-      AssertBounds(start, end);
-      unsigned val = EncodePartition(start, end);
-      SetStealPartition(i, val);
-    }
-  }
-
-  void Schedule(std::function<void()> fn) EIGEN_OVERRIDE {
-    ScheduleWithHint(std::move(fn), 0, num_threads_);
-  }
-
-  void ScheduleWithHint(std::function<void()> fn, int start,
-                        int limit) override {
+  void Schedule(std::function<void()> fn) {
     Task t = env_.CreateTask(std::move(fn));
     PerThread* pt = GetPerThread();
     if (pt->pool == this) {
       // Worker thread of this pool, push onto the thread's queue.
-      Queue& q = thread_data_[pt->thread_id].queue;
-      t = q.PushFront(std::move(t));
+      Queue* q = queues_[pt->thread_id];
+      t = q->PushFront(std::move(t));
     } else {
       // A free-standing thread (or worker of another pool), push onto a random
       // queue.
-      eigen_plain_assert(start < limit);
-      eigen_plain_assert(limit <= num_threads_);
-      int num_queues = limit - start;
-      int rnd = Rand(&pt->rand) % num_queues;
-      eigen_plain_assert(start + rnd < limit);
-      Queue& q = thread_data_[start + rnd].queue;
-      t = q.PushBack(std::move(t));
+      Queue* q = queues_[Rand(&pt->rand) % queues_.size()];
+      t = q->PushBack(std::move(t));
     }
     // Note: below we touch this after making w available to worker threads.
     // Strictly speaking, this can lead to a racy-use-after-free. Consider that
@@ -128,32 +91,19 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
     // completes overall computations, which in turn leads to destruction of
     // this. We expect that such scenario is prevented by program, that is,
     // this is kept alive while any threads can potentially be in Schedule.
-    if (!t.f) {
+    if (!t.f)
       ec_.Notify(false);
-    } else {
+    else
       env_.ExecuteTask(t);  // Push failed, execute directly.
-    }
   }
 
-  void Cancel() EIGEN_OVERRIDE {
-    cancelled_ = true;
-    done_ = true;
-
-    // Let each thread know it's been cancelled.
-#ifdef EIGEN_THREAD_ENV_SUPPORTS_CANCELLATION
-    for (size_t i = 0; i < thread_data_.size(); i++) {
-      thread_data_[i].thread->OnCancel();
-    }
-#endif
-
-    // Wake up the threads without work to let them exit on their own.
-    ec_.Notify(true);
+  int NumThreads() const final {
+    return static_cast<int>(threads_.size());
   }
 
-  int NumThreads() const EIGEN_FINAL { return num_threads_; }
-
-  int CurrentThreadId() const EIGEN_FINAL {
-    const PerThread* pt = const_cast<ThreadPoolTempl*>(this)->GetPerThread();
+  int CurrentThreadId() const final {
+    const PerThread* pt =
+        const_cast<NonBlockingThreadPoolTempl*>(this)->GetPerThread();
     if (pt->pool == this) {
       return pt->thread_id;
     } else {
@@ -162,189 +112,72 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
   }
 
  private:
-  // Create a single atomic<int> that encodes start and limit information for
-  // each thread.
-  // We expect num_threads_ < 65536, so we can store them in a single
-  // std::atomic<unsigned>.
-  // Exposed publicly as static functions so that external callers can reuse
-  // this encode/decode logic for maintaining their own thread-safe copies of
-  // scheduling and steal domain(s).
-  static const int kMaxPartitionBits = 16;
-  static const int kMaxThreads = 1 << kMaxPartitionBits;
-
-  inline unsigned EncodePartition(unsigned start, unsigned limit) {
-    return (start << kMaxPartitionBits) | limit;
-  }
-
-  inline void DecodePartition(unsigned val, unsigned* start, unsigned* limit) {
-    *limit = val & (kMaxThreads - 1);
-    val >>= kMaxPartitionBits;
-    *start = val;
-  }
-
-  void AssertBounds(int start, int end) {
-    eigen_plain_assert(start >= 0);
-    eigen_plain_assert(start < end);  // non-zero sized partition
-    eigen_plain_assert(end <= num_threads_);
-  }
-
-  inline void SetStealPartition(size_t i, unsigned val) {
-    thread_data_[i].steal_partition.store(val, std::memory_order_relaxed);
-  }
-
-  inline unsigned GetStealPartition(int i) {
-    return thread_data_[i].steal_partition.load(std::memory_order_relaxed);
-  }
-
-  void ComputeCoprimes(int N, MaxSizeVector<unsigned>* coprimes) {
-    for (int i = 1; i <= N; i++) {
-      unsigned a = i;
-      unsigned b = N;
-      // If GCD(a, b) == 1, then a and b are coprimes.
-      while (b != 0) {
-        unsigned tmp = a;
-        a = b;
-        b = tmp % b;
-      }
-      if (a == 1) {
-        coprimes->push_back(i);
-      }
-    }
-  }
-
   typedef typename Environment::EnvThread Thread;
 
   struct PerThread {
-    constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) {}
-    ThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
-    uint64_t rand;          // Random generator state.
-    int thread_id;          // Worker thread index in pool.
-#ifndef EIGEN_THREAD_LOCAL
-    // Prevent false sharing.
-    char pad_[128];
-#endif
-  };
-
-  struct ThreadData {
-    constexpr ThreadData() : thread(), steal_partition(0), queue() {}
-    std::unique_ptr<Thread> thread;
-    std::atomic<unsigned> steal_partition;
-    Queue queue;
+    constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) { }
+    NonBlockingThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
+    uint64_t rand;  // Random generator state.
+    int thread_id;  // Worker thread index in pool.
   };
 
   Environment env_;
-  const int num_threads_;
-  const bool allow_spinning_;
-  MaxSizeVector<ThreadData> thread_data_;
-  MaxSizeVector<MaxSizeVector<unsigned>> all_coprimes_;
+  MaxSizeVector<Thread*> threads_;
+  MaxSizeVector<Queue*> queues_;
+  MaxSizeVector<unsigned> coprimes_;
   MaxSizeVector<EventCount::Waiter> waiters_;
-  unsigned global_steal_partition_;
   std::atomic<unsigned> blocked_;
   std::atomic<bool> spinning_;
   std::atomic<bool> done_;
-  std::atomic<bool> cancelled_;
   EventCount ec_;
-#ifndef EIGEN_THREAD_LOCAL
-  std::unique_ptr<Barrier> init_barrier_;
-  std::mutex per_thread_map_mutex_;  // Protects per_thread_map_.
-  std::unordered_map<uint64_t, std::unique_ptr<PerThread>> per_thread_map_;
-#endif
 
   // Main worker thread loop.
   void WorkerLoop(int thread_id) {
-#ifndef EIGEN_THREAD_LOCAL
-    std::unique_ptr<PerThread> new_pt(new PerThread());
-    per_thread_map_mutex_.lock();
-    eigen_plain_assert(per_thread_map_.emplace(GlobalThreadIdHash(), std::move(new_pt)).second);
-    per_thread_map_mutex_.unlock();
-    init_barrier_->Notify();
-    init_barrier_->Wait();
-#endif
     PerThread* pt = GetPerThread();
     pt->pool = this;
-    pt->rand = GlobalThreadIdHash();
+    pt->rand = std::hash<std::thread::id>()(std::this_thread::get_id());
     pt->thread_id = thread_id;
-    Queue& q = thread_data_[thread_id].queue;
+    Queue* q = queues_[thread_id];
     EventCount::Waiter* waiter = &waiters_[thread_id];
-    // TODO(dvyukov,rmlarsen): The time spent in NonEmptyQueueIndex() is
-    // proportional to num_threads_ and we assume that new work is scheduled at
-    // a constant rate, so we set spin_count to 5000 / num_threads_. The
-    // constant was picked based on a fair dice roll, tune it.
-    const int spin_count =
-        allow_spinning_ && num_threads_ > 0 ? 5000 / num_threads_ : 0;
-    if (num_threads_ == 1) {
-      // For num_threads_ == 1 there is no point in going through the expensive
-      // steal loop. Moreover, since NonEmptyQueueIndex() calls PopBack() on the
-      // victim queues it might reverse the order in which ops are executed
-      // compared to the order in which they are scheduled, which tends to be
-      // counter-productive for the types of I/O workloads the single thread
-      // pools tend to be used for.
-      while (!cancelled_) {
-        Task t = q.PopFront();
-        for (int i = 0; i < spin_count && !t.f; i++) {
-          if (!cancelled_.load(std::memory_order_relaxed)) {
-            t = q.PopFront();
-          }
-        }
+    for (;;) {
+      Task t = q->PopFront();
+      if (!t.f) {
+        t = Steal();
         if (!t.f) {
-          if (!WaitForWork(waiter, &t)) {
-            return;
+          // Leave one thread spinning. This reduces latency.
+          // TODO(dvyukov): 1000 iterations is based on fair dice roll, tune it.
+          // Also, the time it takes to attempt to steal work 1000 times depends
+          // on the size of the thread pool. However the speed at which the user
+          // of the thread pool submit tasks is independent of the size of the
+          // pool. Consider a time based limit instead.
+          if (!spinning_ && !spinning_.exchange(true)) {
+            for (int i = 0; i < 1000 && !t.f; i++) {
+              t = Steal();
+            }
+            spinning_ = false;
           }
-        }
-        if (t.f) {
-          env_.ExecuteTask(t);
-        }
-      }
-    } else {
-      while (!cancelled_) {
-        Task t = q.PopFront();
-        if (!t.f) {
-          t = LocalSteal();
           if (!t.f) {
-            t = GlobalSteal();
-            if (!t.f) {
-              // Leave one thread spinning. This reduces latency.
-              if (allow_spinning_ && !spinning_ && !spinning_.exchange(true)) {
-                for (int i = 0; i < spin_count && !t.f; i++) {
-                  if (!cancelled_.load(std::memory_order_relaxed)) {
-                    t = GlobalSteal();
-                  } else {
-                    return;
-                  }
-                }
-                spinning_ = false;
-              }
-              if (!t.f) {
-                if (!WaitForWork(waiter, &t)) {
-                  return;
-                }
-              }
+            if (!WaitForWork(waiter, &t)) {
+              return;
             }
           }
         }
-        if (t.f) {
-          env_.ExecuteTask(t);
-        }
+      }
+      if (t.f) {
+        env_.ExecuteTask(t);
       }
     }
   }
 
-  // Steal tries to steal work from other worker threads in the range [start,
-  // limit) in best-effort manner.
-  Task Steal(unsigned start, unsigned limit) {
+  // Steal tries to steal work from other worker threads in best-effort manner.
+  Task Steal() {
     PerThread* pt = GetPerThread();
-    const size_t size = limit - start;
+    const size_t size = queues_.size();
     unsigned r = Rand(&pt->rand);
-    // Reduce r into [0, size) range, this utilizes trick from
-    // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
-    eigen_plain_assert(all_coprimes_[size - 1].size() < (1<<30));
-    unsigned victim = ((uint64_t)r * (uint64_t)size) >> 32;
-    unsigned index = ((uint64_t) all_coprimes_[size - 1].size() * (uint64_t)r) >> 32;
-    unsigned inc = all_coprimes_[size - 1][index];
-
+    unsigned inc = coprimes_[r % coprimes_.size()];
+    unsigned victim = r % size;
     for (unsigned i = 0; i < size; i++) {
-      eigen_plain_assert(start + victim < limit);
-      Task t = thread_data_[start + victim].queue.PopBack();
+      Task t = queues_[victim]->PopBack();
       if (t.f) {
         return t;
       }
@@ -356,52 +189,27 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
     return Task();
   }
 
-  // Steals work within threads belonging to the partition.
-  Task LocalSteal() {
-    PerThread* pt = GetPerThread();
-    unsigned partition = GetStealPartition(pt->thread_id);
-    // If thread steal partition is the same as global partition, there is no
-    // need to go through the steal loop twice.
-    if (global_steal_partition_ == partition) return Task();
-    unsigned start, limit;
-    DecodePartition(partition, &start, &limit);
-    AssertBounds(start, limit);
-
-    return Steal(start, limit);
-  }
-
-  // Steals work from any other thread in the pool.
-  Task GlobalSteal() {
-    return Steal(0, num_threads_);
-  }
-
-
   // WaitForWork blocks until new work is available (returns true), or if it is
   // time to exit (returns false). Can optionally return a task to execute in t
   // (in such case t.f != nullptr on return).
   bool WaitForWork(EventCount::Waiter* waiter, Task* t) {
-    eigen_plain_assert(!t->f);
+    eigen_assert(!t->f);
     // We already did best-effort emptiness check in Steal, so prepare for
     // blocking.
-    ec_.Prewait();
+    ec_.Prewait(waiter);
     // Now do a reliable emptiness check.
     int victim = NonEmptyQueueIndex();
     if (victim != -1) {
-      ec_.CancelWait();
-      if (cancelled_) {
-        return false;
-      } else {
-        *t = thread_data_[victim].queue.PopBack();
-        return true;
-      }
+      ec_.CancelWait(waiter);
+      *t = queues_[victim]->PopBack();
+      return true;
     }
     // Number of blocked threads is used as termination condition.
     // If we are shutting down and all worker threads blocked without work,
     // that's we are done.
     blocked_++;
-    // TODO is blocked_ required to be unsigned?
-    if (done_ && blocked_ == static_cast<unsigned>(num_threads_)) {
-      ec_.CancelWait();
+    if (done_ && blocked_ == threads_.size()) {
+      ec_.CancelWait(waiter);
       // Almost done, but need to re-check queues.
       // Consider that all queues are empty and all worker threads are preempted
       // right after incrementing blocked_ above. Now a free-standing thread
@@ -428,15 +236,12 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
 
   int NonEmptyQueueIndex() {
     PerThread* pt = GetPerThread();
-    // We intentionally design NonEmptyQueueIndex to steal work from
-    // anywhere in the queue so threads don't block in WaitForWork() forever
-    // when all threads in their partition go to sleep. Steal is still local.
-    const size_t size = thread_data_.size();
+    const size_t size = queues_.size();
     unsigned r = Rand(&pt->rand);
-    unsigned inc = all_coprimes_[size - 1][r % all_coprimes_[size - 1].size()];
+    unsigned inc = coprimes_[r % coprimes_.size()];
     unsigned victim = r % size;
     for (unsigned i = 0; i < size; i++) {
-      if (!thread_data_[victim].queue.Empty()) {
+      if (!queues_[victim]->Empty()) {
         return victim;
       }
       victim += inc;
@@ -447,24 +252,10 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
     return -1;
   }
 
-  static EIGEN_STRONG_INLINE uint64_t GlobalThreadIdHash() {
-    return std::hash<std::thread::id>()(std::this_thread::get_id());
-  }
-
-  EIGEN_STRONG_INLINE PerThread* GetPerThread() {
-#ifndef EIGEN_THREAD_LOCAL
-    static PerThread dummy;
-    auto it = per_thread_map_.find(GlobalThreadIdHash());
-    if (it == per_thread_map_.end()) {
-      return &dummy;
-    } else {
-      return it->second.get();
-    }
-#else
+  static EIGEN_STRONG_INLINE PerThread* GetPerThread() {
     EIGEN_THREAD_LOCAL PerThread per_thread_;
     PerThread* pt = &per_thread_;
     return pt;
-#endif
   }
 
   static EIGEN_STRONG_INLINE unsigned Rand(uint64_t* state) {
@@ -472,12 +263,11 @@ class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
     // Update the internal state
     *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
     // Generate the random output (using the PCG-XSH-RS scheme)
-    return static_cast<unsigned>((current ^ (current >> 22)) >>
-                                 (22 + (current >> 61)));
+    return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
   }
 };
 
-typedef ThreadPoolTempl<StlThreadEnvironment> ThreadPool;
+typedef NonBlockingThreadPoolTempl<StlThreadEnvironment> NonBlockingThreadPool;
 
 }  // namespace Eigen
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
index b572ebcdf..6e505fc14 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
@@ -10,6 +10,7 @@
 #ifndef EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
 #define EIGEN_CXX11_THREADPOOL_RUNQUEUE_H_
 
+
 namespace Eigen {
 
 // RunQueue is a fixed-size, partially non-blocking deque or Work items.
@@ -39,9 +40,9 @@ class RunQueue {
  public:
   RunQueue() : front_(0), back_(0) {
     // require power-of-two for fast masking
-    eigen_plain_assert((kSize & (kSize - 1)) == 0);
-    eigen_plain_assert(kSize > 2);            // why would you do this?
-    eigen_plain_assert(kSize <= (64 << 10));  // leave enough space for counter
+    eigen_assert((kSize & (kSize - 1)) == 0);
+    eigen_assert(kSize > 2);            // why would you do this?
+    eigen_assert(kSize <= (64 << 10));  // leave enough space for counter
     for (unsigned i = 0; i < kSize; i++)
       array_[i].state.store(kEmpty, std::memory_order_relaxed);
   }
@@ -97,9 +98,11 @@ class RunQueue {
   }
 
   // PopBack removes and returns the last elements in the queue.
+  // Can fail spuriously.
   Work PopBack() {
     if (Empty()) return Work();
-    std::unique_lock<std::mutex> lock(mutex_);
+    std::unique_lock<std::mutex> lock(mutex_, std::try_to_lock);
+    if (!lock) return Work();
     unsigned back = back_.load(std::memory_order_relaxed);
     Elem* e = &array_[back & kMask];
     uint8_t s = e->state.load(std::memory_order_relaxed);
@@ -113,10 +116,11 @@ class RunQueue {
   }
 
   // PopBackHalf removes and returns half last elements in the queue.
-  // Returns number of elements removed.
+  // Returns number of elements removed. But can also fail spuriously.
   unsigned PopBackHalf(std::vector<Work>* result) {
     if (Empty()) return 0;
-    std::unique_lock<std::mutex> lock(mutex_);
+    std::unique_lock<std::mutex> lock(mutex_, std::try_to_lock);
+    if (!lock) return 0;
     unsigned back = back_.load(std::memory_order_relaxed);
     unsigned size = Size();
     unsigned mid = back;
@@ -127,14 +131,15 @@ class RunQueue {
       Elem* e = &array_[mid & kMask];
       uint8_t s = e->state.load(std::memory_order_relaxed);
       if (n == 0) {
-        if (s != kReady || !e->state.compare_exchange_strong(
-                               s, kBusy, std::memory_order_acquire))
+        if (s != kReady ||
+            !e->state.compare_exchange_strong(s, kBusy,
+                                              std::memory_order_acquire))
           continue;
         start = mid;
       } else {
         // Note: no need to store temporal kBusy, we exclusively own these
         // elements.
-        eigen_plain_assert(s == kReady);
+        eigen_assert(s == kReady);
       }
       result->push_back(std::move(e->w));
       e->state.store(kEmpty, std::memory_order_release);
@@ -147,18 +152,30 @@ class RunQueue {
 
   // Size returns current queue size.
   // Can be called by any thread at any time.
-  unsigned Size() const { return SizeOrNotEmpty<true>(); }
+  unsigned Size() const {
+    // Emptiness plays critical role in thread pool blocking. So we go to great
+    // effort to not produce false positives (claim non-empty queue as empty).
+    for (;;) {
+      // Capture a consistent snapshot of front/tail.
+      unsigned front = front_.load(std::memory_order_acquire);
+      unsigned back = back_.load(std::memory_order_acquire);
+      unsigned front1 = front_.load(std::memory_order_relaxed);
+      if (front != front1) continue;
+      int size = (front & kMask2) - (back & kMask2);
+      // Fix overflow.
+      if (size < 0) size += 2 * kSize;
+      // Order of modification in push/pop is crafted to make the queue look
+      // larger than it is during concurrent modifications. E.g. pop can
+      // decrement size before the corresponding push has incremented it.
+      // So the computed size can be up to kSize + 1, fix it.
+      if (size > static_cast<int>(kSize)) size = kSize;
+      return size;
+    }
+  }
 
   // Empty tests whether container is empty.
   // Can be called by any thread at any time.
-  bool Empty() const { return SizeOrNotEmpty<false>() == 0; }
-
-  // Delete all the elements from the queue.
-  void Flush() {
-    while (!Empty()) {
-      PopFront();
-    }
-  }
+  bool Empty() const { return Size() == 0; }
 
  private:
   static const unsigned kMask = kSize - 1;
@@ -174,7 +191,7 @@ class RunQueue {
   };
   std::mutex mutex_;
   // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
-  // front/back, respectively. The remaining bits contain modification counters
+  // front/back, repsectively. The remaining bits contain modification counters
   // that are incremented on Push operations. This allows us to (1) distinguish
   // between empty and full conditions (if we would use log(kSize) bits for
   // position, these conditions would be indistinguishable); (2) obtain
@@ -184,49 +201,6 @@ class RunQueue {
   std::atomic<unsigned> back_;
   Elem array_[kSize];
 
-  // SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false,
-  // only whether the size is 0 is guaranteed to be correct.
-  // Can be called by any thread at any time.
-  template<bool NeedSizeEstimate>
-  unsigned SizeOrNotEmpty() const {
-    // Emptiness plays critical role in thread pool blocking. So we go to great
-    // effort to not produce false positives (claim non-empty queue as empty).
-    unsigned front = front_.load(std::memory_order_acquire);
-    for (;;) {
-      // Capture a consistent snapshot of front/tail.
-      unsigned back = back_.load(std::memory_order_acquire);
-      unsigned front1 = front_.load(std::memory_order_relaxed);
-      if (front != front1) {
-        front = front1;
-        std::atomic_thread_fence(std::memory_order_acquire);
-        continue;
-      }
-      if (NeedSizeEstimate) {
-        return CalculateSize(front, back);
-      } else {
-        // This value will be 0 if the queue is empty, and undefined otherwise.
-        unsigned maybe_zero = ((front ^ back) & kMask2);
-        // Queue size estimate must agree with maybe zero check on the queue
-        // empty/non-empty state.
-        eigen_assert((CalculateSize(front, back) == 0) == (maybe_zero == 0));
-        return maybe_zero;
-      }
-    }
-  }
-
-  EIGEN_ALWAYS_INLINE
-  unsigned CalculateSize(unsigned front, unsigned back) const {
-    int size = (front & kMask2) - (back & kMask2);
-    // Fix overflow.
-    if (size < 0) size += 2 * kSize;
-    // Order of modification in push/pop is crafted to make the queue look
-    // larger than it is during concurrent modifications. E.g. push can
-    // increment size before the corresponding pop has decremented it.
-    // So the computed size can be up to kSize + 1, fix it.
-    if (size > static_cast<int>(kSize)) size = kSize;
-    return static_cast<unsigned>(size);
-  }
-
   RunQueue(const RunQueue&) = delete;
   void operator=(const RunQueue&) = delete;
 };
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
new file mode 100644
index 000000000..e75d0f467
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
@@ -0,0 +1,154 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
+#define EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
+
+namespace Eigen {
+
+// The implementation of the ThreadPool type ensures that the Schedule method
+// runs the functions it is provided in FIFO order when the scheduling is done
+// by a single thread.
+// Environment provides a way to create threads and also allows to intercept
+// task submission and execution.
+template <typename Environment>
+class SimpleThreadPoolTempl : public ThreadPoolInterface {
+ public:
+  // Construct a pool that contains "num_threads" threads.
+  explicit SimpleThreadPoolTempl(int num_threads, Environment env = Environment())
+      : env_(env), threads_(num_threads), waiters_(num_threads) {
+    for (int i = 0; i < num_threads; i++) {
+      threads_.push_back(env.CreateThread([this, i]() { WorkerLoop(i); }));
+    }
+  }
+
+  // Wait until all scheduled work has finished and then destroy the
+  // set of threads.
+  ~SimpleThreadPoolTempl() {
+    {
+      // Wait for all work to get done.
+      std::unique_lock<std::mutex> l(mu_);
+      while (!pending_.empty()) {
+        empty_.wait(l);
+      }
+      exiting_ = true;
+
+      // Wakeup all waiters.
+      for (auto w : waiters_) {
+        w->ready = true;
+        w->task.f = nullptr;
+        w->cv.notify_one();
+      }
+    }
+
+    // Wait for threads to finish.
+    for (auto t : threads_) {
+      delete t;
+    }
+  }
+
+  // Schedule fn() for execution in the pool of threads. The functions are
+  // executed in the order in which they are scheduled.
+  void Schedule(std::function<void()> fn) final {
+    Task t = env_.CreateTask(std::move(fn));
+    std::unique_lock<std::mutex> l(mu_);
+    if (waiters_.empty()) {
+      pending_.push_back(std::move(t));
+    } else {
+      Waiter* w = waiters_.back();
+      waiters_.pop_back();
+      w->ready = true;
+      w->task = std::move(t);
+      w->cv.notify_one();
+    }
+  }
+
+  int NumThreads() const final {
+    return static_cast<int>(threads_.size());
+  }
+
+  int CurrentThreadId() const final {
+    const PerThread* pt = this->GetPerThread();
+    if (pt->pool == this) {
+      return pt->thread_id;
+    } else {
+      return -1;
+    }
+  }
+
+ protected:
+  void WorkerLoop(int thread_id) {
+    std::unique_lock<std::mutex> l(mu_);
+    PerThread* pt = GetPerThread();
+    pt->pool = this;
+    pt->thread_id = thread_id;
+    Waiter w;
+    Task t;
+    while (!exiting_) {
+      if (pending_.empty()) {
+        // Wait for work to be assigned to me
+        w.ready = false;
+        waiters_.push_back(&w);
+        while (!w.ready) {
+          w.cv.wait(l);
+        }
+        t = w.task;
+        w.task.f = nullptr;
+      } else {
+        // Pick up pending work
+        t = std::move(pending_.front());
+        pending_.pop_front();
+        if (pending_.empty()) {
+          empty_.notify_all();
+        }
+      }
+      if (t.f) {
+        mu_.unlock();
+        env_.ExecuteTask(t);
+        t.f = nullptr;
+        mu_.lock();
+      }
+    }
+  }
+
+ private:
+  typedef typename Environment::Task Task;
+  typedef typename Environment::EnvThread Thread;
+
+  struct Waiter {
+    std::condition_variable cv;
+    Task task;
+    bool ready;
+  };
+
+  struct PerThread {
+    constexpr PerThread() : pool(NULL), thread_id(-1) { }
+    SimpleThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
+    int thread_id;                // Worker thread index in pool.
+  };
+
+  Environment env_;
+  std::mutex mu_;
+  MaxSizeVector<Thread*> threads_;  // All threads
+  MaxSizeVector<Waiter*> waiters_;  // Stack of waiting threads.
+  std::deque<Task> pending_;        // Queue of pending work
+  std::condition_variable empty_;   // Signaled on pending_.empty()
+  bool exiting_ = false;
+
+  PerThread* GetPerThread() const {
+    EIGEN_THREAD_LOCAL PerThread per_thread;
+    return &per_thread;
+  }
+};
+
+typedef SimpleThreadPoolTempl<StlThreadEnvironment> SimpleThreadPool;
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_SIMPLE_THREAD_POOL_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h
deleted file mode 100644
index a05685f11..000000000
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h
+++ /dev/null
@@ -1,23 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
-#define EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
-
-// Try to come up with a portable way to cancel a thread
-#if EIGEN_OS_GNULINUX
-  #define EIGEN_THREAD_CANCEL(t)                  \
-    pthread_cancel(t.native_handle());
-  #define EIGEN_SUPPORTS_THREAD_CANCELLATION 1
-#else
-#define EIGEN_THREAD_CANCEL(t)
-#endif
-
-
-#endif  // EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
index d94a06416..399f95cc1 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
@@ -23,8 +23,6 @@ struct StlThreadEnvironment {
    public:
     EnvThread(std::function<void()> f) : thr_(std::move(f)) {}
     ~EnvThread() { thr_.join(); }
-    // This function is called when the threadpool is cancelled.
-    void OnCancel() { }
 
    private:
     std::thread thr_;
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
index 4e6847404..cfa221732 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadLocal.h
@@ -10,292 +10,13 @@
 #ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
 #define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
 
-#ifdef EIGEN_AVOID_THREAD_LOCAL
-
-#ifdef EIGEN_THREAD_LOCAL
-#undef EIGEN_THREAD_LOCAL
-#endif
-
+// Try to come up with a portable implementation of thread local variables
+#if EIGEN_COMP_GNUC && EIGEN_GNUC_AT_MOST(4, 7)
+#define EIGEN_THREAD_LOCAL static __thread
+#elif EIGEN_COMP_CLANG
+#define EIGEN_THREAD_LOCAL static __thread
 #else
-
-#if EIGEN_MAX_CPP_VER >= 11 &&                         \
-    ((EIGEN_COMP_GNUC && EIGEN_GNUC_AT_LEAST(4, 8)) || \
-     __has_feature(cxx_thread_local)                || \
-     (EIGEN_COMP_MSVC >= 1900) )
 #define EIGEN_THREAD_LOCAL static thread_local
 #endif
 
-// Disable TLS for Apple and Android builds with older toolchains.
-#if defined(__APPLE__)
-// Included for TARGET_OS_IPHONE, __IPHONE_OS_VERSION_MIN_REQUIRED,
-// __IPHONE_8_0.
-#include <Availability.h>
-#include <TargetConditionals.h>
-#endif
-// Checks whether C++11's `thread_local` storage duration specifier is
-// supported.
-#if defined(__apple_build_version__) &&     \
-    ((__apple_build_version__ < 8000042) || \
-     (TARGET_OS_IPHONE && __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_0))
-// Notes: Xcode's clang did not support `thread_local` until version
-// 8, and even then not for all iOS < 9.0.
-#undef EIGEN_THREAD_LOCAL
-
-#elif defined(__ANDROID__) && EIGEN_COMP_CLANG
-// There are platforms for which TLS should not be used even though the compiler
-// makes it seem like it's supported (Android NDK < r12b for example).
-// This is primarily because of linker problems and toolchain misconfiguration:
-// TLS isn't supported until NDK r12b per
-// https://developer.android.com/ndk/downloads/revision_history.html
-// Since NDK r16, `__NDK_MAJOR__` and `__NDK_MINOR__` are defined in
-// <android/ndk-version.h>. For NDK < r16, users should define these macros,
-// e.g. `-D__NDK_MAJOR__=11 -D__NKD_MINOR__=0` for NDK r11.
-#if __has_include(<android/ndk-version.h>)
-#include <android/ndk-version.h>
-#endif  // __has_include(<android/ndk-version.h>)
-#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && \
-    defined(__NDK_MINOR__) &&                                               \
-    ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1)))
-#undef EIGEN_THREAD_LOCAL
-#endif
-#endif  // defined(__ANDROID__) && defined(__clang__)
-
-#endif  // EIGEN_AVOID_THREAD_LOCAL
-
-namespace Eigen {
-
-namespace internal {
-template <typename T>
-struct ThreadLocalNoOpInitialize {
-  void operator()(T&) const {}
-};
-
-template <typename T>
-struct ThreadLocalNoOpRelease {
-  void operator()(T&) const {}
-};
-
-}  // namespace internal
-
-// Thread local container for elements of type T, that does not use thread local
-// storage. As long as the number of unique threads accessing this storage
-// is smaller than `capacity_`, it is lock-free and wait-free. Otherwise it will
-// use a mutex for synchronization.
-//
-// Type `T` has to be default constructible, and by default each thread will get
-// a default constructed value. It is possible to specify custom `initialize`
-// callable, that will be called lazily from each thread accessing this object,
-// and will be passed a default initialized object of type `T`. Also it's
-// possible to pass a custom `release` callable, that will be invoked before
-// calling ~T().
-//
-// Example:
-//
-//   struct Counter {
-//     int value = 0;
-//   }
-//
-//   Eigen::ThreadLocal<Counter> counter(10);
-//
-//   // Each thread will have access to it's own counter object.
-//   Counter& cnt = counter.local();
-//   cnt++;
-//
-// WARNING: Eigen::ThreadLocal uses the OS-specific value returned by
-// std::this_thread::get_id() to identify threads. This value is not guaranteed
-// to be unique except for the life of the thread. A newly created thread may
-// get an OS-specific ID equal to that of an already destroyed thread.
-//
-// Somewhat similar to TBB thread local storage, with similar restrictions:
-// https://www.threadingbuildingblocks.org/docs/help/reference/thread_local_storage/enumerable_thread_specific_cls.html
-//
-template <typename T,
-          typename Initialize = internal::ThreadLocalNoOpInitialize<T>,
-          typename Release = internal::ThreadLocalNoOpRelease<T>>
-class ThreadLocal {
-  // We preallocate default constructed elements in MaxSizedVector.
-  static_assert(std::is_default_constructible<T>::value,
-                "ThreadLocal data type must be default constructible");
-
- public:
-  explicit ThreadLocal(int capacity)
-      : ThreadLocal(capacity, internal::ThreadLocalNoOpInitialize<T>(),
-                    internal::ThreadLocalNoOpRelease<T>()) {}
-
-  ThreadLocal(int capacity, Initialize initialize)
-      : ThreadLocal(capacity, std::move(initialize),
-                    internal::ThreadLocalNoOpRelease<T>()) {}
-
-  ThreadLocal(int capacity, Initialize initialize, Release release)
-      : initialize_(std::move(initialize)),
-        release_(std::move(release)),
-        capacity_(capacity),
-        data_(capacity_),
-        ptr_(capacity_),
-        filled_records_(0) {
-    eigen_assert(capacity_ >= 0);
-    data_.resize(capacity_);
-    for (int i = 0; i < capacity_; ++i) {
-      ptr_.emplace_back(nullptr);
-    }
-  }
-
-  T& local() {
-    std::thread::id this_thread = std::this_thread::get_id();
-    if (capacity_ == 0) return SpilledLocal(this_thread);
-
-    std::size_t h = std::hash<std::thread::id>()(this_thread);
-    const int start_idx = h % capacity_;
-
-    // NOTE: From the definition of `std::this_thread::get_id()` it is
-    // guaranteed that we never can have concurrent insertions with the same key
-    // to our hash-map like data structure. If we didn't find an element during
-    // the initial traversal, it's guaranteed that no one else could have
-    // inserted it while we are in this function. This allows to massively
-    // simplify out lock-free insert-only hash map.
-
-    // Check if we already have an element for `this_thread`.
-    int idx = start_idx;
-    while (ptr_[idx].load() != nullptr) {
-      ThreadIdAndValue& record = *(ptr_[idx].load());
-      if (record.thread_id == this_thread) return record.value;
-
-      idx += 1;
-      if (idx >= capacity_) idx -= capacity_;
-      if (idx == start_idx) break;
-    }
-
-    // If we are here, it means that we found an insertion point in lookup
-    // table at `idx`, or we did a full traversal and table is full.
-
-    // If lock-free storage is full, fallback on mutex.
-    if (filled_records_.load() >= capacity_) return SpilledLocal(this_thread);
-
-    // We double check that we still have space to insert an element into a lock
-    // free storage. If old value in `filled_records_` is larger than the
-    // records capacity, it means that some other thread added an element while
-    // we were traversing lookup table.
-    int insertion_index =
-        filled_records_.fetch_add(1, std::memory_order_relaxed);
-    if (insertion_index >= capacity_) return SpilledLocal(this_thread);
-
-    // At this point it's guaranteed that we can access to
-    // data_[insertion_index_] without a data race.
-    data_[insertion_index].thread_id = this_thread;
-    initialize_(data_[insertion_index].value);
-
-    // That's the pointer we'll put into the lookup table.
-    ThreadIdAndValue* inserted = &data_[insertion_index];
-
-    // We'll use nullptr pointer to ThreadIdAndValue in a compare-and-swap loop.
-    ThreadIdAndValue* empty = nullptr;
-
-    // Now we have to find an insertion point into the lookup table. We start
-    // from the `idx` that was identified as an insertion point above, it's
-    // guaranteed that we will have an empty record somewhere in a lookup table
-    // (because we created a record in the `data_`).
-    const int insertion_idx = idx;
-
-    do {
-      // Always start search from the original insertion candidate.
-      idx = insertion_idx;
-      while (ptr_[idx].load() != nullptr) {
-        idx += 1;
-        if (idx >= capacity_) idx -= capacity_;
-        // If we did a full loop, it means that we don't have any free entries
-        // in the lookup table, and this means that something is terribly wrong.
-        eigen_assert(idx != insertion_idx);
-      }
-      // Atomic CAS of the pointer guarantees that any other thread, that will
-      // follow this pointer will see all the mutations in the `data_`.
-    } while (!ptr_[idx].compare_exchange_weak(empty, inserted));
-
-    return inserted->value;
-  }
-
-  // WARN: It's not thread safe to call it concurrently with `local()`.
-  void ForEach(std::function<void(std::thread::id, T&)> f) {
-    // Reading directly from `data_` is unsafe, because only CAS to the
-    // record in `ptr_` makes all changes visible to other threads.
-    for (auto& ptr : ptr_) {
-      ThreadIdAndValue* record = ptr.load();
-      if (record == nullptr) continue;
-      f(record->thread_id, record->value);
-    }
-
-    // We did not spill into the map based storage.
-    if (filled_records_.load(std::memory_order_relaxed) < capacity_) return;
-
-    // Adds a happens before edge from the last call to SpilledLocal().
-    std::unique_lock<std::mutex> lock(mu_);
-    for (auto& kv : per_thread_map_) {
-      f(kv.first, kv.second);
-    }
-  }
-
-  // WARN: It's not thread safe to call it concurrently with `local()`.
-  ~ThreadLocal() {
-    // Reading directly from `data_` is unsafe, because only CAS to the record
-    // in `ptr_` makes all changes visible to other threads.
-    for (auto& ptr : ptr_) {
-      ThreadIdAndValue* record = ptr.load();
-      if (record == nullptr) continue;
-      release_(record->value);
-    }
-
-    // We did not spill into the map based storage.
-    if (filled_records_.load(std::memory_order_relaxed) < capacity_) return;
-
-    // Adds a happens before edge from the last call to SpilledLocal().
-    std::unique_lock<std::mutex> lock(mu_);
-    for (auto& kv : per_thread_map_) {
-      release_(kv.second);
-    }
-  }
-
- private:
-  struct ThreadIdAndValue {
-    std::thread::id thread_id;
-    T value;
-  };
-
-  // Use unordered map guarded by a mutex when lock free storage is full.
-  T& SpilledLocal(std::thread::id this_thread) {
-    std::unique_lock<std::mutex> lock(mu_);
-
-    auto it = per_thread_map_.find(this_thread);
-    if (it == per_thread_map_.end()) {
-      auto result = per_thread_map_.emplace(this_thread, T());
-      eigen_assert(result.second);
-      initialize_((*result.first).second);
-      return (*result.first).second;
-    } else {
-      return it->second;
-    }
-  }
-
-  Initialize initialize_;
-  Release release_;
-  const int capacity_;
-
-  // Storage that backs lock-free lookup table `ptr_`. Records stored in this
-  // storage contiguously starting from index 0.
-  MaxSizeVector<ThreadIdAndValue> data_;
-
-  // Atomic pointers to the data stored in `data_`. Used as a lookup table for
-  // linear probing hash map (https://en.wikipedia.org/wiki/Linear_probing).
-  MaxSizeVector<std::atomic<ThreadIdAndValue*>> ptr_;
-
-  // Number of records stored in the `data_`.
-  std::atomic<int> filled_records_;
-
-  // We fallback on per thread map if lock-free storage is full. In practice
-  // this should never happen, if `capacity_` is a reasonable estimate of the
-  // number of threads running in a system.
-  std::mutex mu_;  // Protects per_thread_map_.
-  std::unordered_map<std::thread::id, T> per_thread_map_;
-};
-
-}  // namespace Eigen
-
 #endif  // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
index 25030dc0b..a65ee97c9 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
@@ -16,23 +16,8 @@ namespace Eigen {
 // custom thread pools underneath.
 class ThreadPoolInterface {
  public:
-  // Submits a closure to be run by a thread in the pool.
   virtual void Schedule(std::function<void()> fn) = 0;
 
-  // Submits a closure to be run by threads in the range [start, end) in the
-  // pool.
-  virtual void ScheduleWithHint(std::function<void()> fn, int /*start*/,
-                                int /*end*/) {
-    // Just defer to Schedule in case sub-classes aren't interested in
-    // overriding this functionality.
-    Schedule(fn);
-  }
-
-  // If implemented, stop processing the closures that have been enqueued.
-  // Currently running closures may still be processed.
-  // If not implemented, does nothing.
-  virtual void Cancel() {}
-
   // Returns the number of threads in the pool.
   virtual int NumThreads() const = 0;
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
index 149ceaff0..ec27eddb8 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
@@ -13,6 +13,11 @@
 #include <vector>
 #include "EmulateArray.h"
 
+// Emulate the cxx11 functionality that we need if the compiler doesn't support it.
+// Visual studio 2015 doesn't advertise itself as cxx11 compliant, although it
+// supports enough of the standard for our needs
+#if __cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900
+
 #include "CXX11Workarounds.h"
 
 namespace Eigen {
@@ -35,9 +40,8 @@ template<typename T, T... nn>
 struct numeric_list { constexpr static std::size_t count = sizeof...(nn); };
 
 template<typename T, T n, T... nn>
-struct numeric_list<T, n, nn...> { static const std::size_t count = sizeof...(nn) + 1; const static T first_value = n; };
+struct numeric_list<T, n, nn...> { constexpr static std::size_t count = sizeof...(nn) + 1; constexpr static T first_value = n; };
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
 /* numeric list constructors
  *
  * equivalencies:
@@ -96,14 +100,13 @@ template<int n, typename t, typename... tt> struct h_skip_helper_type<n, t, tt..
 template<typename t, typename... tt>        struct h_skip_helper_type<0, t, tt...> { typedef type_list<t, tt...> type; };
 template<int n>                             struct h_skip_helper_type<n>           { typedef type_list<> type; };
 template<>                                  struct h_skip_helper_type<0>           { typedef type_list<> type; };
-#endif //not EIGEN_PARSED_BY_DOXYGEN
 
 template<int n>
 struct h_skip {
   template<typename T, T... ii>
-  constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_numeric<T, n, ii...>::type helper(numeric_list<T, ii...>) { return typename h_skip_helper_numeric<T, n, ii...>::type(); }
+  constexpr static inline typename h_skip_helper_numeric<T, n, ii...>::type helper(numeric_list<T, ii...>) { return typename h_skip_helper_numeric<T, n, ii...>::type(); }
   template<typename... tt>
-  constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_type<n, tt...>::type helper(type_list<tt...>) { return typename h_skip_helper_type<n, tt...>::type(); }
+  constexpr static inline typename h_skip_helper_type<n, tt...>::type helper(type_list<tt...>) { return typename h_skip_helper_type<n, tt...>::type(); }
 };
 
 template<int n, typename a> struct skip { typedef decltype(h_skip<n>::helper(a())) type; };
@@ -120,10 +123,6 @@ template<typename a, typename... as>                      struct get<0, type_lis
 template<typename T, int n, T a, T... as>                        struct get<n, numeric_list<T, a, as...>>   : get<n-1, numeric_list<T, as...>> {};
 template<typename T, T a, T... as>                               struct get<0, numeric_list<T, a, as...>>   { constexpr static T value = a; };
 
-template<std::size_t n, typename T, T a, T... as> constexpr T       array_get(const numeric_list<T, a, as...>&) {
-   return get<(int)n, numeric_list<T, a, as...>>::value;
-}
-
 /* always get type, regardless of dummy; good for parameter pack expansion */
 
 template<typename T, T dummy, typename t> struct id_numeric  { typedef t type; };
@@ -265,7 +264,7 @@ template<
   typename Reducer
 > struct reduce<Reducer>
 {
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE int run() { return Reducer::Identity; }
+  constexpr static inline int run() { return Reducer::Identity; }
 };
 
 template<
@@ -273,7 +272,7 @@ template<
   typename A
 > struct reduce<Reducer, A>
 {
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE A run(A a) { return a; }
+  constexpr static inline A run(A a) { return a; }
 };
 
 template<
@@ -282,7 +281,7 @@ template<
   typename... Ts
 > struct reduce<Reducer, A, Ts...>
 {
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
+  constexpr static inline auto run(A a, Ts... ts) -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
     return Reducer::run(a, reduce<Reducer, Ts...>::run(ts...));
   }
 };
@@ -290,29 +289,29 @@ template<
 /* generic binary operations */
 
 struct sum_op           {
-  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a + b)   { return a + b;   }
+  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, B b) -> decltype(a + b)   { return a + b;   }
   static constexpr int Identity = 0;
 };
 struct product_op       {
-  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a * b)   { return a * b;   }
+  template<typename A, typename B> EIGEN_DEVICE_FUNC constexpr static inline auto run(A a, B b) -> decltype(a * b)   { return a * b;   }
   static constexpr int Identity = 1;
 };
 
-struct logical_and_op   { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a && b)  { return a && b;  } };
-struct logical_or_op    { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a || b)  { return a || b;  } };
+struct logical_and_op   { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a && b)  { return a && b;  } };
+struct logical_or_op    { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a || b)  { return a || b;  } };
 
-struct equal_op         { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a == b)  { return a == b;  } };
-struct not_equal_op     { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a != b)  { return a != b;  } };
-struct lesser_op        { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a < b)   { return a < b;   } };
-struct lesser_equal_op  { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a <= b)  { return a <= b;  } };
-struct greater_op       { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a > b)   { return a > b;   } };
-struct greater_equal_op { template<typename A, typename B> constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a >= b)  { return a >= b;  } };
+struct equal_op         { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a == b)  { return a == b;  } };
+struct not_equal_op     { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a != b)  { return a != b;  } };
+struct lesser_op        { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a < b)   { return a < b;   } };
+struct lesser_equal_op  { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a <= b)  { return a <= b;  } };
+struct greater_op       { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a > b)   { return a > b;   } };
+struct greater_equal_op { template<typename A, typename B> constexpr static inline auto run(A a, B b) -> decltype(a >= b)  { return a >= b;  } };
 
 /* generic unary operations */
 
-struct not_op                { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(!a)      { return !a;      } };
-struct negation_op           { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(-a)      { return -a;      } };
-struct greater_equal_zero_op { template<typename A> constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(a >= 0)  { return a >= 0;  } };
+struct not_op                { template<typename A> constexpr static inline auto run(A a) -> decltype(!a)      { return !a;      } };
+struct negation_op           { template<typename A> constexpr static inline auto run(A a) -> decltype(-a)      { return -a;      } };
+struct greater_equal_zero_op { template<typename A> constexpr static inline auto run(A a) -> decltype(a >= 0)  { return a >= 0;  } };
 
 
 /* reductions for lists */
@@ -321,13 +320,13 @@ struct greater_equal_zero_op { template<typename A> constexpr static EIGEN_STRON
 // together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1
 // does...
 template<typename... Ts>
-EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
+constexpr inline decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(Ts... ts)
 {
   return reduce<product_op, Ts...>::run(ts...);
 }
 
 template<typename... Ts>
-constexpr EIGEN_STRONG_INLINE decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts... ts)
+constexpr inline decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts... ts)
 {
   return reduce<sum_op, Ts...>::run(ts...);
 }
@@ -335,13 +334,13 @@ constexpr EIGEN_STRONG_INLINE decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...
 /* reverse arrays */
 
 template<typename Array, int... n>
-constexpr EIGEN_STRONG_INLINE Array h_array_reverse(Array arr, numeric_list<int, n...>)
+constexpr inline Array h_array_reverse(Array arr, numeric_list<int, n...>)
 {
   return {{array_get<sizeof...(n) - n - 1>(arr)...}};
 }
 
 template<typename T, std::size_t N>
-constexpr EIGEN_STRONG_INLINE array<T, N> array_reverse(array<T, N> arr)
+constexpr inline array<T, N> array_reverse(array<T, N> arr)
 {
   return h_array_reverse(arr, typename gen_numeric_list<int, N>::type());
 }
@@ -356,7 +355,7 @@ constexpr EIGEN_STRONG_INLINE array<T, N> array_reverse(array<T, N> arr)
 // an infinite loop)
 template<typename Reducer, typename T, std::size_t N, std::size_t n = N - 1>
 struct h_array_reduce {
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(array<T, N> arr, T identity) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr)))
+  EIGEN_DEVICE_FUNC constexpr static inline auto run(array<T, N> arr, T identity) -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr)))
   {
     return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr));
   }
@@ -365,7 +364,7 @@ struct h_array_reduce {
 template<typename Reducer, typename T, std::size_t N>
 struct h_array_reduce<Reducer, T, N, 0>
 {
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, N>& arr, T)
+  EIGEN_DEVICE_FUNC constexpr static inline T run(const array<T, N>& arr, T)
   {
     return array_get<0>(arr);
   }
@@ -374,14 +373,14 @@ struct h_array_reduce<Reducer, T, N, 0>
 template<typename Reducer, typename T>
 struct h_array_reduce<Reducer, T, 0>
 {
-  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, 0>&, T identity)
+  EIGEN_DEVICE_FUNC constexpr static inline T run(const array<T, 0>&, T identity)
   {
     return identity;
   }
 };
 
 template<typename Reducer, typename T, std::size_t N>
-EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_reduce(const array<T, N>& arr, T identity) -> decltype(h_array_reduce<Reducer, T, N>::run(arr, identity))
+EIGEN_DEVICE_FUNC constexpr inline auto array_reduce(const array<T, N>& arr, T identity) -> decltype(h_array_reduce<Reducer, T, N>::run(arr, identity))
 {
   return h_array_reduce<Reducer, T, N>::run(arr, identity);
 }
@@ -389,13 +388,13 @@ EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_reduce(const array<T,
 /* standard array reductions */
 
 template<typename T, std::size_t N>
-EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_sum(const array<T, N>& arr) -> decltype(array_reduce<sum_op, T, N>(arr, static_cast<T>(0)))
+EIGEN_DEVICE_FUNC constexpr inline auto array_sum(const array<T, N>& arr) -> decltype(array_reduce<sum_op, T, N>(arr, static_cast<T>(0)))
 {
   return array_reduce<sum_op, T, N>(arr, static_cast<T>(0));
 }
 
 template<typename T, std::size_t N>
-EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_prod(const array<T, N>& arr) -> decltype(array_reduce<product_op, T, N>(arr, static_cast<T>(1)))
+EIGEN_DEVICE_FUNC constexpr inline auto array_prod(const array<T, N>& arr) -> decltype(array_reduce<product_op, T, N>(arr, static_cast<T>(1)))
 {
   return array_reduce<product_op, T, N>(arr, static_cast<T>(1));
 }
@@ -411,13 +410,13 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
 /* zip an array */
 
 template<typename Op, typename A, typename B, std::size_t N, int... n>
-constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())),N> h_array_zip(array<A, N> a, array<B, N> b, numeric_list<int, n...>)
+constexpr inline array<decltype(Op::run(A(), B())),N> h_array_zip(array<A, N> a, array<B, N> b, numeric_list<int, n...>)
 {
   return array<decltype(Op::run(A(), B())),N>{{ Op::run(array_get<n>(a), array_get<n>(b))... }};
 }
 
 template<typename Op, typename A, typename B, std::size_t N>
-constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())),N> array_zip(array<A, N> a, array<B, N> b)
+constexpr inline array<decltype(Op::run(A(), B())),N> array_zip(array<A, N> a, array<B, N> b)
 {
   return h_array_zip<Op>(a, b, typename gen_numeric_list<int, N>::type());
 }
@@ -425,13 +424,13 @@ constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())),N> array_zip(arr
 /* zip an array and reduce the result */
 
 template<typename Reducer, typename Op, typename A, typename B, std::size_t N, int... n>
-constexpr EIGEN_STRONG_INLINE auto h_array_zip_and_reduce(array<A, N> a, array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...))
+constexpr inline auto h_array_zip_and_reduce(array<A, N> a, array<B, N> b, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...))
 {
   return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A(), B()))>::type...>::run(Op::run(array_get<n>(a), array_get<n>(b))...);
 }
 
 template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
-constexpr EIGEN_STRONG_INLINE auto array_zip_and_reduce(array<A, N> a, array<B, N> b) -> decltype(h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type()))
+constexpr inline auto array_zip_and_reduce(array<A, N> a, array<B, N> b) -> decltype(h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type()))
 {
   return h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type());
 }
@@ -439,13 +438,13 @@ constexpr EIGEN_STRONG_INLINE auto array_zip_and_reduce(array<A, N> a, array<B,
 /* apply stuff to an array */
 
 template<typename Op, typename A, std::size_t N, int... n>
-constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())),N> h_array_apply(array<A, N> a, numeric_list<int, n...>)
+constexpr inline array<decltype(Op::run(A())),N> h_array_apply(array<A, N> a, numeric_list<int, n...>)
 {
   return array<decltype(Op::run(A())),N>{{ Op::run(array_get<n>(a))... }};
 }
 
 template<typename Op, typename A, std::size_t N>
-constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())),N> array_apply(array<A, N> a)
+constexpr inline array<decltype(Op::run(A())),N> array_apply(array<A, N> a)
 {
   return h_array_apply<Op>(a, typename gen_numeric_list<int, N>::type());
 }
@@ -453,13 +452,13 @@ constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())),N> array_apply(array<
 /* apply stuff to an array and reduce */
 
 template<typename Reducer, typename Op, typename A, std::size_t N, int... n>
-constexpr EIGEN_STRONG_INLINE auto h_array_apply_and_reduce(array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...))
+constexpr inline auto h_array_apply_and_reduce(array<A, N> arr, numeric_list<int, n...>) -> decltype(reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...))
 {
   return reduce<Reducer, typename id_numeric<int,n,decltype(Op::run(A()))>::type...>::run(Op::run(array_get<n>(arr))...);
 }
 
 template<typename Reducer, typename Op, typename A, std::size_t N>
-constexpr EIGEN_STRONG_INLINE auto array_apply_and_reduce(array<A, N> a) -> decltype(h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type()))
+constexpr inline auto array_apply_and_reduce(array<A, N> a) -> decltype(h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type()))
 {
   return h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type());
 }
@@ -473,7 +472,7 @@ template<int n>
 struct h_repeat
 {
   template<typename t, int... ii>
-  constexpr static EIGEN_STRONG_INLINE array<t, n> run(t v, numeric_list<int, ii...>)
+  constexpr static inline array<t, n> run(t v, numeric_list<int, ii...>)
   {
     return {{ typename id_numeric<int, ii, t>::type(v)... }};
   }
@@ -534,4 +533,10 @@ InstType instantiate_by_c_array(ArrType* arr)
 
 } // end namespace Eigen
 
+#else // Non C++11, fallback to emulation mode
+
+#include "EmulateCXX11Meta.h"
+
+#endif
+
 #endif // EIGEN_CXX11META_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
index f1c0284ea..fe4d22803 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
@@ -47,9 +47,9 @@ namespace internal {
  */
 
 
-template<std::size_t I_, class T> constexpr inline T&       array_get(std::vector<T>&       a) { return a[I_]; }
-template<std::size_t I_, class T> constexpr inline T&&      array_get(std::vector<T>&&      a) { return a[I_]; }
-template<std::size_t I_, class T> constexpr inline T const& array_get(std::vector<T> const& a) { return a[I_]; }
+template<std::size_t I, class T> constexpr inline T&       array_get(std::vector<T>&       a) { return a[I]; }
+template<std::size_t I, class T> constexpr inline T&&      array_get(std::vector<T>&&      a) { return a[I]; }
+template<std::size_t I, class T> constexpr inline T const& array_get(std::vector<T> const& a) { return a[I]; }
 
 /* Suppose you have a template of the form
  * template<typename T> struct X;
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h
index 834b20b55..30d3ebcff 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@@ -15,20 +15,15 @@
 // The array class is only available starting with cxx11. Emulate our own here
 // if needed. Beware, msvc still doesn't advertise itself as a c++11 compiler!
 // Moreover, CUDA doesn't support the STL containers, so we use our own instead.
-#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(EIGEN_GPUCC) || defined(EIGEN_AVOID_STL_ARRAY)
+#if (__cplusplus <= 199711L && EIGEN_COMP_MSVC < 1900) || defined(__CUDACC__) || defined(EIGEN_AVOID_STL_ARRAY)
 
 namespace Eigen {
 template <typename T, size_t n> class array {
  public:
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE T& operator[] (size_t index) { eigen_internal_assert(index < size()); return values[index]; }
+  EIGEN_STRONG_INLINE T& operator[] (size_t index) { return values[index]; }
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { eigen_internal_assert(index < size()); return values[index]; }
-
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE T& at(size_t index) { eigen_assert(index < size()); return values[index]; }
-  EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const T& at(size_t index) const { eigen_assert(index < size()); return values[index]; }
+  EIGEN_STRONG_INLINE const T& operator[] (size_t index) const { return values[index]; }
 
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE T& front() { return values[0]; }
@@ -174,7 +169,6 @@ template <typename T> class array<T, 0> {
 
 #if EIGEN_HAS_VARIADIC_TEMPLATES
   EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
-    EIGEN_UNUSED_VARIABLE(l);
     eigen_assert(l.size() == 0);
   }
 #endif
@@ -197,26 +191,30 @@ EIGEN_DEVICE_FUNC bool operator==(const array<T,N>& lhs, const array<T,N>& rhs)
 
 
 namespace internal {
-template<std::size_t I_, class T, std::size_t N>
+template<std::size_t I, class T, std::size_t N>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array<T,N>& a) {
-  return a[I_];
+  return a[I];
 }
-template<std::size_t I_, class T, std::size_t N>
+template<std::size_t I, class T, std::size_t N>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T,N>& a) {
-  return a[I_];
+  return a[I];
 }
 
+template <typename T> struct array_size;
 template<class T, std::size_t N> struct array_size<array<T,N> > {
-  enum { value = N };
+  static const size_t value = N;
 };
+template <typename T> struct array_size;
 template<class T, std::size_t N> struct array_size<array<T,N>& > {
-  enum { value = N };
+  static const size_t value = N;
 };
+template <typename T> struct array_size;
 template<class T, std::size_t N> struct array_size<const array<T,N> > {
-  enum { value = N };
+  static const size_t value = N;
 };
+template <typename T> struct array_size;
 template<class T, std::size_t N> struct array_size<const array<T,N>& > {
-  enum { value = N };
+  static const size_t value = N;
 };
 
 }  // end namespace internal
@@ -224,7 +222,7 @@ template<class T, std::size_t N> struct array_size<const array<T,N>& > {
 
 #else
 
-// The compiler supports c++11, and we're not targeting cuda: use std::array as Eigen::array
+// The compiler supports c++11, and we're not targetting cuda: use std::array as Eigen::array
 #include <array>
 namespace Eigen {
 
@@ -240,19 +238,27 @@ namespace internal {
  *                       this may not be constexpr
  */
 #if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322
-#define STD_GET_ARR_HACK             a._M_instance[I_]
+#define STD_GET_ARR_HACK             a._M_instance[I]
 #elif defined(_LIBCPP_VERSION)
-#define STD_GET_ARR_HACK             a.__elems_[I_]
+#define STD_GET_ARR_HACK             a.__elems_[I]
 #else
-#define STD_GET_ARR_HACK             std::template get<I_, T, N>(a)
+#define STD_GET_ARR_HACK             std::template get<I, T, N>(a)
 #endif
 
-template<std::size_t I_, class T, std::size_t N> constexpr inline T&       array_get(std::array<T,N>&       a) { return (T&)       STD_GET_ARR_HACK; }
-template<std::size_t I_, class T, std::size_t N> constexpr inline T&&      array_get(std::array<T,N>&&      a) { return (T&&)      STD_GET_ARR_HACK; }
-template<std::size_t I_, class T, std::size_t N> constexpr inline T const& array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; }
+template<std::size_t I, class T, std::size_t N> constexpr inline T&       array_get(std::array<T,N>&       a) { return (T&)       STD_GET_ARR_HACK; }
+template<std::size_t I, class T, std::size_t N> constexpr inline T&&      array_get(std::array<T,N>&&      a) { return (T&&)      STD_GET_ARR_HACK; }
+template<std::size_t I, class T, std::size_t N> constexpr inline T const& array_get(std::array<T,N> const& a) { return (T const&) STD_GET_ARR_HACK; }
 
 #undef STD_GET_ARR_HACK
 
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<const std::array<T,N> > {
+  static const size_t value = N;
+};
+template <typename T> struct array_size;
+template<class T, std::size_t N> struct array_size<std::array<T,N> > {
+  static const size_t value = N;
+};
 }  // end namespace internal
 }  // end namespace Eigen
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h
new file mode 100644
index 000000000..8a536faf6
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h
@@ -0,0 +1,311 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EMULATE_CXX11_META_H
+#define EIGEN_EMULATE_CXX11_META_H
+
+
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+  * \file CXX11/util/EmulateCXX11Meta.h
+  * This file emulates a subset of the functionality provided by CXXMeta.h for
+  * compilers that don't yet support cxx11 such as nvcc.
+  */
+
+struct empty_list { static const std::size_t count = 0; };
+
+template<typename T, typename Tail=empty_list> struct type_list {
+  typedef T HeadType;
+  typedef Tail TailType;
+  static const T head;
+  static const Tail tail;
+  static const std::size_t count = 1 + Tail::count;
+};
+
+struct null_type { };
+
+template<typename T1 = null_type, typename T2 = null_type, typename T3 = null_type,
+         typename T4 = null_type, typename T5 = null_type, typename T6 = null_type,
+         typename T7 = null_type, typename T8 = null_type>
+struct make_type_list {
+  typedef typename make_type_list<T2, T3, T4, T5, T6, T7, T8>::type tailresult;
+
+  typedef type_list<T1, tailresult> type;
+};
+
+template<> struct make_type_list<> {
+  typedef empty_list type;
+};
+
+
+template <std::size_t index, class TList> struct get_type;
+
+template <class Head, class Tail>
+struct get_type<0, type_list<Head, Tail> >
+{
+  typedef Head type;
+};
+
+template <std::size_t i, class Head, class Tail>
+struct get_type<i, type_list<Head, Tail> >
+{
+  typedef typename get_type<i-1, Tail>::type type;
+};
+
+
+/* numeric list */
+template <typename T, T n>
+struct type2val {
+  typedef T type;
+  static const T value = n;
+};
+
+
+template<typename T, size_t n, T V> struct gen_numeric_list_repeated;
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 1, V> {
+  typedef typename make_type_list<type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 2, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 3, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 4, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 5, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 6, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V>, type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 7, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V> >::type type;
+};
+
+template<typename T, T V> struct gen_numeric_list_repeated<T, 8, V> {
+  typedef typename make_type_list<type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V>, type2val<T, V>,
+                                  type2val<T, V>, type2val<T, V> >::type type;
+};
+
+
+template <std::size_t index, class NList> struct get;
+
+template <std::size_t i>
+struct get<i, empty_list>
+{
+  get() { eigen_assert(false && "index overflow"); }
+  typedef void type;
+  static const char value = '\0';
+};
+
+template <std::size_t i, class Head>
+struct get<i, type_list<Head, empty_list> >
+{
+  get() { eigen_assert(false && "index overflow"); }
+  typedef void type;
+  static const char value = '\0';
+};
+
+template <class Head>
+struct get<0, type_list<Head, empty_list> >
+{
+  typedef typename Head::type type;
+  static const type value = Head::value;
+};
+
+template <class Head, class Tail>
+struct get<0, type_list<Head, Tail> >
+{
+  typedef typename Head::type type;
+  static const type value = Head::value;
+};
+
+template <std::size_t i, class Head, class Tail>
+struct get<i, type_list<Head, Tail> >
+{
+  typedef typename Tail::HeadType::type type;
+  static const type value = get<i-1, Tail>::value;
+};
+
+
+template <class NList> struct arg_prod {
+  static const typename NList::HeadType::type value = get<0, NList>::value * arg_prod<typename NList::TailType>::value;
+};
+template <> struct arg_prod<empty_list> {
+  static const int value = 1;
+};
+
+
+template<int n, typename t>
+array<t, n> repeat(t v) {
+  array<t, n> array;
+  array.fill(v);
+  return array;
+}
+
+template<std::size_t I, class Head, class Tail>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(type_list<Head, Tail>&) {
+  return get<I, type_list<Head, Tail> >::value;
+}
+template<std::size_t I, class Head, class Tail>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Head::type array_get(const type_list<Head, Tail>&) {
+  return get<I, type_list<Head, Tail> >::value;
+}
+
+template <class NList>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NList::HeadType::type array_prod(const NList&) {
+  return arg_prod<NList>::value;
+}
+
+template<typename t, std::size_t n>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, n>& a) {
+  t prod = 1;
+  for (size_t i = 0; i < n; ++i) { prod *= a[i]; }
+  return prod;
+}
+template<typename t>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const array<t, 0>& /*a*/) {
+  return 1;
+}
+
+template<typename t>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
+  eigen_assert(a.size() > 0);
+  t prod = 1;
+  for (size_t i = 0; i < a.size(); ++i) { prod *= a[i]; }
+  return prod;
+}
+
+
+template<std::size_t I, class T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(std::vector<T>& a) {
+  return a[I];
+}
+template<std::size_t I, class T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const std::vector<T>& a) {
+  return a[I];
+}
+
+struct sum_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a + b; }
+};
+struct product_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a * b; }
+};
+
+struct logical_and_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a && b; }
+};
+struct logical_or_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a || b; }
+};
+
+struct equal_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a == b; }
+};
+struct not_equal_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a != b; }
+};
+struct lesser_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a < b; }
+};
+struct lesser_equal_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a <= b; }
+};
+
+struct greater_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a > b; }
+};
+struct greater_equal_op {
+  template<typename A, typename B> static inline bool run(A a, B b) { return a >= b; }
+};
+
+struct not_op {
+  template<typename A> static inline bool run(A a) { return !a; }
+};
+struct negation_op {
+  template<typename A> static inline bool run(A a) { return -a; }
+};
+struct greater_equal_zero_op {
+  template<typename A> static inline bool run(A a) { return a >= 0; }
+};
+
+
+template<typename Reducer, typename Op, typename A, std::size_t N>
+struct ArrayApplyAndReduce {
+  static inline bool run(const array<A, N>& a) {
+    EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    bool result = Reducer::run(Op::run(a[0]), Op::run(a[1]));
+    for (size_t i = 2; i < N; ++i) {
+      result = Reducer::run(result, Op::run(a[i]));
+    }
+    return result;
+  }
+};
+
+template<typename Reducer, typename Op, typename A>
+struct ArrayApplyAndReduce<Reducer, Op, A, 1>  {
+  static inline bool run(const array<A, 1>& a) {
+    return Op::run(a[0]);
+  }
+};
+
+template<typename Reducer, typename Op, typename A, std::size_t N>
+inline bool array_apply_and_reduce(const array<A, N>& a) {
+  return ArrayApplyAndReduce<Reducer, Op, A, N>::run(a);
+}
+
+template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
+struct ArrayZipAndReduce {
+  static inline bool run(const array<A, N>& a, const array<B, N>& b) {
+    EIGEN_STATIC_ASSERT(N >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    bool result = Reducer::run(Op::run(a[0], b[0]), Op::run(a[1], b[1]));
+    for (size_t i = 2; i < N; ++i) {
+      result = Reducer::run(result, Op::run(a[i], b[i]));
+    }
+    return result;
+  }
+};
+
+template<typename Reducer, typename Op, typename A, typename B>
+struct ArrayZipAndReduce<Reducer, Op, A, B, 1> {
+  static inline bool run(const array<A, 1>& a, const array<B, 1>& b) {
+    return Op::run(a[0], b[0]);
+  }
+};
+
+template<typename Reducer, typename Op, typename A, typename B, std::size_t N>
+inline bool array_zip_and_reduce(const array<A, N>& a, const array<B, N>& b) {
+  return ArrayZipAndReduce<Reducer, Op, A, B, N>::run(a, b);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+
+
+#endif  // EIGEN_EMULATE_CXX11_META_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
index 277ab149a..4bc3dd1ba 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
@@ -29,13 +29,13 @@ namespace Eigen {
   */
 template <typename T>
 class MaxSizeVector {
-  static const size_t alignment = EIGEN_PLAIN_ENUM_MAX(EIGEN_ALIGNOF(T), sizeof(void*));
  public:
   // Construct a new MaxSizeVector, reserve n elements.
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   explicit MaxSizeVector(size_t n)
       : reserve_(n), size_(0),
-        data_(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {
+        data_(static_cast<T*>(internal::aligned_malloc(n * sizeof(T)))) {
+    for (size_t i = 0; i < n; ++i) { new (&data_[i]) T; }
   }
 
   // Construct a new MaxSizeVector, reserve and resize to n.
@@ -43,56 +43,36 @@ class MaxSizeVector {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   MaxSizeVector(size_t n, const T& init)
       : reserve_(n), size_(n),
-        data_(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {
-    size_t i = 0;
-    EIGEN_TRY
-    {
-      for(; i < size_; ++i) { new (&data_[i]) T(init); }
-    }
-    EIGEN_CATCH(...)
-    {
-      // Construction failed, destruct in reverse order:
-      for(; (i+1) > 0; --i) { data_[i-1].~T(); }
-      internal::handmade_aligned_free(data_);
-      EIGEN_THROW;
-    }
+        data_(static_cast<T*>(internal::aligned_malloc(n * sizeof(T)))) {
+    for (size_t i = 0; i < n; ++i) { new (&data_[i]) T(init); }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   ~MaxSizeVector() {
-    for (size_t i = size_; i > 0; --i) {
-      data_[i-1].~T();
+    for (size_t i = 0; i < size_; ++i) {
+      data_[i].~T();
     }
-    internal::handmade_aligned_free(data_);
+    internal::aligned_free(data_);
   }
 
   void resize(size_t n) {
     eigen_assert(n <= reserve_);
-    for (; size_ < n; ++size_) {
-      new (&data_[size_]) T;
+    for (size_t i = size_; i < n; ++i) {
+      new (&data_[i]) T;
     }
-    for (; size_ > n; --size_) {
-      data_[size_-1].~T();
+    for (size_t i = n; i < size_; ++i) {
+      data_[i].~T();
     }
-    eigen_assert(size_ == n);
+    size_ = n;
   }
 
   // Append new elements (up to reserved size).
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void push_back(const T& t) {
     eigen_assert(size_ < reserve_);
-    new (&data_[size_++]) T(t);
+    data_[size_++] = t;
   }
 
-  // For C++03 compatibility this only takes one argument
-  template<class X>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  void emplace_back(const X& x) {
-    eigen_assert(size_ < reserve_);
-    new (&data_[size_++]) T(x);
-  }
-
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const T& operator[] (size_t i) const {
     eigen_assert(i < size_);
@@ -119,8 +99,11 @@ class MaxSizeVector {
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void pop_back() {
+    // NOTE: This does not destroy the value at the end the way
+    // std::vector's version of pop_back() does.  That happens when
+    // the Vector is destroyed.
     eigen_assert(size_ > 0);
-    data_[--size_].~T();
+    size_--;
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/EulerAngles b/uppsrc/plugin/Eigen/unsupported/Eigen/EulerAngles
index f8f1c5d0b..521fa3f76 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/EulerAngles
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/EulerAngles
@@ -11,10 +11,10 @@
 #define EIGEN_EULERANGLES_MODULE_H
 
 
-#include "../../Eigen/Core"
-#include "../../Eigen/Geometry"
+#include "Eigen/Core"
+#include "Eigen/Geometry"
 
-#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+#include "Eigen/src/Core/util/DisableStupidWarnings.h"
 
 namespace Eigen {
 
@@ -38,6 +38,6 @@ namespace Eigen {
 #include "src/EulerAngles/EulerSystem.h"
 #include "src/EulerAngles/EulerAngles.h"
 
-#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+#include "Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_EULERANGLES_MODULE_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/FFT b/uppsrc/plugin/Eigen/unsupported/Eigen/FFT
index d9ad21a5a..d8cf3e642 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/FFT
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/FFT
@@ -13,7 +13,7 @@
 #include <complex>
 #include <vector>
 #include <map>
-#include "../../Eigen/Core"
+#include <Eigen/Core>
 
 
 /**
@@ -68,8 +68,6 @@
   */
  
 
-#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
-
 #ifdef EIGEN_FFTW_DEFAULT
 // FFTW: faster, GPL -- incompatible with Eigen in LGPL form, bigger code size
 #  include <fftw3.h>
@@ -417,8 +415,5 @@ void fft_inv_proxy<T_SrcMat,T_FftIfc>::evalTo(T_DestMat& dst) const
 }
 
 }
-
-#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
-
 #endif
 /* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/IterativeSolvers b/uppsrc/plugin/Eigen/unsupported/Eigen/IterativeSolvers
index 0fa129a7b..31e880bdc 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/IterativeSolvers
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/IterativeSolvers
@@ -10,9 +10,7 @@
 #ifndef EIGEN_ITERATIVE_SOLVERS_MODULE_H
 #define EIGEN_ITERATIVE_SOLVERS_MODULE_H
 
-#include "../../Eigen/Sparse"
-#include "../../Eigen/Jacobi"
-#include "../../Eigen/Householder"
+#include <Eigen/Sparse>
 
 /**
   * \defgroup IterativeSolvers_Module Iterative solvers module
@@ -26,21 +24,19 @@
   */
 //@{
 
-#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
-
 #ifndef EIGEN_MPL2_ONLY
 #include "src/IterativeSolvers/IterationController.h"
 #include "src/IterativeSolvers/ConstrainedConjGrad.h"
 #endif
 
 #include "src/IterativeSolvers/IncompleteLU.h"
+#include "../../Eigen/Jacobi"
+#include "../../Eigen/Householder"
 #include "src/IterativeSolvers/GMRES.h"
 #include "src/IterativeSolvers/DGMRES.h"
 //#include "src/IterativeSolvers/SSORPreconditioner.h"
 #include "src/IterativeSolvers/MINRES.h"
 
-#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
-
 //@}
 
 #endif // EIGEN_ITERATIVE_SOLVERS_MODULE_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/LevenbergMarquardt b/uppsrc/plugin/Eigen/unsupported/Eigen/LevenbergMarquardt
index 109050501..0fe2680ba 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/LevenbergMarquardt
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/LevenbergMarquardt
@@ -12,12 +12,12 @@
 
 // #include <vector>
 
-#include "../../Eigen/Core"
-#include "../../Eigen/Jacobi"
-#include "../../Eigen/QR"
-#include "NumericalDiff"
+#include <Eigen/Core>
+#include <Eigen/Jacobi>
+#include <Eigen/QR>
+#include <unsupported/Eigen/NumericalDiff> 
 
-#include "../../Eigen/SparseQR"
+#include <Eigen/SparseQR>
 
 /**
   * \defgroup LevenbergMarquardt_Module Levenberg-Marquardt module
@@ -29,10 +29,7 @@
   * 
   */
 
-#include "../../Eigen/SparseCore"
-
-#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
-
+#include "Eigen/SparseCore"
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 
 #include "src/LevenbergMarquardt/LMqrsolv.h"
@@ -44,6 +41,5 @@
 #include "src/LevenbergMarquardt/LevenbergMarquardt.h"
 #include "src/LevenbergMarquardt/LMonestep.h"
 
-#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_LEVENBERGMARQUARDT_MODULE
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/MPRealSupport b/uppsrc/plugin/Eigen/unsupported/Eigen/MPRealSupport
index c4ea4ec5f..7f0b70c63 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/MPRealSupport
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/MPRealSupport
@@ -12,7 +12,7 @@
 #ifndef EIGEN_MPREALSUPPORT_MODULE_H
 #define EIGEN_MPREALSUPPORT_MODULE_H
 
-#include "../../Eigen/Core"
+#include <Eigen/Core>
 #include <mpreal.h>
 
 namespace Eigen {
@@ -90,9 +90,6 @@ int main()
 #ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS
     static inline int digits10 (long Precision = mpfr::mpreal::get_default_prec())  { return std::numeric_limits<Real>::digits10(Precision); }
     static inline int digits10 (const Real& x)                                      { return std::numeric_limits<Real>::digits10(x); }
-    
-    static inline int digits ()               { return std::numeric_limits<Real>::digits(); }
-    static inline int digits (const Real& x)  { return std::numeric_limits<Real>::digits(x); }
 #endif
 
     static inline Real dummy_precision()
@@ -162,7 +159,6 @@ int main()
       typedef ResScalar LhsPacket;
       typedef ResScalar RhsPacket;
       typedef ResScalar ResPacket;
-      typedef LhsPacket LhsPacket4Packing;
       
     };
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/MatrixFunctions b/uppsrc/plugin/Eigen/unsupported/Eigen/MatrixFunctions
index 20c23d1c5..60dc0a69b 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/MatrixFunctions
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/MatrixFunctions
@@ -14,9 +14,9 @@
 #include <cfloat>
 #include <list>
 
-#include "../../Eigen/Core"
-#include "../../Eigen/LU"
-#include "../../Eigen/Eigenvalues"
+#include <Eigen/Core>
+#include <Eigen/LU>
+#include <Eigen/Eigenvalues>
 
 /**
   * \defgroup MatrixFunctions_Module Matrix functions module
@@ -53,16 +53,12 @@
   *
   */
 
-#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
-
 #include "src/MatrixFunctions/MatrixExponential.h"
 #include "src/MatrixFunctions/MatrixFunction.h"
 #include "src/MatrixFunctions/MatrixSquareRoot.h"
 #include "src/MatrixFunctions/MatrixLogarithm.h"
 #include "src/MatrixFunctions/MatrixPower.h"
 
-#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
-
 
 /** 
 \page matrixbaseextra_page
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/MoreVectorization b/uppsrc/plugin/Eigen/unsupported/Eigen/MoreVectorization
index 7662b4780..470e72430 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/MoreVectorization
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/MoreVectorization
@@ -9,7 +9,7 @@
 #ifndef EIGEN_MOREVECTORIZATION_MODULE_H
 #define EIGEN_MOREVECTORIZATION_MODULE_H
 
-#include "../../Eigen/Core"
+#include <Eigen/Core>
 
 namespace Eigen {
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/NonLinearOptimization b/uppsrc/plugin/Eigen/unsupported/Eigen/NonLinearOptimization
index 961f192b5..600ab4c12 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/NonLinearOptimization
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/NonLinearOptimization
@@ -12,10 +12,10 @@
 
 #include <vector>
 
-#include "../../Eigen/Core"
-#include "../../Eigen/Jacobi"
-#include "../../Eigen/QR"
-#include "NumericalDiff"
+#include <Eigen/Core>
+#include <Eigen/Jacobi>
+#include <Eigen/QR>
+#include <unsupported/Eigen/NumericalDiff>
 
 /**
   * \defgroup NonLinearOptimization_Module Non linear optimization module
@@ -30,12 +30,12 @@
   * actually linear. But if this is so, you should probably better use other
   * methods more fitted to this special case.
   *
-  * One algorithm allows to find a least-squares solution of such a system
-  * (Levenberg-Marquardt algorithm) and the second one is used to find 
+  * One algorithm allows to find an extremum of such a system (Levenberg
+  * Marquardt algorithm) and the second one is used to find 
   * a zero for the system (Powell hybrid "dogleg" method).
   *
   * This code is a port of minpack (http://en.wikipedia.org/wiki/MINPACK).
-  * Minpack is a very famous, old, robust and well renowned package, written in
+  * Minpack is a very famous, old, robust and well-reknown package, written in 
   * fortran. Those implementations have been carefully tuned, tested, and used
   * for several decades.
   *
@@ -58,41 +58,35 @@
   * There are two kinds of tests : those that come from examples bundled with cminpack.
   * They guaranty we get the same results as the original algorithms (value for 'x',
   * for the number of evaluations of the function, and for the number of evaluations
-  * of the Jacobian if ever).
+  * of the jacobian if ever).
   * 
   * Other tests were added by myself at the very beginning of the 
-  * process and check the results for Levenberg-Marquardt using the reference data 
+  * process and check the results for levenberg-marquardt using the reference data 
   * on http://www.itl.nist.gov/div898/strd/nls/nls_main.shtml. Since then i've 
-  * carefully checked that the same results were obtained when modifying the
+  * carefully checked that the same results were obtained when modifiying the 
   * code. Please note that we do not always get the exact same decimals as they do,
   * but this is ok : they use 128bits float, and we do the tests using the C type 'double',
   * which is 64 bits on most platforms (x86 and amd64, at least).
-  * I've performed those tests on several other implementations of Levenberg-Marquardt, and
+  * I've performed those tests on several other implementations of levenberg-marquardt, and
   * (c)minpack performs VERY well compared to those, both in accuracy and speed.
   * 
   * The documentation for running the tests is on the wiki
   * http://eigen.tuxfamily.org/index.php?title=Tests
   * 
-  * \section API API: overview of methods
+  * \section API API : overview of methods
   * 
-  * Both algorithms needs a functor computing the Jacobian. It can be computed by
-  * hand, using auto-differentiation (see \ref AutoDiff_Module), or using numerical
-  * differences (see \ref NumericalDiff_Module). For instance:
-  *\code
-  * MyFunc func;
-  * NumericalDiff<MyFunc> func_with_num_diff(func);
-  * LevenbergMarquardt<NumericalDiff<MyFunc> > lm(func_with_num_diff);
-  * \endcode
-  * For HybridNonLinearSolver, the method solveNumericalDiff() does the above wrapping for
-  * you.
+  * Both algorithms can use either the jacobian (provided by the user) or compute 
+  * an approximation by themselves (actually using Eigen \ref NumericalDiff_Module).
+  * The part of API referring to the latter use 'NumericalDiff' in the method names
+  * (exemple: LevenbergMarquardt.minimizeNumericalDiff() ) 
   * 
   * The methods LevenbergMarquardt.lmder1()/lmdif1()/lmstr1() and 
   * HybridNonLinearSolver.hybrj1()/hybrd1() are specific methods from the original 
   * minpack package that you probably should NOT use until you are porting a code that
-  * was previously using minpack. They just define a 'simple' API with default values 
+  *  was previously using minpack. They just define a 'simple' API with default values 
   * for some parameters.
   * 
-  * All algorithms are provided using two APIs :
+  * All algorithms are provided using Two APIs :
   *     - one where the user inits the algorithm, and uses '*OneStep()' as much as he wants : 
   * this way the caller have control over the steps
   *     - one where the user just calls a method (optimize() or solve()) which will 
@@ -100,7 +94,7 @@
   *  convenience.
   * 
   * As an example, the method LevenbergMarquardt::minimize() is 
-  * implemented as follow: 
+  * implemented as follow : 
   * \code
   * Status LevenbergMarquardt<FunctorType,Scalar>::minimize(FVectorType  &x, const int mode)
   * {
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/NumericalDiff b/uppsrc/plugin/Eigen/unsupported/Eigen/NumericalDiff
index 0668f960f..433334ca8 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/NumericalDiff
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/NumericalDiff
@@ -10,7 +10,7 @@
 #ifndef EIGEN_NUMERICALDIFF_MODULE
 #define EIGEN_NUMERICALDIFF_MODULE
 
-#include "../../Eigen/Core"
+#include <Eigen/Core>
 
 namespace Eigen {
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/OpenGLSupport b/uppsrc/plugin/Eigen/unsupported/Eigen/OpenGLSupport
index f8c213003..085325ce1 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/OpenGLSupport
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/OpenGLSupport
@@ -10,7 +10,7 @@
 #ifndef EIGEN_OPENGL_MODULE
 #define EIGEN_OPENGL_MODULE
 
-#include "../../Eigen/Geometry"
+#include <Eigen/Geometry>
 
 #if defined(__APPLE_CC__)
   #include <OpenGL/gl.h>
@@ -25,7 +25,7 @@ namespace Eigen {
   *
   * This module provides wrapper functions for a couple of OpenGL functions
   * which simplify the way to pass Eigen's object to openGL.
-  * Here is an example:
+  * Here is an exmaple:
   * 
   * \code
   * // You need to add path_to_eigen/unsupported to your include path.
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/Polynomials b/uppsrc/plugin/Eigen/unsupported/Eigen/Polynomials
index 146e5c404..334b03142 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/Polynomials
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/Polynomials
@@ -9,11 +9,11 @@
 #ifndef EIGEN_POLYNOMIALS_MODULE_H
 #define EIGEN_POLYNOMIALS_MODULE_H
 
-#include "../../Eigen/Core"
+#include <Eigen/Core>
 
-#include "../../Eigen/Eigenvalues"
+#include <Eigen/Eigenvalues>
 
-#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+#include <Eigen/src/Core/util/DisableStupidWarnings.h>
 
 // Note that EIGEN_HIDE_HEAVY_CODE has to be defined per module
 #if (defined EIGEN_EXTERN_INSTANTIATIONS) && (EIGEN_EXTERN_INSTANTIATIONS>=2)
@@ -132,7 +132,7 @@
   Output: \verbinclude PolynomialSolver1.out
 */
 
-#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
 
 #endif // EIGEN_POLYNOMIALS_MODULE_H
 /* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/Skyline b/uppsrc/plugin/Eigen/unsupported/Eigen/Skyline
index ebdf143f7..71a68cb42 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/Skyline
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/Skyline
@@ -10,9 +10,9 @@
 #define EIGEN_SKYLINE_MODULE_H
 
 
-#include "../../Eigen/Core"
+#include "Eigen/Core"
 
-#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+#include "Eigen/src/Core/util/DisableStupidWarnings.h"
 
 #include <map>
 #include <cstdlib>
@@ -34,6 +34,6 @@
 #include "src/Skyline/SkylineInplaceLU.h"
 #include "src/Skyline/SkylineProduct.h"
 
-#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+#include "Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 #endif // EIGEN_SKYLINE_MODULE_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/SpecialFunctions b/uppsrc/plugin/Eigen/unsupported/Eigen/SpecialFunctions
index a098ce871..a2ad4925e 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/SpecialFunctions
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/SpecialFunctions
@@ -29,29 +29,12 @@ namespace Eigen {
   * - erfc
   * - lgamma
   * - igamma
-  * - igamma_der_a
-  * - gamma_sample_der_alpha
   * - igammac
   * - digamma
-  * - ndtri
   * - polygamma
   * - zeta
   * - betainc
   *
-  * Bessel Functions
-  * - bessel_i0
-  * - bessel_i0e
-  * - bessel_i1
-  * - bessel_i1e
-  * - bessel_j0
-  * - bessel_j1
-  * - bessel_k0
-  * - bessel_k0e
-  * - bessel_k1
-  * - bessel_k1e
-  * - bessel_y0
-  * - bessel_y1
-  *
   * \code
   * #include <unsupported/Eigen/SpecialFunctions>
   * \endcode
@@ -60,22 +43,14 @@ namespace Eigen {
 
 }
 
-#include "src/SpecialFunctions/BesselFunctionsImpl.h"
-#include "src/SpecialFunctions/BesselFunctionsPacketMath.h"
-#include "src/SpecialFunctions/BesselFunctionsHalf.h"
-#include "src/SpecialFunctions/BesselFunctionsFunctors.h"
-#include "src/SpecialFunctions/BesselFunctionsArrayAPI.h"
 #include "src/SpecialFunctions/SpecialFunctionsImpl.h"
-#if defined(EIGEN_HIPCC)
-#include "src/SpecialFunctions/HipVectorCompatibility.h"
-#endif
 #include "src/SpecialFunctions/SpecialFunctionsPacketMath.h"
 #include "src/SpecialFunctions/SpecialFunctionsHalf.h"
 #include "src/SpecialFunctions/SpecialFunctionsFunctors.h"
 #include "src/SpecialFunctions/SpecialFunctionsArrayAPI.h"
 
-#if defined EIGEN_VECTORIZE_GPU
-  #include "src/SpecialFunctions/arch/GPU/GpuSpecialFunctions.h"
+#if defined EIGEN_VECTORIZE_CUDA
+  #include "src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h"
 #endif
 
 namespace Eigen {
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/Splines b/uppsrc/plugin/Eigen/unsupported/Eigen/Splines
index 2ca581364..322e6b9f5 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/Splines
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/Splines
@@ -24,12 +24,8 @@ namespace Eigen
   */
 }
 
-#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
-
 #include "src/Splines/SplineFwd.h"
 #include "src/Splines/Spline.h"
 #include "src/Splines/SplineFitting.h"
 
-#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
-
 #endif // EIGEN_SPLINES_MODULE_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
index 0ef159e30..58f3f3319 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
@@ -565,11 +565,6 @@ struct ScalarBinaryOpTraits<typename DerType::Scalar,AutoDiffScalar<DerType>, Bi
     CODE; \
   }
 
-template<typename DerType>
-struct CleanedUpDerType {
-  typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> type;
-};
-
 template<typename DerType>
 inline const AutoDiffScalar<DerType>& conj(const AutoDiffScalar<DerType>& x)  { return x; }
 template<typename DerType>
@@ -577,31 +572,31 @@ inline const AutoDiffScalar<DerType>& real(const AutoDiffScalar<DerType>& x)  {
 template<typename DerType>
 inline typename DerType::Scalar imag(const AutoDiffScalar<DerType>&)    { return 0.; }
 template<typename DerType, typename T>
-inline typename CleanedUpDerType<DerType>::type (min)(const AutoDiffScalar<DerType>& x, const T& y) {
-  typedef typename CleanedUpDerType<DerType>::type ADS;
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (min)(const AutoDiffScalar<DerType>& x, const T& y) {
+  typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
   return (x <= y ? ADS(x) : ADS(y));
 }
 template<typename DerType, typename T>
-inline typename CleanedUpDerType<DerType>::type (max)(const AutoDiffScalar<DerType>& x, const T& y) {
-  typedef typename CleanedUpDerType<DerType>::type ADS;
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (max)(const AutoDiffScalar<DerType>& x, const T& y) {
+  typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
   return (x >= y ? ADS(x) : ADS(y));
 }
 template<typename DerType, typename T>
-inline typename CleanedUpDerType<DerType>::type (min)(const T& x, const AutoDiffScalar<DerType>& y) {
-  typedef typename CleanedUpDerType<DerType>::type ADS;
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (min)(const T& x, const AutoDiffScalar<DerType>& y) {
+  typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
   return (x < y ? ADS(x) : ADS(y));
 }
 template<typename DerType, typename T>
-inline typename CleanedUpDerType<DerType>::type (max)(const T& x, const AutoDiffScalar<DerType>& y) {
-  typedef typename CleanedUpDerType<DerType>::type ADS;
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (max)(const T& x, const AutoDiffScalar<DerType>& y) {
+  typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
   return (x > y ? ADS(x) : ADS(y));
 }
 template<typename DerType>
-inline typename CleanedUpDerType<DerType>::type (min)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (min)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
   return (x.value() < y.value() ? x : y);
 }
 template<typename DerType>
-inline typename CleanedUpDerType<DerType>::type (max)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (max)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
   return (x.value() >= y.value() ? x : y);
 }
 
@@ -716,15 +711,10 @@ template<typename DerType> struct NumTraits<AutoDiffScalar<DerType> >
 }
 
 namespace std {
-
 template <typename T>
 class numeric_limits<Eigen::AutoDiffScalar<T> >
   : public numeric_limits<typename T::Scalar> {};
 
-template <typename T>
-class numeric_limits<Eigen::AutoDiffScalar<T&> >
-  : public numeric_limits<typename T::Scalar> {};
-
 }  // namespace std
 
 #endif // EIGEN_AUTODIFF_SCALAR_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/BVH/KdBVH.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/BVH/KdBVH.h
index 2d5b76ad0..5e39af26c 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/BVH/KdBVH.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/BVH/KdBVH.h
@@ -171,7 +171,7 @@ private:
   typedef internal::vector_int_pair<Scalar, Dim> VIPair;
   typedef std::vector<VIPair, aligned_allocator<VIPair> > VIPairList;
   typedef Matrix<Scalar, Dim, 1> VectorType;
-  struct VectorComparator //compares vectors, or more specifically, VIPairs along a particular dimension
+  struct VectorComparator //compares vectors, or, more specificall, VIPairs along a particular dimension
   {
     VectorComparator(int inDim) : dim(inDim) {}
     inline bool operator()(const VIPair &v1, const VIPair &v2) const { return v1.first[dim] < v2.first[dim]; }
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
index 0fbd84772..4170d26b6 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_ARPACKGENERALIZEDSELFADJOINTEIGENSOLVER_H
 #define EIGEN_ARPACKGENERALIZEDSELFADJOINTEIGENSOLVER_H
 
-#include "../../../../Eigen/Dense"
+#include <Eigen/Dense>
 
 namespace Eigen { 
 
@@ -285,7 +285,7 @@ public:
 
   /** \brief Reports whether previous computation was successful.
    *
-   * \returns \c Success if computation was successful, \c NoConvergence otherwise.
+   * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
    */
   ComputationInfo info() const
   {
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/EulerAngles/CMakeLists.txt b/uppsrc/plugin/Eigen/unsupported/Eigen/src/EulerAngles/CMakeLists.txt
index 22088eb30..40af550e8 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/EulerAngles/CMakeLists.txt
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/EulerAngles/CMakeLists.txt
@@ -1,6 +1,6 @@
-file(GLOB Eigen_EulerAngles_SRCS "*.h")
+FILE(GLOB Eigen_EulerAngles_SRCS "*.h")
 
-install(FILES
+INSTALL(FILES
   ${Eigen_EulerAngles_SRCS}
   DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/EulerAngles COMPONENT Devel
   )
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/EulerAngles/EulerAngles.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/EulerAngles/EulerAngles.h
index e43cdb7fb..13a0da1ab 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/EulerAngles/EulerAngles.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/EulerAngles/EulerAngles.h
@@ -12,6 +12,11 @@
 
 namespace Eigen
 {
+  /*template<typename Other,
+         int OtherRows=Other::RowsAtCompileTime,
+         int OtherCols=Other::ColsAtCompileTime>
+  struct ei_eulerangles_assign_impl;*/
+
   /** \class EulerAngles
     *
     * \ingroup EulerAngles_Module
@@ -31,7 +36,7 @@ namespace Eigen
     * ### Rotation representation and conversions ###
     *
     * It has been proved(see Wikipedia link below) that every rotation can be represented
-    *  by Euler angles, but there is no single representation (e.g. unlike rotation matrices).
+    *  by Euler angles, but there is no singular representation (e.g. unlike rotation matrices).
     * Therefore, you can convert from Eigen rotation and to them
     *  (including rotation matrices, which is not called "rotations" by Eigen design).
     *
@@ -50,27 +55,33 @@ namespace Eigen
     * Additionally, some axes related computation is done in compile time.
     *
     * #### Euler angles ranges in conversions ####
-    * Rotations representation as EulerAngles are not single (unlike matrices),
-    *  and even have infinite EulerAngles representations.<BR>
-    * For example, add or subtract 2*PI from either angle of EulerAngles
-    *  and you'll get the same rotation.
-    * This is the general reason for infinite representation,
-    *  but it's not the only general reason for not having a single representation.
     *
-    * When converting rotation to EulerAngles, this class convert it to specific ranges
-    * When converting some rotation to EulerAngles, the rules for ranges are as follow:
-    * - If the rotation we converting from is an EulerAngles
-    *  (even when it represented as RotationBase explicitly), angles ranges are __undefined__.
-    * - otherwise, alpha and gamma angles will be in the range [-PI, PI].<BR>
-    *   As for Beta angle:
-    *    - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2].
-    *    - otherwise:
-    *      - If the beta axis is positive, the beta angle will be in the range [0, PI]
-    *      - If the beta axis is negative, the beta angle will be in the range [-PI, 0]
+    * When converting some rotation to Euler angles, there are some ways you can guarantee
+    *  the Euler angles ranges.
     *
+    * #### implicit ranges ####
+    * When using implicit ranges, all angles are guarantee to be in the range [-PI, +PI],
+    *  unless you convert from some other Euler angles.
+    * In this case, the range is __undefined__ (might be even less than -PI or greater than +2*PI).
     * \sa EulerAngles(const MatrixBase<Derived>&)
     * \sa EulerAngles(const RotationBase<Derived, 3>&)
     *
+    * #### explicit ranges ####
+    * When using explicit ranges, all angles are guarantee to be in the range you choose.
+    * In the range Boolean parameter, you're been ask whether you prefer the positive range or not:
+    * - _true_ - force the range between [0, +2*PI]
+    * - _false_ - force the range between [-PI, +PI]
+    *
+    * ##### compile time ranges #####
+    * This is when you have compile time ranges and you prefer to
+    *  use template parameter. (e.g. for performance)
+    * \sa FromRotation()
+    *
+    * ##### run-time time ranges #####
+    * Run-time ranges are also supported.
+    * \sa EulerAngles(const MatrixBase<Derived>&, bool, bool, bool)
+    * \sa EulerAngles(const RotationBase<Derived, 3>&, bool, bool, bool)
+    *
     * ### Convenient user typedefs ###
     *
     * Convenient typedefs for EulerAngles exist for float and double scalar,
@@ -92,7 +103,7 @@ namespace Eigen
     *
     * More information about Euler angles: https://en.wikipedia.org/wiki/Euler_angles
     *
-    * \tparam _Scalar the scalar type, i.e. the type of the angles.
+    * \tparam _Scalar the scalar type, i.e., the type of the angles.
     *
     * \tparam _System the EulerSystem to use, which represents the axes of rotation.
     */
@@ -100,11 +111,8 @@ namespace Eigen
   class EulerAngles : public RotationBase<EulerAngles<_Scalar, _System>, 3>
   {
     public:
-      typedef RotationBase<EulerAngles<_Scalar, _System>, 3> Base;
-      
       /** the scalar type of the angles */
       typedef _Scalar Scalar;
-      typedef typename NumTraits<Scalar>::Real RealScalar;
       
       /** the EulerSystem to use, which represents the axes of rotation. */
       typedef _System System;
@@ -138,56 +146,67 @@ namespace Eigen
     public:
       /** Default constructor without initialization. */
       EulerAngles() {}
-      /** Constructs and initialize an EulerAngles (\p alpha, \p beta, \p gamma). */
+      /** Constructs and initialize Euler angles(\p alpha, \p beta, \p gamma). */
       EulerAngles(const Scalar& alpha, const Scalar& beta, const Scalar& gamma) :
         m_angles(alpha, beta, gamma) {}
       
-      // TODO: Test this constructor
-      /** Constructs and initialize an EulerAngles from the array data {alpha, beta, gamma} */
-      explicit EulerAngles(const Scalar* data) : m_angles(data) {}
-      
-      /** Constructs and initializes an EulerAngles from either:
-        *  - a 3x3 rotation matrix expression(i.e. pure orthogonal matrix with determinant of +1),
-        *  - a 3D vector expression representing Euler angles.
+      /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m.
         *
-        * \note If \p other is a 3x3 rotation matrix, the angles range rules will be as follow:<BR>
-        *  Alpha and gamma angles will be in the range [-PI, PI].<BR>
-        *  As for Beta angle:
-        *   - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2].
-        *   - otherwise:
-        *     - If the beta axis is positive, the beta angle will be in the range [0, PI]
-        *     - If the beta axis is negative, the beta angle will be in the range [-PI, 0]
-       */
+        * \note All angles will be in the range [-PI, PI].
+      */
       template<typename Derived>
-      explicit EulerAngles(const MatrixBase<Derived>& other) { *this = other; }
+      EulerAngles(const MatrixBase<Derived>& m) { *this = m; }
+      
+      /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m,
+        *  with options to choose for each angle the requested range.
+        *
+        * If positive range is true, then the specified angle will be in the range [0, +2*PI].
+        * Otherwise, the specified angle will be in the range [-PI, +PI].
+        *
+        * \param m The 3x3 rotation matrix to convert
+        * \param positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \param positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \param positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+      */
+      template<typename Derived>
+      EulerAngles(
+        const MatrixBase<Derived>& m,
+        bool positiveRangeAlpha,
+        bool positiveRangeBeta,
+        bool positiveRangeGamma) {
+        
+        System::CalcEulerAngles(*this, m, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma);
+      }
       
       /** Constructs and initialize Euler angles from a rotation \p rot.
         *
-        * \note If \p rot is an EulerAngles (even when it represented as RotationBase explicitly),
-        *  angles ranges are __undefined__.
-        *  Otherwise, alpha and gamma angles will be in the range [-PI, PI].<BR>
-        *  As for Beta angle:
-        *   - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2].
-        *   - otherwise:
-        *     - If the beta axis is positive, the beta angle will be in the range [0, PI]
-        *     - If the beta axis is negative, the beta angle will be in the range [-PI, 0]
+        * \note All angles will be in the range [-PI, PI], unless \p rot is an EulerAngles.
+        *  If rot is an EulerAngles, expected EulerAngles range is __undefined__.
+        *  (Use other functions here for enforcing range if this effect is desired)
       */
       template<typename Derived>
-      EulerAngles(const RotationBase<Derived, 3>& rot) { System::CalcEulerAngles(*this, rot.toRotationMatrix()); }
+      EulerAngles(const RotationBase<Derived, 3>& rot) { *this = rot; }
       
-      /*EulerAngles(const QuaternionType& q)
-      {
-        // TODO: Implement it in a faster way for quaternions
-        // According to http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler/
-        //  we can compute only the needed matrix cells and then convert to euler angles. (see ZYX example below)
-        // Currently we compute all matrix cells from quaternion.
-
-        // Special case only for ZYX
-        //Scalar y2 = q.y() * q.y();
-        //m_angles[0] = std::atan2(2*(q.w()*q.z() + q.x()*q.y()), (1 - 2*(y2 + q.z()*q.z())));
-        //m_angles[1] = std::asin( 2*(q.w()*q.y() - q.z()*q.x()));
-        //m_angles[2] = std::atan2(2*(q.w()*q.x() + q.y()*q.z()), (1 - 2*(q.x()*q.x() + y2)));
-      }*/
+      /** Constructs and initialize Euler angles from a rotation \p rot,
+        *  with options to choose for each angle the requested range.
+        *
+        * If positive range is true, then the specified angle will be in the range [0, +2*PI].
+        * Otherwise, the specified angle will be in the range [-PI, +PI].
+        *
+        * \param rot The 3x3 rotation matrix to convert
+        * \param positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \param positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \param positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+      */
+      template<typename Derived>
+      EulerAngles(
+        const RotationBase<Derived, 3>& rot,
+        bool positiveRangeAlpha,
+        bool positiveRangeBeta,
+        bool positiveRangeGamma) {
+        
+        System::CalcEulerAngles(*this, rot.toRotationMatrix(), positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma);
+      }
 
       /** \returns The angle values stored in a vector (alpha, beta, gamma). */
       const Vector3& angles() const { return m_angles; }
@@ -227,48 +246,90 @@ namespace Eigen
         return inverse();
       }
       
-      /** Set \c *this from either:
-        *  - a 3x3 rotation matrix expression(i.e. pure orthogonal matrix with determinant of +1),
-        *  - a 3D vector expression representing Euler angles.
+      /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m,
+        *  with options to choose for each angle the requested range (__only in compile time__).
         *
-        * See EulerAngles(const MatrixBase<Derived, 3>&) for more information about
-        *  angles ranges output.
-      */
-      template<class Derived>
-      EulerAngles& operator=(const MatrixBase<Derived>& other)
+        * If positive range is true, then the specified angle will be in the range [0, +2*PI].
+        * Otherwise, the specified angle will be in the range [-PI, +PI].
+        *
+        * \param m The 3x3 rotation matrix to convert
+        * \tparam positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \tparam positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \tparam positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        */
+      template<
+        bool PositiveRangeAlpha,
+        bool PositiveRangeBeta,
+        bool PositiveRangeGamma,
+        typename Derived>
+      static EulerAngles FromRotation(const MatrixBase<Derived>& m)
       {
-        EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename Derived::Scalar>::value),
-         YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+        EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived, 3, 3)
         
-        internal::eulerangles_assign_impl<System, Derived>::run(*this, other.derived());
+        EulerAngles e;
+        System::template CalcEulerAngles<
+          PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma, _Scalar>(e, m);
+        return e;
+      }
+      
+      /** Constructs and initialize Euler angles from a rotation \p rot,
+        *  with options to choose for each angle the requested range (__only in compile time__).
+        *
+        * If positive range is true, then the specified angle will be in the range [0, +2*PI].
+        * Otherwise, the specified angle will be in the range [-PI, +PI].
+        *
+        * \param rot The 3x3 rotation matrix to convert
+        * \tparam positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \tparam positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \tparam positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+      */
+      template<
+        bool PositiveRangeAlpha,
+        bool PositiveRangeBeta,
+        bool PositiveRangeGamma,
+        typename Derived>
+      static EulerAngles FromRotation(const RotationBase<Derived, 3>& rot)
+      {
+        return FromRotation<PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma>(rot.toRotationMatrix());
+      }
+      
+      /*EulerAngles& fromQuaternion(const QuaternionType& q)
+      {
+        // TODO: Implement it in a faster way for quaternions
+        // According to http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler/
+        //  we can compute only the needed matrix cells and then convert to euler angles. (see ZYX example below)
+        // Currently we compute all matrix cells from quaternion.
+
+        // Special case only for ZYX
+        //Scalar y2 = q.y() * q.y();
+        //m_angles[0] = std::atan2(2*(q.w()*q.z() + q.x()*q.y()), (1 - 2*(y2 + q.z()*q.z())));
+        //m_angles[1] = std::asin( 2*(q.w()*q.y() - q.z()*q.x()));
+        //m_angles[2] = std::atan2(2*(q.w()*q.x() + q.y()*q.z()), (1 - 2*(q.x()*q.x() + y2)));
+      }*/
+      
+      /** Set \c *this from a rotation matrix(i.e. pure orthogonal matrix with determinant of +1). */
+      template<typename Derived>
+      EulerAngles& operator=(const MatrixBase<Derived>& m) {
+        EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived, 3, 3)
+        
+        System::CalcEulerAngles(*this, m);
         return *this;
       }
 
       // TODO: Assign and construct from another EulerAngles (with different system)
       
-      /** Set \c *this from a rotation.
-        *
-        * See EulerAngles(const RotationBase<Derived, 3>&) for more information about
-        *  angles ranges output.
-      */
+      /** Set \c *this from a rotation. */
       template<typename Derived>
       EulerAngles& operator=(const RotationBase<Derived, 3>& rot) {
         System::CalcEulerAngles(*this, rot.toRotationMatrix());
         return *this;
       }
       
-      /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-        * determined by \a prec.
-        *
-        * \sa MatrixBase::isApprox() */
-      bool isApprox(const EulerAngles& other,
-        const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const
-      { return angles().isApprox(other.angles(), prec); }
+      // TODO: Support isApprox function
 
       /** \returns an equivalent 3x3 rotation matrix. */
       Matrix3 toRotationMatrix() const
       {
-        // TODO: Calc it faster
         return static_cast<QuaternionType>(*this).toRotationMatrix();
       }
 
@@ -286,15 +347,6 @@ namespace Eigen
         s << eulerAngles.angles().transpose();
         return s;
       }
-      
-      /** \returns \c *this with scalar type casted to \a NewScalarType */
-      template <typename NewScalarType>
-      EulerAngles<NewScalarType, System> cast() const
-      {
-        EulerAngles<NewScalarType, System> e;
-        e.angles() = angles().template cast<NewScalarType>();
-        return e;
-      }
   };
 
 #define EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(AXES, SCALAR_TYPE, SCALAR_POSTFIX) \
@@ -327,29 +379,8 @@ EIGEN_EULER_ANGLES_TYPEDEFS(double, d)
     {
       typedef _Scalar Scalar;
     };
-    
-    // set from a rotation matrix
-    template<class System, class Other>
-    struct eulerangles_assign_impl<System,Other,3,3>
-    {
-      typedef typename Other::Scalar Scalar;
-      static void run(EulerAngles<Scalar, System>& e, const Other& m)
-      {
-        System::CalcEulerAngles(e, m);
-      }
-    };
-    
-    // set from a vector of Euler angles
-    template<class System, class Other>
-    struct eulerangles_assign_impl<System,Other,3,1>
-    {
-      typedef typename Other::Scalar Scalar;
-      static void run(EulerAngles<Scalar, System>& e, const Other& vec)
-      {
-        e.angles() = vec;
-      }
-    };
   }
+  
 }
 
 #endif // EIGEN_EULERANGLESCLASS_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/EulerAngles/EulerSystem.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/EulerAngles/EulerSystem.h
index 2a833b0a4..98f9f647d 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/EulerAngles/EulerSystem.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/EulerAngles/EulerSystem.h
@@ -12,13 +12,13 @@
 
 namespace Eigen
 {
-  // Forward declarations
+  // Forward declerations
   template <typename _Scalar, class _System>
   class EulerAngles;
   
   namespace internal
   {
-    // TODO: Add this trait to the Eigen internal API?
+    // TODO: Check if already exists on the rest API
     template <int Num, bool IsPositive = (Num > 0)>
     struct Abs
     {
@@ -36,12 +36,6 @@ namespace Eigen
     {
       enum { value = Axis != 0 && Abs<Axis>::value <= 3 };
     };
-    
-    template<typename System,
-            typename Other,
-            int OtherRows=Other::RowsAtCompileTime,
-            int OtherCols=Other::ColsAtCompileTime>
-    struct eulerangles_assign_impl;
   }
   
   #define EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(COND,MSG) typedef char static_assertion_##MSG[(COND)?1:-1]
@@ -75,7 +69,7 @@ namespace Eigen
     *
     * You can use this class to get two things:
     *  - Build an Euler system, and then pass it as a template parameter to EulerAngles.
-    *  - Query some compile time data about an Euler system. (e.g. Whether it's Tait-Bryan)
+    *  - Query some compile time data about an Euler system. (e.g. Whether it's tait bryan)
     *
     * Euler rotation is a set of three rotation on fixed axes. (see \ref EulerAngles)
     * This meta-class store constantly those signed axes. (see \ref EulerAxis)
@@ -86,7 +80,7 @@ namespace Eigen
     *  signed axes{+X,+Y,+Z,-X,-Y,-Z} are supported:
     *  - all axes X, Y, Z in each valid order (see below what order is valid)
     *  - rotation over the axis is supported both over the positive and negative directions.
-    *  - both Tait-Bryan and proper/classic Euler angles (i.e. the opposite).
+    *  - both tait bryan and proper/classic Euler angles (i.e. the opposite).
     *
     * Since EulerSystem support both positive and negative directions,
     *  you may call this rotation distinction in other names:
@@ -96,7 +90,7 @@ namespace Eigen
     * Notice all axed combination are valid, and would trigger a static assertion.
     * Same unsigned axes can't be neighbors, e.g. {X,X,Y} is invalid.
     * This yield two and only two classes:
-    *  - _Tait-Bryan_ - all unsigned axes are distinct, e.g. {X,Y,Z}
+    *  - _tait bryan_ - all unsigned axes are distinct, e.g. {X,Y,Z}
     *  - _proper/classic Euler angles_ - The first and the third unsigned axes is equal,
     *     and the second is different, e.g. {X,Y,X}
     *
@@ -118,9 +112,9 @@ namespace Eigen
     *
     * \tparam _AlphaAxis the first fixed EulerAxis
     *
-    * \tparam _BetaAxis the second fixed EulerAxis
+    * \tparam _AlphaAxis the second fixed EulerAxis
     *
-    * \tparam _GammaAxis the third fixed EulerAxis
+    * \tparam _AlphaAxis the third fixed EulerAxis
     */
   template <int _AlphaAxis, int _BetaAxis, int _GammaAxis>
   class EulerSystem
@@ -144,16 +138,14 @@ namespace Eigen
       BetaAxisAbs = internal::Abs<BetaAxis>::value, /*!< the second rotation axis unsigned */
       GammaAxisAbs = internal::Abs<GammaAxis>::value, /*!< the third rotation axis unsigned */
       
-      IsAlphaOpposite = (AlphaAxis < 0) ? 1 : 0, /*!< whether alpha axis is negative */
-      IsBetaOpposite = (BetaAxis < 0) ? 1 : 0, /*!< whether beta axis is negative */
-      IsGammaOpposite = (GammaAxis < 0) ? 1 : 0, /*!< whether gamma axis is negative */
+      IsAlphaOpposite = (AlphaAxis < 0) ? 1 : 0, /*!< weather alpha axis is negative */
+      IsBetaOpposite = (BetaAxis < 0) ? 1 : 0, /*!< weather beta axis is negative */
+      IsGammaOpposite = (GammaAxis < 0) ? 1 : 0, /*!< weather gamma axis is negative */
+      
+      IsOdd = ((AlphaAxisAbs)%3 == (BetaAxisAbs - 1)%3) ? 0 : 1, /*!< weather the Euler system is odd */
+      IsEven = IsOdd ? 0 : 1, /*!< weather the Euler system is even */
 
-      // Parity is even if alpha axis X is followed by beta axis Y, or Y is followed
-      // by Z, or Z is followed by X; otherwise it is odd.
-      IsOdd = ((AlphaAxisAbs)%3 == (BetaAxisAbs - 1)%3) ? 0 : 1, /*!< whether the Euler system is odd */
-      IsEven = IsOdd ? 0 : 1, /*!< whether the Euler system is even */
-
-      IsTaitBryan = ((unsigned)AlphaAxisAbs != (unsigned)GammaAxisAbs) ? 1 : 0 /*!< whether the Euler system is Tait-Bryan */
+      IsTaitBryan = ((unsigned)AlphaAxisAbs != (unsigned)GammaAxisAbs) ? 1 : 0 /*!< weather the Euler system is tait bryan */
     };
     
     private:
@@ -173,113 +165,142 @@ namespace Eigen
     EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT((unsigned)BetaAxisAbs != (unsigned)GammaAxisAbs,
       BETA_AXIS_CANT_BE_EQUAL_TO_GAMMA_AXIS);
 
-    static const int
+    enum
+    {
       // I, J, K are the pivot indexes permutation for the rotation matrix, that match this Euler system. 
       // They are used in this class converters.
       // They are always different from each other, and their possible values are: 0, 1, or 2.
-      I_ = AlphaAxisAbs - 1,
-      J_ = (AlphaAxisAbs - 1 + 1 + IsOdd)%3,
-      K_ = (AlphaAxisAbs - 1 + 2 - IsOdd)%3
-    ;
+      I = AlphaAxisAbs - 1,
+      J = (AlphaAxisAbs - 1 + 1 + IsOdd)%3,
+      K = (AlphaAxisAbs - 1 + 2 - IsOdd)%3
+    };
     
     // TODO: Get @mat parameter in form that avoids double evaluation.
     template <typename Derived>
     static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar, 3, 1>& res, const MatrixBase<Derived>& mat, internal::true_type /*isTaitBryan*/)
     {
       using std::atan2;
-      using std::sqrt;
+      using std::sin;
+      using std::cos;
       
       typedef typename Derived::Scalar Scalar;
-
-      const Scalar plusMinus = IsEven? 1 : -1;
-      const Scalar minusPlus = IsOdd?  1 : -1;
-
-      const Scalar Rsum = sqrt((mat(I_,I_) * mat(I_,I_) + mat(I_,J_) * mat(I_,J_) + mat(J_,K_) * mat(J_,K_) + mat(K_,K_) * mat(K_,K_))/2);
-      res[1] = atan2(plusMinus * mat(I_,K_), Rsum);
-
-      // There is a singularity when cos(beta) == 0
-      if(Rsum > 4 * NumTraits<Scalar>::epsilon()) {// cos(beta) != 0
-        res[0] = atan2(minusPlus * mat(J_, K_), mat(K_, K_));
-        res[2] = atan2(minusPlus * mat(I_, J_), mat(I_, I_));
-      }
-      else if(plusMinus * mat(I_, K_) > 0) {// cos(beta) == 0 and sin(beta) == 1
-        Scalar spos = mat(J_, I_) + plusMinus * mat(K_, J_); // 2*sin(alpha + plusMinus * gamma
-        Scalar cpos = mat(J_, J_) + minusPlus * mat(K_, I_); // 2*cos(alpha + plusMinus * gamma)
-        Scalar alphaPlusMinusGamma = atan2(spos, cpos);
-        res[0] = alphaPlusMinusGamma;
-        res[2] = 0;
-      }
-      else {// cos(beta) == 0 and sin(beta) == -1
-        Scalar sneg = plusMinus * (mat(K_, J_) + minusPlus * mat(J_, I_)); // 2*sin(alpha + minusPlus*gamma)
-        Scalar cneg = mat(J_, J_) + plusMinus * mat(K_, I_);               // 2*cos(alpha + minusPlus*gamma)
-        Scalar alphaMinusPlusBeta = atan2(sneg, cneg);
-        res[0] = alphaMinusPlusBeta;
-        res[2] = 0;
+      typedef Matrix<Scalar,2,1> Vector2;
+      
+      res[0] = atan2(mat(J,K), mat(K,K));
+      Scalar c2 = Vector2(mat(I,I), mat(I,J)).norm();
+      if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0))) {
+        if(res[0] > Scalar(0)) {
+          res[0] -= Scalar(EIGEN_PI);
+        }
+        else {
+          res[0] += Scalar(EIGEN_PI);
+        }
+        res[1] = atan2(-mat(I,K), -c2);
       }
+      else
+        res[1] = atan2(-mat(I,K), c2);
+      Scalar s1 = sin(res[0]);
+      Scalar c1 = cos(res[0]);
+      res[2] = atan2(s1*mat(K,I)-c1*mat(J,I), c1*mat(J,J) - s1 * mat(K,J));
     }
 
     template <typename Derived>
-    static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar,3,1>& res,
-                                    const MatrixBase<Derived>& mat, internal::false_type /*isTaitBryan*/)
+    static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar,3,1>& res, const MatrixBase<Derived>& mat, internal::false_type /*isTaitBryan*/)
     {
       using std::atan2;
-      using std::sqrt;
+      using std::sin;
+      using std::cos;
 
       typedef typename Derived::Scalar Scalar;
-
-      const Scalar plusMinus = IsEven? 1 : -1;
-      const Scalar minusPlus = IsOdd?  1 : -1;
-
-      const Scalar Rsum = sqrt((mat(I_, J_) * mat(I_, J_) + mat(I_, K_) * mat(I_, K_) + mat(J_, I_) * mat(J_, I_) + mat(K_, I_) * mat(K_, I_)) / 2);
-
-      res[1] = atan2(Rsum, mat(I_, I_));
-
-      // There is a singularity when sin(beta) == 0
-      if(Rsum > 4 * NumTraits<Scalar>::epsilon()) {// sin(beta) != 0
-        res[0] = atan2(mat(J_, I_), minusPlus * mat(K_, I_));
-        res[2] = atan2(mat(I_, J_), plusMinus * mat(I_, K_));
+      typedef Matrix<Scalar,2,1> Vector2;
+      
+      res[0] = atan2(mat(J,I), mat(K,I));
+      if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0)))
+      {
+        if(res[0] > Scalar(0)) {
+          res[0] -= Scalar(EIGEN_PI);
+        }
+        else {
+          res[0] += Scalar(EIGEN_PI);
+        }
+        Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm();
+        res[1] = -atan2(s2, mat(I,I));
       }
-      else if(mat(I_, I_) > 0) {// sin(beta) == 0 and cos(beta) == 1
-        Scalar spos = plusMinus * mat(K_, J_) + minusPlus * mat(J_, K_); // 2*sin(alpha + gamma)
-        Scalar cpos = mat(J_, J_) + mat(K_, K_);                         // 2*cos(alpha + gamma)
-        res[0] = atan2(spos, cpos);
-        res[2] = 0;
-      }
-      else {// sin(beta) == 0 and cos(beta) == -1
-        Scalar sneg = plusMinus * mat(K_, J_) + plusMinus * mat(J_, K_); // 2*sin(alpha - gamma)
-        Scalar cneg = mat(J_, J_) - mat(K_, K_);                         // 2*cos(alpha - gamma)
-        res[0] = atan2(sneg, cneg);
-        res[2] = 0;
+      else
+      {
+        Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm();
+        res[1] = atan2(s2, mat(I,I));
       }
+
+      // With a=(0,1,0), we have i=0; j=1; k=2, and after computing the first two angles,
+      // we can compute their respective rotation, and apply its inverse to M. Since the result must
+      // be a rotation around x, we have:
+      //
+      //  c2  s1.s2 c1.s2                   1  0   0 
+      //  0   c1    -s1       *    M    =   0  c3  s3
+      //  -s2 s1.c2 c1.c2                   0 -s3  c3
+      //
+      //  Thus:  m11.c1 - m21.s1 = c3  &   m12.c1 - m22.s1 = s3
+
+      Scalar s1 = sin(res[0]);
+      Scalar c1 = cos(res[0]);
+      res[2] = atan2(c1*mat(J,K)-s1*mat(K,K), c1*mat(J,J) - s1 * mat(K,J));
     }
     
     template<typename Scalar>
     static void CalcEulerAngles(
       EulerAngles<Scalar, EulerSystem>& res,
       const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat)
+    {
+      CalcEulerAngles(res, mat, false, false, false);
+    }
+    
+    template<
+      bool PositiveRangeAlpha,
+      bool PositiveRangeBeta,
+      bool PositiveRangeGamma,
+      typename Scalar>
+    static void CalcEulerAngles(
+      EulerAngles<Scalar, EulerSystem>& res,
+      const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat)
+    {
+      CalcEulerAngles(res, mat, PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma);
+    }
+    
+    template<typename Scalar>
+    static void CalcEulerAngles(
+      EulerAngles<Scalar, EulerSystem>& res,
+      const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat,
+      bool PositiveRangeAlpha,
+      bool PositiveRangeBeta,
+      bool PositiveRangeGamma)
     {
       CalcEulerAngles_imp(
         res.angles(), mat,
         typename internal::conditional<IsTaitBryan, internal::true_type, internal::false_type>::type());
 
-      if (IsAlphaOpposite)
+      if (IsAlphaOpposite == IsOdd)
         res.alpha() = -res.alpha();
         
-      if (IsBetaOpposite)
+      if (IsBetaOpposite == IsOdd)
         res.beta() = -res.beta();
         
-      if (IsGammaOpposite)
+      if (IsGammaOpposite == IsOdd)
         res.gamma() = -res.gamma();
+      
+      // Saturate results to the requested range
+      if (PositiveRangeAlpha && (res.alpha() < 0))
+        res.alpha() += Scalar(2 * EIGEN_PI);
+      
+      if (PositiveRangeBeta && (res.beta() < 0))
+        res.beta() += Scalar(2 * EIGEN_PI);
+      
+      if (PositiveRangeGamma && (res.gamma() < 0))
+        res.gamma() += Scalar(2 * EIGEN_PI);
     }
     
     template <typename _Scalar, class _System>
     friend class Eigen::EulerAngles;
-    
-    template<typename System,
-            typename Other,
-            int OtherRows,
-            int OtherCols>
-    friend struct internal::eulerangles_assign_impl;
   };
 
 #define EIGEN_EULER_SYSTEM_TYPEDEF(A, B, C) \
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/FFT/ei_kissfft_impl.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
index 079e88602..be51b4e6f 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
@@ -316,8 +316,8 @@ struct kissfft_impl
 
         // use optimized mode for even real
         fwd( dst, reinterpret_cast<const Complex*> (src), ncfft);
-        Complex dc(dst[0].real() +  dst[0].imag());
-        Complex nyquist(dst[0].real() -  dst[0].imag());
+        Complex dc = dst[0].real() +  dst[0].imag();
+        Complex nyquist = dst[0].real() -  dst[0].imag();
         int k;
         for ( k=1;k <= ncfft2 ; ++k ) {
           Complex fpk = dst[k];
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h
index 5f7cdf29a..dc0093eb9 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h
@@ -31,7 +31,7 @@
 #ifndef EIGEN_CONSTRAINEDCG_H
 #define EIGEN_CONSTRAINEDCG_H
 
-#include "../../../../Eigen/Core"
+#include <Eigen/Core>
 
 namespace Eigen { 
 
@@ -99,7 +99,7 @@ void pseudo_inverse(const CMatrix &C, CINVMatrix &CINV)
 /** \ingroup IterativeSolvers_Module
   * Constrained conjugate gradient
   *
-  * Computes the minimum of \f$ 1/2((Ax).x) - bx \f$ under the constraint \f$ Cx \le f \f$
+  * Computes the minimum of \f$ 1/2((Ax).x) - bx \f$ under the contraint \f$ Cx \le f \f$
   */
 template<typename TMatrix, typename CMatrix,
          typename VectorX, typename VectorB, typename VectorF>
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/IterativeSolvers/DGMRES.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
index 2ab56b5e7..4079e2367 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_DGMRES_H
 #define EIGEN_DGMRES_H
 
-#include "../../../../Eigen/Eigenvalues"
+#include <Eigen/Eigenvalues>
 
 namespace Eigen { 
   
@@ -88,7 +88,7 @@ void sortWithPermutation (VectorType& vec, IndexType& perm, typename IndexType::
  * [1] D. NUENTSA WAKAM and F. PACULL, Memory Efficient Hybrid
  *  Algebraic Solvers for Linear Systems Arising from Compressible
  *  Flows, Computers and Fluids, In Press,
- *  https://doi.org/10.1016/j.compfluid.2012.03.023   
+ *  http://dx.doi.org/10.1016/j.compfluid.2012.03.023   
  * [2] K. Burrage and J. Erhel, On the performance of various 
  * adaptive preconditioned GMRES strategies, 5(1998), 101-121.
  * [3] J. Erhel, K. Burrage and B. Pohl, Restarted GMRES 
@@ -109,7 +109,6 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
     using Base::m_tolerance; 
   public:
     using Base::_solve_impl;
-    using Base::_solve_with_guess_impl;
     typedef _MatrixType MatrixType;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::StorageIndex StorageIndex;
@@ -142,16 +141,30 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
   
   /** \internal */
   template<typename Rhs,typename Dest>
-  void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
-  {
-    EIGEN_STATIC_ASSERT(Rhs::ColsAtCompileTime==1 || Dest::ColsAtCompileTime==1, YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX);
-    
-    m_iterations = Base::maxIterations();
-    m_error = Base::m_tolerance;
-    
-    dgmres(matrix(), b, x, Base::m_preconditioner);
+  void _solve_with_guess_impl(const Rhs& b, Dest& x) const
+  {    
+    bool failed = false;
+    for(Index j=0; j<b.cols(); ++j)
+    {
+      m_iterations = Base::maxIterations();
+      m_error = Base::m_tolerance;
+      
+      typename Dest::ColXpr xj(x,j);
+      dgmres(matrix(), b.col(j), xj, Base::m_preconditioner);
+    }
+    m_info = failed ? NumericalIssue
+           : m_error <= Base::m_tolerance ? Success
+           : NoConvergence;
+    m_isInitialized = true;
   }
 
+  /** \internal */
+  template<typename Rhs,typename Dest>
+  void _solve_impl(const Rhs& b, MatrixBase<Dest>& x) const
+  {
+    x = b;
+    _solve_with_guess_impl(b,x.derived());
+  }
   /** 
    * Get the restart value
     */
@@ -199,7 +212,7 @@ class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
     void dgmresInitDeflation(Index& rows) const; 
     mutable DenseMatrix m_V; // Krylov basis vectors
     mutable DenseMatrix m_H; // Hessenberg matrix 
-    mutable DenseMatrix m_Hes; // Initial hessenberg matrix without Givens rotations applied
+    mutable DenseMatrix m_Hes; // Initial hessenberg matrix wihout Givens rotations applied
     mutable Index m_restart; // Maximum size of the Krylov subspace
     mutable DenseMatrix m_U; // Vectors that form the basis of the invariant subspace 
     mutable DenseMatrix m_MU; // matrix operator applied to m_U (for next cycles)
@@ -228,30 +241,18 @@ template<typename Rhs, typename Dest>
 void DGMRES<_MatrixType, _Preconditioner>::dgmres(const MatrixType& mat,const Rhs& rhs, Dest& x,
               const Preconditioner& precond) const
 {
-  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
-
-  RealScalar normRhs = rhs.norm();
-  if(normRhs <= considerAsZero) 
-  {
-    x.setZero();
-    m_error = 0;
-    return;
-  }
-
   //Initialization
-  m_isDeflInitialized = false;
   Index n = mat.rows(); 
   DenseVector r0(n); 
   Index nbIts = 0; 
   m_H.resize(m_restart+1, m_restart);
   m_Hes.resize(m_restart, m_restart);
   m_V.resize(n,m_restart+1);
-  //Initial residual vector and initial norm
-  if(x.squaredNorm()==0) 
-    x = precond.solve(rhs);
+  //Initial residual vector and intial norm
+  x = precond.solve(x);
   r0 = rhs - mat * x; 
   RealScalar beta = r0.norm(); 
-  
+  RealScalar normRhs = rhs.norm();
   m_error = beta/normRhs; 
   if(m_error < m_tolerance)
     m_info = Success; 
@@ -264,10 +265,8 @@ void DGMRES<_MatrixType, _Preconditioner>::dgmres(const MatrixType& mat,const Rh
     dgmresCycle(mat, precond, x, r0, beta, normRhs, nbIts); 
     
     // Compute the new residual vector for the restart 
-    if (nbIts < m_iterations && m_info == NoConvergence) {
-      r0 = rhs - mat * x;
-      beta = r0.norm();
-    }
+    if (nbIts < m_iterations && m_info == NoConvergence)
+      r0 = rhs - mat * x; 
   }
 } 
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/IterativeSolvers/GMRES.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/IterativeSolvers/GMRES.h
index ff912094f..92618b107 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/IterativeSolvers/GMRES.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/IterativeSolvers/GMRES.h
@@ -64,15 +64,6 @@ bool gmres(const MatrixType & mat, const Rhs & rhs, Dest & x, const Precondition
   typedef Matrix < Scalar, Dynamic, 1 > VectorType;
   typedef Matrix < Scalar, Dynamic, Dynamic, ColMajor> FMatrixType;
 
-  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
-
-  if(rhs.norm() <= considerAsZero) 
-  {
-    x.setZero();
-    tol_error = 0;
-    return true;
-  }
-
   RealScalar tol = tol_error;
   const Index maxIters = iters;
   iters = 0;
@@ -316,14 +307,31 @@ public:
 
   /** \internal */
   template<typename Rhs,typename Dest>
-  void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
+  void _solve_with_guess_impl(const Rhs& b, Dest& x) const
   {
-    m_iterations = Base::maxIterations();
-    m_error = Base::m_tolerance;
-    bool ret = internal::gmres(matrix(), b, x, Base::m_preconditioner, m_iterations, m_restart, m_error);
-    m_info = (!ret) ? NumericalIssue
+    bool failed = false;
+    for(Index j=0; j<b.cols(); ++j)
+    {
+      m_iterations = Base::maxIterations();
+      m_error = Base::m_tolerance;
+
+      typename Dest::ColXpr xj(x,j);
+      if(!internal::gmres(matrix(), b.col(j), xj, Base::m_preconditioner, m_iterations, m_restart, m_error))
+        failed = true;
+    }
+    m_info = failed ? NumericalIssue
           : m_error <= Base::m_tolerance ? Success
           : NoConvergence;
+    m_isInitialized = true;
+  }
+
+  /** \internal */
+  template<typename Rhs,typename Dest>
+  void _solve_impl(const Rhs& b, MatrixBase<Dest> &x) const
+  {
+    x = b;
+    if(x.squaredNorm() == 0) return; // Check Zero right hand side
+    _solve_with_guess_impl(b,x.derived());
   }
 
 protected:
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/IterativeSolvers/MINRES.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/IterativeSolvers/MINRES.h
index 5db454d24..256990c1a 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/IterativeSolvers/MINRES.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/IterativeSolvers/MINRES.h
@@ -3,7 +3,6 @@
 //
 // Copyright (C) 2012 Giacomo Po <gpo@ucla.edu>
 // Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2018 David Hyde <dabh@stanford.edu>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -65,6 +64,8 @@ namespace Eigen {
             eigen_assert(beta_new2 >= 0.0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE");
             RealScalar beta_new(sqrt(beta_new2));
             const RealScalar beta_one(beta_new);
+            v_new /= beta_new;
+            w_new /= beta_new;
             // Initialize other variables
             RealScalar c(1.0); // the cosine of the Givens rotation
             RealScalar c_old(1.0);
@@ -82,18 +83,18 @@ namespace Eigen {
                 /* Note that there are 4 variants on the Lanczos algorithm. These are
                  * described in Paige, C. C. (1972). Computational variants of
                  * the Lanczos method for the eigenproblem. IMA Journal of Applied
-                 * Mathematics, 10(3), 373-381. The current implementation corresponds 
+                 * Mathematics, 10(3), 373–381. The current implementation corresponds 
                  * to the case A(2,7) in the paper. It also corresponds to 
-                 * algorithm 6.14 in Y. Saad, Iterative Methods for Sparse Linear
+                 * algorithm 6.14 in Y. Saad, Iterative Methods ￼￼￼for Sparse Linear
                  * Systems, 2003 p.173. For the preconditioned version see 
                  * A. Greenbaum, Iterative Methods for Solving Linear Systems, SIAM (1987).
                  */
                 const RealScalar beta(beta_new);
                 v_old = v; // update: at first time step, this makes v_old = 0 so value of beta doesn't matter
-                v_new /= beta_new; // overwrite v_new for next iteration
-                w_new /= beta_new; // overwrite w_new for next iteration
+//                const VectorType v_old(v); // NOT SURE IF CREATING v_old EVERY ITERATION IS EFFICIENT
                 v = v_new; // update
                 w = w_new; // update
+//                const VectorType w(w_new); // NOT SURE IF CREATING w EVERY ITERATION IS EFFICIENT
                 v_new.noalias() = mat*w - beta*v_old; // compute v_new
                 const RealScalar alpha = v_new.dot(w);
                 v_new -= alpha*v; // overwrite v_new
@@ -101,6 +102,8 @@ namespace Eigen {
                 beta_new2 = v_new.dot(w_new); // compute beta_new
                 eigen_assert(beta_new2 >= 0.0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE");
                 beta_new = sqrt(beta_new2); // compute beta_new
+                v_new /= beta_new; // overwrite v_new for next iteration
+                w_new /= beta_new; // overwrite w_new for next iteration
                 
                 // Givens rotation
                 const RealScalar r2 =s*alpha+c*c_old*beta; // s, s_old, c and c_old are still from previous iteration
@@ -114,6 +117,7 @@ namespace Eigen {
                 
                 // Update solution
                 p_oold = p_old;
+//                const VectorType p_oold(p_old); // NOT SURE IF CREATING p_oold EVERY ITERATION IS EFFICIENT
                 p_old = p;
                 p.noalias()=(w-r2*p_old-r3*p_oold) /r1; // IS NOALIAS REQUIRED?
                 x += beta_one*c*eta*p;
@@ -233,7 +237,7 @@ namespace Eigen {
 
         /** \internal */
         template<typename Rhs,typename Dest>
-        void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const
+        void _solve_with_guess_impl(const Rhs& b, Dest& x) const
         {
             typedef typename Base::MatrixWrapper MatrixWrapper;
             typedef typename Base::ActualMatrixType ActualMatrixType;
@@ -253,11 +257,28 @@ namespace Eigen {
             m_iterations = Base::maxIterations();
             m_error = Base::m_tolerance;
             RowMajorWrapper row_mat(matrix());
-            internal::minres(SelfAdjointWrapper(row_mat), b, x,
-                             Base::m_preconditioner, m_iterations, m_error);
+            for(int j=0; j<b.cols(); ++j)
+            {
+                m_iterations = Base::maxIterations();
+                m_error = Base::m_tolerance;
+                
+                typename Dest::ColXpr xj(x,j);
+                internal::minres(SelfAdjointWrapper(row_mat), b.col(j), xj,
+                                 Base::m_preconditioner, m_iterations, m_error);
+            }
+            
+            m_isInitialized = true;
             m_info = m_error <= Base::m_tolerance ? Success : NoConvergence;
         }
         
+        /** \internal */
+        template<typename Rhs,typename Dest>
+        void _solve_impl(const Rhs& b, MatrixBase<Dest> &x) const
+        {
+            x.setZero();
+            _solve_with_guess_impl(b,x.derived());
+        }
+        
     protected:
         
     };
@@ -265,3 +286,4 @@ namespace Eigen {
 } // end namespace Eigen
 
 #endif // EIGEN_MINRES_H
+
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/IterativeSolvers/Scaling.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/IterativeSolvers/Scaling.h
index 9b3eb53e0..d113e6e90 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/IterativeSolvers/Scaling.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/IterativeSolvers/Scaling.h
@@ -104,18 +104,12 @@ class IterScaling
         for (int i = 0; i < m; ++i) 
         {
           Dr(i) = std::sqrt(Dr(i));
-        }
-        for (int i = 0; i < n; ++i) 
-        {
           Dc(i) = std::sqrt(Dc(i));
         }
         // Save the scaling factors 
         for (int i = 0; i < m; ++i) 
         {
           m_left(i) /= Dr(i);
-        }
-        for (int i = 0; i < n; ++i) 
-        {
           m_right(i) /= Dc(i);
         }
         // Scale the rows and the columns of the matrix
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
index 123485817..ae9d793b1 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
@@ -73,7 +73,7 @@ void lmqrsolv(
             qtbpj = -givens.s() * wa[k] + givens.c() * qtbpj;
             wa[k] = temp;
 
-            /*           accumulate the transformation in the row of s. */
+            /*           accumulate the tranformation in the row of s. */
             for (i = k+1; i<n; ++i) {
                 temp = givens.c() * s(i,k) + givens.s() * sdiag[i];
                 sdiag[i] = -givens.s() * s(i,k) + givens.c() * sdiag[i];
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
index 62561da1d..995427978 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
@@ -117,7 +117,7 @@ class LevenbergMarquardt : internal::no_assignment_operator
     typedef typename JacobianType::RealScalar RealScalar; 
     typedef typename QRSolver::StorageIndex PermIndex;
     typedef Matrix<Scalar,Dynamic,1> FVectorType;
-    typedef PermutationMatrix<Dynamic,Dynamic,int> PermutationType;
+    typedef PermutationMatrix<Dynamic,Dynamic> PermutationType;
   public:
     LevenbergMarquardt(FunctorType& functor) 
     : m_functor(functor),m_nfev(0),m_njev(0),m_fnorm(0.0),m_gnorm(0),
@@ -233,9 +233,9 @@ class LevenbergMarquardt : internal::no_assignment_operator
     
     /** 
      * \brief Reports whether the minimization was successful
-     * \returns \c Success if the minimization was successful,
+     * \returns \c Success if the minimization was succesful,
      *         \c NumericalIssue if a numerical problem arises during the 
-     *          minimization process, for example during the QR factorization
+     *          minimization process, for exemple during the QR factorization
      *         \c NoConvergence if the minimization did not converge after 
      *          the maximum number of function evaluation allowed
      *          \c InvalidInput if the input matrix is invalid
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
index 02284b0dd..0b0ee6546 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
@@ -314,7 +314,7 @@ struct matrix_exp_computeUV<MatrixType, long double>
       matrix_exp_pade17(A, U, V);
     }
   
-#elif LDBL_MANT_DIG <= 113  // quadruple precision
+#elif LDBL_MANT_DIG <= 112  // quadruple precison
   
     if (l1norm < 1.639394610288918690547467954466970e-005L) {
       matrix_exp_pade3(arg, U, V);
@@ -347,7 +347,7 @@ struct matrix_exp_computeUV<MatrixType, long double>
 template<typename T> struct is_exp_known_type : false_type {};
 template<> struct is_exp_known_type<float> : true_type {};
 template<> struct is_exp_known_type<double> : true_type {};
-#if LDBL_MANT_DIG <= 113
+#if LDBL_MANT_DIG <= 112
 template<> struct is_exp_known_type<long double> : true_type {};
 #endif
 
@@ -396,6 +396,7 @@ void matrix_exp_compute(const ArgType& arg, ResultType &result, false_type) // d
 template<typename Derived> struct MatrixExponentialReturnValue
 : public ReturnByValue<MatrixExponentialReturnValue<Derived> >
 {
+    typedef typename Derived::Index Index;
   public:
     /** \brief Constructor.
       *
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
index cc12ab62b..3df82394c 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
@@ -53,7 +53,7 @@ template <typename MatrixType>
 typename NumTraits<typename MatrixType::Scalar>::Real matrix_function_compute_mu(const MatrixType& A)
 {
   typedef typename plain_col_type<MatrixType>::type VectorType;
-  Index rows = A.rows();
+  typename MatrixType::Index rows = A.rows();
   const MatrixType N = MatrixType::Identity(rows, rows) - A;
   VectorType e = VectorType::Ones(rows);
   N.template triangularView<Upper>().solveInPlace(e);
@@ -65,6 +65,7 @@ MatrixType MatrixFunctionAtomic<MatrixType>::compute(const MatrixType& A)
 {
   // TODO: Use that A is upper triangular
   typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef typename MatrixType::Index Index;
   Index rows = A.rows();
   Scalar avgEival = A.trace() / Scalar(RealScalar(rows));
   MatrixType Ashifted = A - avgEival * MatrixType::Identity(rows, rows);
@@ -72,10 +73,10 @@ MatrixType MatrixFunctionAtomic<MatrixType>::compute(const MatrixType& A)
   MatrixType F = m_f(avgEival, 0) * MatrixType::Identity(rows, rows);
   MatrixType P = Ashifted;
   MatrixType Fincr;
-  for (Index s = 1; double(s) < 1.1 * double(rows) + 10.0; s++) { // upper limit is fairly arbitrary
+  for (Index s = 1; s < 1.1 * rows + 10; s++) { // upper limit is fairly arbitrary
     Fincr = m_f(avgEival, static_cast<int>(s)) * P;
     F += Fincr;
-    P = Scalar(RealScalar(1)/RealScalar(s + 1)) * P * Ashifted;
+    P = Scalar(RealScalar(1.0/(s + 1))) * P * Ashifted;
 
     // test whether Taylor series converged
     const RealScalar F_norm = F.cwiseAbs().rowwise().sum().maxCoeff();
@@ -130,6 +131,7 @@ typename ListOfClusters::iterator matrix_function_find_cluster(Index key, ListOf
 template <typename EivalsType, typename Cluster>
 void matrix_function_partition_eigenvalues(const EivalsType& eivals, std::list<Cluster>& clusters)
 {
+  typedef typename EivalsType::Index Index;
   typedef typename EivalsType::RealScalar RealScalar;
   for (Index i=0; i<eivals.rows(); ++i) {
     // Find cluster containing i-th ei'val, adding a new cluster if necessary
@@ -177,7 +179,7 @@ void matrix_function_compute_block_start(const VectorType& clusterSize, VectorTy
 {
   blockStart.resize(clusterSize.rows());
   blockStart(0) = 0;
-  for (Index i = 1; i < clusterSize.rows(); i++) {
+  for (typename VectorType::Index i = 1; i < clusterSize.rows(); i++) {
     blockStart(i) = blockStart(i-1) + clusterSize(i-1);
   }
 }
@@ -186,6 +188,7 @@ void matrix_function_compute_block_start(const VectorType& clusterSize, VectorTy
 template <typename EivalsType, typename ListOfClusters, typename VectorType>
 void matrix_function_compute_map(const EivalsType& eivals, const ListOfClusters& clusters, VectorType& eivalToCluster)
 {
+  typedef typename EivalsType::Index Index;
   eivalToCluster.resize(eivals.rows());
   Index clusterIndex = 0;
   for (typename ListOfClusters::const_iterator cluster = clusters.begin(); cluster != clusters.end(); ++cluster) {
@@ -202,6 +205,7 @@ void matrix_function_compute_map(const EivalsType& eivals, const ListOfClusters&
 template <typename DynVectorType, typename VectorType>
 void matrix_function_compute_permutation(const DynVectorType& blockStart, const DynVectorType& eivalToCluster, VectorType& permutation)
 {
+  typedef typename VectorType::Index Index;
   DynVectorType indexNextEntry = blockStart;
   permutation.resize(eivalToCluster.rows());
   for (Index i = 0; i < eivalToCluster.rows(); i++) {
@@ -215,6 +219,7 @@ void matrix_function_compute_permutation(const DynVectorType& blockStart, const
 template <typename VectorType, typename MatrixType>
 void matrix_function_permute_schur(VectorType& permutation, MatrixType& U, MatrixType& T)
 {
+  typedef typename VectorType::Index Index;
   for (Index i = 0; i < permutation.rows() - 1; i++) {
     Index j;
     for (j = i; j < permutation.rows(); j++) {
@@ -242,7 +247,7 @@ template <typename MatrixType, typename AtomicType, typename VectorType>
 void matrix_function_compute_block_atomic(const MatrixType& T, AtomicType& atomic, const VectorType& blockStart, const VectorType& clusterSize, MatrixType& fT)
 { 
   fT.setZero(T.rows(), T.cols());
-  for (Index i = 0; i < clusterSize.rows(); ++i) {
+  for (typename VectorType::Index i = 0; i < clusterSize.rows(); ++i) {
     fT.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i))
       = atomic.compute(T.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i)));
   }
@@ -280,6 +285,7 @@ MatrixType matrix_function_solve_triangular_sylvester(const MatrixType& A, const
   eigen_assert(C.rows() == A.rows());
   eigen_assert(C.cols() == B.rows());
 
+  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
 
   Index m = A.rows();
@@ -324,8 +330,11 @@ void matrix_function_compute_above_diagonal(const MatrixType& T, const VectorTyp
 { 
   typedef internal::traits<MatrixType> Traits;
   typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::Index Index;
+  static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
+  static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
   static const int Options = MatrixType::Options;
-  typedef Matrix<Scalar, Dynamic, Dynamic, Options, Traits::RowsAtCompileTime, Traits::ColsAtCompileTime> DynMatrixType;
+  typedef Matrix<Scalar, Dynamic, Dynamic, Options, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
 
   for (Index k = 1; k < clusterSize.rows(); k++) {
     for (Index i = 0; i < clusterSize.rows() - k; i++) {
@@ -419,8 +428,7 @@ struct matrix_function_compute<MatrixType, 1>
     typedef internal::traits<MatrixType> Traits;
     
     // compute Schur decomposition of A
-    const ComplexSchur<MatrixType> schurOfA(A);
-    eigen_assert(schurOfA.info()==Success);
+    const ComplexSchur<MatrixType> schurOfA(A);  
     MatrixType T = schurOfA.matrixT();
     MatrixType U = schurOfA.matrixU();
 
@@ -472,6 +480,7 @@ template<typename Derived> class MatrixFunctionReturnValue
 {
   public:
     typedef typename Derived::Scalar Scalar;
+    typedef typename Derived::Index Index;
     typedef typename internal::stem_function<Scalar>::type StemFunction;
 
   protected:
@@ -496,8 +505,10 @@ template<typename Derived> class MatrixFunctionReturnValue
       typedef typename internal::nested_eval<Derived, 10>::type NestedEvalType;
       typedef typename internal::remove_all<NestedEvalType>::type NestedEvalTypeClean;
       typedef internal::traits<NestedEvalTypeClean> Traits;
+      static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
+      static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
       typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
-      typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, Traits::RowsAtCompileTime, Traits::ColsAtCompileTime> DynMatrixType;
+      typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
 
       typedef internal::MatrixFunctionAtomic<DynMatrixType> AtomicType;
       AtomicType atomic(m_f);
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
index e917013e0..cf5fffad3 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
@@ -62,8 +62,8 @@ void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result)
   else
   {
     // computation in previous branch is inaccurate if A(1,1) \approx A(0,0)
-    RealScalar unwindingNumber = ceil((imag(logA11 - logA00) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI));
-    result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,RealScalar(2*EIGEN_PI)*unwindingNumber)) / y;
+    int unwindingNumber = static_cast<int>(ceil((imag(logA11 - logA00) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI)));
+    result(0,1) = A(0,1) * (numext::log1p(y/A(0,0)) + Scalar(0,2*EIGEN_PI*unwindingNumber)) / y;
   }
 }
 
@@ -135,8 +135,7 @@ void matrix_log_compute_pade(MatrixType& result, const MatrixType& T, int degree
   const int minPadeDegree = 3;
   const int maxPadeDegree = 11;
   assert(degree >= minPadeDegree && degree <= maxPadeDegree);
-  // FIXME this creates float-conversion-warnings if these are enabled.
-  // Either manually convert each value, or disable the warning locally
+
   const RealScalar nodes[][maxPadeDegree] = { 
     { 0.1127016653792583114820734600217600L, 0.5000000000000000000000000000000000L,  // degree 3
       0.8872983346207416885179265399782400L }, 
@@ -233,13 +232,12 @@ void matrix_log_compute_big(const MatrixType& A, MatrixType& result)
   int degree;
   MatrixType T = A, sqrtT;
 
-  const int maxPadeDegree = matrix_log_max_pade_degree<Scalar>::value;
-  const RealScalar maxNormForPade = RealScalar(
-                                    maxPadeDegree<= 5? 5.3149729967117310e-1L:                    // single precision
+  int maxPadeDegree = matrix_log_max_pade_degree<Scalar>::value;
+  const RealScalar maxNormForPade = maxPadeDegree<= 5? 5.3149729967117310e-1L:                    // single precision
                                     maxPadeDegree<= 7? 2.6429608311114350e-1L:                    // double precision
                                     maxPadeDegree<= 8? 2.32777776523703892094e-1L:                // extended precision
                                     maxPadeDegree<=10? 1.05026503471351080481093652651105e-1L:    // double-double
-                                                       1.1880960220216759245467951592883642e-1L); // quadruple precision
+                                                       1.1880960220216759245467951592883642e-1L;  // quadruple precision
 
   while (true) {
     RealScalar normTminusI = (T - MatrixType::Identity(T.rows(), T.rows())).cwiseAbs().colwise().sum().maxCoeff();
@@ -256,7 +254,7 @@ void matrix_log_compute_big(const MatrixType& A, MatrixType& result)
   }
 
   matrix_log_compute_pade(result, T, degree);
-  result *= pow(RealScalar(2), RealScalar(numberOfSquareRoots)); // TODO replace by bitshift if possible
+  result *= pow(RealScalar(2), numberOfSquareRoots);
 }
 
 /** \ingroup MatrixFunctions_Module
@@ -334,8 +332,10 @@ public:
     typedef typename internal::nested_eval<Derived, 10>::type DerivedEvalType;
     typedef typename internal::remove_all<DerivedEvalType>::type DerivedEvalTypeClean;
     typedef internal::traits<DerivedEvalTypeClean> Traits;
+    static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
+    static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
     typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
-    typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, Traits::RowsAtCompileTime, Traits::ColsAtCompileTime> DynMatrixType;
+    typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
     typedef internal::MatrixLogarithmAtomic<DynMatrixType> AtomicType;
     AtomicType atomic;
     
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
index d7672d7c9..a3273da4e 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
@@ -40,6 +40,7 @@ class MatrixPowerParenthesesReturnValue : public ReturnByValue< MatrixPowerParen
 {
   public:
     typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::Index Index;
 
     /**
      * \brief Constructor.
@@ -80,7 +81,7 @@ class MatrixPowerParenthesesReturnValue : public ReturnByValue< MatrixPowerParen
  *
  * \note Currently this class is only used by MatrixPower. One may
  * insist that this be nested into MatrixPower. This class is here to
- * facilitate future development of triangular matrix functions.
+ * faciliate future development of triangular matrix functions.
  */
 template<typename MatrixType>
 class MatrixPowerAtomic : internal::noncopyable
@@ -93,6 +94,7 @@ class MatrixPowerAtomic : internal::noncopyable
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
     typedef std::complex<RealScalar> ComplexScalar;
+    typedef typename MatrixType::Index Index;
     typedef Block<MatrixType,Dynamic,Dynamic> ResultType;
 
     const MatrixType& m_A;
@@ -160,11 +162,11 @@ template<typename MatrixType>
 void MatrixPowerAtomic<MatrixType>::computePade(int degree, const MatrixType& IminusT, ResultType& res) const
 {
   int i = 2*degree;
-  res = (m_p-RealScalar(degree)) / RealScalar(2*i-2) * IminusT;
+  res = (m_p-degree) / (2*i-2) * IminusT;
 
   for (--i; i; --i) {
     res = (MatrixType::Identity(IminusT.rows(), IminusT.cols()) + res).template triangularView<Upper>()
-	.solve((i==1 ? -m_p : i&1 ? (-m_p-RealScalar(i/2))/RealScalar(2*i) : (m_p-RealScalar(i/2))/RealScalar(2*i-2)) * IminusT).eval();
+	.solve((i==1 ? -m_p : i&1 ? (-m_p-i/2)/(2*i) : (m_p-i/2)/(2*i-2)) * IminusT).eval();
   }
   res += MatrixType::Identity(IminusT.rows(), IminusT.cols());
 }
@@ -194,12 +196,11 @@ void MatrixPowerAtomic<MatrixType>::computeBig(ResultType& res) const
 {
   using std::ldexp;
   const int digits = std::numeric_limits<RealScalar>::digits;
-  const RealScalar maxNormForPade = RealScalar(
-                                    digits <=  24? 4.3386528e-1L                            // single precision
+  const RealScalar maxNormForPade = digits <=  24? 4.3386528e-1L                            // single precision
                                   : digits <=  53? 2.789358995219730e-1L                    // double precision
                                   : digits <=  64? 2.4471944416607995472e-1L                // extended precision
                                   : digits <= 106? 1.1016843812851143391275867258512e-1L    // double-double
-                                  :                9.134603732914548552537150753385375e-2L); // quadruple precision
+                                  :                9.134603732914548552537150753385375e-2L; // quadruple precision
   MatrixType IminusT, sqrtT, T = m_A.template triangularView<Upper>();
   RealScalar normIminusT;
   int degree, degree2, numberOfSquareRoots = 0;
@@ -297,8 +298,8 @@ MatrixPowerAtomic<MatrixType>::computeSuperDiag(const ComplexScalar& curr, const
 
   ComplexScalar logCurr = log(curr);
   ComplexScalar logPrev = log(prev);
-  RealScalar unwindingNumber = ceil((numext::imag(logCurr - logPrev) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI));
-  ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, RealScalar(EIGEN_PI)*unwindingNumber);
+  int unwindingNumber = ceil((numext::imag(logCurr - logPrev) - RealScalar(EIGEN_PI)) / RealScalar(2*EIGEN_PI));
+  ComplexScalar w = numext::log1p((curr-prev)/prev)/RealScalar(2) + ComplexScalar(0, EIGEN_PI*unwindingNumber);
   return RealScalar(2) * exp(RealScalar(0.5) * p * (logCurr + logPrev)) * sinh(p * w) / (curr - prev);
 }
 
@@ -339,6 +340,7 @@ class MatrixPower : internal::noncopyable
   private:
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
+    typedef typename MatrixType::Index Index;
 
   public:
     /**
@@ -598,6 +600,7 @@ class MatrixPowerReturnValue : public ReturnByValue< MatrixPowerReturnValue<Deri
   public:
     typedef typename Derived::PlainObject PlainObject;
     typedef typename Derived::RealScalar RealScalar;
+    typedef typename Derived::Index Index;
 
     /**
      * \brief Constructor.
@@ -645,6 +648,7 @@ class MatrixComplexPowerReturnValue : public ReturnByValue< MatrixComplexPowerRe
   public:
     typedef typename Derived::PlainObject PlainObject;
     typedef typename std::complex<typename Derived::RealScalar> ComplexScalar;
+    typedef typename Derived::Index Index;
 
     /**
      * \brief Constructor.
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
index e363e779d..9de0c3574 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
@@ -17,7 +17,7 @@ namespace internal {
 // pre:  T.block(i,i,2,2) has complex conjugate eigenvalues
 // post: sqrtT.block(i,i,2,2) is square root of T.block(i,i,2,2)
 template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, Index i, ResultType& sqrtT)
+void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, typename MatrixType::Index i, ResultType& sqrtT)
 {
   // TODO: This case (2-by-2 blocks with complex conjugate eigenvalues) is probably hidden somewhere
   //       in EigenSolver. If we expose it, we could call it directly from here.
@@ -32,7 +32,7 @@ void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, Index
 //       all blocks of sqrtT to left of and below (i,j) are correct
 // post: sqrtT(i,j) has the correct value
 template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
+void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
 {
   typedef typename traits<MatrixType>::Scalar Scalar;
   Scalar tmp = (sqrtT.row(i).segment(i+1,j-i-1) * sqrtT.col(j).segment(i+1,j-i-1)).value();
@@ -41,7 +41,7 @@ void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, In
 
 // similar to compute1x1offDiagonalBlock()
 template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
+void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
 {
   typedef typename traits<MatrixType>::Scalar Scalar;
   Matrix<Scalar,1,2> rhs = T.template block<1,2>(i,j);
@@ -54,7 +54,7 @@ void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, In
 
 // similar to compute1x1offDiagonalBlock()
 template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
+void matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
 {
   typedef typename traits<MatrixType>::Scalar Scalar;
   Matrix<Scalar,2,1> rhs = T.template block<2,1>(i,j);
@@ -101,7 +101,7 @@ void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const
 
 // similar to compute1x1offDiagonalBlock()
 template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT)
+void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
 {
   typedef typename traits<MatrixType>::Scalar Scalar;
   Matrix<Scalar,2,2> A = sqrtT.template block<2,2>(i,i);
@@ -204,7 +204,7 @@ template <typename MatrixType, typename ResultType>
 void matrix_sqrt_triangular(const MatrixType &arg, ResultType &result)
 {
   using std::sqrt;
-  typedef typename MatrixType::Scalar Scalar;
+      typedef typename MatrixType::Scalar Scalar;
 
   eigen_assert(arg.rows() == arg.cols());
 
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h
index 4f2f560b3..feafd62a8 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h
@@ -61,7 +61,7 @@ void qrsolv(
             qtbpj = -givens.s() * wa[k] + givens.c() * qtbpj;
             wa[k] = temp;
 
-            /*           accumulate the transformation in the row of s. */
+            /*           accumulate the tranformation in the row of s. */
             for (i = k+1; i<n; ++i) {
                 temp = givens.c() * s(i,k) + givens.s() * sdiag[i];
                 sdiag[i] = -givens.s() * s(i,k) + givens.c() * sdiag[i];
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/NonLinearOptimization/r1updt.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
index 09fc65255..f28766061 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
@@ -22,7 +22,7 @@ void r1updt(
     Scalar temp;
     JacobiRotation<Scalar> givens;
 
-    // r1updt had a broader usecase, but we don't use it here. And, more
+    // r1updt had a broader usecase, but we dont use it here. And, more
     // importantly, we can not test it.
     eigen_assert(m==n);
     eigen_assert(u.size()==m);
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/Polynomials/Companion.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/Polynomials/Companion.h
index 6ab8f9714..359836cac 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/Polynomials/Companion.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/Polynomials/Companion.h
@@ -103,7 +103,7 @@ class companion
     /** Helper function for the balancing algorithm.
      * \returns true if the row and the column, having colNorm and rowNorm
      * as norms, are balanced, false otherwise.
-     * colB and rowB are respectively the multipliers for
+     * colB and rowB are repectively the multipliers for
      * the column and the row in order to balance them.
      * */
     bool balanced( RealScalar colNorm, RealScalar rowNorm,
@@ -112,7 +112,7 @@ class companion
     /** Helper function for the balancing algorithm.
      * \returns true if the row and the column, having colNorm and rowNorm
      * as norms, are balanced, false otherwise.
-     * colB and rowB are respectively the multipliers for
+     * colB and rowB are repectively the multipliers for
      * the column and the row in order to balance them.
      * */
     bool balancedR( RealScalar colNorm, RealScalar rowNorm,
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h
index 6d0370d5b..a1f54ed35 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h
@@ -41,7 +41,7 @@ public:
 
     /** Sets the relative threshold value used to prune zero coefficients during the decomposition.
      *
-     * Setting a value greater than zero speeds up computation, and yields to an incomplete
+     * Setting a value greater than zero speeds up computation, and yields to an imcomplete
      * factorization with fewer non zero coefficients. Such approximate factors are especially
      * useful to initialize an iterative solver.
      *
@@ -349,4 +349,4 @@ bool SkylineInplaceLU<MatrixType>::solve(const MatrixBase<BDerived> &b, MatrixBa
 
 } // end namespace Eigen
 
-#endif // EIGEN_SKYLINEINPLACELU_H
+#endif // EIGEN_SKYLINELU_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/Skyline/SkylineMatrix.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/Skyline/SkylineMatrix.h
index 7c7eace7f..a2a8933ca 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/Skyline/SkylineMatrix.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/Skyline/SkylineMatrix.h
@@ -206,26 +206,26 @@ public:
             if (col > row) //upper matrix
             {
                 const Index minOuterIndex = inner - m_data.upperProfile(inner);
-                eigen_assert(outer >= minOuterIndex && "You tried to access a coeff that does not exist in the storage");
+                eigen_assert(outer >= minOuterIndex && "you try to acces a coeff that do not exist in the storage");
                 return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
             }
             if (col < row) //lower matrix
             {
                 const Index minInnerIndex = outer - m_data.lowerProfile(outer);
-                eigen_assert(inner >= minInnerIndex && "You tried to access a coeff that does not exist in the storage");
+                eigen_assert(inner >= minInnerIndex && "you try to acces a coeff that do not exist in the storage");
                 return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
             }
         } else {
             if (outer > inner) //upper matrix
             {
                 const Index maxOuterIndex = inner + m_data.upperProfile(inner);
-                eigen_assert(outer <= maxOuterIndex && "You tried to access a coeff that does not exist in the storage");
+                eigen_assert(outer <= maxOuterIndex && "you try to acces a coeff that do not exist in the storage");
                 return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
             }
             if (outer < inner) //lower matrix
             {
                 const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
-                eigen_assert(inner <= maxInnerIndex && "You tried to access a coeff that does not exist in the storage");
+                eigen_assert(inner <= maxInnerIndex && "you try to acces a coeff that do not exist in the storage");
                 return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
             }
         }
@@ -300,11 +300,11 @@ public:
 
         if (IsRowMajor) {
             const Index minInnerIndex = outer - m_data.lowerProfile(outer);
-            eigen_assert(inner >= minInnerIndex && "You tried to access a coeff that does not exist in the storage");
+            eigen_assert(inner >= minInnerIndex && "you try to acces a coeff that do not exist in the storage");
             return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
         } else {
             const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
-            eigen_assert(inner <= maxInnerIndex && "You tried to access a coeff that does not exist in the storage");
+            eigen_assert(inner <= maxInnerIndex && "you try to acces a coeff that do not exist in the storage");
             return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
         }
     }
@@ -336,11 +336,11 @@ public:
 
         if (IsRowMajor) {
             const Index minOuterIndex = inner - m_data.upperProfile(inner);
-            eigen_assert(outer >= minOuterIndex && "You tried to access a coeff that does not exist in the storage");
+            eigen_assert(outer >= minOuterIndex && "you try to acces a coeff that do not exist in the storage");
             return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
         } else {
             const Index maxOuterIndex = inner + m_data.upperProfile(inner);
-            eigen_assert(outer <= maxOuterIndex && "You tried to access a coeff that does not exist in the storage");
+            eigen_assert(outer <= maxOuterIndex && "you try to acces a coeff that do not exist in the storage");
             return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
         }
     }
@@ -859,4 +859,4 @@ protected:
 
 } // end namespace Eigen
 
-#endif // EIGEN_SKYLINEMATRIX_H
+#endif // EIGEN_SkylineMatrix_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h
index 753c1b33d..b3a237230 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h
@@ -209,4 +209,4 @@ protected:
 
 } // end namespace Eigen
 
-#endif // EIGEN_SKYLINEMATRIXBASE_H
+#endif // EIGEN_SkylineMatrixBase_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/Skyline/SkylineStorage.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/Skyline/SkylineStorage.h
index cc7514f12..378a8deb4 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/Skyline/SkylineStorage.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/Skyline/SkylineStorage.h
@@ -256,4 +256,4 @@ public:
 
 } // end namespace Eigen
 
-#endif // EIGEN_SKYLINE_STORAGE_H
+#endif // EIGEN_COMPRESSED_STORAGE_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
index 42c99e467..0ffbc43d2 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
@@ -187,7 +187,7 @@ template<typename _Scalar, int _Options, typename _StorageIndex>
     /** Does nothing: provided for compatibility with SparseMatrix */
     inline void finalize() {}
 
-    /** Suppress all nonzeros which are smaller than \a reference under the tolerance \a epsilon */
+    /** Suppress all nonzeros which are smaller than \a reference under the tolerence \a epsilon */
     void prune(Scalar reference, RealScalar epsilon = NumTraits<RealScalar>::dummy_precision())
     {
       for (Index j=0; j<outerSize(); ++j)
@@ -224,7 +224,7 @@ template<typename _Scalar, int _Options, typename _StorageIndex>
       }
     }
 
-    /** The class DynamicSparseMatrix is deprecated */
+    /** The class DynamicSparseMatrix is deprectaed */
     EIGEN_DEPRECATED inline DynamicSparseMatrix()
       : m_innerSize(0), m_data(0)
     {
@@ -234,7 +234,7 @@ template<typename _Scalar, int _Options, typename _StorageIndex>
       eigen_assert(innerSize()==0 && outerSize()==0);
     }
 
-    /** The class DynamicSparseMatrix is deprecated */
+    /** The class DynamicSparseMatrix is deprectaed */
     EIGEN_DEPRECATED inline DynamicSparseMatrix(Index rows, Index cols)
       : m_innerSize(0)
     {
@@ -244,7 +244,7 @@ template<typename _Scalar, int _Options, typename _StorageIndex>
       resize(rows, cols);
     }
 
-    /** The class DynamicSparseMatrix is deprecated */
+    /** The class DynamicSparseMatrix is deprectaed */
     template<typename OtherDerived>
     EIGEN_DEPRECATED explicit inline DynamicSparseMatrix(const SparseMatrixBase<OtherDerived>& other)
       : m_innerSize(0)
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SparseExtra/MarketIO.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/SparseExtra/MarketIO.h
index 833edd4df..04b7d69ac 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SparseExtra/MarketIO.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/SparseExtra/MarketIO.h
@@ -12,38 +12,38 @@
 #define EIGEN_SPARSE_MARKET_IO_H
 
 #include <iostream>
-#include <vector>
 
 namespace Eigen { 
 
 namespace internal 
 {
-  template <typename Scalar, typename StorageIndex>
-  inline void GetMarketLine (const char* line, StorageIndex& i, StorageIndex& j, Scalar& value)
+  template <typename Scalar,typename IndexType>
+  inline bool GetMarketLine (std::stringstream& line, IndexType& M, IndexType& N, IndexType& i, IndexType& j, Scalar& value)
   {
-    std::stringstream sline(line);
-    sline >> i >> j >> value;
+    line >> i >> j >> value;
+    i--;
+    j--;
+    if(i>=0 && j>=0 && i<M && j<N)
+    {
+      return true; 
+    }
+    else
+      return false;
   }
-
-  template<> inline void GetMarketLine (const char* line, int& i, int& j, float& value)
-  { std::sscanf(line, "%d %d %g", &i, &j, &value); }
-
-  template<> inline void GetMarketLine (const char* line, int& i, int& j, double& value)
-  { std::sscanf(line, "%d %d %lg", &i, &j, &value); }
-
-  template<> inline void GetMarketLine (const char* line, int& i, int& j, std::complex<float>& value)
-  { std::sscanf(line, "%d %d %g %g", &i, &j, &numext::real_ref(value), &numext::imag_ref(value)); }
-
-  template<> inline void GetMarketLine (const char* line, int& i, int& j, std::complex<double>& value)
-  { std::sscanf(line, "%d %d %lg %lg", &i, &j, &numext::real_ref(value), &numext::imag_ref(value)); }
-
-  template <typename Scalar, typename StorageIndex>
-  inline void GetMarketLine (const char* line, StorageIndex& i, StorageIndex& j, std::complex<Scalar>& value)
+  template <typename Scalar,typename IndexType>
+  inline bool GetMarketLine (std::stringstream& line, IndexType& M, IndexType& N, IndexType& i, IndexType& j, std::complex<Scalar>& value)
   {
-    std::stringstream sline(line);
     Scalar valR, valI;
-    sline >> i >> j >> valR >> valI;
-    value = std::complex<Scalar>(valR,valI);
+    line >> i >> j >> valR >> valI;
+    i--;
+    j--;
+    if(i>=0 && j>=0 && i<M && j<N)
+    {
+      value = std::complex<Scalar>(valR, valI);
+      return true; 
+    }
+    else
+      return false;
   }
 
   template <typename RealScalar>
@@ -81,13 +81,13 @@ namespace internal
     }
   }
 
-  template<typename Scalar, typename StorageIndex>
-  inline void PutMatrixElt(Scalar value, StorageIndex row, StorageIndex col, std::ofstream& out)
+  template<typename Scalar>
+  inline void PutMatrixElt(Scalar value, int row, int col, std::ofstream& out)
   {
     out << row << " "<< col << " " << value << "\n";
   }
-  template<typename Scalar, typename StorageIndex>
-  inline void PutMatrixElt(std::complex<Scalar> value, StorageIndex row, StorageIndex col, std::ofstream& out)
+  template<typename Scalar>
+  inline void PutMatrixElt(std::complex<Scalar> value, int row, int col, std::ofstream& out)
   {
     out << row << " " << col << " " << value.real() << " " << value.imag() << "\n";
   }
@@ -104,7 +104,7 @@ namespace internal
     out << value.real << " " << value.imag()<< "\n"; 
   }
 
-} // end namespace internal
+} // end namepsace internal
 
 inline bool getMarketHeader(const std::string& filename, int& sym, bool& iscomplex, bool& isvector)
 {
@@ -138,9 +138,6 @@ bool loadMarket(SparseMatrixType& mat, const std::string& filename)
   std::ifstream input(filename.c_str(),std::ios::in);
   if(!input)
     return false;
-
-  char rdbuffer[4096];
-  input.rdbuf()->pubsetbuf(rdbuffer, 4096);
   
   const int maxBuffersize = 2048;
   char buffer[maxBuffersize];
@@ -150,22 +147,24 @@ bool loadMarket(SparseMatrixType& mat, const std::string& filename)
   typedef Triplet<Scalar,StorageIndex> T;
   std::vector<T> elements;
   
-  Index M(-1), N(-1), NNZ(-1);
-  Index count = 0;
+  StorageIndex M(-1), N(-1), NNZ(-1);
+  StorageIndex count = 0;
   while(input.getline(buffer, maxBuffersize))
   {
     // skip comments   
     //NOTE An appropriate test should be done on the header to get the  symmetry
     if(buffer[0]=='%')
       continue;
-
+    
+    std::stringstream line(buffer);
+    
     if(!readsizes)
     {
-      std::stringstream line(buffer);
       line >> M >> N >> NNZ;
-      if(M > 0 && N > 0)
+      if(M > 0 && N > 0 && NNZ > 0) 
       {
         readsizes = true;
+        //std::cout << "sizes: " << M << "," << N << "," << NNZ << "\n";
         mat.resize(M,N);
         mat.reserve(NNZ);
       }
@@ -174,20 +173,15 @@ bool loadMarket(SparseMatrixType& mat, const std::string& filename)
     { 
       StorageIndex i(-1), j(-1);
       Scalar value; 
-      internal::GetMarketLine(buffer, i, j, value);
-
-      i--;
-      j--;
-      if(i>=0 && j>=0 && i<M && j<N)
+      if( internal::GetMarketLine(line, M, N, i, j, value) ) 
       {
-        ++count;
+        ++ count;
         elements.push_back(T(i,j,value));
       }
-      else
+      else 
         std::cerr << "Invalid read: " << i << "," << j << "\n";        
     }
   }
-
   mat.setFromTriplets(elements.begin(), elements.end());
   if(count!=NNZ)
     std::cerr << count << "!=" << NNZ << "\n";
@@ -232,13 +226,12 @@ template<typename SparseMatrixType>
 bool saveMarket(const SparseMatrixType& mat, const std::string& filename, int sym = 0)
 {
   typedef typename SparseMatrixType::Scalar Scalar;
-  typedef typename SparseMatrixType::RealScalar RealScalar;
   std::ofstream out(filename.c_str(),std::ios::out);
   if(!out)
     return false;
   
   out.flags(std::ios_base::scientific);
-  out.precision(std::numeric_limits<RealScalar>::digits10 + 2);
+  out.precision(64);
   std::string header; 
   internal::putMarketHeader<Scalar>(header, sym); 
   out << header << std::endl; 
@@ -249,6 +242,7 @@ bool saveMarket(const SparseMatrixType& mat, const std::string& filename, int sy
     {
       ++ count;
       internal::PutMatrixElt(it.value(), it.row()+1, it.col()+1, out);
+      // out << it.row()+1 << " " << it.col()+1 << " " << it.value() << "\n";
     }
   out.close();
   return true;
@@ -257,14 +251,13 @@ bool saveMarket(const SparseMatrixType& mat, const std::string& filename, int sy
 template<typename VectorType>
 bool saveMarketVector (const VectorType& vec, const std::string& filename)
 {
- typedef typename VectorType::Scalar Scalar;
- typedef typename VectorType::RealScalar RealScalar;
+ typedef typename VectorType::Scalar Scalar; 
  std::ofstream out(filename.c_str(),std::ios::out);
   if(!out)
     return false;
   
   out.flags(std::ios_base::scientific);
-  out.precision(std::numeric_limits<RealScalar>::digits10 + 2);
+  out.precision(64);
   if(internal::is_same<Scalar, std::complex<float> >::value || internal::is_same<Scalar, std::complex<double> >::value)
       out << "%%MatrixMarket matrix array complex general\n"; 
   else
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SparseExtra/RandomSetter.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/SparseExtra/RandomSetter.h
index 7542cf764..ee97299af 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SparseExtra/RandomSetter.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/SparseExtra/RandomSetter.h
@@ -249,10 +249,10 @@ class RandomSetter
           }
         }
         // prefix sum
-        StorageIndex count = 0;
+        Index count = 0;
         for (Index j=0; j<mp_target->outerSize(); ++j)
         {
-          StorageIndex tmp = positions[j];
+          Index tmp = positions[j];
           mp_target->outerIndexPtr()[j] = count;
           positions[j] = count;
           count += tmp;
@@ -281,7 +281,7 @@ class RandomSetter
               mp_target->innerIndexPtr()[i+1] = mp_target->innerIndexPtr()[i];
               --i;
             }
-            mp_target->innerIndexPtr()[i+1] = internal::convert_index<StorageIndex>(inner);
+            mp_target->innerIndexPtr()[i+1] = inner;
             mp_target->valuePtr()[i+1] = it->second.value;
           }
         }
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h
deleted file mode 100644
index 41d2bf61c..000000000
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h
+++ /dev/null
@@ -1,286 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-
-#ifndef EIGEN_BESSELFUNCTIONS_ARRAYAPI_H
-#define EIGEN_BESSELFUNCTIONS_ARRAYAPI_H
-
-namespace Eigen {
-
-/** \returns an expression of the coefficient-wise i0(\a x) to the given
- * arrays.
-  *
-  * It returns the modified Bessel function of the first kind of order zero.
-  *
-  * \param x is the argument
-  *
-  * \note This function supports only float and double scalar types. To support
-  * other scalar types, the user has to provide implementations of i0(T) for
-  * any scalar type T to be supported.
-  *
-  * \sa ArrayBase::bessel_i0()
-  */
-template <typename Derived>
-EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
-    Eigen::internal::scalar_bessel_i0_op<typename Derived::Scalar>, const Derived>
-bessel_i0(const Eigen::ArrayBase<Derived>& x) {
-  return Eigen::CwiseUnaryOp<
-      Eigen::internal::scalar_bessel_i0_op<typename Derived::Scalar>,
-      const Derived>(x.derived());
-}
-
-/** \returns an expression of the coefficient-wise i0e(\a x) to the given
- * arrays.
-  *
-  * It returns the exponentially scaled modified Bessel
-  * function of the first kind of order zero.
-  *
-  * \param x is the argument
-  *
-  * \note This function supports only float and double scalar types. To support
-  * other scalar types, the user has to provide implementations of i0e(T) for
-  * any scalar type T to be supported.
-  *
-  * \sa ArrayBase::bessel_i0e()
-  */
-template <typename Derived>
-EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
-    Eigen::internal::scalar_bessel_i0e_op<typename Derived::Scalar>, const Derived>
-bessel_i0e(const Eigen::ArrayBase<Derived>& x) {
-  return Eigen::CwiseUnaryOp<
-      Eigen::internal::scalar_bessel_i0e_op<typename Derived::Scalar>,
-      const Derived>(x.derived());
-}
-
-/** \returns an expression of the coefficient-wise i1(\a x) to the given
- * arrays.
-  *
-  * It returns the modified Bessel function of the first kind of order one.
-  *
-  * \param x is the argument
-  *
-  * \note This function supports only float and double scalar types. To support
-  * other scalar types, the user has to provide implementations of i1(T) for
-  * any scalar type T to be supported.
-  *
-  * \sa ArrayBase::bessel_i1()
-  */
-template <typename Derived>
-EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
-    Eigen::internal::scalar_bessel_i1_op<typename Derived::Scalar>, const Derived>
-bessel_i1(const Eigen::ArrayBase<Derived>& x) {
-  return Eigen::CwiseUnaryOp<
-      Eigen::internal::scalar_bessel_i1_op<typename Derived::Scalar>,
-      const Derived>(x.derived());
-}
-
-/** \returns an expression of the coefficient-wise i1e(\a x) to the given
- * arrays.
-  *
-  * It returns the exponentially scaled modified Bessel
-  * function of the first kind of order one.
-  *
-  * \param x is the argument
-  *
-  * \note This function supports only float and double scalar types. To support
-  * other scalar types, the user has to provide implementations of i1e(T) for
-  * any scalar type T to be supported.
-  *
-  * \sa ArrayBase::bessel_i1e()
-  */
-template <typename Derived>
-EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
-    Eigen::internal::scalar_bessel_i1e_op<typename Derived::Scalar>, const Derived>
-bessel_i1e(const Eigen::ArrayBase<Derived>& x) {
-  return Eigen::CwiseUnaryOp<
-      Eigen::internal::scalar_bessel_i1e_op<typename Derived::Scalar>,
-      const Derived>(x.derived());
-}
-
-/** \returns an expression of the coefficient-wise k0(\a x) to the given
- * arrays.
-  *
-  * It returns the modified Bessel function of the second kind of order zero.
-  *
-  * \param x is the argument
-  *
-  * \note This function supports only float and double scalar types. To support
-  * other scalar types, the user has to provide implementations of k0(T) for
-  * any scalar type T to be supported.
-  *
-  * \sa ArrayBase::bessel_k0()
-  */
-template <typename Derived>
-EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
-    Eigen::internal::scalar_bessel_k0_op<typename Derived::Scalar>, const Derived>
-bessel_k0(const Eigen::ArrayBase<Derived>& x) {
-  return Eigen::CwiseUnaryOp<
-      Eigen::internal::scalar_bessel_k0_op<typename Derived::Scalar>,
-      const Derived>(x.derived());
-}
-
-/** \returns an expression of the coefficient-wise k0e(\a x) to the given
- * arrays.
-  *
-  * It returns the exponentially scaled modified Bessel
-  * function of the second kind of order zero.
-  *
-  * \param x is the argument
-  *
-  * \note This function supports only float and double scalar types. To support
-  * other scalar types, the user has to provide implementations of k0e(T) for
-  * any scalar type T to be supported.
-  *
-  * \sa ArrayBase::bessel_k0e()
-  */
-template <typename Derived>
-EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
-    Eigen::internal::scalar_bessel_k0e_op<typename Derived::Scalar>, const Derived>
-bessel_k0e(const Eigen::ArrayBase<Derived>& x) {
-  return Eigen::CwiseUnaryOp<
-      Eigen::internal::scalar_bessel_k0e_op<typename Derived::Scalar>,
-      const Derived>(x.derived());
-}
-
-/** \returns an expression of the coefficient-wise k1(\a x) to the given
- * arrays.
-  *
-  * It returns the modified Bessel function of the second kind of order one.
-  *
-  * \param x is the argument
-  *
-  * \note This function supports only float and double scalar types. To support
-  * other scalar types, the user has to provide implementations of k1(T) for
-  * any scalar type T to be supported.
-  *
-  * \sa ArrayBase::bessel_k1()
-  */
-template <typename Derived>
-EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
-    Eigen::internal::scalar_bessel_k1_op<typename Derived::Scalar>, const Derived>
-bessel_k1(const Eigen::ArrayBase<Derived>& x) {
-  return Eigen::CwiseUnaryOp<
-      Eigen::internal::scalar_bessel_k1_op<typename Derived::Scalar>,
-      const Derived>(x.derived());
-}
-
-/** \returns an expression of the coefficient-wise k1e(\a x) to the given
- * arrays.
-  *
-  * It returns the exponentially scaled modified Bessel
-  * function of the second kind of order one.
-  *
-  * \param x is the argument
-  *
-  * \note This function supports only float and double scalar types. To support
-  * other scalar types, the user has to provide implementations of k1e(T) for
-  * any scalar type T to be supported.
-  *
-  * \sa ArrayBase::bessel_k1e()
-  */
-template <typename Derived>
-EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
-    Eigen::internal::scalar_bessel_k1e_op<typename Derived::Scalar>, const Derived>
-bessel_k1e(const Eigen::ArrayBase<Derived>& x) {
-  return Eigen::CwiseUnaryOp<
-      Eigen::internal::scalar_bessel_k1e_op<typename Derived::Scalar>,
-      const Derived>(x.derived());
-}
-
-/** \returns an expression of the coefficient-wise j0(\a x) to the given
- * arrays.
-  *
-  * It returns the Bessel function of the first kind of order zero.
-  *
-  * \param x is the argument
-  *
-  * \note This function supports only float and double scalar types. To support
-  * other scalar types, the user has to provide implementations of j0(T) for
-  * any scalar type T to be supported.
-  *
-  * \sa ArrayBase::bessel_j0()
-  */
-template <typename Derived>
-EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
-    Eigen::internal::scalar_bessel_j0_op<typename Derived::Scalar>, const Derived>
-bessel_j0(const Eigen::ArrayBase<Derived>& x) {
-  return Eigen::CwiseUnaryOp<
-      Eigen::internal::scalar_bessel_j0_op<typename Derived::Scalar>,
-      const Derived>(x.derived());
-}
-
-/** \returns an expression of the coefficient-wise y0(\a x) to the given
- * arrays.
-  *
-  * It returns the Bessel function of the second kind of order zero.
-  *
-  * \param x is the argument
-  *
-  * \note This function supports only float and double scalar types. To support
-  * other scalar types, the user has to provide implementations of y0(T) for
-  * any scalar type T to be supported.
-  *
-  * \sa ArrayBase::bessel_y0()
-  */
-template <typename Derived>
-EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
-    Eigen::internal::scalar_bessel_y0_op<typename Derived::Scalar>, const Derived>
-bessel_y0(const Eigen::ArrayBase<Derived>& x) {
-  return Eigen::CwiseUnaryOp<
-      Eigen::internal::scalar_bessel_y0_op<typename Derived::Scalar>,
-      const Derived>(x.derived());
-}
-
-/** \returns an expression of the coefficient-wise j1(\a x) to the given
- * arrays.
-  *
-  * It returns the modified Bessel function of the first kind of order one.
-  *
-  * \param x is the argument
-  *
-  * \note This function supports only float and double scalar types. To support
-  * other scalar types, the user has to provide implementations of j1(T) for
-  * any scalar type T to be supported.
-  *
-  * \sa ArrayBase::bessel_j1()
-  */
-template <typename Derived>
-EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
-    Eigen::internal::scalar_bessel_j1_op<typename Derived::Scalar>, const Derived>
-bessel_j1(const Eigen::ArrayBase<Derived>& x) {
-  return Eigen::CwiseUnaryOp<
-      Eigen::internal::scalar_bessel_j1_op<typename Derived::Scalar>,
-      const Derived>(x.derived());
-}
-
-/** \returns an expression of the coefficient-wise y1(\a x) to the given
- * arrays.
-  *
-  * It returns the Bessel function of the second kind of order one.
-  *
-  * \param x is the argument
-  *
-  * \note This function supports only float and double scalar types. To support
-  * other scalar types, the user has to provide implementations of y1(T) for
-  * any scalar type T to be supported.
-  *
-  * \sa ArrayBase::bessel_y1()
-  */
-template <typename Derived>
-EIGEN_STRONG_INLINE const Eigen::CwiseUnaryOp<
-    Eigen::internal::scalar_bessel_y1_op<typename Derived::Scalar>, const Derived>
-bessel_y1(const Eigen::ArrayBase<Derived>& x) {
-  return Eigen::CwiseUnaryOp<
-      Eigen::internal::scalar_bessel_y1_op<typename Derived::Scalar>,
-      const Derived>(x.derived());
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_BESSELFUNCTIONS_ARRAYAPI_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h
deleted file mode 100644
index 8606a9f8e..000000000
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h
+++ /dev/null
@@ -1,357 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
-// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_BESSELFUNCTIONS_FUNCTORS_H
-#define EIGEN_BESSELFUNCTIONS_FUNCTORS_H
-
-namespace Eigen {
-
-namespace internal {
-
-/** \internal
- * \brief Template functor to compute the modified Bessel function of the first
- * kind of order zero.
- * \sa class CwiseUnaryOp, Cwise::bessel_i0()
- */
-template <typename Scalar>
-struct scalar_bessel_i0_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_i0_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
-    using numext::bessel_i0;
-    return bessel_i0(x);
-  }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
-    return internal::pbessel_i0(x);
-  }
-};
-template <typename Scalar>
-struct functor_traits<scalar_bessel_i0_op<Scalar> > {
-  enum {
-    // On average, a Chebyshev polynomial of order N=20 is computed.
-    // The cost is N multiplications and 2N additions. We also add
-    // the cost of an additional exp over i0e.
-    Cost = 28 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasBessel
-  };
-};
-
-/** \internal
- * \brief Template functor to compute the exponentially scaled modified Bessel
- * function of the first kind of order zero
- * \sa class CwiseUnaryOp, Cwise::bessel_i0e()
- */
-template <typename Scalar>
-struct scalar_bessel_i0e_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_i0e_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
-    using numext::bessel_i0e;
-    return bessel_i0e(x);
-  }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
-    return internal::pbessel_i0e(x);
-  }
-};
-template <typename Scalar>
-struct functor_traits<scalar_bessel_i0e_op<Scalar> > {
-  enum {
-    // On average, a Chebyshev polynomial of order N=20 is computed.
-    // The cost is N multiplications and 2N additions.
-    Cost = 20 * NumTraits<Scalar>::MulCost + 40 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasBessel
-  };
-};
-
-/** \internal
- * \brief Template functor to compute the modified Bessel function of the first
- * kind of order one
- * \sa class CwiseUnaryOp, Cwise::bessel_i1()
- */
-template <typename Scalar>
-struct scalar_bessel_i1_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_i1_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
-    using numext::bessel_i1;
-    return bessel_i1(x);
-  }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
-    return internal::pbessel_i1(x);
-  }
-};
-template <typename Scalar>
-struct functor_traits<scalar_bessel_i1_op<Scalar> > {
-  enum {
-    // On average, a Chebyshev polynomial of order N=20 is computed.
-    // The cost is N multiplications and 2N additions. We also add
-    // the cost of an additional exp over i1e.
-    Cost = 28 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasBessel
-  };
-};
-
-/** \internal
- * \brief Template functor to compute the exponentially scaled modified Bessel
- * function of the first kind of order zero
- * \sa class CwiseUnaryOp, Cwise::bessel_i1e()
- */
-template <typename Scalar>
-struct scalar_bessel_i1e_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_i1e_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
-    using numext::bessel_i1e;
-    return bessel_i1e(x);
-  }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
-    return internal::pbessel_i1e(x);
-  }
-};
-template <typename Scalar>
-struct functor_traits<scalar_bessel_i1e_op<Scalar> > {
-  enum {
-    // On average, a Chebyshev polynomial of order N=20 is computed.
-    // The cost is N multiplications and 2N additions.
-    Cost = 20 * NumTraits<Scalar>::MulCost + 40 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasBessel
-  };
-};
-
-/** \internal
- * \brief Template functor to compute the Bessel function of the second kind of
- * order zero
- * \sa class CwiseUnaryOp, Cwise::bessel_j0()
- */
-template <typename Scalar>
-struct scalar_bessel_j0_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_j0_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
-    using numext::bessel_j0;
-    return bessel_j0(x);
-  }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
-    return internal::pbessel_j0(x);
-  }
-};
-template <typename Scalar>
-struct functor_traits<scalar_bessel_j0_op<Scalar> > {
-  enum {
-    // 6 polynomial of order ~N=8 is computed.
-    // The cost is N multiplications and N additions each, along with a
-    // sine, cosine and rsqrt cost.
-    Cost = 63 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasBessel
-  };
-};
-
-/** \internal
- * \brief Template functor to compute the Bessel function of the second kind of
- * order zero
- * \sa class CwiseUnaryOp, Cwise::bessel_y0()
- */
-template <typename Scalar>
-struct scalar_bessel_y0_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_y0_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
-    using numext::bessel_y0;
-    return bessel_y0(x);
-  }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
-    return internal::pbessel_y0(x);
-  }
-};
-template <typename Scalar>
-struct functor_traits<scalar_bessel_y0_op<Scalar> > {
-  enum {
-    // 6 polynomial of order ~N=8 is computed.
-    // The cost is N multiplications and N additions each, along with a
-    // sine, cosine, rsqrt and j0 cost.
-    Cost = 126 * NumTraits<Scalar>::MulCost + 96 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasBessel
-  };
-};
-
-/** \internal
- * \brief Template functor to compute the Bessel function of the first kind of
- * order one
- * \sa class CwiseUnaryOp, Cwise::bessel_j1()
- */
-template <typename Scalar>
-struct scalar_bessel_j1_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_j1_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
-    using numext::bessel_j1;
-    return bessel_j1(x);
-  }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
-    return internal::pbessel_j1(x);
-  }
-};
-template <typename Scalar>
-struct functor_traits<scalar_bessel_j1_op<Scalar> > {
-  enum {
-    // 6 polynomial of order ~N=8 is computed.
-    // The cost is N multiplications and N additions each, along with a
-    // sine, cosine and rsqrt cost.
-    Cost = 63 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasBessel
-  };
-};
-
-/** \internal
- * \brief Template functor to compute the Bessel function of the second kind of
- * order one
- * \sa class CwiseUnaryOp, Cwise::bessel_j1e()
- */
-template <typename Scalar>
-struct scalar_bessel_y1_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_y1_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
-    using numext::bessel_y1;
-    return bessel_y1(x);
-  }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
-    return internal::pbessel_y1(x);
-  }
-};
-template <typename Scalar>
-struct functor_traits<scalar_bessel_y1_op<Scalar> > {
-  enum {
-    // 6 polynomial of order ~N=8 is computed.
-    // The cost is N multiplications and N additions each, along with a
-    // sine, cosine, rsqrt and j1 cost.
-    Cost = 126 * NumTraits<Scalar>::MulCost + 96 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasBessel
-  };
-};
-
-/** \internal
- * \brief Template functor to compute the modified Bessel function of the second
- * kind of order zero
- * \sa class CwiseUnaryOp, Cwise::bessel_k0()
- */
-template <typename Scalar>
-struct scalar_bessel_k0_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_k0_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
-    using numext::bessel_k0;
-    return bessel_k0(x);
-  }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
-    return internal::pbessel_k0(x);
-  }
-};
-template <typename Scalar>
-struct functor_traits<scalar_bessel_k0_op<Scalar> > {
-  enum {
-    // On average, a Chebyshev polynomial of order N=10 is computed.
-    // The cost is N multiplications and 2N additions. In addition we compute
-    // i0, a log, exp and prsqrt and sin and cos.
-    Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasBessel
-  };
-};
-
-/** \internal
- * \brief Template functor to compute the exponentially scaled modified Bessel
- * function of the second kind of order zero
- * \sa class CwiseUnaryOp, Cwise::bessel_k0e()
- */
-template <typename Scalar>
-struct scalar_bessel_k0e_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_k0e_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
-    using numext::bessel_k0e;
-    return bessel_k0e(x);
-  }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
-    return internal::pbessel_k0e(x);
-  }
-};
-template <typename Scalar>
-struct functor_traits<scalar_bessel_k0e_op<Scalar> > {
-  enum {
-    // On average, a Chebyshev polynomial of order N=10 is computed.
-    // The cost is N multiplications and 2N additions. In addition we compute
-    // i0, a log, exp and prsqrt and sin and cos.
-    Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasBessel
-  };
-};
-
-/** \internal
- * \brief Template functor to compute the modified Bessel function of the
- * second kind of order one
- * \sa class CwiseUnaryOp, Cwise::bessel_k1()
- */
-template <typename Scalar>
-struct scalar_bessel_k1_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_k1_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
-    using numext::bessel_k1;
-    return bessel_k1(x);
-  }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
-    return internal::pbessel_k1(x);
-  }
-};
-template <typename Scalar>
-struct functor_traits<scalar_bessel_k1_op<Scalar> > {
-  enum {
-    // On average, a Chebyshev polynomial of order N=10 is computed.
-    // The cost is N multiplications and 2N additions. In addition we compute
-    // i1, a log, exp and prsqrt and sin and cos.
-    Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasBessel
-  };
-};
-
-/** \internal
- * \brief Template functor to compute the exponentially scaled modified Bessel
- * function of the second kind of order one
- * \sa class CwiseUnaryOp, Cwise::bessel_k1e()
- */
-template <typename Scalar>
-struct scalar_bessel_k1e_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_bessel_k1e_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
-    using numext::bessel_k1e;
-    return bessel_k1e(x);
-  }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
-    return internal::pbessel_k1e(x);
-  }
-};
-template <typename Scalar>
-struct functor_traits<scalar_bessel_k1e_op<Scalar> > {
-  enum {
-    // On average, a Chebyshev polynomial of order N=10 is computed.
-    // The cost is N multiplications and 2N additions. In addition we compute
-    // i1, a log, exp and prsqrt and sin and cos.
-    Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasBessel
-  };
-};
-
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_BESSELFUNCTIONS_FUNCTORS_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h
deleted file mode 100644
index 8930d1a3c..000000000
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h
+++ /dev/null
@@ -1,66 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_BESSELFUNCTIONS_HALF_H
-#define EIGEN_BESSELFUNCTIONS_HALF_H
-
-namespace Eigen {
-namespace numext {
-
-#if EIGEN_HAS_C99_MATH
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i0(const Eigen::half& x) {
-  return Eigen::half(Eigen::numext::bessel_i0(static_cast<float>(x)));
-}
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i0e(const Eigen::half& x) {
-  return Eigen::half(Eigen::numext::bessel_i0e(static_cast<float>(x)));
-}
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i1(const Eigen::half& x) {
-  return Eigen::half(Eigen::numext::bessel_i1(static_cast<float>(x)));
-}
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i1e(const Eigen::half& x) {
-  return Eigen::half(Eigen::numext::bessel_i1e(static_cast<float>(x)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_j0(const Eigen::half& x) {
-  return Eigen::half(Eigen::numext::bessel_j0(static_cast<float>(x)));
-}
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_j1(const Eigen::half& x) {
-  return Eigen::half(Eigen::numext::bessel_j1(static_cast<float>(x)));
-}
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_y0(const Eigen::half& x) {
-  return Eigen::half(Eigen::numext::bessel_y0(static_cast<float>(x)));
-}
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_y1(const Eigen::half& x) {
-  return Eigen::half(Eigen::numext::bessel_y1(static_cast<float>(x)));
-}
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k0(const Eigen::half& x) {
-  return Eigen::half(Eigen::numext::bessel_k0(static_cast<float>(x)));
-}
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k0e(const Eigen::half& x) {
-  return Eigen::half(Eigen::numext::bessel_k0e(static_cast<float>(x)));
-}
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k1(const Eigen::half& x) {
-  return Eigen::half(Eigen::numext::bessel_k1(static_cast<float>(x)));
-}
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k1e(const Eigen::half& x) {
-  return Eigen::half(Eigen::numext::bessel_k1e(static_cast<float>(x)));
-}
-#endif
-
-}  // end namespace numext
-}  // end namespace Eigen
-
-#endif  // EIGEN_BESSELFUNCTIONS_HALF_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h
deleted file mode 100644
index a9b6ad940..000000000
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h
+++ /dev/null
@@ -1,1959 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_BESSEL_FUNCTIONS_H
-#define EIGEN_BESSEL_FUNCTIONS_H
-
-namespace Eigen {
-namespace internal {
-
-//  Parts of this code are based on the Cephes Math Library.
-//
-//  Cephes Math Library Release 2.8:  June, 2000
-//  Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier
-//
-//  Permission has been kindly provided by the original author
-//  to incorporate the Cephes software into the Eigen codebase:
-//
-//    From: Stephen Moshier
-//    To: Eugene Brevdo
-//    Subject: Re: Permission to wrap several cephes functions in Eigen
-//
-//    Hello Eugene,
-//
-//    Thank you for writing.
-//
-//    If your licensing is similar to BSD, the formal way that has been
-//    handled is simply to add a statement to the effect that you are incorporating
-//    the Cephes software by permission of the author.
-//
-//    Good luck with your project,
-//    Steve
-
-
-/****************************************************************************
- * Implementation of Bessel function, based on Cephes                       *
- ****************************************************************************/
-
-template <typename Scalar>
-struct bessel_i0e_retval {
-  typedef Scalar type;
-};
-
-template <typename T, typename ScalarType>
-struct generic_i0e {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T&) {
-    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
-                        THIS_TYPE_IS_NOT_SUPPORTED);
-    return ScalarType(0);
-  }
-};
-
-template <typename T>
-struct generic_i0e<T, float> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T& x) {
-    /*  i0ef.c
-     *
-     *  Modified Bessel function of order zero,
-     *  exponentially scaled
-     *
-     *
-     *
-     * SYNOPSIS:
-     *
-     * float x, y, i0ef();
-     *
-     * y = i0ef( x );
-     *
-     *
-     *
-     * DESCRIPTION:
-     *
-     * Returns exponentially scaled modified Bessel function
-     * of order zero of the argument.
-     *
-     * The function is defined as i0e(x) = exp(-|x|) j0( ix ).
-     *
-     *
-     *
-     * ACCURACY:
-     *
-     *                      Relative error:
-     * arithmetic   domain     # trials      peak         rms
-     *    IEEE      0,30        100000      3.7e-7      7.0e-8
-     * See i0f().
-     *
-     */
-
-    const float A[] = {-1.30002500998624804212E-8f, 6.04699502254191894932E-8f,
-                       -2.67079385394061173391E-7f, 1.11738753912010371815E-6f,
-                       -4.41673835845875056359E-6f, 1.64484480707288970893E-5f,
-                       -5.75419501008210370398E-5f, 1.88502885095841655729E-4f,
-                       -5.76375574538582365885E-4f, 1.63947561694133579842E-3f,
-                       -4.32430999505057594430E-3f, 1.05464603945949983183E-2f,
-                       -2.37374148058994688156E-2f, 4.93052842396707084878E-2f,
-                       -9.49010970480476444210E-2f, 1.71620901522208775349E-1f,
-                       -3.04682672343198398683E-1f, 6.76795274409476084995E-1f};
-
-    const float B[] = {3.39623202570838634515E-9f, 2.26666899049817806459E-8f,
-                       2.04891858946906374183E-7f, 2.89137052083475648297E-6f,
-                       6.88975834691682398426E-5f, 3.36911647825569408990E-3f,
-                       8.04490411014108831608E-1f};
-    T y = pabs(x);
-    T y_le_eight = internal::pchebevl<T, 18>::run(
-        pmadd(pset1<T>(0.5f), y, pset1<T>(-2.0f)), A);
-    T y_gt_eight = pmul(
-        internal::pchebevl<T, 7>::run(
-            psub(pdiv(pset1<T>(32.0f), y), pset1<T>(2.0f)), B),
-        prsqrt(y));
-    // TODO: Perhaps instead check whether all packet elements are in
-    // [-8, 8] and evaluate a branch based off of that. It's possible
-    // in practice most elements are in this region.
-    return pselect(pcmp_le(y, pset1<T>(8.0f)), y_le_eight, y_gt_eight);
-  }
-};
-
-template <typename T>
-struct generic_i0e<T, double> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T& x) {
-    /*  i0e.c
-     *
-     *  Modified Bessel function of order zero,
-     *  exponentially scaled
-     *
-     *
-     *
-     * SYNOPSIS:
-     *
-     * double x, y, i0e();
-     *
-     * y = i0e( x );
-     *
-     *
-     *
-     * DESCRIPTION:
-     *
-     * Returns exponentially scaled modified Bessel function
-     * of order zero of the argument.
-     *
-     * The function is defined as i0e(x) = exp(-|x|) j0( ix ).
-     *
-     *
-     *
-     * ACCURACY:
-     *
-     *                      Relative error:
-     * arithmetic   domain     # trials      peak         rms
-     *    IEEE      0,30        30000       5.4e-16     1.2e-16
-     * See i0().
-     *
-     */
-
-    const double A[] = {-4.41534164647933937950E-18, 3.33079451882223809783E-17,
-                        -2.43127984654795469359E-16, 1.71539128555513303061E-15,
-                        -1.16853328779934516808E-14, 7.67618549860493561688E-14,
-                        -4.85644678311192946090E-13, 2.95505266312963983461E-12,
-                        -1.72682629144155570723E-11, 9.67580903537323691224E-11,
-                        -5.18979560163526290666E-10, 2.65982372468238665035E-9,
-                        -1.30002500998624804212E-8,  6.04699502254191894932E-8,
-                        -2.67079385394061173391E-7,  1.11738753912010371815E-6,
-                        -4.41673835845875056359E-6,  1.64484480707288970893E-5,
-                        -5.75419501008210370398E-5,  1.88502885095841655729E-4,
-                        -5.76375574538582365885E-4,  1.63947561694133579842E-3,
-                        -4.32430999505057594430E-3,  1.05464603945949983183E-2,
-                        -2.37374148058994688156E-2,  4.93052842396707084878E-2,
-                        -9.49010970480476444210E-2,  1.71620901522208775349E-1,
-                        -3.04682672343198398683E-1,  6.76795274409476084995E-1};
-    const double B[] = {
-        -7.23318048787475395456E-18, -4.83050448594418207126E-18,
-        4.46562142029675999901E-17,  3.46122286769746109310E-17,
-        -2.82762398051658348494E-16, -3.42548561967721913462E-16,
-        1.77256013305652638360E-15,  3.81168066935262242075E-15,
-        -9.55484669882830764870E-15, -4.15056934728722208663E-14,
-        1.54008621752140982691E-14,  3.85277838274214270114E-13,
-        7.18012445138366623367E-13,  -1.79417853150680611778E-12,
-        -1.32158118404477131188E-11, -3.14991652796324136454E-11,
-        1.18891471078464383424E-11,  4.94060238822496958910E-10,
-        3.39623202570838634515E-9,   2.26666899049817806459E-8,
-        2.04891858946906374183E-7,   2.89137052083475648297E-6,
-        6.88975834691682398426E-5,   3.36911647825569408990E-3,
-        8.04490411014108831608E-1};
-    T y = pabs(x);
-    T y_le_eight = internal::pchebevl<T, 30>::run(
-        pmadd(pset1<T>(0.5), y, pset1<T>(-2.0)), A);
-    T y_gt_eight = pmul(
-        internal::pchebevl<T, 25>::run(
-            psub(pdiv(pset1<T>(32.0), y), pset1<T>(2.0)), B),
-        prsqrt(y));
-    // TODO: Perhaps instead check whether all packet elements are in
-    // [-8, 8] and evaluate a branch based off of that. It's possible
-    // in practice most elements are in this region.
-    return pselect(pcmp_le(y, pset1<T>(8.0)), y_le_eight, y_gt_eight);
-  }
-};
-
-template <typename Scalar>
-struct bessel_i0e_impl {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
-    return generic_i0e<Scalar, Scalar>::run(x);
-  }
-};
-
-template <typename Scalar>
-struct bessel_i0_retval {
-  typedef Scalar type;
-};
-
-template <typename T, typename ScalarType>
-struct generic_i0 {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T& x) {
-    return pmul(
-        pexp(pabs(x)),
-        generic_i0e<T, ScalarType>::run(x));
-  }
-};
-
-template <typename Scalar>
-struct bessel_i0_impl {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
-    return generic_i0<Scalar, Scalar>::run(x);
-  }
-};
-
-template <typename Scalar>
-struct bessel_i1e_retval {
-  typedef Scalar type;
-};
-
-template <typename T, typename ScalarType>
-struct generic_i1e {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T&) {
-    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
-                        THIS_TYPE_IS_NOT_SUPPORTED);
-    return ScalarType(0);
-  }
-};
-
-template <typename T>
-struct generic_i1e<T, float> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T& x) {
-    /* i1ef.c
-     *
-     *  Modified Bessel function of order one,
-     *  exponentially scaled
-     *
-     *
-     *
-     * SYNOPSIS:
-     *
-     * float x, y, i1ef();
-     *
-     * y = i1ef( x );
-     *
-     *
-     *
-     * DESCRIPTION:
-     *
-     * Returns exponentially scaled modified Bessel function
-     * of order one of the argument.
-     *
-     * The function is defined as i1(x) = -i exp(-|x|) j1( ix ).
-     *
-     *
-     *
-     * ACCURACY:
-     *
-     *                      Relative error:
-     * arithmetic   domain     # trials      peak         rms
-     *    IEEE      0, 30       30000       1.5e-6      1.5e-7
-     * See i1().
-     *
-     */
-    const float A[] = {9.38153738649577178388E-9f, -4.44505912879632808065E-8f,
-                       2.00329475355213526229E-7f, -8.56872026469545474066E-7f,
-                       3.47025130813767847674E-6f, -1.32731636560394358279E-5f,
-                       4.78156510755005422638E-5f, -1.61760815825896745588E-4f,
-                       5.12285956168575772895E-4f, -1.51357245063125314899E-3f,
-                       4.15642294431288815669E-3f, -1.05640848946261981558E-2f,
-                       2.47264490306265168283E-2f, -5.29459812080949914269E-2f,
-                       1.02643658689847095384E-1f, -1.76416518357834055153E-1f,
-                       2.52587186443633654823E-1f};
-
-    const float B[] = {-3.83538038596423702205E-9f, -2.63146884688951950684E-8f,
-                       -2.51223623787020892529E-7f, -3.88256480887769039346E-6f,
-                       -1.10588938762623716291E-4f, -9.76109749136146840777E-3f,
-                       7.78576235018280120474E-1f};
-
-
-    T y = pabs(x);
-    T y_le_eight = pmul(y, internal::pchebevl<T, 17>::run(
-        pmadd(pset1<T>(0.5f), y, pset1<T>(-2.0f)), A));
-    T y_gt_eight = pmul(
-        internal::pchebevl<T, 7>::run(
-            psub(pdiv(pset1<T>(32.0f), y),
-                 pset1<T>(2.0f)), B),
-        prsqrt(y));
-    // TODO: Perhaps instead check whether all packet elements are in
-    // [-8, 8] and evaluate a branch based off of that. It's possible
-    // in practice most elements are in this region.
-    y = pselect(pcmp_le(y, pset1<T>(8.0f)), y_le_eight, y_gt_eight);
-    return pselect(pcmp_lt(x, pset1<T>(0.0f)), pnegate(y), y);
-  }
-};
-
-template <typename T>
-struct generic_i1e<T, double> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T& x) {
-    /*  i1e.c
-     *
-     *  Modified Bessel function of order one,
-     *  exponentially scaled
-     *
-     *
-     *
-     * SYNOPSIS:
-     *
-     * double x, y, i1e();
-     *
-     * y = i1e( x );
-     *
-     *
-     *
-     * DESCRIPTION:
-     *
-     * Returns exponentially scaled modified Bessel function
-     * of order one of the argument.
-     *
-     * The function is defined as i1(x) = -i exp(-|x|) j1( ix ).
-     *
-     *
-     *
-     * ACCURACY:
-     *
-     *                      Relative error:
-     * arithmetic   domain     # trials      peak         rms
-     *    IEEE      0, 30       30000       2.0e-15     2.0e-16
-     * See i1().
-     *
-     */
-    const double A[] = {2.77791411276104639959E-18, -2.11142121435816608115E-17,
-                        1.55363195773620046921E-16, -1.10559694773538630805E-15,
-                        7.60068429473540693410E-15, -5.04218550472791168711E-14,
-                        3.22379336594557470981E-13, -1.98397439776494371520E-12,
-                        1.17361862988909016308E-11, -6.66348972350202774223E-11,
-                        3.62559028155211703701E-10, -1.88724975172282928790E-9,
-                        9.38153738649577178388E-9,  -4.44505912879632808065E-8,
-                        2.00329475355213526229E-7,  -8.56872026469545474066E-7,
-                        3.47025130813767847674E-6,  -1.32731636560394358279E-5,
-                        4.78156510755005422638E-5,  -1.61760815825896745588E-4,
-                        5.12285956168575772895E-4,  -1.51357245063125314899E-3,
-                        4.15642294431288815669E-3,  -1.05640848946261981558E-2,
-                        2.47264490306265168283E-2,  -5.29459812080949914269E-2,
-                        1.02643658689847095384E-1,  -1.76416518357834055153E-1,
-                        2.52587186443633654823E-1};
-    const double B[] = {
-        7.51729631084210481353E-18,  4.41434832307170791151E-18,
-        -4.65030536848935832153E-17, -3.20952592199342395980E-17,
-        2.96262899764595013876E-16,  3.30820231092092828324E-16,
-        -1.88035477551078244854E-15, -3.81440307243700780478E-15,
-        1.04202769841288027642E-14,  4.27244001671195135429E-14,
-        -2.10154184277266431302E-14, -4.08355111109219731823E-13,
-        -7.19855177624590851209E-13, 2.03562854414708950722E-12,
-        1.41258074366137813316E-11,  3.25260358301548823856E-11,
-        -1.89749581235054123450E-11, -5.58974346219658380687E-10,
-        -3.83538038596423702205E-9,  -2.63146884688951950684E-8,
-        -2.51223623787020892529E-7,  -3.88256480887769039346E-6,
-        -1.10588938762623716291E-4,  -9.76109749136146840777E-3,
-        7.78576235018280120474E-1};
-    T y = pabs(x);
-    T y_le_eight = pmul(y, internal::pchebevl<T, 29>::run(
-        pmadd(pset1<T>(0.5), y, pset1<T>(-2.0)), A));
-    T y_gt_eight = pmul(
-        internal::pchebevl<T, 25>::run(
-            psub(pdiv(pset1<T>(32.0), y),
-                 pset1<T>(2.0)), B),
-        prsqrt(y));
-    // TODO: Perhaps instead check whether all packet elements are in
-    // [-8, 8] and evaluate a branch based off of that. It's possible
-    // in practice most elements are in this region.
-    y = pselect(pcmp_le(y, pset1<T>(8.0)), y_le_eight, y_gt_eight);
-    return pselect(pcmp_lt(x, pset1<T>(0.0)), pnegate(y), y);
-  }
-};
-
-template <typename Scalar>
-struct bessel_i1e_impl {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
-    return generic_i1e<Scalar, Scalar>::run(x);
-  }
-};
-
-template <typename Scalar>
-struct bessel_i1_retval {
-  typedef Scalar type;
-};
-
-template <typename T, typename ScalarType>
-struct generic_i1 {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T& x) {
-    return pmul(
-        pexp(pabs(x)),
-        generic_i1e<T, ScalarType>::run(x));
-  }
-};
-
-template <typename Scalar>
-struct bessel_i1_impl {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
-    return generic_i1<Scalar, Scalar>::run(x);
-  }
-};
-
-template <typename Scalar>
-struct bessel_k0e_retval {
-  typedef Scalar type;
-};
-
-template <typename T, typename ScalarType>
-struct generic_k0e {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T&) {
-    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
-                        THIS_TYPE_IS_NOT_SUPPORTED);
-    return ScalarType(0);
-  }
-};
-
-template <typename T>
-struct generic_k0e<T, float> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T& x) {
-    /*  k0ef.c
-     *	Modified Bessel function, third kind, order zero,
-     *	exponentially scaled
-     *
-     *
-     *
-     * SYNOPSIS:
-     *
-     * float x, y, k0ef();
-     *
-     * y = k0ef( x );
-     *
-     *
-     *
-     * DESCRIPTION:
-     *
-     * Returns exponentially scaled modified Bessel function
-     * of the third kind of order zero of the argument.
-     *
-     *
-     *
-     * ACCURACY:
-     *
-     *                      Relative error:
-     * arithmetic   domain     # trials      peak         rms
-     *    IEEE      0, 30       30000       8.1e-7      7.8e-8
-     * See k0().
-     *
-     */
-
-    const float A[] = {1.90451637722020886025E-9f, 2.53479107902614945675E-7f,
-                       2.28621210311945178607E-5f, 1.26461541144692592338E-3f,
-                       3.59799365153615016266E-2f, 3.44289899924628486886E-1f,
-                       -5.35327393233902768720E-1f};
-
-    const float B[] = {-1.69753450938905987466E-9f, 8.57403401741422608519E-9f,
-                       -4.66048989768794782956E-8f, 2.76681363944501510342E-7f,
-                       -1.83175552271911948767E-6f, 1.39498137188764993662E-5f,
-                       -1.28495495816278026384E-4f, 1.56988388573005337491E-3f,
-                       -3.14481013119645005427E-2f, 2.44030308206595545468E0f};
-    const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
-    const T two = pset1<T>(2.0);
-    T x_le_two = internal::pchebevl<T, 7>::run(
-        pmadd(x, x, pset1<T>(-2.0)), A);
-    x_le_two = pmadd(
-        generic_i0<T, float>::run(x), pnegate(
-            plog(pmul(pset1<T>(0.5), x))), x_le_two);
-    x_le_two = pmul(pexp(x), x_le_two);
-    T x_gt_two = pmul(
-            internal::pchebevl<T, 10>::run(
-                psub(pdiv(pset1<T>(8.0), x), two), B),
-            prsqrt(x));
-    return pselect(
-        pcmp_le(x, pset1<T>(0.0)),
-        MAXNUM,
-        pselect(pcmp_le(x, two), x_le_two, x_gt_two));
-  }
-};
-
-template <typename T>
-struct generic_k0e<T, double> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T& x) {
-    /*  k0e.c
-     *	Modified Bessel function, third kind, order zero,
-     *	exponentially scaled
-     *
-     *
-     *
-     * SYNOPSIS:
-     *
-     * double x, y, k0e();
-     *
-     * y = k0e( x );
-     *
-     *
-     *
-     * DESCRIPTION:
-     *
-     * Returns exponentially scaled modified Bessel function
-     * of the third kind of order zero of the argument.
-     *
-     *
-     *
-     * ACCURACY:
-     *
-     *                      Relative error:
-     * arithmetic   domain     # trials      peak         rms
-     *    IEEE      0, 30       30000       1.4e-15     1.4e-16
-     * See k0().
-     *
-     */
-
-    const double A[] = {
-      1.37446543561352307156E-16,
-      4.25981614279661018399E-14,
-      1.03496952576338420167E-11,
-      1.90451637722020886025E-9,
-      2.53479107902614945675E-7,
-      2.28621210311945178607E-5,
-      1.26461541144692592338E-3,
-      3.59799365153615016266E-2,
-      3.44289899924628486886E-1,
-      -5.35327393233902768720E-1};
-    const double B[] = {
-       5.30043377268626276149E-18, -1.64758043015242134646E-17,
-       5.21039150503902756861E-17, -1.67823109680541210385E-16,
-       5.51205597852431940784E-16, -1.84859337734377901440E-15,
-       6.34007647740507060557E-15, -2.22751332699166985548E-14,
-       8.03289077536357521100E-14, -2.98009692317273043925E-13,
-       1.14034058820847496303E-12, -4.51459788337394416547E-12,
-       1.85594911495471785253E-11, -7.95748924447710747776E-11,
-       3.57739728140030116597E-10, -1.69753450938905987466E-9,
-       8.57403401741422608519E-9, -4.66048989768794782956E-8,
-       2.76681363944501510342E-7, -1.83175552271911948767E-6,
-       1.39498137188764993662E-5, -1.28495495816278026384E-4,
-       1.56988388573005337491E-3, -3.14481013119645005427E-2,
-       2.44030308206595545468E0
-    };
-    const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
-    const T two = pset1<T>(2.0);
-    T x_le_two = internal::pchebevl<T, 10>::run(
-        pmadd(x, x, pset1<T>(-2.0)), A);
-    x_le_two = pmadd(
-        generic_i0<T, double>::run(x), pmul(
-            pset1<T>(-1.0), plog(pmul(pset1<T>(0.5), x))), x_le_two);
-    x_le_two = pmul(pexp(x), x_le_two);
-    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
-    T x_gt_two = pmul(
-            internal::pchebevl<T, 25>::run(
-                psub(pdiv(pset1<T>(8.0), x), two), B),
-            prsqrt(x));
-    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
-  }
-};
-
-template <typename Scalar>
-struct bessel_k0e_impl {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
-    return generic_k0e<Scalar, Scalar>::run(x);
-  }
-};
-
-template <typename Scalar>
-struct bessel_k0_retval {
-  typedef Scalar type;
-};
-
-template <typename T, typename ScalarType>
-struct generic_k0 {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T&) {
-    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
-                        THIS_TYPE_IS_NOT_SUPPORTED);
-    return ScalarType(0);
-  }
-};
-
-template <typename T>
-struct generic_k0<T, float> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T& x) {
-    /*  k0f.c
-     *	Modified Bessel function, third kind, order zero
-     *
-     *
-     *
-     * SYNOPSIS:
-     *
-     * float x, y, k0f();
-     *
-     * y = k0f( x );
-     *
-     *
-     *
-     * DESCRIPTION:
-     *
-     * Returns modified Bessel function of the third kind
-     * of order zero of the argument.
-     *
-     * The range is partitioned into the two intervals [0,8] and
-     * (8, infinity).  Chebyshev polynomial expansions are employed
-     * in each interval.
-     *
-     *
-     *
-     * ACCURACY:
-     *
-     * Tested at 2000 random points between 0 and 8.  Peak absolute
-     * error (relative when K0 > 1) was 1.46e-14; rms, 4.26e-15.
-     *                      Relative error:
-     * arithmetic   domain     # trials      peak         rms
-     *    IEEE      0, 30       30000       7.8e-7      8.5e-8
-     *
-     * ERROR MESSAGES:
-     *
-     *   message         condition      value returned
-     *  K0 domain          x <= 0          MAXNUM
-     *
-     */
-
-    const float A[] = {1.90451637722020886025E-9f, 2.53479107902614945675E-7f,
-                       2.28621210311945178607E-5f, 1.26461541144692592338E-3f,
-                       3.59799365153615016266E-2f, 3.44289899924628486886E-1f,
-                       -5.35327393233902768720E-1f};
-
-    const float B[] = {-1.69753450938905987466E-9f, 8.57403401741422608519E-9f,
-                       -4.66048989768794782956E-8f, 2.76681363944501510342E-7f,
-                       -1.83175552271911948767E-6f, 1.39498137188764993662E-5f,
-                       -1.28495495816278026384E-4f, 1.56988388573005337491E-3f,
-                       -3.14481013119645005427E-2f, 2.44030308206595545468E0f};
-    const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
-    const T two = pset1<T>(2.0);
-    T x_le_two = internal::pchebevl<T, 7>::run(
-        pmadd(x, x, pset1<T>(-2.0)), A);
-    x_le_two = pmadd(
-        generic_i0<T, float>::run(x), pnegate(
-            plog(pmul(pset1<T>(0.5), x))), x_le_two);
-    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
-    T x_gt_two = pmul(
-        pmul(
-            pexp(pnegate(x)),
-            internal::pchebevl<T, 10>::run(
-                psub(pdiv(pset1<T>(8.0), x), two), B)),
-        prsqrt(x));
-    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
-  }
-};
-
-template <typename T>
-struct generic_k0<T, double> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T& x) {
-    /*
-     *
-     *	Modified Bessel function, third kind, order zero,
-     *	exponentially scaled
-     *
-     *
-     *
-     * SYNOPSIS:
-     *
-     * double x, y, k0();
-     *
-     * y = k0( x );
-     *
-     *
-     *
-     * DESCRIPTION:
-     *
-     * Returns exponentially scaled modified Bessel function
-     * of the third kind of order zero of the argument.
-     *
-     *
-     *
-     * ACCURACY:
-     *
-     *                      Relative error:
-     * arithmetic   domain     # trials      peak         rms
-     *    IEEE      0, 30       30000       1.4e-15     1.4e-16
-     * See k0().
-     *
-     */
-    const double A[] = {
-      1.37446543561352307156E-16,
-      4.25981614279661018399E-14,
-      1.03496952576338420167E-11,
-      1.90451637722020886025E-9,
-      2.53479107902614945675E-7,
-      2.28621210311945178607E-5,
-      1.26461541144692592338E-3,
-      3.59799365153615016266E-2,
-      3.44289899924628486886E-1,
-      -5.35327393233902768720E-1};
-    const double B[] = {
-       5.30043377268626276149E-18, -1.64758043015242134646E-17,
-       5.21039150503902756861E-17, -1.67823109680541210385E-16,
-       5.51205597852431940784E-16, -1.84859337734377901440E-15,
-       6.34007647740507060557E-15, -2.22751332699166985548E-14,
-       8.03289077536357521100E-14, -2.98009692317273043925E-13,
-       1.14034058820847496303E-12, -4.51459788337394416547E-12,
-       1.85594911495471785253E-11, -7.95748924447710747776E-11,
-       3.57739728140030116597E-10, -1.69753450938905987466E-9,
-       8.57403401741422608519E-9, -4.66048989768794782956E-8,
-       2.76681363944501510342E-7, -1.83175552271911948767E-6,
-       1.39498137188764993662E-5, -1.28495495816278026384E-4,
-       1.56988388573005337491E-3, -3.14481013119645005427E-2,
-       2.44030308206595545468E0
-    };
-    const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
-    const T two = pset1<T>(2.0);
-    T x_le_two = internal::pchebevl<T, 10>::run(
-        pmadd(x, x, pset1<T>(-2.0)), A);
-    x_le_two = pmadd(
-        generic_i0<T, double>::run(x), pnegate(
-            plog(pmul(pset1<T>(0.5), x))), x_le_two);
-    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
-    T x_gt_two = pmul(
-        pmul(
-            pexp(-x),
-            internal::pchebevl<T, 25>::run(
-                psub(pdiv(pset1<T>(8.0), x), two), B)),
-        prsqrt(x));
-    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
-  }
-};
-
-template <typename Scalar>
-struct bessel_k0_impl {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
-    return generic_k0<Scalar, Scalar>::run(x);
-  }
-};
-
-template <typename Scalar>
-struct bessel_k1e_retval {
-  typedef Scalar type;
-};
-
-template <typename T, typename ScalarType>
-struct generic_k1e {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T&) {
-    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
-                        THIS_TYPE_IS_NOT_SUPPORTED);
-    return ScalarType(0);
-  }
-};
-
-template <typename T>
-struct generic_k1e<T, float> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T& x) {
-    /* k1ef.c
-     *
-     *	Modified Bessel function, third kind, order one,
-     *	exponentially scaled
-     *
-     *
-     *
-     * SYNOPSIS:
-     *
-     * float x, y, k1ef();
-     *
-     * y = k1ef( x );
-     *
-     *
-     *
-     * DESCRIPTION:
-     *
-     * Returns exponentially scaled modified Bessel function
-     * of the third kind of order one of the argument:
-     *
-     *      k1e(x) = exp(x) * k1(x).
-     *
-     *
-     *
-     * ACCURACY:
-     *
-     *                      Relative error:
-     * arithmetic   domain     # trials      peak         rms
-     *    IEEE      0, 30       30000       4.9e-7      6.7e-8
-     * See k1().
-     *
-     */
-
-    const float A[] = {-2.21338763073472585583E-8f, -2.43340614156596823496E-6f,
-                        -1.73028895751305206302E-4f, -6.97572385963986435018E-3f,
-                        -1.22611180822657148235E-1f, -3.53155960776544875667E-1f,
-                        1.52530022733894777053E0f};
-    const float B[] = {2.01504975519703286596E-9f, -1.03457624656780970260E-8f,
-                       5.74108412545004946722E-8f, -3.50196060308781257119E-7f,
-                       2.40648494783721712015E-6f, -1.93619797416608296024E-5f,
-                       1.95215518471351631108E-4f, -2.85781685962277938680E-3f,
-                       1.03923736576817238437E-1f, 2.72062619048444266945E0f};
-    const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
-    const T two = pset1<T>(2.0);
-    T x_le_two = pdiv(internal::pchebevl<T, 7>::run(
-        pmadd(x, x, pset1<T>(-2.0)), A), x);
-    x_le_two = pmadd(
-        generic_i1<T, float>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
-    x_le_two = pmul(x_le_two, pexp(x));
-    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
-    T x_gt_two = pmul(
-        internal::pchebevl<T, 10>::run(
-            psub(pdiv(pset1<T>(8.0), x), two), B),
-        prsqrt(x));
-    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
-  }
-};
-
-template <typename T>
-struct generic_k1e<T, double> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T& x) {
-    /*  k1e.c
-     *
-     *	Modified Bessel function, third kind, order one,
-     *	exponentially scaled
-     *
-     *
-     *
-     * SYNOPSIS:
-     *
-     * double x, y, k1e();
-     *
-     * y = k1e( x );
-     *
-     *
-     *
-     * DESCRIPTION:
-     *
-     * Returns exponentially scaled modified Bessel function
-     * of the third kind of order one of the argument:
-     *
-     *      k1e(x) = exp(x) * k1(x).
-     *
-     *
-     *
-     * ACCURACY:
-     *
-     *                      Relative error:
-     * arithmetic   domain     # trials      peak         rms
-     *    IEEE      0, 30       30000       7.8e-16     1.2e-16
-     * See k1().
-     *
-     */
-    const double A[] = {-7.02386347938628759343E-18, -2.42744985051936593393E-15,
-                        -6.66690169419932900609E-13, -1.41148839263352776110E-10,
-                        -2.21338763073472585583E-8, -2.43340614156596823496E-6,
-                        -1.73028895751305206302E-4, -6.97572385963986435018E-3,
-                        -1.22611180822657148235E-1, -3.53155960776544875667E-1,
-                        1.52530022733894777053E0};
-    const double B[] = {-5.75674448366501715755E-18, 1.79405087314755922667E-17,
-                        -5.68946255844285935196E-17, 1.83809354436663880070E-16,
-                        -6.05704724837331885336E-16, 2.03870316562433424052E-15,
-                        -7.01983709041831346144E-15, 2.47715442448130437068E-14,
-                        -8.97670518232499435011E-14, 3.34841966607842919884E-13,
-                        -1.28917396095102890680E-12, 5.13963967348173025100E-12,
-                        -2.12996783842756842877E-11, 9.21831518760500529508E-11,
-                        -4.19035475934189648750E-10, 2.01504975519703286596E-9,
-                        -1.03457624656780970260E-8, 5.74108412545004946722E-8,
-                        -3.50196060308781257119E-7, 2.40648494783721712015E-6,
-                        -1.93619797416608296024E-5, 1.95215518471351631108E-4,
-                        -2.85781685962277938680E-3, 1.03923736576817238437E-1,
-                        2.72062619048444266945E0};
-    const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
-    const T two = pset1<T>(2.0);
-    T x_le_two = pdiv(internal::pchebevl<T, 11>::run(
-        pmadd(x, x, pset1<T>(-2.0)), A), x);
-    x_le_two = pmadd(
-        generic_i1<T, double>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
-    x_le_two = pmul(x_le_two, pexp(x));
-    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
-    T x_gt_two = pmul(
-        internal::pchebevl<T, 25>::run(
-            psub(pdiv(pset1<T>(8.0), x), two), B),
-        prsqrt(x));
-    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
-  }
-};
-
-template <typename Scalar>
-struct bessel_k1e_impl {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
-    return generic_k1e<Scalar, Scalar>::run(x);
-  }
-};
-
-template <typename Scalar>
-struct bessel_k1_retval {
-  typedef Scalar type;
-};
-
-template <typename T, typename ScalarType>
-struct generic_k1 {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T&) {
-    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
-                        THIS_TYPE_IS_NOT_SUPPORTED);
-    return ScalarType(0);
-  }
-};
-
-template <typename T>
-struct generic_k1<T, float> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T& x) {
-    /* k1f.c
-     *	Modified Bessel function, third kind, order one
-     *
-     *
-     *
-     * SYNOPSIS:
-     *
-     * float x, y, k1f();
-     *
-     * y = k1f( x );
-     *
-     *
-     *
-     * DESCRIPTION:
-     *
-     * Computes the modified Bessel function of the third kind
-     * of order one of the argument.
-     *
-     * The range is partitioned into the two intervals [0,2] and
-     * (2, infinity).  Chebyshev polynomial expansions are employed
-     * in each interval.
-     *
-     *
-     *
-     * ACCURACY:
-     *
-     *                      Relative error:
-     * arithmetic   domain     # trials      peak         rms
-     *    IEEE      0, 30       30000       4.6e-7      7.6e-8
-     *
-     * ERROR MESSAGES:
-     *
-     *   message         condition      value returned
-     * k1 domain          x <= 0          MAXNUM
-     *
-     */
-
-    const float A[] = {-2.21338763073472585583E-8f, -2.43340614156596823496E-6f,
-                        -1.73028895751305206302E-4f, -6.97572385963986435018E-3f,
-                        -1.22611180822657148235E-1f, -3.53155960776544875667E-1f,
-                        1.52530022733894777053E0f};
-    const float B[] = {2.01504975519703286596E-9f, -1.03457624656780970260E-8f,
-                       5.74108412545004946722E-8f, -3.50196060308781257119E-7f,
-                       2.40648494783721712015E-6f, -1.93619797416608296024E-5f,
-                       1.95215518471351631108E-4f, -2.85781685962277938680E-3f,
-                       1.03923736576817238437E-1f, 2.72062619048444266945E0f};
-    const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
-    const T two = pset1<T>(2.0);
-    T x_le_two = pdiv(internal::pchebevl<T, 7>::run(
-        pmadd(x, x, pset1<T>(-2.0)), A), x);
-    x_le_two = pmadd(
-        generic_i1<T, float>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
-    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
-    T x_gt_two = pmul(
-        pexp(pnegate(x)),
-        pmul(
-            internal::pchebevl<T, 10>::run(
-                psub(pdiv(pset1<T>(8.0), x), two), B),
-            prsqrt(x)));
-    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
-  }
-};
-
-template <typename T>
-struct generic_k1<T, double> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T& x) {
-    /*  k1.c
-     *	Modified Bessel function, third kind, order one
-     *
-     *
-     *
-     * SYNOPSIS:
-     *
-     * float x, y, k1f();
-     *
-     * y = k1f( x );
-     *
-     *
-     *
-     * DESCRIPTION:
-     *
-     * Computes the modified Bessel function of the third kind
-     * of order one of the argument.
-     *
-     * The range is partitioned into the two intervals [0,2] and
-     * (2, infinity).  Chebyshev polynomial expansions are employed
-     * in each interval.
-     *
-     *
-     *
-     * ACCURACY:
-     *
-     *                      Relative error:
-     * arithmetic   domain     # trials      peak         rms
-     *    IEEE      0, 30       30000       4.6e-7      7.6e-8
-     *
-     * ERROR MESSAGES:
-     *
-     *   message         condition      value returned
-     * k1 domain          x <= 0          MAXNUM
-     *
-     */
-    const double A[] = {-7.02386347938628759343E-18, -2.42744985051936593393E-15,
-                        -6.66690169419932900609E-13, -1.41148839263352776110E-10,
-                        -2.21338763073472585583E-8, -2.43340614156596823496E-6,
-                        -1.73028895751305206302E-4, -6.97572385963986435018E-3,
-                        -1.22611180822657148235E-1, -3.53155960776544875667E-1,
-                        1.52530022733894777053E0};
-    const double B[] = {-5.75674448366501715755E-18, 1.79405087314755922667E-17,
-                        -5.68946255844285935196E-17, 1.83809354436663880070E-16,
-                        -6.05704724837331885336E-16, 2.03870316562433424052E-15,
-                        -7.01983709041831346144E-15, 2.47715442448130437068E-14,
-                        -8.97670518232499435011E-14, 3.34841966607842919884E-13,
-                        -1.28917396095102890680E-12, 5.13963967348173025100E-12,
-                        -2.12996783842756842877E-11, 9.21831518760500529508E-11,
-                        -4.19035475934189648750E-10, 2.01504975519703286596E-9,
-                        -1.03457624656780970260E-8, 5.74108412545004946722E-8,
-                        -3.50196060308781257119E-7, 2.40648494783721712015E-6,
-                        -1.93619797416608296024E-5, 1.95215518471351631108E-4,
-                        -2.85781685962277938680E-3, 1.03923736576817238437E-1,
-                        2.72062619048444266945E0};
-    const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
-    const T two = pset1<T>(2.0);
-    T x_le_two = pdiv(internal::pchebevl<T, 11>::run(
-        pmadd(x, x, pset1<T>(-2.0)), A), x);
-    x_le_two = pmadd(
-        generic_i1<T, double>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
-    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
-    T x_gt_two = pmul(
-        pexp(-x),
-        pmul(
-            internal::pchebevl<T, 25>::run(
-                psub(pdiv(pset1<T>(8.0), x), two), B),
-            prsqrt(x)));
-    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
-  }
-};
-
-template <typename Scalar>
-struct bessel_k1_impl {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
-    return generic_k1<Scalar, Scalar>::run(x);
-  }
-};
-
-template <typename Scalar>
-struct bessel_j0_retval {
-  typedef Scalar type;
-};
-
-template <typename T, typename ScalarType>
-struct generic_j0 {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T&) {
-    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
-                        THIS_TYPE_IS_NOT_SUPPORTED);
-    return ScalarType(0);
-  }
-};
-
-template <typename T>
-struct generic_j0<T, float> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T& x) {
-    /* j0f.c
-     *	Bessel function of order zero
-     *
-     *
-     *
-     * SYNOPSIS:
-     *
-     * float x, y, j0f();
-     *
-     * y = j0f( x );
-     *
-     *
-     *
-     * DESCRIPTION:
-     *
-     * Returns Bessel function of order zero of the argument.
-     *
-     * The domain is divided into the intervals [0, 2] and
-     * (2, infinity). In the first interval the following polynomial
-     * approximation is used:
-     *
-     *
-     *        2         2         2
-     * (w - r  ) (w - r  ) (w - r  ) P(w)
-     *       1         2         3
-     *
-     *            2
-     * where w = x  and the three r's are zeros of the function.
-     *
-     * In the second interval, the modulus and phase are approximated
-     * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
-     * and Phase(x) = x + 1/x R(1/x^2) - pi/4.  The function is
-     *
-     *   j0(x) = Modulus(x) cos( Phase(x) ).
-     *
-     *
-     *
-     * ACCURACY:
-     *
-     *                      Absolute error:
-     * arithmetic   domain     # trials      peak         rms
-     *    IEEE      0, 2        100000      1.3e-7      3.6e-8
-     *    IEEE      2, 32       100000      1.9e-7      5.4e-8
-     *
-     */
-
-    const float JP[] = {-6.068350350393235E-008f, 6.388945720783375E-006f,
-                        -3.969646342510940E-004f, 1.332913422519003E-002f,
-                        -1.729150680240724E-001f};
-    const float MO[] = {-6.838999669318810E-002f, 1.864949361379502E-001f,
-                        -2.145007480346739E-001f, 1.197549369473540E-001f,
-                        -3.560281861530129E-003f, -4.969382655296620E-002f,
-                        -3.355424622293709E-006f, 7.978845717621440E-001f};
-    const float PH[] = {3.242077816988247E+001f, -3.630592630518434E+001f,
-                        1.756221482109099E+001f, -4.974978466280903E+000f,
-                        1.001973420681837E+000f, -1.939906941791308E-001f,
-                        6.490598792654666E-002f, -1.249992184872738E-001f};
-    const T DR1 =  pset1<T>(5.78318596294678452118f);
-    const T NEG_PIO4F = pset1<T>(-0.7853981633974483096f); /* -pi / 4 */
-    T y = pabs(x);
-    T z = pmul(y, y);
-    T y_le_two = pselect(
-        pcmp_lt(y, pset1<T>(1.0e-3f)),
-        pmadd(z, pset1<T>(-0.25f), pset1<T>(1.0f)),
-        pmul(psub(z, DR1), internal::ppolevl<T, 4>::run(z, JP)));
-    T q = pdiv(pset1<T>(1.0f), y);
-    T w = prsqrt(y);
-    T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO));
-    w = pmul(q, q);
-    T yn = pmadd(q, internal::ppolevl<T, 7>::run(w, PH), NEG_PIO4F);
-    T y_gt_two = pmul(p, pcos(padd(yn, y)));
-    return pselect(pcmp_le(y, pset1<T>(2.0)), y_le_two, y_gt_two);
-  }
-};
-
-template <typename T>
-struct generic_j0<T, double> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T& x) {
-    /*  j0.c
-     *	Bessel function of order zero
-     *
-     *
-     *
-     * SYNOPSIS:
-     *
-     * double x, y, j0();
-     *
-     * y = j0( x );
-     *
-     *
-     *
-     * DESCRIPTION:
-     *
-     * Returns Bessel function of order zero of the argument.
-     *
-     * The domain is divided into the intervals [0, 5] and
-     * (5, infinity). In the first interval the following rational
-     * approximation is used:
-     *
-     *
-     *        2         2
-     * (w - r  ) (w - r  ) P (w) / Q (w)
-     *       1         2    3       8
-     *
-     *            2
-     * where w = x  and the two r's are zeros of the function.
-     *
-     * In the second interval, the Hankel asymptotic expansion
-     * is employed with two rational functions of degree 6/6
-     * and 7/7.
-     *
-     *
-     *
-     * ACCURACY:
-     *
-     *                      Absolute error:
-     * arithmetic   domain     # trials      peak         rms
-     *    DEC       0, 30       10000       4.4e-17     6.3e-18
-     *    IEEE      0, 30       60000       4.2e-16     1.1e-16
-     *
-     */
-    const double PP[] = {7.96936729297347051624E-4, 8.28352392107440799803E-2,
-                        1.23953371646414299388E0, 5.44725003058768775090E0,
-                        8.74716500199817011941E0, 5.30324038235394892183E0,
-                        9.99999999999999997821E-1};
-    const double PQ[] = {9.24408810558863637013E-4, 8.56288474354474431428E-2,
-                         1.25352743901058953537E0, 5.47097740330417105182E0,
-                         8.76190883237069594232E0, 5.30605288235394617618E0,
-                         1.00000000000000000218E0};
-    const double QP[] = {-1.13663838898469149931E-2, -1.28252718670509318512E0,
-                         -1.95539544257735972385E1, -9.32060152123768231369E1,
-                         -1.77681167980488050595E2, -1.47077505154951170175E2,
-                         -5.14105326766599330220E1, -6.05014350600728481186E0};
-    const double QQ[] = {1.00000000000000000000E0, 6.43178256118178023184E1,
-                         8.56430025976980587198E2, 3.88240183605401609683E3,
-                         7.24046774195652478189E3, 5.93072701187316984827E3,
-                         2.06209331660327847417E3, 2.42005740240291393179E2};
-    const double RP[] = {-4.79443220978201773821E9, 1.95617491946556577543E12,
-                         -2.49248344360967716204E14, 9.70862251047306323952E15};
-    const double RQ[] = {1.00000000000000000000E0, 4.99563147152651017219E2,
-                         1.73785401676374683123E5, 4.84409658339962045305E7,
-                         1.11855537045356834862E10, 2.11277520115489217587E12,
-                         3.10518229857422583814E14, 3.18121955943204943306E16,
-                         1.71086294081043136091E18};
-    const T DR1 = pset1<T>(5.78318596294678452118E0);
-    const T DR2 = pset1<T>(3.04712623436620863991E1);
-    const T SQ2OPI = pset1<T>(7.9788456080286535587989E-1); /* sqrt(2 / pi) */
-    const T NEG_PIO4 = pset1<T>(-0.7853981633974483096); /* pi / 4 */
-
-    T y = pabs(x);
-    T z = pmul(y, y);
-    T y_le_five = pselect(
-        pcmp_lt(y, pset1<T>(1.0e-5)),
-        pmadd(z, pset1<T>(-0.25), pset1<T>(1.0)),
-        pmul(pmul(psub(z, DR1), psub(z, DR2)),
-             pdiv(internal::ppolevl<T, 3>::run(z, RP),
-                  internal::ppolevl<T, 8>::run(z, RQ))));
-    T s = pdiv(pset1<T>(25.0), z);
-    T p = pdiv(
-        internal::ppolevl<T, 6>::run(s, PP),
-        internal::ppolevl<T, 6>::run(s, PQ));
-    T q = pdiv(
-        internal::ppolevl<T, 7>::run(s, QP),
-        internal::ppolevl<T, 7>::run(s, QQ));
-    T yn = padd(y, NEG_PIO4);
-    T w = pdiv(pset1<T>(-5.0), y);
-    p = pmadd(p, pcos(yn), pmul(w, pmul(q, psin(yn))));
-    T y_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(y)));
-    return pselect(pcmp_le(y, pset1<T>(5.0)), y_le_five, y_gt_five);
-  }
-};
-
-template <typename Scalar>
-struct bessel_j0_impl {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
-    return generic_j0<Scalar, Scalar>::run(x);
-  }
-};
-
-template <typename Scalar>
-struct bessel_y0_retval {
-  typedef Scalar type;
-};
-
-template <typename T, typename ScalarType>
-struct generic_y0 {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T&) {
-    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
-                        THIS_TYPE_IS_NOT_SUPPORTED);
-    return ScalarType(0);
-  }
-};
-
-template <typename T>
-struct generic_y0<T, float> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T& x) {
-    /* j0f.c
-     * 	Bessel function of the second kind, order zero
-     *
-     *
-     *
-     * SYNOPSIS:
-     *
-     * float x, y, y0f();
-     *
-     * y = y0f( x );
-     *
-     *
-     *
-     * DESCRIPTION:
-     *
-     * Returns Bessel function of the second kind, of order
-     * zero, of the argument.
-     *
-     * The domain is divided into the intervals [0, 2] and
-     * (2, infinity). In the first interval a rational approximation
-     * R(x) is employed to compute
-     *
-     *                  2         2         2
-     * y0(x)  =  (w - r  ) (w - r  ) (w - r  ) R(x)  +  2/pi ln(x) j0(x).
-     *                 1         2         3
-     *
-     * Thus a call to j0() is required.  The three zeros are removed
-     * from R(x) to improve its numerical stability.
-     *
-     * In the second interval, the modulus and phase are approximated
-     * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
-     * and Phase(x) = x + 1/x S(1/x^2) - pi/4.  Then the function is
-     *
-     *   y0(x) = Modulus(x) sin( Phase(x) ).
-     *
-     *
-     *
-     *
-     * ACCURACY:
-     *
-     *  Absolute error, when y0(x) < 1; else relative error:
-     *
-     * arithmetic   domain     # trials      peak         rms
-     *    IEEE      0,  2       100000      2.4e-7      3.4e-8
-     *    IEEE      2, 32       100000      1.8e-7      5.3e-8
-     *
-     */
-
-    const float YP[] = {9.454583683980369E-008f, -9.413212653797057E-006f,
-                        5.344486707214273E-004f, -1.584289289821316E-002f,
-                        1.707584643733568E-001f};
-    const float MO[] = {-6.838999669318810E-002f, 1.864949361379502E-001f,
-                        -2.145007480346739E-001f, 1.197549369473540E-001f,
-                        -3.560281861530129E-003f, -4.969382655296620E-002f,
-                        -3.355424622293709E-006f, 7.978845717621440E-001f};
-    const float PH[] = {3.242077816988247E+001f, -3.630592630518434E+001f,
-                        1.756221482109099E+001f, -4.974978466280903E+000f,
-                        1.001973420681837E+000f, -1.939906941791308E-001f,
-                        6.490598792654666E-002f, -1.249992184872738E-001f};
-    const T YZ1 = pset1<T>(0.43221455686510834878f);
-    const T TWOOPI =  pset1<T>(0.636619772367581343075535f); /* 2 / pi */
-    const T NEG_PIO4F = pset1<T>(-0.7853981633974483096f); /* -pi / 4 */
-    const T NEG_MAXNUM = pset1<T>(-NumTraits<float>::infinity());
-    T z = pmul(x, x);
-    T x_le_two = pmul(TWOOPI, pmul(plog(x), generic_j0<T, float>::run(x)));
-    x_le_two = pmadd(
-        psub(z, YZ1), internal::ppolevl<T, 4>::run(z, YP), x_le_two);
-    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), NEG_MAXNUM, x_le_two);
-    T q = pdiv(pset1<T>(1.0), x);
-    T w = prsqrt(x);
-    T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO));
-    T u = pmul(q, q);
-    T xn = pmadd(q, internal::ppolevl<T, 7>::run(u, PH), NEG_PIO4F);
-    T x_gt_two = pmul(p, psin(padd(xn, x)));
-    return pselect(pcmp_le(x, pset1<T>(2.0)), x_le_two, x_gt_two);
-  }
-};
-
-template <typename T>
-struct generic_y0<T, double> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T& x) {
-    /*  j0.c
-     *	Bessel function of the second kind, order zero
-     *
-     *
-     *
-     * SYNOPSIS:
-     *
-     * double x, y, y0();
-     *
-     * y = y0( x );
-     *
-     *
-     *
-     * DESCRIPTION:
-     *
-     * Returns Bessel function of the second kind, of order
-     * zero, of the argument.
-     *
-     * The domain is divided into the intervals [0, 5] and
-     * (5, infinity). In the first interval a rational approximation
-     * R(x) is employed to compute
-     *   y0(x)  = R(x)  +   2 * log(x) * j0(x) / PI.
-     * Thus a call to j0() is required.
-     *
-     * In the second interval, the Hankel asymptotic expansion
-     * is employed with two rational functions of degree 6/6
-     * and 7/7.
-     *
-     *
-     *
-     * ACCURACY:
-     *
-     *  Absolute error, when y0(x) < 1; else relative error:
-     *
-     * arithmetic   domain     # trials      peak         rms
-     *    DEC       0, 30        9400       7.0e-17     7.9e-18
-     *    IEEE      0, 30       30000       1.3e-15     1.6e-16
-     *
-     */
-    const double PP[] = {7.96936729297347051624E-4, 8.28352392107440799803E-2,
-                        1.23953371646414299388E0, 5.44725003058768775090E0,
-                        8.74716500199817011941E0, 5.30324038235394892183E0,
-                        9.99999999999999997821E-1};
-    const double PQ[] = {9.24408810558863637013E-4, 8.56288474354474431428E-2,
-                         1.25352743901058953537E0, 5.47097740330417105182E0,
-                         8.76190883237069594232E0, 5.30605288235394617618E0,
-                         1.00000000000000000218E0};
-    const double QP[] = {-1.13663838898469149931E-2, -1.28252718670509318512E0,
-                         -1.95539544257735972385E1, -9.32060152123768231369E1,
-                         -1.77681167980488050595E2, -1.47077505154951170175E2,
-                         -5.14105326766599330220E1, -6.05014350600728481186E0};
-    const double QQ[] = {1.00000000000000000000E0, 6.43178256118178023184E1,
-                         8.56430025976980587198E2, 3.88240183605401609683E3,
-                         7.24046774195652478189E3, 5.93072701187316984827E3,
-                         2.06209331660327847417E3, 2.42005740240291393179E2};
-    const double YP[] = {1.55924367855235737965E4, -1.46639295903971606143E7,
-                         5.43526477051876500413E9, -9.82136065717911466409E11,
-                         8.75906394395366999549E13, -3.46628303384729719441E15,
-                         4.42733268572569800351E16, -1.84950800436986690637E16};
-    const double YQ[] = {1.00000000000000000000E0,  1.04128353664259848412E3,
-                         6.26107330137134956842E5, 2.68919633393814121987E8,
-                         8.64002487103935000337E10, 2.02979612750105546709E13,
-                         3.17157752842975028269E15, 2.50596256172653059228E17};
-    const T SQ2OPI = pset1<T>(7.9788456080286535587989E-1); /* sqrt(2 / pi) */
-    const T TWOOPI =  pset1<T>(0.636619772367581343075535); /* 2 / pi */
-    const T NEG_PIO4 = pset1<T>(-0.7853981633974483096); /* -pi / 4 */
-    const T NEG_MAXNUM = pset1<T>(-NumTraits<double>::infinity());
-
-    T z = pmul(x, x);
-    T x_le_five = pdiv(internal::ppolevl<T, 7>::run(z, YP),
-                       internal::ppolevl<T, 7>::run(z, YQ));
-    x_le_five = pmadd(
-        pmul(TWOOPI, plog(x)), generic_j0<T, double>::run(x), x_le_five);
-    x_le_five = pselect(pcmp_le(x, pset1<T>(0.0)), NEG_MAXNUM, x_le_five);
-    T s = pdiv(pset1<T>(25.0), z);
-    T p = pdiv(
-        internal::ppolevl<T, 6>::run(s, PP),
-        internal::ppolevl<T, 6>::run(s, PQ));
-    T q = pdiv(
-        internal::ppolevl<T, 7>::run(s, QP),
-        internal::ppolevl<T, 7>::run(s, QQ));
-    T xn = padd(x, NEG_PIO4);
-    T w = pdiv(pset1<T>(5.0), x);
-    p = pmadd(p, psin(xn), pmul(w, pmul(q, pcos(xn))));
-    T x_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(x)));
-    return pselect(pcmp_le(x, pset1<T>(5.0)), x_le_five, x_gt_five);
-  }
-};
-
-template <typename Scalar>
-struct bessel_y0_impl {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
-    return generic_y0<Scalar, Scalar>::run(x);
-  }
-};
-
-template <typename Scalar>
-struct bessel_j1_retval {
-  typedef Scalar type;
-};
-
-template <typename T, typename ScalarType>
-struct generic_j1 {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T&) {
-    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
-                        THIS_TYPE_IS_NOT_SUPPORTED);
-    return ScalarType(0);
-  }
-};
-
-template <typename T>
-struct generic_j1<T, float> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T& x) {
-    /* j1f.c
-     *	Bessel function of order one
-     *
-     *
-     *
-     * SYNOPSIS:
-     *
-     * float x, y, j1f();
-     *
-     * y = j1f( x );
-     *
-     *
-     *
-     * DESCRIPTION:
-     *
-     * Returns Bessel function of order one of the argument.
-     *
-     * The domain is divided into the intervals [0, 2] and
-     * (2, infinity). In the first interval a polynomial approximation
-     *        2
-     * (w - r  ) x P(w)
-     *       1
-     *                     2
-     * is used, where w = x  and r is the first zero of the function.
-     *
-     * In the second interval, the modulus and phase are approximated
-     * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
-     * and Phase(x) = x + 1/x R(1/x^2) - 3pi/4.  The function is
-     *
-     *   j0(x) = Modulus(x) cos( Phase(x) ).
-     *
-     *
-     *
-     * ACCURACY:
-     *
-     *                      Absolute error:
-     * arithmetic   domain      # trials      peak       rms
-     *    IEEE      0,  2       100000       1.2e-7     2.5e-8
-     *    IEEE      2, 32       100000       2.0e-7     5.3e-8
-     *
-     *
-     */
-
-    const float JP[] = {-4.878788132172128E-009f, 6.009061827883699E-007f,
-                        -4.541343896997497E-005f, 1.937383947804541E-003f,
-                        -3.405537384615824E-002f};
-    const float MO1[] = {6.913942741265801E-002f, -2.284801500053359E-001f,
-                        3.138238455499697E-001f, -2.102302420403875E-001f,
-                        5.435364690523026E-003f, 1.493389585089498E-001f,
-                        4.976029650847191E-006f, 7.978845453073848E-001f};
-    const float PH1[] = {-4.497014141919556E+001f, 5.073465654089319E+001f,
-                        -2.485774108720340E+001f, 7.222973196770240E+000f,
-                        -1.544842782180211E+000f, 3.503787691653334E-001f,
-                        -1.637986776941202E-001f, 3.749989509080821E-001f};
-    const T Z1 = pset1<T>(1.46819706421238932572E1f);
-    const T NEG_THPIO4F = pset1<T>(-2.35619449019234492885f);    /* -3*pi/4 */
-
-    T y = pabs(x);
-    T z = pmul(y, y);
-    T y_le_two = pmul(
-        psub(z, Z1),
-        pmul(x, internal::ppolevl<T, 4>::run(z, JP)));
-    T q = pdiv(pset1<T>(1.0f), y);
-    T w = prsqrt(y);
-    T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO1));
-    w = pmul(q, q);
-    T yn = pmadd(q, internal::ppolevl<T, 7>::run(w, PH1), NEG_THPIO4F);
-    T y_gt_two = pmul(p, pcos(padd(yn, y)));
-    // j1 is an odd function. This implementation differs from cephes to
-    // take this fact in to account. Cephes returns -j1(x) for y > 2 range.
-    y_gt_two = pselect(
-        pcmp_lt(x, pset1<T>(0.0f)), pnegate(y_gt_two), y_gt_two);
-    return pselect(pcmp_le(y, pset1<T>(2.0f)), y_le_two, y_gt_two);
-  }
-};
-
-template <typename T>
-struct generic_j1<T, double> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T& x) {
-    /*  j1.c
-     *	Bessel function of order one
-     *
-     *
-     *
-     * SYNOPSIS:
-     *
-     * double x, y, j1();
-     *
-     * y = j1( x );
-     *
-     *
-     *
-     * DESCRIPTION:
-     *
-     * Returns Bessel function of order one of the argument.
-     *
-     * The domain is divided into the intervals [0, 8] and
-     * (8, infinity). In the first interval a 24 term Chebyshev
-     * expansion is used. In the second, the asymptotic
-     * trigonometric representation is employed using two
-     * rational functions of degree 5/5.
-     *
-     *
-     *
-     * ACCURACY:
-     *
-     *                      Absolute error:
-     * arithmetic   domain      # trials      peak         rms
-     *    DEC       0, 30       10000       4.0e-17     1.1e-17
-     *    IEEE      0, 30       30000       2.6e-16     1.1e-16
-     *
-     */
-    const double PP[] = {7.62125616208173112003E-4, 7.31397056940917570436E-2,
-                         1.12719608129684925192E0, 5.11207951146807644818E0,
-                         8.42404590141772420927E0, 5.21451598682361504063E0,
-                         1.00000000000000000254E0};
-    const double PQ[] = {5.71323128072548699714E-4, 6.88455908754495404082E-2,
-                         1.10514232634061696926E0, 5.07386386128601488557E0,
-                         8.39985554327604159757E0, 5.20982848682361821619E0,
-                         9.99999999999999997461E-1};
-    const double QP[] = {5.10862594750176621635E-2, 4.98213872951233449420E0,
-                         7.58238284132545283818E1, 3.66779609360150777800E2,
-                         7.10856304998926107277E2, 5.97489612400613639965E2,
-                         2.11688757100572135698E2, 2.52070205858023719784E1};
-    const double QQ[] = {1.00000000000000000000E0, 7.42373277035675149943E1,
-                         1.05644886038262816351E3, 4.98641058337653607651E3,
-                         9.56231892404756170795E3, 7.99704160447350683650E3,
-                         2.82619278517639096600E3, 3.36093607810698293419E2};
-    const double RP[] = {-8.99971225705559398224E8, 4.52228297998194034323E11,
-                         -7.27494245221818276015E13, 3.68295732863852883286E15};
-    const double RQ[] = {1.00000000000000000000E0, 6.20836478118054335476E2,
-                         2.56987256757748830383E5, 8.35146791431949253037E7,
-                         2.21511595479792499675E10, 4.74914122079991414898E12,
-                         7.84369607876235854894E14, 8.95222336184627338078E16,
-                         5.32278620332680085395E18};
-    const T Z1 = pset1<T>(1.46819706421238932572E1);
-    const T Z2 = pset1<T>(4.92184563216946036703E1);
-    const T NEG_THPIO4 = pset1<T>(-2.35619449019234492885);    /* -3*pi/4 */
-    const T SQ2OPI = pset1<T>(7.9788456080286535587989E-1); /* sqrt(2 / pi) */
-    T y = pabs(x);
-    T z = pmul(y, y);
-    T y_le_five = pdiv(internal::ppolevl<T, 3>::run(z, RP),
-                       internal::ppolevl<T, 8>::run(z, RQ));
-    y_le_five = pmul(pmul(pmul(y_le_five, x), psub(z, Z1)), psub(z, Z2));
-    T s = pdiv(pset1<T>(25.0), z);
-    T p = pdiv(
-        internal::ppolevl<T, 6>::run(s, PP),
-        internal::ppolevl<T, 6>::run(s, PQ));
-    T q = pdiv(
-        internal::ppolevl<T, 7>::run(s, QP),
-        internal::ppolevl<T, 7>::run(s, QQ));
-    T yn = padd(y, NEG_THPIO4);
-    T w = pdiv(pset1<T>(-5.0), y);
-    p = pmadd(p, pcos(yn), pmul(w, pmul(q, psin(yn))));
-    T y_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(y)));
-    // j1 is an odd function. This implementation differs from cephes to
-    // take this fact in to account. Cephes returns -j1(x) for y > 5 range.
-    y_gt_five = pselect(
-        pcmp_lt(x, pset1<T>(0.0)), pnegate(y_gt_five), y_gt_five);
-    return pselect(pcmp_le(y, pset1<T>(5.0)), y_le_five, y_gt_five);
-  }
-};
-
-template <typename Scalar>
-struct bessel_j1_impl {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
-    return generic_j1<Scalar, Scalar>::run(x);
-  }
-};
-
-template <typename Scalar>
-struct bessel_y1_retval {
-  typedef Scalar type;
-};
-
-template <typename T, typename ScalarType>
-struct generic_y1 {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T&) {
-    EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false),
-                        THIS_TYPE_IS_NOT_SUPPORTED);
-    return ScalarType(0);
-  }
-};
-
-template <typename T>
-struct generic_y1<T, float> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T& x) {
-    /* j1f.c
-     *	Bessel function of second kind of order one
-     *
-     *
-     *
-     * SYNOPSIS:
-     *
-     * double x, y, y1();
-     *
-     * y = y1( x );
-     *
-     *
-     *
-     * DESCRIPTION:
-     *
-     * Returns Bessel function of the second kind of order one
-     * of the argument.
-     *
-     * The domain is divided into the intervals [0, 2] and
-     * (2, infinity). In the first interval a rational approximation
-     * R(x) is employed to compute
-     *
-     *                  2
-     * y0(x)  =  (w - r  ) x R(x^2)  +  2/pi (ln(x) j1(x) - 1/x) .
-     *                 1
-     *
-     * Thus a call to j1() is required.
-     *
-     * In the second interval, the modulus and phase are approximated
-     * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
-     * and Phase(x) = x + 1/x S(1/x^2) - 3pi/4.  Then the function is
-     *
-     *   y0(x) = Modulus(x) sin( Phase(x) ).
-     *
-     *
-     *
-     *
-     * ACCURACY:
-     *
-     *                      Absolute error:
-     * arithmetic   domain      # trials      peak         rms
-     *    IEEE      0,  2       100000       2.2e-7     4.6e-8
-     *    IEEE      2, 32       100000       1.9e-7     5.3e-8
-     *
-     * (error criterion relative when |y1| > 1).
-     *
-     */
-
-    const float YP[] = {8.061978323326852E-009f, -9.496460629917016E-007f,
-                        6.719543806674249E-005f, -2.641785726447862E-003f,
-                        4.202369946500099E-002f};
-    const float MO1[] = {6.913942741265801E-002f, -2.284801500053359E-001f,
-                        3.138238455499697E-001f, -2.102302420403875E-001f,
-                        5.435364690523026E-003f, 1.493389585089498E-001f,
-                        4.976029650847191E-006f, 7.978845453073848E-001f};
-    const float PH1[] = {-4.497014141919556E+001f, 5.073465654089319E+001f,
-                        -2.485774108720340E+001f, 7.222973196770240E+000f,
-                        -1.544842782180211E+000f, 3.503787691653334E-001f,
-                        -1.637986776941202E-001f, 3.749989509080821E-001f};
-    const T YO1 = pset1<T>(4.66539330185668857532f);
-    const T NEG_THPIO4F = pset1<T>(-2.35619449019234492885f);    /* -3*pi/4 */
-    const T TWOOPI = pset1<T>(0.636619772367581343075535f); /* 2/pi */
-    const T NEG_MAXNUM = pset1<T>(-NumTraits<float>::infinity());
-
-    T z = pmul(x, x);
-    T x_le_two = pmul(psub(z, YO1), internal::ppolevl<T, 4>::run(z, YP));
-    x_le_two = pmadd(
-       x_le_two, x,
-       pmul(TWOOPI, pmadd(
-           generic_j1<T, float>::run(x), plog(x),
-           pdiv(pset1<T>(-1.0f), x))));
-    x_le_two = pselect(pcmp_lt(x, pset1<T>(0.0f)), NEG_MAXNUM, x_le_two);
-
-    T q = pdiv(pset1<T>(1.0), x);
-    T w = prsqrt(x);
-    T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO1));
-    w = pmul(q, q);
-    T xn = pmadd(q, internal::ppolevl<T, 7>::run(w, PH1), NEG_THPIO4F);
-    T x_gt_two = pmul(p, psin(padd(xn, x)));
-    return pselect(pcmp_le(x, pset1<T>(2.0)), x_le_two, x_gt_two);
-  }
-};
-
-template <typename T>
-struct generic_y1<T, double> {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T& x) {
-    /*  j1.c
-     *	Bessel function of second kind of order one
-     *
-     *
-     *
-     * SYNOPSIS:
-     *
-     * double x, y, y1();
-     *
-     * y = y1( x );
-     *
-     *
-     *
-     * DESCRIPTION:
-     *
-     * Returns Bessel function of the second kind of order one
-     * of the argument.
-     *
-     * The domain is divided into the intervals [0, 8] and
-     * (8, infinity). In the first interval a 25 term Chebyshev
-     * expansion is used, and a call to j1() is required.
-     * In the second, the asymptotic trigonometric representation
-     * is employed using two rational functions of degree 5/5.
-     *
-     *
-     *
-     * ACCURACY:
-     *
-     *                      Absolute error:
-     * arithmetic   domain      # trials      peak         rms
-     *    DEC       0, 30       10000       8.6e-17     1.3e-17
-     *    IEEE      0, 30       30000       1.0e-15     1.3e-16
-     *
-     * (error criterion relative when |y1| > 1).
-     *
-     */
-    const double PP[] = {7.62125616208173112003E-4, 7.31397056940917570436E-2,
-                         1.12719608129684925192E0, 5.11207951146807644818E0,
-                         8.42404590141772420927E0, 5.21451598682361504063E0,
-                         1.00000000000000000254E0};
-    const double PQ[] = {5.71323128072548699714E-4, 6.88455908754495404082E-2,
-                         1.10514232634061696926E0, 5.07386386128601488557E0,
-                         8.39985554327604159757E0, 5.20982848682361821619E0,
-                         9.99999999999999997461E-1};
-    const double QP[] = {5.10862594750176621635E-2, 4.98213872951233449420E0,
-                         7.58238284132545283818E1, 3.66779609360150777800E2,
-                         7.10856304998926107277E2, 5.97489612400613639965E2,
-                         2.11688757100572135698E2, 2.52070205858023719784E1};
-    const double QQ[] = {1.00000000000000000000E0, 7.42373277035675149943E1,
-                         1.05644886038262816351E3, 4.98641058337653607651E3,
-                         9.56231892404756170795E3, 7.99704160447350683650E3,
-                         2.82619278517639096600E3, 3.36093607810698293419E2};
-    const double YP[] = {1.26320474790178026440E9, -6.47355876379160291031E11,
-                         1.14509511541823727583E14, -8.12770255501325109621E15,
-                         2.02439475713594898196E17, -7.78877196265950026825E17};
-    const double YQ[] = {1.00000000000000000000E0, 5.94301592346128195359E2,
-                         2.35564092943068577943E5, 7.34811944459721705660E7,
-                         1.87601316108706159478E10, 3.88231277496238566008E12,
-                         6.20557727146953693363E14, 6.87141087355300489866E16,
-                         3.97270608116560655612E18};
-    const T SQ2OPI = pset1<T>(.79788456080286535588);
-    const T NEG_THPIO4 = pset1<T>(-2.35619449019234492885);    /* -3*pi/4 */
-    const T TWOOPI = pset1<T>(0.636619772367581343075535); /* 2/pi */
-    const T NEG_MAXNUM = pset1<T>(-NumTraits<double>::infinity());
-
-    T z = pmul(x, x);
-    T x_le_five = pdiv(internal::ppolevl<T, 5>::run(z, YP),
-                   internal::ppolevl<T, 8>::run(z, YQ));
-    x_le_five = pmadd(
-        x_le_five, x, pmul(
-            TWOOPI, pmadd(generic_j1<T, double>::run(x), plog(x),
-                          pdiv(pset1<T>(-1.0), x))));
-
-    x_le_five = pselect(pcmp_le(x, pset1<T>(0.0)), NEG_MAXNUM, x_le_five);
-    T s = pdiv(pset1<T>(25.0), z);
-    T p = pdiv(
-        internal::ppolevl<T, 6>::run(s, PP),
-        internal::ppolevl<T, 6>::run(s, PQ));
-    T q = pdiv(
-        internal::ppolevl<T, 7>::run(s, QP),
-        internal::ppolevl<T, 7>::run(s, QQ));
-    T xn = padd(x, NEG_THPIO4);
-    T w = pdiv(pset1<T>(5.0), x);
-    p = pmadd(p, psin(xn), pmul(w, pmul(q, pcos(xn))));
-    T x_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(x)));
-    return pselect(pcmp_le(x, pset1<T>(5.0)), x_le_five, x_gt_five);
-  }
-};
-
-template <typename Scalar>
-struct bessel_y1_impl {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
-    return generic_y1<Scalar, Scalar>::run(x);
-  }
-};
-
-}  // end namespace internal
-
-namespace numext {
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i0, Scalar)
-    bessel_i0(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(bessel_i0, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i0e, Scalar)
-    bessel_i0e(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(bessel_i0e, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i1, Scalar)
-    bessel_i1(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(bessel_i1, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i1e, Scalar)
-    bessel_i1e(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(bessel_i1e, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k0, Scalar)
-    bessel_k0(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(bessel_k0, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k0e, Scalar)
-    bessel_k0e(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(bessel_k0e, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k1, Scalar)
-    bessel_k1(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(bessel_k1, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k1e, Scalar)
-    bessel_k1e(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(bessel_k1e, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_j0, Scalar)
-    bessel_j0(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(bessel_j0, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_y0, Scalar)
-    bessel_y0(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(bessel_y0, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_j1, Scalar)
-    bessel_j1(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(bessel_j1, Scalar)::run(x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_y1, Scalar)
-    bessel_y1(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(bessel_y1, Scalar)::run(x);
-}
-
-}  // end namespace numext
-
-}  // end namespace Eigen
-
-#endif  // EIGEN_BESSEL_FUNCTIONS_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h
deleted file mode 100644
index efc6d9c8f..000000000
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h
+++ /dev/null
@@ -1,130 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_BESSELFUNCTIONS_PACKETMATH_H
-#define EIGEN_BESSELFUNCTIONS_PACKETMATH_H
-
-namespace Eigen {
-
-namespace internal {
-
-/** \internal \returns the exponentially scaled modified Bessel function of
- * order zero i0(\a a) (coeff-wise) */
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pbessel_i0(const Packet& x) {
-  typedef typename unpacket_traits<Packet>::type ScalarType;
-  using internal::generic_i0; return generic_i0<Packet, ScalarType>::run(x);
-}
-
-/** \internal \returns the exponentially scaled modified Bessel function of
- * order zero i0e(\a a) (coeff-wise) */
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pbessel_i0e(const Packet& x) {
-  typedef typename unpacket_traits<Packet>::type ScalarType;
-  using internal::generic_i0e; return generic_i0e<Packet, ScalarType>::run(x);
-}
-
-/** \internal \returns the exponentially scaled modified Bessel function of
- * order one i1(\a a) (coeff-wise) */
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pbessel_i1(const Packet& x) {
-  typedef typename unpacket_traits<Packet>::type ScalarType;
-  using internal::generic_i1; return generic_i1<Packet, ScalarType>::run(x);
-}
-
-/** \internal \returns the exponentially scaled modified Bessel function of
- * order one i1e(\a a) (coeff-wise) */
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pbessel_i1e(const Packet& x) {
-  typedef typename unpacket_traits<Packet>::type ScalarType;
-  using internal::generic_i1e; return generic_i1e<Packet, ScalarType>::run(x);
-}
-
-/** \internal \returns the exponentially scaled modified Bessel function of
- * order zero j0(\a a) (coeff-wise) */
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pbessel_j0(const Packet& x) {
-  typedef typename unpacket_traits<Packet>::type ScalarType;
-  using internal::generic_j0; return generic_j0<Packet, ScalarType>::run(x);
-}
-
-/** \internal \returns the exponentially scaled modified Bessel function of
- * order zero j1(\a a) (coeff-wise) */
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pbessel_j1(const Packet& x) {
-  typedef typename unpacket_traits<Packet>::type ScalarType;
-  using internal::generic_j1; return generic_j1<Packet, ScalarType>::run(x);
-}
-
-/** \internal \returns the exponentially scaled modified Bessel function of
- * order one y0(\a a) (coeff-wise) */
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pbessel_y0(const Packet& x) {
-  typedef typename unpacket_traits<Packet>::type ScalarType;
-  using internal::generic_y0; return generic_y0<Packet, ScalarType>::run(x);
-}
-
-/** \internal \returns the exponentially scaled modified Bessel function of
- * order one y1(\a a) (coeff-wise) */
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pbessel_y1(const Packet& x) {
-  typedef typename unpacket_traits<Packet>::type ScalarType;
-  using internal::generic_y1; return generic_y1<Packet, ScalarType>::run(x);
-}
-
-/** \internal \returns the exponentially scaled modified Bessel function of
- * order zero k0(\a a) (coeff-wise) */
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pbessel_k0(const Packet& x) {
-  typedef typename unpacket_traits<Packet>::type ScalarType;
-  using internal::generic_k0; return generic_k0<Packet, ScalarType>::run(x);
-}
-
-/** \internal \returns the exponentially scaled modified Bessel function of
- * order zero k0e(\a a) (coeff-wise) */
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pbessel_k0e(const Packet& x) {
-  typedef typename unpacket_traits<Packet>::type ScalarType;
-  using internal::generic_k0e; return generic_k0e<Packet, ScalarType>::run(x);
-}
-
-/** \internal \returns the exponentially scaled modified Bessel function of
- * order one k1e(\a a) (coeff-wise) */
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pbessel_k1(const Packet& x) {
-  typedef typename unpacket_traits<Packet>::type ScalarType;
-  using internal::generic_k1; return generic_k1<Packet, ScalarType>::run(x);
-}
-
-/** \internal \returns the exponentially scaled modified Bessel function of
- * order one k1e(\a a) (coeff-wise) */
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pbessel_k1e(const Packet& x) {
-  typedef typename unpacket_traits<Packet>::type ScalarType;
-  using internal::generic_k1e; return generic_k1e<Packet, ScalarType>::run(x);
-}
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_BESSELFUNCTIONS_PACKETMATH_H
-
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h
deleted file mode 100644
index d7b231adb..000000000
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef HIP_VECTOR_COMPATIBILITY_H
-#define HIP_VECTOR_COMPATIBILITY_H
-
-namespace hip_impl {
-  template <typename, typename, unsigned int> struct Scalar_accessor;
-}   // end namespace hip_impl
-
-namespace Eigen {
-namespace internal {
-
-#define HIP_SCALAR_ACCESSOR_BUILDER(NAME) \
-template <typename T, typename U, unsigned int n> \
-struct NAME <hip_impl::Scalar_accessor<T, U, n>> : NAME <T> {};
-
-#define HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(NAME) \
-template <typename T, typename U, unsigned int n> \
-struct NAME##_impl <hip_impl::Scalar_accessor<T, U, n>> : NAME##_impl <T> {}; \
-template <typename T, typename U, unsigned int n> \
-struct NAME##_retval <hip_impl::Scalar_accessor<T, U, n>> : NAME##_retval <T> {};
-
-#define HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(NAME) \
-template <typename T, typename U, unsigned int n, IgammaComputationMode mode> \
-struct NAME <hip_impl::Scalar_accessor<T, U, n>, mode> : NAME <T, mode> {};
-
-#if EIGEN_HAS_C99_MATH
-HIP_SCALAR_ACCESSOR_BUILDER(betainc_helper)
-HIP_SCALAR_ACCESSOR_BUILDER(incbeta_cfe)
-
-HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(erf)
-HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(erfc)
-HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(igammac)
-HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(lgamma)
-HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(ndtri)
-HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(polygamma)
-
-HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(igamma_generic_impl)
-#endif
-
-HIP_SCALAR_ACCESSOR_BUILDER(digamma_impl_maybe_poly)
-HIP_SCALAR_ACCESSOR_BUILDER(zeta_impl_series)
-
-HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i0)
-HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i0e)
-HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i1)
-HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i1e)
-HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_j0)
-HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_j1)
-HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k0)
-HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k0e)
-HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k1)
-HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k1e)
-HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_y0)
-HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_y1)
-HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(betainc)
-HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(digamma)
-HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(gamma_sample_der_alpha)
-HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(igamma_der_a)
-HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(igamma)
-HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(zeta)
-
-HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(igamma_series_impl)
-HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(igammac_cf_impl)
-
-}  // end namespace internal
-}  // end namespace Eigen
-
-#endif  // HIP_VECTOR_COMPATIBILITY_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
index 691ff4d03..ed415db99 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
@@ -24,7 +24,7 @@ namespace Eigen {
   * \sa Eigen::igammac(), Eigen::lgamma()
   */
 template<typename Derived,typename ExponentDerived>
-EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
 igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x)
 {
   return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
@@ -33,48 +33,6 @@ igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerive
   );
 }
 
-/** \cpp11 \returns an expression of the coefficient-wise igamma_der_a(\a a, \a x) to the given arrays.
-  *
-  * This function computes the coefficient-wise derivative of the incomplete
-  * gamma function with respect to the parameter a.
-  *
-  * \note This function supports only float and double scalar types in c++11
-  * mode. To support other scalar types,
-  * or float/double in non c++11 mode, the user has to provide implementations
-  * of igamma_der_a(T,T) for any scalar
-  * type T to be supported.
-  *
-  * \sa Eigen::igamma(), Eigen::lgamma()
-  */
-template <typename Derived, typename ExponentDerived>
-EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_der_a_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
-igamma_der_a(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) {
-  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_der_a_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
-    a.derived(),
-    x.derived());
-}
-
-/** \cpp11 \returns an expression of the coefficient-wise gamma_sample_der_alpha(\a alpha, \a sample) to the given arrays.
-  *
-  * This function computes the coefficient-wise derivative of the sample
-  * of a Gamma(alpha, 1) random variable with respect to the parameter alpha.
-  *
-  * \note This function supports only float and double scalar types in c++11
-  * mode. To support other scalar types,
-  * or float/double in non c++11 mode, the user has to provide implementations
-  * of gamma_sample_der_alpha(T,T) for any scalar
-  * type T to be supported.
-  *
-  * \sa Eigen::igamma(), Eigen::lgamma()
-  */
-template <typename AlphaDerived, typename SampleDerived>
-EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_gamma_sample_der_alpha_op<typename AlphaDerived::Scalar>, const AlphaDerived, const SampleDerived>
-gamma_sample_der_alpha(const Eigen::ArrayBase<AlphaDerived>& alpha, const Eigen::ArrayBase<SampleDerived>& sample) {
-  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_gamma_sample_der_alpha_op<typename AlphaDerived::Scalar>, const AlphaDerived, const SampleDerived>(
-      alpha.derived(),
-      sample.derived());
-}
-
 /** \cpp11 \returns an expression of the coefficient-wise igammac(\a a, \a x) to the given arrays.
   *
   * This function computes the coefficient-wise complementary incomplete gamma function.
@@ -86,7 +44,7 @@ gamma_sample_der_alpha(const Eigen::ArrayBase<AlphaDerived>& alpha, const Eigen:
   * \sa Eigen::igamma(), Eigen::lgamma()
   */
 template<typename Derived,typename ExponentDerived>
-EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
 igammac(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x)
 {
   return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
@@ -108,7 +66,7 @@ igammac(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDeriv
 // * \warning Be careful with the order of the parameters: x.polygamma(n) is equivalent to polygamma(n,x)
 // * \sa ArrayBase::polygamma()
 template<typename DerivedN,typename DerivedX>
-EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>
 polygamma(const Eigen::ArrayBase<DerivedN>& n, const Eigen::ArrayBase<DerivedX>& x)
 {
   return Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>(
@@ -128,7 +86,7 @@ polygamma(const Eigen::ArrayBase<DerivedN>& n, const Eigen::ArrayBase<DerivedX>&
   * \sa Eigen::betainc(), Eigen::lgamma()
   */
 template<typename ArgADerived, typename ArgBDerived, typename ArgXDerived>
-EIGEN_STRONG_INLINE const Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>
+inline const Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>
 betainc(const Eigen::ArrayBase<ArgADerived>& a, const Eigen::ArrayBase<ArgBDerived>& b, const Eigen::ArrayBase<ArgXDerived>& x)
 {
   return Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>(
@@ -143,7 +101,7 @@ betainc(const Eigen::ArrayBase<ArgADerived>& a, const Eigen::ArrayBase<ArgBDeriv
   *
   * It returns the Riemann zeta function of two arguments \a x and \a q:
   *
-  * \param x is the exponent, it must be > 1
+  * \param x is the exposent, it must be > 1
   * \param q is the shift, it must be > 0
   *
   * \note This function supports only float and double scalar types. To support other scalar types, the user has
@@ -152,7 +110,7 @@ betainc(const Eigen::ArrayBase<ArgADerived>& a, const Eigen::ArrayBase<ArgBDeriv
   * \sa ArrayBase::zeta()
   */
 template<typename DerivedX,typename DerivedQ>
-EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>
 zeta(const Eigen::ArrayBase<DerivedX>& x, const Eigen::ArrayBase<DerivedQ>& q)
 {
   return Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>(
@@ -161,7 +119,6 @@ zeta(const Eigen::ArrayBase<DerivedX>& x, const Eigen::ArrayBase<DerivedQ>& q)
   );
 }
 
-
 } // end namespace Eigen
 
 #endif // EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
index abefe99b7..d8f2363be 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
@@ -41,60 +41,6 @@ struct functor_traits<scalar_igamma_op<Scalar> > {
   };
 };
 
-/** \internal
-  * \brief Template functor to compute the derivative of the incomplete gamma
-  * function igamma_der_a(a, x)
-  *
-  * \sa class CwiseBinaryOp, Cwise::igamma_der_a
-  */
-template <typename Scalar>
-struct scalar_igamma_der_a_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_igamma_der_a_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a, const Scalar& x) const {
-    using numext::igamma_der_a;
-    return igamma_der_a(a, x);
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const {
-    return internal::pigamma_der_a(a, x);
-  }
-};
-template <typename Scalar>
-struct functor_traits<scalar_igamma_der_a_op<Scalar> > {
-  enum {
-    // 2x the cost of igamma
-    Cost = 40 * NumTraits<Scalar>::MulCost + 20 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasIGammaDerA
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the derivative of the sample
-  * of a Gamma(alpha, 1) random variable with respect to the parameter alpha
-  * gamma_sample_der_alpha(alpha, sample)
-  *
-  * \sa class CwiseBinaryOp, Cwise::gamma_sample_der_alpha
-  */
-template <typename Scalar>
-struct scalar_gamma_sample_der_alpha_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_gamma_sample_der_alpha_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& alpha, const Scalar& sample) const {
-    using numext::gamma_sample_der_alpha;
-    return gamma_sample_der_alpha(alpha, sample);
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& alpha, const Packet& sample) const {
-    return internal::pgamma_sample_der_alpha(alpha, sample);
-  }
-};
-template <typename Scalar>
-struct functor_traits<scalar_gamma_sample_der_alpha_op<Scalar> > {
-  enum {
-    // 2x the cost of igamma, minus the lgamma cost (the lgamma cancels out)
-    Cost = 30 * NumTraits<Scalar>::MulCost + 15 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasGammaSampleDerAlpha
-  };
-};
 
 /** \internal
   * \brief Template functor to compute the complementary incomplete gamma function igammac(a, x)
@@ -155,11 +101,11 @@ struct functor_traits<scalar_betainc_op<Scalar> > {
  */
 template<typename Scalar> struct scalar_lgamma_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_lgamma_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
     using numext::lgamma; return lgamma(a);
   }
   typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::plgamma(a); }
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plgamma(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_lgamma_op<Scalar> >
@@ -177,11 +123,11 @@ struct functor_traits<scalar_lgamma_op<Scalar> >
  */
 template<typename Scalar> struct scalar_digamma_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_digamma_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
     using numext::digamma; return digamma(a);
   }
   typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::pdigamma(a); }
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pdigamma(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_digamma_op<Scalar> >
@@ -199,11 +145,11 @@ struct functor_traits<scalar_digamma_op<Scalar> >
  */
 template<typename Scalar> struct scalar_zeta_op {
     EIGEN_EMPTY_STRUCT_CTOR(scalar_zeta_op)
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& x, const Scalar& q) const {
+    EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& x, const Scalar& q) const {
         using numext::zeta; return zeta(x, q);
     }
     typedef typename packet_traits<Scalar>::type Packet;
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x, const Packet& q) const { return internal::pzeta(x, q); }
+    EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x, const Packet& q) const { return internal::pzeta(x, q); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_zeta_op<Scalar> >
@@ -221,11 +167,11 @@ struct functor_traits<scalar_zeta_op<Scalar> >
  */
 template<typename Scalar> struct scalar_polygamma_op {
     EIGEN_EMPTY_STRUCT_CTOR(scalar_polygamma_op)
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& n, const Scalar& x) const {
+    EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& n, const Scalar& x) const {
         using numext::polygamma; return polygamma(n, x);
     }
     typedef typename packet_traits<Scalar>::type Packet;
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& n, const Packet& x) const { return internal::ppolygamma(n, x); }
+    EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& n, const Packet& x) const { return internal::ppolygamma(n, x); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_polygamma_op<Scalar> >
@@ -238,40 +184,25 @@ struct functor_traits<scalar_polygamma_op<Scalar> >
 };
 
 /** \internal
- * \brief Template functor to compute the error function of a scalar
- * \sa class CwiseUnaryOp, ArrayBase::erf()
+ * \brief Template functor to compute the Gauss error function of a
+ * scalar
+ * \sa class CwiseUnaryOp, Cwise::erf()
  */
 template<typename Scalar> struct scalar_erf_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_erf_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
-  operator()(const Scalar& a) const {
-    return numext::erf(a);
-  }
-  template <typename Packet>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
-    return perf(x);
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+    using numext::erf; return erf(a);
   }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perf(a); }
 };
-template <typename Scalar>
-struct functor_traits<scalar_erf_op<Scalar> > {
+template<typename Scalar>
+struct functor_traits<scalar_erf_op<Scalar> >
+{
   enum {
-    PacketAccess = packet_traits<Scalar>::HasErf,
-    Cost =
-        (PacketAccess
-#ifdef EIGEN_VECTORIZE_FMA
-             // TODO(rmlarsen): Move the FMA cost model to a central location.
-             // Haswell can issue 2 add/mul/madd per cycle.
-             // 10 pmadd, 2 pmul, 1 div, 2 other
-             ? (2 * NumTraits<Scalar>::AddCost +
-                7 * NumTraits<Scalar>::MulCost +
-                scalar_div_cost<Scalar, packet_traits<Scalar>::HasDiv>::value)
-#else
-             ? (12 * NumTraits<Scalar>::AddCost +
-                12 * NumTraits<Scalar>::MulCost +
-                scalar_div_cost<Scalar, packet_traits<Scalar>::HasDiv>::value)
-#endif
-             // Assume for simplicity that this is as expensive as an exp().
-             : (functor_traits<scalar_exp_op<Scalar> >::Cost))
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasErf
   };
 };
 
@@ -282,11 +213,11 @@ struct functor_traits<scalar_erf_op<Scalar> > {
  */
 template<typename Scalar> struct scalar_erfc_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_erfc_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
     using numext::erfc; return erfc(a);
   }
   typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::perfc(a); }
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perfc(a); }
 };
 template<typename Scalar>
 struct functor_traits<scalar_erfc_op<Scalar> >
@@ -298,31 +229,6 @@ struct functor_traits<scalar_erfc_op<Scalar> >
   };
 };
 
-/** \internal
- * \brief Template functor to compute the Inverse of the normal distribution
- * function of a scalar
- * \sa class CwiseUnaryOp, Cwise::ndtri()
- */
-template<typename Scalar> struct scalar_ndtri_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_ndtri_op)
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const {
-    using numext::ndtri; return ndtri(a);
-  }
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::pndtri(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_ndtri_op<Scalar> >
-{
-  enum {
-    // On average, We are evaluating rational functions with degree N=9 in the
-    // numerator and denominator. This results in 2*N additions and 2*N
-    // multiplications.
-    Cost = 18 * NumTraits<Scalar>::MulCost + 18 * NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasNdtri
-  };
-};
-
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
index 2a3a53168..553bcda6a 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
@@ -30,20 +30,9 @@ template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::ha
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) {
   return Eigen::half(Eigen::numext::erfc(static_cast<float>(a)));
 }
-template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ndtri(const Eigen::half& a) {
-  return Eigen::half(Eigen::numext::ndtri(static_cast<float>(a)));
-}
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) {
   return Eigen::half(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
 }
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma_der_a(const Eigen::half& a, const Eigen::half& x) {
-  return Eigen::half(Eigen::numext::igamma_der_a(static_cast<float>(a), static_cast<float>(x)));
-}
-template <>
-EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half gamma_sample_der_alpha(const Eigen::half& alpha, const Eigen::half& sample) {
-  return Eigen::half(Eigen::numext::gamma_sample_der_alpha(static_cast<float>(alpha), static_cast<float>(sample)));
-}
 template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen::half& a, const Eigen::half& x) {
   return Eigen::half(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x)));
 }
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
index 0044b8a27..f524d7137 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@@ -36,6 +36,66 @@ namespace internal {
 //    Good luck with your project,
 //    Steve
 
+namespace cephes {
+
+/* polevl (modified for Eigen)
+ *
+ *      Evaluate polynomial
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * int N;
+ * Scalar x, y, coef[N+1];
+ *
+ * y = polevl<decltype(x), N>( x, coef);
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Evaluates polynomial of degree N:
+ *
+ *                     2          N
+ * y  =  C  + C x + C x  +...+ C x
+ *        0    1     2          N
+ *
+ * Coefficients are stored in reverse order:
+ *
+ * coef[0] = C  , ..., coef[N] = C  .
+ *            N                   0
+ *
+ *  The function p1evl() assumes that coef[N] = 1.0 and is
+ * omitted from the array.  Its calling arguments are
+ * otherwise the same as polevl().
+ *
+ *
+ * The Eigen implementation is templatized.  For best speed, store
+ * coef as a const array (constexpr), e.g.
+ *
+ * const double coef[] = {1.0, 2.0, 3.0, ...};
+ *
+ */
+template <typename Scalar, int N>
+struct polevl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar x, const Scalar coef[]) {
+    EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    return polevl<Scalar, N - 1>::run(x, coef) * x + coef[N];
+  }
+};
+
+template <typename Scalar>
+struct polevl<Scalar, 0> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar, const Scalar coef[]) {
+    return coef[0];
+  }
+};
+
+}  // end namespace cephes
 
 /****************************************************************************
  * Implementation of lgamma, requires C++11/C99                             *
@@ -61,11 +121,9 @@ template <>
 struct lgamma_impl<float> {
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE float run(float x) {
-#if !defined(EIGEN_GPU_COMPILE_PHASE) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
-    int dummy;
-    return ::lgammaf_r(x, &dummy);
-#elif defined(SYCL_DEVICE_ONLY)
-    return cl::sycl::lgamma(x);
+#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
+    int signgam;
+    return ::lgammaf_r(x, &signgam);
 #else
     return ::lgammaf(x);
 #endif
@@ -76,11 +134,9 @@ template <>
 struct lgamma_impl<double> {
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE double run(double x) {
-#if !defined(EIGEN_GPU_COMPILE_PHASE) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
-    int dummy;
-    return ::lgamma_r(x, &dummy);
-#elif defined(SYCL_DEVICE_ONLY)
-    return cl::sycl::lgamma(x);
+#if !defined(__CUDA_ARCH__) && (defined(_BSD_SOURCE) || defined(_SVID_SOURCE)) && !defined(__APPLE__)
+    int signgam;
+    return ::lgamma_r(x, &signgam);
 #else
     return ::lgamma(x);
 #endif
@@ -135,7 +191,7 @@ struct digamma_impl_maybe_poly<float> {
     float z;
     if (s < 1.0e8f) {
       z = 1.0f / (s * s);
-      return z * internal::ppolevl<float, 3>::run(z, A);
+      return z * cephes::polevl<float, 3>::run(z, A);
     } else return 0.0f;
   }
 };
@@ -157,7 +213,7 @@ struct digamma_impl_maybe_poly<double> {
     double z;
     if (s < 1.0e17) {
       z = 1.0 / (s * s);
-      return z * internal::ppolevl<double, 6>::run(z, A);
+      return z * cephes::polevl<double, 6>::run(z, A);
     }
     else return 0.0;
   }
@@ -279,63 +335,13 @@ struct digamma_impl {
  * Implementation of erf, requires C++11/C99                                *
  ****************************************************************************/
 
-/** \internal \returns the error function of \a a (coeff-wise)
-    Doesn't do anything fancy, just a 13/8-degree rational interpolant which
-    is accurate up to a couple of ulp in the range [-4, 4], outside of which
-    fl(erf(x)) = +/-1.
-
-    This implementation works on both scalars and Ts.
-*/
-template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf_float(const T& a_x) {
-  // Clamp the inputs to the range [-4, 4] since anything outside
-  // this range is +/-1.0f in single-precision.
-  const T plus_4 = pset1<T>(4.f);
-  const T minus_4 = pset1<T>(-4.f);
-  const T x = pmax(pmin(a_x, plus_4), minus_4);
-  // The monomial coefficients of the numerator polynomial (odd).
-  const T alpha_1 = pset1<T>(-1.60960333262415e-02f);
-  const T alpha_3 = pset1<T>(-2.95459980854025e-03f);
-  const T alpha_5 = pset1<T>(-7.34990630326855e-04f);
-  const T alpha_7 = pset1<T>(-5.69250639462346e-05f);
-  const T alpha_9 = pset1<T>(-2.10102402082508e-06f);
-  const T alpha_11 = pset1<T>(2.77068142495902e-08f);
-  const T alpha_13 = pset1<T>(-2.72614225801306e-10f);
-
-  // The monomial coefficients of the denominator polynomial (even).
-  const T beta_0 = pset1<T>(-1.42647390514189e-02f);
-  const T beta_2 = pset1<T>(-7.37332916720468e-03f);
-  const T beta_4 = pset1<T>(-1.68282697438203e-03f);
-  const T beta_6 = pset1<T>(-2.13374055278905e-04f);
-  const T beta_8 = pset1<T>(-1.45660718464996e-05f);
-
-  // Since the polynomials are odd/even, we need x^2.
-  const T x2 = pmul(x, x);
-
-  // Evaluate the numerator polynomial p.
-  T p = pmadd(x2, alpha_13, alpha_11);
-  p = pmadd(x2, p, alpha_9);
-  p = pmadd(x2, p, alpha_7);
-  p = pmadd(x2, p, alpha_5);
-  p = pmadd(x2, p, alpha_3);
-  p = pmadd(x2, p, alpha_1);
-  p = pmul(x, p);
-
-  // Evaluate the denominator polynomial p.
-  T q = pmadd(x2, beta_8, beta_6);
-  q = pmadd(x2, q, beta_4);
-  q = pmadd(x2, q, beta_2);
-  q = pmadd(x2, q, beta_0);
-
-  // Divide the numerator by the denominator.
-  return pdiv(p, q);
-}
-
-template <typename T>
+template <typename Scalar>
 struct erf_impl {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE T run(const T x) {
-    return generic_fast_erf_float(x);
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
   }
 };
 
@@ -348,25 +354,13 @@ struct erf_retval {
 template <>
 struct erf_impl<float> {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE float run(float x) {
-#if defined(SYCL_DEVICE_ONLY)
-    return cl::sycl::erf(x);
-#else
-    return generic_fast_erf_float(x);
-#endif
-  }
+  static EIGEN_STRONG_INLINE float run(float x) { return ::erff(x); }
 };
 
 template <>
 struct erf_impl<double> {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE double run(double x) {
-#if defined(SYCL_DEVICE_ONLY)
-    return cl::sycl::erf(x);
-#else
-    return ::erf(x);
-#endif
-  }
+  static EIGEN_STRONG_INLINE double run(double x) { return ::erf(x); }
 };
 #endif  // EIGEN_HAS_C99_MATH
 
@@ -393,269 +387,16 @@ struct erfc_retval {
 template <>
 struct erfc_impl<float> {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE float run(const float x) {
-#if defined(SYCL_DEVICE_ONLY)
-    return cl::sycl::erfc(x);
-#else
-    return ::erfcf(x);
-#endif
-  }
+  static EIGEN_STRONG_INLINE float run(const float x) { return ::erfcf(x); }
 };
 
 template <>
 struct erfc_impl<double> {
   EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE double run(const double x) {
-#if defined(SYCL_DEVICE_ONLY)
-    return cl::sycl::erfc(x);
-#else
-    return ::erfc(x);
-#endif
-  }
+  static EIGEN_STRONG_INLINE double run(const double x) { return ::erfc(x); }
 };
 #endif  // EIGEN_HAS_C99_MATH
 
-
-/***************************************************************************
-* Implementation of ndtri.                                                 *
-****************************************************************************/
-
-/* Inverse of Normal distribution function (modified for Eigen).
- *
- *
- * SYNOPSIS:
- *
- * double x, y, ndtri();
- *
- * x = ndtri( y );
- *
- *
- *
- * DESCRIPTION:
- *
- * Returns the argument, x, for which the area under the
- * Gaussian probability density function (integrated from
- * minus infinity to x) is equal to y.
- *
- *
- * For small arguments 0 < y < exp(-2), the program computes
- * z = sqrt( -2.0 * log(y) );  then the approximation is
- * x = z - log(z)/z  - (1/z) P(1/z) / Q(1/z).
- * There are two rational functions P/Q, one for 0 < y < exp(-32)
- * and the other for y up to exp(-2).  For larger arguments,
- * w = y - 0.5, and  x/sqrt(2pi) = w + w**3 R(w**2)/S(w**2)).
- *
- *
- * ACCURACY:
- *
- *                      Relative error:
- * arithmetic   domain        # trials      peak         rms
- *    DEC      0.125, 1         5500       9.5e-17     2.1e-17
- *    DEC      6e-39, 0.135     3500       5.7e-17     1.3e-17
- *    IEEE     0.125, 1        20000       7.2e-16     1.3e-16
- *    IEEE     3e-308, 0.135   50000       4.6e-16     9.8e-17
- *
- *
- * ERROR MESSAGES:
- *
- *   message         condition    value returned
- * ndtri domain       x <= 0        -MAXNUM
- * ndtri domain       x >= 1         MAXNUM
- *
- */
- /*
-   Cephes Math Library Release 2.2: June, 1992
-   Copyright 1985, 1987, 1992 by Stephen L. Moshier
-   Direct inquiries to 30 Frost Street, Cambridge, MA 02140
- */
-
-
-// TODO: Add a cheaper approximation for float.
-
-
-template<typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T flipsign(
-    const T& should_flipsign, const T& x) {
-  const T sign_mask = pset1<T>(-0.0);
-  T sign_bit = pand<T>(should_flipsign, sign_mask);
-  return pxor<T>(sign_bit, x);
-}
-
-template<>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double flipsign<double>(
-    const double& should_flipsign, const double& x) {
-  return should_flipsign == 0 ? x : -x;
-}
-
-template<>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float flipsign<float>(
-    const float& should_flipsign, const float& x) {
-  return should_flipsign == 0 ? x : -x;
-}
-
-// We split this computation in to two so that in the scalar path
-// only one branch is evaluated (due to our template specialization of pselect
-// being an if statement.)
-
-template <typename T, typename ScalarType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_ndtri_gt_exp_neg_two(const T& b) {
-  const ScalarType p0[] = {
-    ScalarType(-5.99633501014107895267e1),
-    ScalarType(9.80010754185999661536e1),
-    ScalarType(-5.66762857469070293439e1),
-    ScalarType(1.39312609387279679503e1),
-    ScalarType(-1.23916583867381258016e0)
-  };
-  const ScalarType q0[] = {
-    ScalarType(1.0),
-    ScalarType(1.95448858338141759834e0),
-    ScalarType(4.67627912898881538453e0),
-    ScalarType(8.63602421390890590575e1),
-    ScalarType(-2.25462687854119370527e2),
-    ScalarType(2.00260212380060660359e2),
-    ScalarType(-8.20372256168333339912e1),
-    ScalarType(1.59056225126211695515e1),
-    ScalarType(-1.18331621121330003142e0)
-  };
-  const T sqrt2pi = pset1<T>(ScalarType(2.50662827463100050242e0));
-  const T half = pset1<T>(ScalarType(0.5));
-  T c, c2, ndtri_gt_exp_neg_two;
-
-  c = psub(b, half);
-  c2 = pmul(c, c);
-  ndtri_gt_exp_neg_two = pmadd(c, pmul(
-      c2, pdiv(
-          internal::ppolevl<T, 4>::run(c2, p0),
-          internal::ppolevl<T, 8>::run(c2, q0))), c);
-  return pmul(ndtri_gt_exp_neg_two, sqrt2pi);
-}
-
-template <typename T, typename ScalarType>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_ndtri_lt_exp_neg_two(
-    const T& b, const T& should_flipsign) {
-  /* Approximation for interval z = sqrt(-2 log a ) between 2 and 8
-   * i.e., a between exp(-2) = .135 and exp(-32) = 1.27e-14.
-   */
-  const ScalarType p1[] = {
-    ScalarType(4.05544892305962419923e0),
-    ScalarType(3.15251094599893866154e1),
-    ScalarType(5.71628192246421288162e1),
-    ScalarType(4.40805073893200834700e1),
-    ScalarType(1.46849561928858024014e1),
-    ScalarType(2.18663306850790267539e0),
-    ScalarType(-1.40256079171354495875e-1),
-    ScalarType(-3.50424626827848203418e-2),
-    ScalarType(-8.57456785154685413611e-4)
-  };
-  const ScalarType q1[] = {
-    ScalarType(1.0),
-    ScalarType(1.57799883256466749731e1),
-    ScalarType(4.53907635128879210584e1),
-    ScalarType(4.13172038254672030440e1),
-    ScalarType(1.50425385692907503408e1),
-    ScalarType(2.50464946208309415979e0),
-    ScalarType(-1.42182922854787788574e-1),
-    ScalarType(-3.80806407691578277194e-2),
-    ScalarType(-9.33259480895457427372e-4)
-  };
-  /* Approximation for interval z = sqrt(-2 log a ) between 8 and 64
-   * i.e., a between exp(-32) = 1.27e-14 and exp(-2048) = 3.67e-890.
-   */
-  const ScalarType p2[] = {
-    ScalarType(3.23774891776946035970e0),
-    ScalarType(6.91522889068984211695e0),
-    ScalarType(3.93881025292474443415e0),
-    ScalarType(1.33303460815807542389e0),
-    ScalarType(2.01485389549179081538e-1),
-    ScalarType(1.23716634817820021358e-2),
-    ScalarType(3.01581553508235416007e-4),
-    ScalarType(2.65806974686737550832e-6),
-    ScalarType(6.23974539184983293730e-9)
-  };
-  const ScalarType q2[] = {
-    ScalarType(1.0),
-    ScalarType(6.02427039364742014255e0),
-    ScalarType(3.67983563856160859403e0),
-    ScalarType(1.37702099489081330271e0),
-    ScalarType(2.16236993594496635890e-1),
-    ScalarType(1.34204006088543189037e-2),
-    ScalarType(3.28014464682127739104e-4),
-    ScalarType(2.89247864745380683936e-6),
-    ScalarType(6.79019408009981274425e-9)
-  };
-  const T eight = pset1<T>(ScalarType(8.0));
-  const T one = pset1<T>(ScalarType(1));
-  const T neg_two = pset1<T>(ScalarType(-2));
-  T x, x0, x1, z;
-
-  x = psqrt(pmul(neg_two, plog(b)));
-  x0 = psub(x, pdiv(plog(x), x));
-  z = pdiv(one, x);
-  x1 = pmul(
-      z, pselect(
-          pcmp_lt(x, eight),
-          pdiv(internal::ppolevl<T, 8>::run(z, p1),
-               internal::ppolevl<T, 8>::run(z, q1)),
-          pdiv(internal::ppolevl<T, 8>::run(z, p2),
-               internal::ppolevl<T, 8>::run(z, q2))));
-  return flipsign(should_flipsign, psub(x0, x1));
-}
-
-template <typename T, typename ScalarType>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-T generic_ndtri(const T& a) {
-  const T maxnum = pset1<T>(NumTraits<ScalarType>::infinity());
-  const T neg_maxnum = pset1<T>(-NumTraits<ScalarType>::infinity());
-
-  const T zero = pset1<T>(ScalarType(0));
-  const T one = pset1<T>(ScalarType(1));
-  // exp(-2)
-  const T exp_neg_two = pset1<T>(ScalarType(0.13533528323661269189));
-  T b, ndtri, should_flipsign;
-
-  should_flipsign = pcmp_le(a, psub(one, exp_neg_two));
-  b = pselect(should_flipsign, a, psub(one, a));
-
-  ndtri = pselect(
-      pcmp_lt(exp_neg_two, b),
-      generic_ndtri_gt_exp_neg_two<T, ScalarType>(b),
-      generic_ndtri_lt_exp_neg_two<T, ScalarType>(b, should_flipsign));
-
-  return pselect(
-      pcmp_le(a, zero), neg_maxnum,
-      pselect(pcmp_le(one, a), maxnum, ndtri));
-}
-
-template <typename Scalar>
-struct ndtri_retval {
-  typedef Scalar type;
-};
-
-#if !EIGEN_HAS_C99_MATH
-
-template <typename Scalar>
-struct ndtri_impl {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
-    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
-                        THIS_TYPE_IS_NOT_SUPPORTED);
-    return Scalar(0);
-  }
-};
-
-# else
-
-template <typename Scalar>
-struct ndtri_impl {
-  EIGEN_DEVICE_FUNC
-  static EIGEN_STRONG_INLINE Scalar run(const Scalar x) {
-    return generic_ndtri<Scalar, Scalar>(x);
-  }
-};
-
-#endif  // EIGEN_HAS_C99_MATH
-
-
 /**************************************************************************************************************
  * Implementation of igammac (complemented incomplete gamma integral), based on Cephes but requires C++11/C99 *
  **************************************************************************************************************/
@@ -711,228 +452,6 @@ struct cephes_helper<double> {
   }
 };
 
-enum IgammaComputationMode { VALUE, DERIVATIVE, SAMPLE_DERIVATIVE };
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC
-static EIGEN_STRONG_INLINE Scalar main_igamma_term(Scalar a, Scalar x) {
-    /* Compute  x**a * exp(-x) / gamma(a)  */
-    Scalar logax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
-    if (logax < -numext::log(NumTraits<Scalar>::highest()) ||
-        // Assuming x and a aren't Nan.
-        (numext::isnan)(logax)) {
-      return Scalar(0);
-    }
-    return numext::exp(logax);
-}
-
-template <typename Scalar, IgammaComputationMode mode>
-EIGEN_DEVICE_FUNC
-int igamma_num_iterations() {
-  /* Returns the maximum number of internal iterations for igamma computation.
-   */
-  if (mode == VALUE) {
-    return 2000;
-  }
-
-  if (internal::is_same<Scalar, float>::value) {
-    return 200;
-  } else if (internal::is_same<Scalar, double>::value) {
-    return 500;
-  } else {
-    return 2000;
-  }
-}
-
-template <typename Scalar, IgammaComputationMode mode>
-struct igammac_cf_impl {
-  /* Computes igamc(a, x) or derivative (depending on the mode)
-   * using the continued fraction expansion of the complementary
-   * incomplete Gamma function.
-   *
-   * Preconditions:
-   *   a > 0
-   *   x >= 1
-   *   x >= a
-   */
-  EIGEN_DEVICE_FUNC
-  static Scalar run(Scalar a, Scalar x) {
-    const Scalar zero = 0;
-    const Scalar one = 1;
-    const Scalar two = 2;
-    const Scalar machep = cephes_helper<Scalar>::machep();
-    const Scalar big = cephes_helper<Scalar>::big();
-    const Scalar biginv = cephes_helper<Scalar>::biginv();
-
-    if ((numext::isinf)(x)) {
-      return zero;
-    }
-
-    Scalar ax = main_igamma_term<Scalar>(a, x);
-    // This is independent of mode. If this value is zero,
-    // then the function value is zero. If the function value is zero,
-    // then we are in a neighborhood where the function value evalutes to zero,
-    // so the derivative is zero.
-    if (ax == zero) {
-      return zero;
-    }
-
-    // continued fraction
-    Scalar y = one - a;
-    Scalar z = x + y + one;
-    Scalar c = zero;
-    Scalar pkm2 = one;
-    Scalar qkm2 = x;
-    Scalar pkm1 = x + one;
-    Scalar qkm1 = z * x;
-    Scalar ans = pkm1 / qkm1;
-
-    Scalar dpkm2_da = zero;
-    Scalar dqkm2_da = zero;
-    Scalar dpkm1_da = zero;
-    Scalar dqkm1_da = -x;
-    Scalar dans_da = (dpkm1_da - ans * dqkm1_da) / qkm1;
-
-    for (int i = 0; i < igamma_num_iterations<Scalar, mode>(); i++) {
-      c += one;
-      y += one;
-      z += two;
-
-      Scalar yc = y * c;
-      Scalar pk = pkm1 * z - pkm2 * yc;
-      Scalar qk = qkm1 * z - qkm2 * yc;
-
-      Scalar dpk_da = dpkm1_da * z - pkm1 - dpkm2_da * yc + pkm2 * c;
-      Scalar dqk_da = dqkm1_da * z - qkm1 - dqkm2_da * yc + qkm2 * c;
-
-      if (qk != zero) {
-        Scalar ans_prev = ans;
-        ans = pk / qk;
-
-        Scalar dans_da_prev = dans_da;
-        dans_da = (dpk_da - ans * dqk_da) / qk;
-
-        if (mode == VALUE) {
-          if (numext::abs(ans_prev - ans) <= machep * numext::abs(ans)) {
-            break;
-          }
-        } else {
-          if (numext::abs(dans_da - dans_da_prev) <= machep) {
-            break;
-          }
-        }
-      }
-
-      pkm2 = pkm1;
-      pkm1 = pk;
-      qkm2 = qkm1;
-      qkm1 = qk;
-
-      dpkm2_da = dpkm1_da;
-      dpkm1_da = dpk_da;
-      dqkm2_da = dqkm1_da;
-      dqkm1_da = dqk_da;
-
-      if (numext::abs(pk) > big) {
-        pkm2 *= biginv;
-        pkm1 *= biginv;
-        qkm2 *= biginv;
-        qkm1 *= biginv;
-
-        dpkm2_da *= biginv;
-        dpkm1_da *= biginv;
-        dqkm2_da *= biginv;
-        dqkm1_da *= biginv;
-      }
-    }
-
-    /* Compute  x**a * exp(-x) / gamma(a)  */
-    Scalar dlogax_da = numext::log(x) - digamma_impl<Scalar>::run(a);
-    Scalar dax_da = ax * dlogax_da;
-
-    switch (mode) {
-      case VALUE:
-        return ans * ax;
-      case DERIVATIVE:
-        return ans * dax_da + dans_da * ax;
-      case SAMPLE_DERIVATIVE:
-      default: // this is needed to suppress clang warning
-        return -(dans_da + ans * dlogax_da) * x;
-    }
-  }
-};
-
-template <typename Scalar, IgammaComputationMode mode>
-struct igamma_series_impl {
-  /* Computes igam(a, x) or its derivative (depending on the mode)
-   * using the series expansion of the incomplete Gamma function.
-   *
-   * Preconditions:
-   *   x > 0
-   *   a > 0
-   *   !(x > 1 && x > a)
-   */
-  EIGEN_DEVICE_FUNC
-  static Scalar run(Scalar a, Scalar x) {
-    const Scalar zero = 0;
-    const Scalar one = 1;
-    const Scalar machep = cephes_helper<Scalar>::machep();
-
-    Scalar ax = main_igamma_term<Scalar>(a, x);
-
-    // This is independent of mode. If this value is zero,
-    // then the function value is zero. If the function value is zero,
-    // then we are in a neighborhood where the function value evalutes to zero,
-    // so the derivative is zero.
-    if (ax == zero) {
-      return zero;
-    }
-
-    ax /= a;
-
-    /* power series */
-    Scalar r = a;
-    Scalar c = one;
-    Scalar ans = one;
-
-    Scalar dc_da = zero;
-    Scalar dans_da = zero;
-
-    for (int i = 0; i < igamma_num_iterations<Scalar, mode>(); i++) {
-      r += one;
-      Scalar term = x / r;
-      Scalar dterm_da = -x / (r * r);
-      dc_da = term * dc_da + dterm_da * c;
-      dans_da += dc_da;
-      c *= term;
-      ans += c;
-
-      if (mode == VALUE) {
-        if (c <= machep * ans) {
-          break;
-        }
-      } else {
-        if (numext::abs(dc_da) <= machep * numext::abs(dans_da)) {
-          break;
-        }
-      }
-    }
-
-    Scalar dlogax_da = numext::log(x) - digamma_impl<Scalar>::run(a + one);
-    Scalar dax_da = ax * dlogax_da;
-
-    switch (mode) {
-      case VALUE:
-        return ans * ax;
-      case DERIVATIVE:
-        return ans * dax_da + dans_da * ax;
-      case SAMPLE_DERIVATIVE:
-      default: // this is needed to suppress clang warning
-        return -(dans_da + ans * dlogax_da) * x / a;
-    }
-  }
-};
-
 #if !EIGEN_HAS_C99_MATH
 
 template <typename Scalar>
@@ -947,6 +466,8 @@ struct igammac_impl {
 
 #else
 
+template <typename Scalar> struct igamma_impl;  // predeclare igamma_impl
+
 template <typename Scalar>
 struct igammac_impl {
   EIGEN_DEVICE_FUNC
@@ -1014,15 +535,93 @@ struct igammac_impl {
       return nan;
     }
 
-    if ((numext::isnan)(a) || (numext::isnan)(x)) {  // propagate nans
-      return nan;
-    }
-
     if ((x < one) || (x < a)) {
-      return (one - igamma_series_impl<Scalar, VALUE>::run(a, x));
+      /* The checks above ensure that we meet the preconditions for
+       * igamma_impl::Impl(), so call it, rather than igamma_impl::Run().
+       * Calling Run() would also work, but in that case the compiler may not be
+       * able to prove that igammac_impl::Run and igamma_impl::Run are not
+       * mutually recursive.  This leads to worse code, particularly on
+       * platforms like nvptx, where recursion is allowed only begrudgingly.
+       */
+      return (one - igamma_impl<Scalar>::Impl(a, x));
     }
 
-    return igammac_cf_impl<Scalar, VALUE>::run(a, x);
+    return Impl(a, x);
+  }
+
+ private:
+  /* igamma_impl calls igammac_impl::Impl. */
+  friend struct igamma_impl<Scalar>;
+
+  /* Actually computes igamc(a, x).
+   *
+   * Preconditions:
+   *   a > 0
+   *   x >= 1
+   *   x >= a
+   */
+  EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) {
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar two = 2;
+    const Scalar machep = cephes_helper<Scalar>::machep();
+    const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
+    const Scalar big = cephes_helper<Scalar>::big();
+    const Scalar biginv = cephes_helper<Scalar>::biginv();
+    const Scalar inf = NumTraits<Scalar>::infinity();
+
+    Scalar ans, ax, c, yc, r, t, y, z;
+    Scalar pk, pkm1, pkm2, qk, qkm1, qkm2;
+
+    if (x == inf) return zero;  // std::isinf crashes on CUDA
+
+    /* Compute  x**a * exp(-x) / gamma(a)  */
+    ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
+    if (ax < -maxlog) {  // underflow
+      return zero;
+    }
+    ax = numext::exp(ax);
+
+    // continued fraction
+    y = one - a;
+    z = x + y + one;
+    c = zero;
+    pkm2 = one;
+    qkm2 = x;
+    pkm1 = x + one;
+    qkm1 = z * x;
+    ans = pkm1 / qkm1;
+
+    while (true) {
+      c += one;
+      y += one;
+      z += two;
+      yc = y * c;
+      pk = pkm1 * z - pkm2 * yc;
+      qk = qkm1 * z - qkm2 * yc;
+      if (qk != zero) {
+        r = pk / qk;
+        t = numext::abs((ans - r) / r);
+        ans = r;
+      } else {
+        t = one;
+      }
+      pkm2 = pkm1;
+      pkm1 = pk;
+      qkm2 = qkm1;
+      qkm1 = qk;
+      if (numext::abs(pk) > big) {
+        pkm2 *= biginv;
+        pkm1 *= biginv;
+        qkm2 *= biginv;
+        qkm1 *= biginv;
+      }
+      if (t <= machep) {
+        break;
+      }
+    }
+
+    return (ans * ax);
   }
 };
 
@@ -1032,10 +631,15 @@ struct igammac_impl {
  * Implementation of igamma (incomplete gamma integral), based on Cephes but requires C++11/C99 *
  ************************************************************************************************/
 
+template <typename Scalar>
+struct igamma_retval {
+  typedef Scalar type;
+};
+
 #if !EIGEN_HAS_C99_MATH
 
-template <typename Scalar, IgammaComputationMode mode>
-struct igamma_generic_impl {
+template <typename Scalar>
+struct igamma_impl {
   EIGEN_DEVICE_FUNC
   static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar x) {
     EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
@@ -1046,17 +650,69 @@ struct igamma_generic_impl {
 
 #else
 
-template <typename Scalar, IgammaComputationMode mode>
-struct igamma_generic_impl {
+template <typename Scalar>
+struct igamma_impl {
   EIGEN_DEVICE_FUNC
   static Scalar run(Scalar a, Scalar x) {
-    /* Depending on the mode, returns
-     * - VALUE: incomplete Gamma function igamma(a, x)
-     * - DERIVATIVE: derivative of incomplete Gamma function d/da igamma(a, x)
-     * - SAMPLE_DERIVATIVE: implicit derivative of a Gamma random variable
-     * x ~ Gamma(x | a, 1), dx/da = -1 / Gamma(x | a, 1) * d igamma(a, x) / dx
+    /*	igam()
+     *	Incomplete gamma integral
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double a, x, y, igam();
+     *
+     * y = igam( a, x );
+     *
+     * DESCRIPTION:
+     *
+     * The function is defined by
+     *
+     *                           x
+     *                            -
+     *                   1       | |  -t  a-1
+     *  igam(a,x)  =   -----     |   e   t   dt.
+     *                  -      | |
+     *                 | (a)    -
+     *                           0
+     *
+     *
+     * In this implementation both arguments must be positive.
+     * The integral is evaluated by either a power series or
+     * continued fraction expansion, depending on the relative
+     * values of a and x.
+     *
+     * ACCURACY (double):
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30       200000       3.6e-14     2.9e-15
+     *    IEEE      0,100      300000       9.9e-14     1.5e-14
+     *
+     *
+     * ACCURACY (float):
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        20000       7.8e-6      5.9e-7
+     *
+     */
+    /*
+      Cephes Math Library Release 2.2: June, 1992
+      Copyright 1985, 1987, 1992 by Stephen L. Moshier
+      Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+    */
+
+
+    /* left tail of incomplete gamma function:
+     *
+     *          inf.      k
+     *   a  -x   -       x
+     *  x  e     >   ----------
+     *           -     -
+     *          k=0   | (a+k+1)
      *
-     * Derivatives are implemented by forward-mode differentiation.
      */
     const Scalar zero = 0;
     const Scalar one = 1;
@@ -1068,167 +724,67 @@ struct igamma_generic_impl {
       return nan;
     }
 
-    if ((numext::isnan)(a) || (numext::isnan)(x)) {  // propagate nans
-      return nan;
+    if ((x > one) && (x > a)) {
+      /* The checks above ensure that we meet the preconditions for
+       * igammac_impl::Impl(), so call it, rather than igammac_impl::Run().
+       * Calling Run() would also work, but in that case the compiler may not be
+       * able to prove that igammac_impl::Run and igamma_impl::Run are not
+       * mutually recursive.  This leads to worse code, particularly on
+       * platforms like nvptx, where recursion is allowed only begrudgingly.
+       */
+      return (one - igammac_impl<Scalar>::Impl(a, x));
     }
 
-    if ((x > one) && (x > a)) {
-      Scalar ret = igammac_cf_impl<Scalar, mode>::run(a, x);
-      if (mode == VALUE) {
-        return one - ret;
-      } else {
-        return -ret;
+    return Impl(a, x);
+  }
+
+ private:
+  /* igammac_impl calls igamma_impl::Impl. */
+  friend struct igammac_impl<Scalar>;
+
+  /* Actually computes igam(a, x).
+   *
+   * Preconditions:
+   *   x > 0
+   *   a > 0
+   *   !(x > 1 && x > a)
+   */
+  EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) {
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar machep = cephes_helper<Scalar>::machep();
+    const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
+
+    Scalar ans, ax, c, r;
+
+    /* Compute  x**a * exp(-x) / gamma(a)  */
+    ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
+    if (ax < -maxlog) {
+      // underflow
+      return zero;
+    }
+    ax = numext::exp(ax);
+
+    /* power series */
+    r = a;
+    c = one;
+    ans = one;
+
+    while (true) {
+      r += one;
+      c *= x/r;
+      ans += c;
+      if (c/ans <= machep) {
+        break;
       }
     }
 
-    return igamma_series_impl<Scalar, mode>::run(a, x);
+    return (ans * ax / a);
   }
 };
 
 #endif  // EIGEN_HAS_C99_MATH
 
-template <typename Scalar>
-struct igamma_retval {
-  typedef Scalar type;
-};
-
-template <typename Scalar>
-struct igamma_impl : igamma_generic_impl<Scalar, VALUE> {
-  /* igam()
-   * Incomplete gamma integral.
-   *
-   * The CDF of Gamma(a, 1) random variable at the point x.
-   *
-   * Accuracy estimation. For each a in [10^-2, 10^-1...10^3] we sample
-   * 50 Gamma random variables x ~ Gamma(x | a, 1), a total of 300 points.
-   * The ground truth is computed by mpmath. Mean absolute error:
-   * float: 1.26713e-05
-   * double: 2.33606e-12
-   *
-   * Cephes documentation below.
-   *
-   * SYNOPSIS:
-   *
-   * double a, x, y, igam();
-   *
-   * y = igam( a, x );
-   *
-   * DESCRIPTION:
-   *
-   * The function is defined by
-   *
-   *                           x
-   *                            -
-   *                   1       | |  -t  a-1
-   *  igam(a,x)  =   -----     |   e   t   dt.
-   *                  -      | |
-   *                 | (a)    -
-   *                           0
-   *
-   *
-   * In this implementation both arguments must be positive.
-   * The integral is evaluated by either a power series or
-   * continued fraction expansion, depending on the relative
-   * values of a and x.
-   *
-   * ACCURACY (double):
-   *
-   *                      Relative error:
-   * arithmetic   domain     # trials      peak         rms
-   *    IEEE      0,30       200000       3.6e-14     2.9e-15
-   *    IEEE      0,100      300000       9.9e-14     1.5e-14
-   *
-   *
-   * ACCURACY (float):
-   *
-   *                      Relative error:
-   * arithmetic   domain     # trials      peak         rms
-   *    IEEE      0,30        20000       7.8e-6      5.9e-7
-   *
-   */
-  /*
-    Cephes Math Library Release 2.2: June, 1992
-    Copyright 1985, 1987, 1992 by Stephen L. Moshier
-    Direct inquiries to 30 Frost Street, Cambridge, MA 02140
-  */
-
-  /* left tail of incomplete gamma function:
-   *
-   *          inf.      k
-   *   a  -x   -       x
-   *  x  e     >   ----------
-   *           -     -
-   *          k=0   | (a+k+1)
-   *
-   */
-};
-
-template <typename Scalar>
-struct igamma_der_a_retval : igamma_retval<Scalar> {};
-
-template <typename Scalar>
-struct igamma_der_a_impl : igamma_generic_impl<Scalar, DERIVATIVE> {
-  /* Derivative of the incomplete Gamma function with respect to a.
-   *
-   * Computes d/da igamma(a, x) by forward differentiation of the igamma code.
-   *
-   * Accuracy estimation. For each a in [10^-2, 10^-1...10^3] we sample
-   * 50 Gamma random variables x ~ Gamma(x | a, 1), a total of 300 points.
-   * The ground truth is computed by mpmath. Mean absolute error:
-   * float: 6.17992e-07
-   * double: 4.60453e-12
-   *
-   * Reference:
-   * R. Moore. "Algorithm AS 187: Derivatives of the incomplete gamma
-   * integral". Journal of the Royal Statistical Society. 1982
-   */
-};
-
-template <typename Scalar>
-struct gamma_sample_der_alpha_retval : igamma_retval<Scalar> {};
-
-template <typename Scalar>
-struct gamma_sample_der_alpha_impl
-    : igamma_generic_impl<Scalar, SAMPLE_DERIVATIVE> {
-  /* Derivative of a Gamma random variable sample with respect to alpha.
-   *
-   * Consider a sample of a Gamma random variable with the concentration
-   * parameter alpha: sample ~ Gamma(alpha, 1). The reparameterization
-   * derivative that we want to compute is dsample / dalpha =
-   * d igammainv(alpha, u) / dalpha, where u = igamma(alpha, sample).
-   * However, this formula is numerically unstable and expensive, so instead
-   * we use implicit differentiation:
-   *
-   * igamma(alpha, sample) = u, where u ~ Uniform(0, 1).
-   * Apply d / dalpha to both sides:
-   * d igamma(alpha, sample) / dalpha
-   *     + d igamma(alpha, sample) / dsample * dsample/dalpha  = 0
-   * d igamma(alpha, sample) / dalpha
-   *     + Gamma(sample | alpha, 1) dsample / dalpha = 0
-   * dsample/dalpha = - (d igamma(alpha, sample) / dalpha)
-   *                   / Gamma(sample | alpha, 1)
-   *
-   * Here Gamma(sample | alpha, 1) is the PDF of the Gamma distribution
-   * (note that the derivative of the CDF w.r.t. sample is the PDF).
-   * See the reference below for more details.
-   *
-   * The derivative of igamma(alpha, sample) is computed by forward
-   * differentiation of the igamma code. Division by the Gamma PDF is performed
-   * in the same code, increasing the accuracy and speed due to cancellation
-   * of some terms.
-   *
-   * Accuracy estimation. For each alpha in [10^-2, 10^-1...10^3] we sample
-   * 50 Gamma random variables sample ~ Gamma(sample | alpha, 1), a total of 300
-   * points. The ground truth is computed by mpmath. Mean absolute error:
-   * float: 2.1686e-06
-   * double: 1.4774e-12
-   *
-   * Reference:
-   * M. Figurnov, S. Mohamed, A. Mnih "Implicit Reparameterization Gradients".
-   * 2018
-   */
-};
-
 /*****************************************************************************
  * Implementation of Riemann zeta function of two arguments, based on Cephes *
  *****************************************************************************/
@@ -1836,7 +1392,7 @@ struct betainc_helper<double> {
     if ((a + b) < maxgam && numext::abs(u) < maxlog) {
       t = gamma(a + b) / (gamma(a) * gamma(b));
       s = s * t * pow(x, a);
-    }
+    } else {
     */
     t = lgamma_impl<double>::run(a + b) - lgamma_impl<double>::run(a) -
         lgamma_impl<double>::run(b) + u + numext::log(s);
@@ -1983,30 +1539,12 @@ EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar)
   return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x);
 }
 
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(ndtri, Scalar)
-    ndtri(const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(ndtri, Scalar)::run(x);
-}
-
 template <typename Scalar>
 EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma, Scalar)
     igamma(const Scalar& a, const Scalar& x) {
   return EIGEN_MATHFUNC_IMPL(igamma, Scalar)::run(a, x);
 }
 
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma_der_a, Scalar)
-    igamma_der_a(const Scalar& a, const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(igamma_der_a, Scalar)::run(a, x);
-}
-
-template <typename Scalar>
-EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(gamma_sample_der_alpha, Scalar)
-    gamma_sample_der_alpha(const Scalar& a, const Scalar& x) {
-  return EIGEN_MATHFUNC_IMPL(gamma_sample_der_alpha, Scalar)::run(a, x);
-}
-
 template <typename Scalar>
 EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igammac, Scalar)
     igammac(const Scalar& a, const Scalar& x) {
@@ -2020,6 +1558,8 @@ EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(betainc, Scalar)
 }
 
 }  // end namespace numext
+
+
 }  // end namespace Eigen
 
 #endif  // EIGEN_SPECIAL_FUNCTIONS_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
index 2bb017921..46d60d323 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
@@ -38,32 +38,10 @@ Packet perf(const Packet& a) { using numext::erf; return erf(a); }
 template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); }
 
-/** \internal \returns the ndtri(\a a) (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pndtri(const Packet& a) {
-  typedef typename unpacket_traits<Packet>::type ScalarType;
-  using internal::generic_ndtri; return generic_ndtri<Packet, ScalarType>(a);
-}
-
 /** \internal \returns the incomplete gamma function igamma(\a a, \a x) */
 template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); }
 
-/** \internal \returns the derivative of the incomplete gamma function
- * igamma_der_a(\a a, \a x) */
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pigamma_der_a(const Packet& a, const Packet& x) {
-  using numext::igamma_der_a; return igamma_der_a(a, x);
-}
-
-/** \internal \returns compute the derivative of the sample
-  * of Gamma(alpha, 1) random variable with respect to the parameter a
-  * gamma_sample_der_alpha(\a alpha, \a sample) */
-template <typename Packet>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pgamma_sample_der_alpha(const Packet& alpha, const Packet& sample) {
-  using numext::gamma_sample_der_alpha; return gamma_sample_der_alpha(alpha, sample);
-}
-
 /** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */
 template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; return igammac(a, x); }
@@ -77,3 +55,4 @@ Packet pbetainc(const Packet& a, const Packet& b,const Packet& x) { using numext
 } // end namespace Eigen
 
 #endif // EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
+
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h
new file mode 100644
index 000000000..ec4fa8448
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h
@@ -0,0 +1,165 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CUDA_SPECIALFUNCTIONS_H
+#define EIGEN_CUDA_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 plgamma<float4>(const float4& a)
+{
+  return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 plgamma<double2>(const double2& a)
+{
+  using numext::lgamma;
+  return make_double2(lgamma(a.x), lgamma(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pdigamma<float4>(const float4& a)
+{
+  using numext::digamma;
+  return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pdigamma<double2>(const double2& a)
+{
+  using numext::digamma;
+  return make_double2(digamma(a.x), digamma(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pzeta<float4>(const float4& x, const float4& q)
+{
+    using numext::zeta;
+    return make_float4(zeta(x.x, q.x), zeta(x.y, q.y), zeta(x.z, q.z), zeta(x.w, q.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pzeta<double2>(const double2& x, const double2& q)
+{
+    using numext::zeta;
+    return make_double2(zeta(x.x, q.x), zeta(x.y, q.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 ppolygamma<float4>(const float4& n, const float4& x)
+{
+    using numext::polygamma;
+    return make_float4(polygamma(n.x, x.x), polygamma(n.y, x.y), polygamma(n.z, x.z), polygamma(n.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 ppolygamma<double2>(const double2& n, const double2& x)
+{
+    using numext::polygamma;
+    return make_double2(polygamma(n.x, x.x), polygamma(n.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 perf<float4>(const float4& a)
+{
+  return make_float4(erff(a.x), erff(a.y), erff(a.z), erff(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 perf<double2>(const double2& a)
+{
+  using numext::erf;
+  return make_double2(erf(a.x), erf(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 perfc<float4>(const float4& a)
+{
+  using numext::erfc;
+  return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 perfc<double2>(const double2& a)
+{
+  using numext::erfc;
+  return make_double2(erfc(a.x), erfc(a.y));
+}
+
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pigamma<float4>(const float4& a, const float4& x)
+{
+  using numext::igamma;
+  return make_float4(
+      igamma(a.x, x.x),
+      igamma(a.y, x.y),
+      igamma(a.z, x.z),
+      igamma(a.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pigamma<double2>(const double2& a, const double2& x)
+{
+  using numext::igamma;
+  return make_double2(igamma(a.x, x.x), igamma(a.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pigammac<float4>(const float4& a, const float4& x)
+{
+  using numext::igammac;
+  return make_float4(
+      igammac(a.x, x.x),
+      igammac(a.y, x.y),
+      igammac(a.z, x.z),
+      igammac(a.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pigammac<double2>(const double2& a, const double2& x)
+{
+  using numext::igammac;
+  return make_double2(igammac(a.x, x.x), igammac(a.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pbetainc<float4>(const float4& a, const float4& b, const float4& x)
+{
+  using numext::betainc;
+  return make_float4(
+      betainc(a.x, b.x, x.x),
+      betainc(a.y, b.y, x.y),
+      betainc(a.z, b.z, x.z),
+      betainc(a.w, b.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pbetainc<double2>(const double2& a, const double2& b, const double2& x)
+{
+  using numext::betainc;
+  return make_double2(betainc(a.x, b.x, x.x), betainc(a.y, b.y, x.y));
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CUDA_SPECIALFUNCTIONS_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/arch/GPU/GpuSpecialFunctions.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/arch/GPU/GpuSpecialFunctions.h
deleted file mode 100644
index dd3bf4dd1..000000000
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/arch/GPU/GpuSpecialFunctions.h
+++ /dev/null
@@ -1,369 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_GPU_SPECIALFUNCTIONS_H
-#define EIGEN_GPU_SPECIALFUNCTIONS_H
-
-namespace Eigen {
-
-namespace internal {
-
-// Make sure this is only available when targeting a GPU: we don't want to
-// introduce conflicts between these packet_traits definitions and the ones
-// we'll use on the host side (SSE, AVX, ...)
-#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 plgamma<float4>(const float4& a)
-{
-  return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 plgamma<double2>(const double2& a)
-{
-  using numext::lgamma;
-  return make_double2(lgamma(a.x), lgamma(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pdigamma<float4>(const float4& a)
-{
-  using numext::digamma;
-  return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pdigamma<double2>(const double2& a)
-{
-  using numext::digamma;
-  return make_double2(digamma(a.x), digamma(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pzeta<float4>(const float4& x, const float4& q)
-{
-    using numext::zeta;
-    return make_float4(zeta(x.x, q.x), zeta(x.y, q.y), zeta(x.z, q.z), zeta(x.w, q.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pzeta<double2>(const double2& x, const double2& q)
-{
-    using numext::zeta;
-    return make_double2(zeta(x.x, q.x), zeta(x.y, q.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 ppolygamma<float4>(const float4& n, const float4& x)
-{
-    using numext::polygamma;
-    return make_float4(polygamma(n.x, x.x), polygamma(n.y, x.y), polygamma(n.z, x.z), polygamma(n.w, x.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 ppolygamma<double2>(const double2& n, const double2& x)
-{
-    using numext::polygamma;
-    return make_double2(polygamma(n.x, x.x), polygamma(n.y, x.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 perf<float4>(const float4& a)
-{
-  return make_float4(erff(a.x), erff(a.y), erff(a.z), erff(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 perf<double2>(const double2& a)
-{
-  using numext::erf;
-  return make_double2(erf(a.x), erf(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 perfc<float4>(const float4& a)
-{
-  using numext::erfc;
-  return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 perfc<double2>(const double2& a)
-{
-  using numext::erfc;
-  return make_double2(erfc(a.x), erfc(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pndtri<float4>(const float4& a)
-{
-  using numext::ndtri;
-  return make_float4(ndtri(a.x), ndtri(a.y), ndtri(a.z), ndtri(a.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pndtri<double2>(const double2& a)
-{
-  using numext::ndtri;
-  return make_double2(ndtri(a.x), ndtri(a.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pigamma<float4>(const float4& a, const float4& x)
-{
-  using numext::igamma;
-  return make_float4(
-      igamma(a.x, x.x),
-      igamma(a.y, x.y),
-      igamma(a.z, x.z),
-      igamma(a.w, x.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pigamma<double2>(const double2& a, const double2& x)
-{
-  using numext::igamma;
-  return make_double2(igamma(a.x, x.x), igamma(a.y, x.y));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pigamma_der_a<float4>(
-    const float4& a, const float4& x) {
-  using numext::igamma_der_a;
-  return make_float4(igamma_der_a(a.x, x.x), igamma_der_a(a.y, x.y),
-                     igamma_der_a(a.z, x.z), igamma_der_a(a.w, x.w));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
-pigamma_der_a<double2>(const double2& a, const double2& x) {
-  using numext::igamma_der_a;
-  return make_double2(igamma_der_a(a.x, x.x), igamma_der_a(a.y, x.y));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pgamma_sample_der_alpha<float4>(
-    const float4& alpha, const float4& sample) {
-  using numext::gamma_sample_der_alpha;
-  return make_float4(
-      gamma_sample_der_alpha(alpha.x, sample.x),
-      gamma_sample_der_alpha(alpha.y, sample.y),
-      gamma_sample_der_alpha(alpha.z, sample.z),
-      gamma_sample_der_alpha(alpha.w, sample.w));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
-pgamma_sample_der_alpha<double2>(const double2& alpha, const double2& sample) {
-  using numext::gamma_sample_der_alpha;
-  return make_double2(
-      gamma_sample_der_alpha(alpha.x, sample.x),
-      gamma_sample_der_alpha(alpha.y, sample.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pigammac<float4>(const float4& a, const float4& x)
-{
-  using numext::igammac;
-  return make_float4(
-      igammac(a.x, x.x),
-      igammac(a.y, x.y),
-      igammac(a.z, x.z),
-      igammac(a.w, x.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pigammac<double2>(const double2& a, const double2& x)
-{
-  using numext::igammac;
-  return make_double2(igammac(a.x, x.x), igammac(a.y, x.y));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-float4 pbetainc<float4>(const float4& a, const float4& b, const float4& x)
-{
-  using numext::betainc;
-  return make_float4(
-      betainc(a.x, b.x, x.x),
-      betainc(a.y, b.y, x.y),
-      betainc(a.z, b.z, x.z),
-      betainc(a.w, b.w, x.w));
-}
-
-template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-double2 pbetainc<double2>(const double2& a, const double2& b, const double2& x)
-{
-  using numext::betainc;
-  return make_double2(betainc(a.x, b.x, x.x), betainc(a.y, b.y, x.y));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i0e<float4>(const float4& x) {
-  using numext::bessel_i0e;
-  return make_float4(bessel_i0e(x.x), bessel_i0e(x.y), bessel_i0e(x.z), bessel_i0e(x.w));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
-pbessel_i0e<double2>(const double2& x) {
-  using numext::bessel_i0e;
-  return make_double2(bessel_i0e(x.x), bessel_i0e(x.y));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i0<float4>(const float4& x) {
-  using numext::bessel_i0;
-  return make_float4(bessel_i0(x.x), bessel_i0(x.y), bessel_i0(x.z), bessel_i0(x.w));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
-pbessel_i0<double2>(const double2& x) {
-  using numext::bessel_i0;
-  return make_double2(bessel_i0(x.x), bessel_i0(x.y));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i1e<float4>(const float4& x) {
-  using numext::bessel_i1e;
-  return make_float4(bessel_i1e(x.x), bessel_i1e(x.y), bessel_i1e(x.z), bessel_i1e(x.w));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
-pbessel_i1e<double2>(const double2& x) {
-  using numext::bessel_i1e;
-  return make_double2(bessel_i1e(x.x), bessel_i1e(x.y));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i1<float4>(const float4& x) {
-  using numext::bessel_i1;
-  return make_float4(bessel_i1(x.x), bessel_i1(x.y), bessel_i1(x.z), bessel_i1(x.w));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
-pbessel_i1<double2>(const double2& x) {
-  using numext::bessel_i1;
-  return make_double2(bessel_i1(x.x), bessel_i1(x.y));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k0e<float4>(const float4& x) {
-  using numext::bessel_k0e;
-  return make_float4(bessel_k0e(x.x), bessel_k0e(x.y), bessel_k0e(x.z), bessel_k0e(x.w));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
-pbessel_k0e<double2>(const double2& x) {
-  using numext::bessel_k0e;
-  return make_double2(bessel_k0e(x.x), bessel_k0e(x.y));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k0<float4>(const float4& x) {
-  using numext::bessel_k0;
-  return make_float4(bessel_k0(x.x), bessel_k0(x.y), bessel_k0(x.z), bessel_k0(x.w));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
-pbessel_k0<double2>(const double2& x) {
-  using numext::bessel_k0;
-  return make_double2(bessel_k0(x.x), bessel_k0(x.y));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k1e<float4>(const float4& x) {
-  using numext::bessel_k1e;
-  return make_float4(bessel_k1e(x.x), bessel_k1e(x.y), bessel_k1e(x.z), bessel_k1e(x.w));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
-pbessel_k1e<double2>(const double2& x) {
-  using numext::bessel_k1e;
-  return make_double2(bessel_k1e(x.x), bessel_k1e(x.y));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k1<float4>(const float4& x) {
-  using numext::bessel_k1;
-  return make_float4(bessel_k1(x.x), bessel_k1(x.y), bessel_k1(x.z), bessel_k1(x.w));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
-pbessel_k1<double2>(const double2& x) {
-  using numext::bessel_k1;
-  return make_double2(bessel_k1(x.x), bessel_k1(x.y));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_j0<float4>(const float4& x) {
-  using numext::bessel_j0;
-  return make_float4(bessel_j0(x.x), bessel_j0(x.y), bessel_j0(x.z), bessel_j0(x.w));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
-pbessel_j0<double2>(const double2& x) {
-  using numext::bessel_j0;
-  return make_double2(bessel_j0(x.x), bessel_j0(x.y));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_j1<float4>(const float4& x) {
-  using numext::bessel_j1;
-  return make_float4(bessel_j1(x.x), bessel_j1(x.y), bessel_j1(x.z), bessel_j1(x.w));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
-pbessel_j1<double2>(const double2& x) {
-  using numext::bessel_j1;
-  return make_double2(bessel_j1(x.x), bessel_j1(x.y));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_y0<float4>(const float4& x) {
-  using numext::bessel_y0;
-  return make_float4(bessel_y0(x.x), bessel_y0(x.y), bessel_y0(x.z), bessel_y0(x.w));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
-pbessel_y0<double2>(const double2& x) {
-  using numext::bessel_y0;
-  return make_double2(bessel_y0(x.x), bessel_y0(x.y));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_y1<float4>(const float4& x) {
-  using numext::bessel_y1;
-  return make_float4(bessel_y1(x.x), bessel_y1(x.y), bessel_y1(x.z), bessel_y1(x.w));
-}
-
-template <>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
-pbessel_y1<double2>(const double2& x) {
-  using numext::bessel_y1;
-  return make_double2(bessel_y1(x.x), bessel_y1(x.y));
-}
-
-#endif
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_GPU_SPECIALFUNCTIONS_H
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/Splines/Spline.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/Splines/Spline.h
index 79edd52ce..f1ae92d79 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/Splines/Spline.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/Splines/Spline.h
@@ -255,7 +255,7 @@ namespace Eigen
     const KnotVectorType& U = knots;
 
     BasisVectorType left(p+1); left(0) = Scalar(0);
-    BasisVectorType right(p+1); right(0) = Scalar(0);
+    BasisVectorType right(p+1); right(0) = Scalar(0);        
 
     VectorBlock<BasisVectorType,Degree>(left,1,p) = u - VectorBlock<const KnotVectorType,Degree>(U,i+1-p,p).reverse();
     VectorBlock<BasisVectorType,Degree>(right,1,p) = VectorBlock<const KnotVectorType,Degree>(U,i+1,p) - u;
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/Splines/SplineFitting.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/Splines/SplineFitting.h
index 9f6e8afa0..c761a9b3d 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/Splines/SplineFitting.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/Splines/SplineFitting.h
@@ -17,8 +17,8 @@
 
 #include "SplineFwd.h"
 
-#include "../../../../Eigen/LU"
-#include "../../../../Eigen/QR"
+#include <Eigen/LU>
+#include <Eigen/QR>
 
 namespace Eigen
 {
@@ -181,7 +181,7 @@ namespace Eigen
    * \ingroup Splines_Module
    *
    * \param[in] pts The data points to which a spline should be fit.
-   * \param[out] chord_lengths The resulting chord length vector.
+   * \param[out] chord_lengths The resulting chord lenggth vector.
    *
    * \sa Les Piegl and Wayne Tiller, The NURBS book (2nd ed.), 1997, 9.2.1 Global Curve Interpolation to Point Data
    **/   
@@ -385,7 +385,7 @@ namespace Eigen
     {
       const DenseIndex span = SplineType::Span(parameters[i], degree, knots);
 
-      if (derivativeIndex < derivativeIndices.size() && derivativeIndices[derivativeIndex] == i)
+      if (derivativeIndices[derivativeIndex] == i)
       {
         A.block(row, span - degree, 2, degree + 1)
           = SplineType::BasisFunctionDerivatives(parameters[i], 1, degree, knots);
@@ -395,9 +395,8 @@ namespace Eigen
       }
       else
       {
-        A.row(row).segment(span - degree, degree + 1)
+        A.row(row++).segment(span - degree, degree + 1)
           = SplineType::BasisFunctions(parameters[i], degree, knots);
-        b.col(row++) = points.col(i);
       }
     }
     b.col(0) = points.col(0);
diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/src/Splines/SplineFwd.h b/uppsrc/plugin/Eigen/unsupported/Eigen/src/Splines/SplineFwd.h
index 00d6b4921..0a95fbf3e 100644
--- a/uppsrc/plugin/Eigen/unsupported/Eigen/src/Splines/SplineFwd.h
+++ b/uppsrc/plugin/Eigen/unsupported/Eigen/src/Splines/SplineFwd.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_SPLINES_FWD_H
 #define EIGEN_SPLINES_FWD_H
 
-#include "../../../../Eigen/Core"
+#include <Eigen/Core>
 
 namespace Eigen
 {
diff --git a/uppsrc/plugin/Eigen/unsupported/README.txt b/uppsrc/plugin/Eigen/unsupported/README.txt
index 70793bf13..83479ff0b 100644
--- a/uppsrc/plugin/Eigen/unsupported/README.txt
+++ b/uppsrc/plugin/Eigen/unsupported/README.txt
@@ -20,7 +20,7 @@ However, it:
  - must rely on Eigen,
  - must be highly related to math,
  - should have some general purpose in the sense that it could
-   potentially become an official Eigen module (or be merged into another one).
+   potentially become an offical Eigen module (or be merged into another one).
 
 In doubt feel free to contact us. For instance, if your addons is very too specific
 but it shows an interesting way of using Eigen, then it could be a nice demo.
diff --git a/uppsrc/plugin/Eigen/unsupported/bench/bench_svd.cpp b/uppsrc/plugin/Eigen/unsupported/bench/bench_svd.cpp
new file mode 100644
index 000000000..01d8231ae
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/bench/bench_svd.cpp
@@ -0,0 +1,123 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Gauthier Brun <brun.gauthier@gmail.com>
+// Copyright (C) 2013 Nicolas Carre <nicolas.carre@ensimag.fr>
+// Copyright (C) 2013 Jean Ceccato <jean.ceccato@ensimag.fr>
+// Copyright (C) 2013 Pierre Zoppitelli <pierre.zoppitelli@ensimag.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/
+
+// Bench to compare the efficiency of SVD algorithms
+
+#include <iostream>
+#include <bench/BenchTimer.h>
+#include <unsupported/Eigen/SVD>
+
+
+using namespace Eigen;
+using namespace std;
+
+// number of computations of each algorithm before the print of the time
+#ifndef REPEAT
+#define REPEAT 10
+#endif
+
+// number of tests of the same type
+#ifndef NUMBER_SAMPLE
+#define NUMBER_SAMPLE 2
+#endif
+
+template<typename MatrixType>
+void bench_svd(const MatrixType& a = MatrixType())
+{
+  MatrixType m = MatrixType::Random(a.rows(), a.cols());
+  BenchTimer timerJacobi;
+  BenchTimer timerBDC;
+  timerJacobi.reset();
+  timerBDC.reset();
+
+  cout << " Only compute Singular Values" <<endl;
+  for (int k=1; k<=NUMBER_SAMPLE; ++k)
+  {
+    timerBDC.start();
+    for (int i=0; i<REPEAT; ++i) 
+    {
+      BDCSVD<MatrixType> bdc_matrix(m);
+    }
+    timerBDC.stop();
+    
+    timerJacobi.start();
+    for (int i=0; i<REPEAT; ++i) 
+    {
+      JacobiSVD<MatrixType> jacobi_matrix(m);
+    }
+    timerJacobi.stop();
+
+
+    cout << "Sample " << k << " : " << REPEAT << " computations :  Jacobi : " << fixed << timerJacobi.value() << "s ";
+    cout << " || " << " BDC : " << timerBDC.value() << "s " <<endl <<endl;
+      
+    if (timerBDC.value() >= timerJacobi.value())  
+      cout << "KO : BDC is " <<  timerJacobi.value() / timerBDC.value() << "  times faster than Jacobi" <<endl;
+    else 
+      cout << "OK : BDC is " << timerJacobi.value() / timerBDC.value() << "  times faster than Jacobi"  <<endl;
+      
+  }
+  cout << "       =================" <<endl;
+  std::cout<< std::endl;
+  timerJacobi.reset();
+  timerBDC.reset();
+  cout << " Computes rotaion matrix" <<endl;
+  for (int k=1; k<=NUMBER_SAMPLE; ++k)
+  {
+    timerBDC.start();
+    for (int i=0; i<REPEAT; ++i) 
+    {
+      BDCSVD<MatrixType> bdc_matrix(m, ComputeFullU|ComputeFullV);
+    }
+    timerBDC.stop();
+    
+    timerJacobi.start();
+    for (int i=0; i<REPEAT; ++i) 
+    {
+      JacobiSVD<MatrixType> jacobi_matrix(m, ComputeFullU|ComputeFullV);
+    }
+    timerJacobi.stop();
+
+
+    cout << "Sample " << k << " : " << REPEAT << " computations :  Jacobi : " << fixed << timerJacobi.value() << "s ";
+    cout << " || " << " BDC : " << timerBDC.value() << "s " <<endl <<endl;
+      
+    if (timerBDC.value() >= timerJacobi.value())  
+      cout << "KO : BDC is " <<  timerJacobi.value() / timerBDC.value() << "  times faster than Jacobi" <<endl;
+    else 
+      cout << "OK : BDC is " << timerJacobi.value() / timerBDC.value() << "  times faster than Jacobi"  <<endl;
+      
+  }
+  std::cout<< std::endl;
+}
+
+
+
+int main(int argc, char* argv[])
+{
+  std::cout<< std::endl;
+
+  std::cout<<"On a (Dynamic, Dynamic) (6, 6) Matrix" <<std::endl;
+  bench_svd<Matrix<double,Dynamic,Dynamic> >(Matrix<double,Dynamic,Dynamic>(6, 6));
+  
+  std::cout<<"On a (Dynamic, Dynamic) (32, 32) Matrix" <<std::endl;
+  bench_svd<Matrix<double,Dynamic,Dynamic> >(Matrix<double,Dynamic,Dynamic>(32, 32));
+
+  //std::cout<<"On a (Dynamic, Dynamic) (128, 128) Matrix" <<std::endl;
+  //bench_svd<Matrix<double,Dynamic,Dynamic> >(Matrix<double,Dynamic,Dynamic>(128, 128));
+
+  std::cout<<"On a (Dynamic, Dynamic) (160, 160) Matrix" <<std::endl;
+  bench_svd<Matrix<double,Dynamic,Dynamic> >(Matrix<double,Dynamic,Dynamic>(160, 160));
+  
+  std::cout<< "--------------------------------------------------------------------"<< std::endl;
+           
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/doc/CMakeLists.txt b/uppsrc/plugin/Eigen/unsupported/doc/CMakeLists.txt
new file mode 100644
index 000000000..9e9ab9800
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/doc/CMakeLists.txt
@@ -0,0 +1,4 @@
+set_directory_properties(PROPERTIES EXCLUDE_FROM_ALL TRUE)
+
+add_subdirectory(examples)
+add_subdirectory(snippets)
diff --git a/uppsrc/plugin/Eigen/unsupported/doc/Overview.dox b/uppsrc/plugin/Eigen/unsupported/doc/Overview.dox
new file mode 100644
index 000000000..45464a545
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/doc/Overview.dox
@@ -0,0 +1,28 @@
+/// \brief Namespace containing all symbols from the %Eigen library.
+namespace Eigen {
+
+/** \mainpage %Eigen's unsupported modules
+
+This is the API documentation for %Eigen's unsupported modules.
+
+These modules are contributions from various users. They are provided "as is", without any support.
+
+Click on the \e Modules tab at the top of this page to get a list of all unsupported modules.
+
+Don't miss the <a href="../index.html">official Eigen documentation</a>.
+
+*/
+
+/*
+
+\defgroup Unsupported_modules Unsupported modules
+
+The unsupported modules are contributions from various users. They are
+provided "as is", without any support. Nevertheless, some of them are
+subject to be included in %Eigen in the future.
+
+*/
+
+/// \internal \brief Namespace containing low-level routines from the %Eigen library.
+namespace internal {}
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/doc/eigendoxy_layout.xml.in b/uppsrc/plugin/Eigen/unsupported/doc/eigendoxy_layout.xml.in
new file mode 100644
index 000000000..c93621ed3
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/doc/eigendoxy_layout.xml.in
@@ -0,0 +1,177 @@
+<?xml version="1.0"?>
+<doxygenlayout version="1.0">
+  <!-- Navigation index tabs for HTML output -->
+  <navindex>
+    <tab type="user" url="index.html" title="Overview" />
+    <tab type="modules" visible="yes" title="Unsupported Modules" intro=""/>
+<!--     <tab type="mainpage" visible="yes" title=""/> -->
+    <tab type="classlist" visible="yes" title="" intro=""/>
+<!--     <tab type="classmembers" visible="yes" title="" intro=""/> -->
+  </navindex>
+
+  <!-- Layout definition for a class page -->
+  <class>
+    <briefdescription visible="no"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <detaileddescription title=""/>
+    <inheritancegraph visible="$CLASS_GRAPH"/>
+    <collaborationgraph visible="$COLLABORATION_GRAPH"/>
+    <allmemberslink visible="yes"/>
+    <memberdecl>
+      <nestedclasses visible="yes" title=""/>
+      <publictypes title=""/>
+      <publicslots title=""/>
+      <signals title=""/>
+      <publicmethods title=""/>
+      <publicstaticmethods title=""/>
+      <publicattributes title=""/>
+      <publicstaticattributes title=""/>
+      <protectedtypes title=""/>
+      <protectedslots title=""/>
+      <protectedmethods title=""/>
+      <protectedstaticmethods title=""/>
+      <protectedattributes title=""/>
+      <protectedstaticattributes title=""/>
+      <packagetypes title=""/>
+      <packagemethods title=""/>
+      <packagestaticmethods title=""/>
+      <packageattributes title=""/>
+      <packagestaticattributes title=""/>
+      <properties title=""/>
+      <events title=""/>
+      <privatetypes title=""/>
+      <privateslots title=""/>
+      <privatemethods title=""/>
+      <privatestaticmethods title=""/>
+      <privateattributes title=""/>
+      <privatestaticattributes title=""/>
+      <friends title=""/>
+      <related title="" subtitle=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <constructors title=""/>
+      <functions title=""/>
+      <related title=""/>
+      <variables title=""/>
+      <properties title=""/>
+      <events title=""/>
+    </memberdef>
+    <usedfiles visible="$SHOW_USED_FILES"/>
+    <authorsection visible="yes"/>
+  </class>
+
+  <!-- Layout definition for a namespace page -->
+  <namespace>
+    <briefdescription visible="yes"/>
+    <memberdecl>
+      <nestednamespaces visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </namespace>
+
+  <!-- Layout definition for a file page -->
+  <file>
+    <briefdescription visible="yes"/>
+    <includes visible="$SHOW_INCLUDE_FILES"/>
+    <includegraph visible="$INCLUDE_GRAPH"/>
+    <includedbygraph visible="$INCLUDED_BY_GRAPH"/>
+    <sourcelink visible="yes"/>
+    <memberdecl>
+      <classes visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+    <memberdef>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <functions title=""/>
+      <variables title=""/>
+    </memberdef>
+    <authorsection/>
+  </file>
+
+  <!-- Layout definition for a group page -->
+  <group>
+    <briefdescription visible="no"/>
+    <detaileddescription title=""/>
+    <groupgraph visible="$GROUP_GRAPHS"/>
+    <memberdecl>
+      <nestedgroups visible="yes" title=""/>
+      <dirs visible="yes" title=""/>
+      <files visible="yes" title=""/>
+      <namespaces visible="yes" title=""/>
+      <classes visible="yes" title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+      <membergroups visible="yes"/>
+    </memberdecl>
+    
+    <memberdef>
+      <pagedocs/>
+      <inlineclasses title=""/>
+      <defines title=""/>
+      <typedefs title=""/>
+      <enums title=""/>
+      <enumvalues title=""/>
+      <functions title=""/>
+      <variables title=""/>
+      <signals title=""/>
+      <publicslots title=""/>
+      <protectedslots title=""/>
+      <privateslots title=""/>
+      <events title=""/>
+      <properties title=""/>
+      <friends title=""/>
+    </memberdef>
+    <authorsection visible="yes"/>
+  </group>
+
+  <!-- Layout definition for a directory page -->
+  <directory>
+    <briefdescription visible="yes"/>
+    <directorygraph visible="yes"/>
+    <memberdecl>
+      <dirs visible="yes"/>
+      <files visible="yes"/>
+    </memberdecl>
+    <detaileddescription title=""/>
+  </directory>
+</doxygenlayout>
diff --git a/uppsrc/plugin/Eigen/unsupported/doc/examples/BVH_Example.cpp b/uppsrc/plugin/Eigen/unsupported/doc/examples/BVH_Example.cpp
new file mode 100644
index 000000000..afb0c94c2
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/doc/examples/BVH_Example.cpp
@@ -0,0 +1,50 @@
+#include <Eigen/StdVector>
+#include <unsupported/Eigen/BVH>
+#include <iostream>
+
+using namespace Eigen;
+typedef AlignedBox<double, 2> Box2d;
+
+namespace Eigen {
+  Box2d bounding_box(const Vector2d &v) { return Box2d(v, v); } //compute the bounding box of a single point
+}
+
+struct PointPointMinimizer //how to compute squared distances between points and rectangles
+{
+  PointPointMinimizer() : calls(0) {}
+  typedef double Scalar;
+
+  double minimumOnVolumeVolume(const Box2d &r1, const Box2d &r2) { ++calls; return r1.squaredExteriorDistance(r2); }
+  double minimumOnVolumeObject(const Box2d &r, const Vector2d &v) { ++calls; return r.squaredExteriorDistance(v); }
+  double minimumOnObjectVolume(const Vector2d &v, const Box2d &r) { ++calls; return r.squaredExteriorDistance(v); }
+  double minimumOnObjectObject(const Vector2d &v1, const Vector2d &v2) { ++calls; return (v1 - v2).squaredNorm(); }
+
+  int calls;
+};
+
+int main()
+{
+  typedef std::vector<Vector2d, aligned_allocator<Vector2d> > StdVectorOfVector2d;
+  StdVectorOfVector2d redPoints, bluePoints;
+  for(int i = 0; i < 100; ++i) { //initialize random set of red points and blue points
+    redPoints.push_back(Vector2d::Random());
+    bluePoints.push_back(Vector2d::Random());
+  }
+
+  PointPointMinimizer minimizer;
+  double minDistSq = std::numeric_limits<double>::max();
+
+  //brute force to find closest red-blue pair
+  for(int i = 0; i < (int)redPoints.size(); ++i)
+    for(int j = 0; j < (int)bluePoints.size(); ++j)
+      minDistSq = std::min(minDistSq, minimizer.minimumOnObjectObject(redPoints[i], bluePoints[j]));
+  std::cout << "Brute force distance = " << sqrt(minDistSq) << ", calls = " << minimizer.calls << std::endl;
+
+  //using BVH to find closest red-blue pair
+  minimizer.calls = 0;
+  KdBVH<double, 2, Vector2d> redTree(redPoints.begin(), redPoints.end()), blueTree(bluePoints.begin(), bluePoints.end()); //construct the trees
+  minDistSq = BVMinimize(redTree, blueTree, minimizer); //actual BVH minimization call
+  std::cout << "BVH distance         = " << sqrt(minDistSq) << ", calls = " << minimizer.calls << std::endl;
+
+  return 0;
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/doc/examples/CMakeLists.txt b/uppsrc/plugin/Eigen/unsupported/doc/examples/CMakeLists.txt
new file mode 100644
index 000000000..c47646dfc
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/doc/examples/CMakeLists.txt
@@ -0,0 +1,20 @@
+FILE(GLOB examples_SRCS "*.cpp")
+
+ADD_CUSTOM_TARGET(unsupported_examples)
+
+INCLUDE_DIRECTORIES(../../../unsupported ../../../unsupported/test)
+
+FOREACH(example_src ${examples_SRCS})
+  GET_FILENAME_COMPONENT(example ${example_src} NAME_WE)
+  ADD_EXECUTABLE(example_${example} ${example_src})
+  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
+    target_link_libraries(example_${example} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
+  endif()
+  ADD_CUSTOM_COMMAND(
+    TARGET example_${example}
+    POST_BUILD
+    COMMAND example_${example}
+    ARGS >${CMAKE_CURRENT_BINARY_DIR}/${example}.out
+  )
+  ADD_DEPENDENCIES(unsupported_examples example_${example})
+ENDFOREACH(example_src)
diff --git a/uppsrc/plugin/Eigen/unsupported/doc/examples/EulerAngles.cpp b/uppsrc/plugin/Eigen/unsupported/doc/examples/EulerAngles.cpp
new file mode 100644
index 000000000..1ef6aee18
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/doc/examples/EulerAngles.cpp
@@ -0,0 +1,46 @@
+#include <unsupported/Eigen/EulerAngles>
+#include <iostream>
+
+using namespace Eigen;
+
+int main()
+{
+  // A common Euler system by many armies around the world,
+  //  where the first one is the azimuth(the angle from the north -
+  //   the same angle that is show in compass)
+  //  and the second one is elevation(the angle from the horizon)
+  //  and the third one is roll(the angle between the horizontal body
+  //   direction and the plane ground surface)
+  // Keep remembering we're using radian angles here!
+  typedef EulerSystem<-EULER_Z, EULER_Y, EULER_X> MyArmySystem;
+  typedef EulerAngles<double, MyArmySystem> MyArmyAngles;
+  
+  MyArmyAngles vehicleAngles(
+    3.14/*PI*/ / 2, /* heading to east, notice that this angle is counter-clockwise */
+    -0.3, /* going down from a mountain */
+    0.1); /* slightly rolled to the right */
+  
+  // Some Euler angles representation that our plane use.
+  EulerAnglesZYZd planeAngles(0.78474, 0.5271, -0.513794);
+  
+  MyArmyAngles planeAnglesInMyArmyAngles = MyArmyAngles::FromRotation<true, false, false>(planeAngles);
+  
+  std::cout << "vehicle angles(MyArmy):     " << vehicleAngles << std::endl;
+  std::cout << "plane angles(ZYZ):        " << planeAngles << std::endl;
+  std::cout << "plane angles(MyArmy):     " << planeAnglesInMyArmyAngles << std::endl;
+  
+  // Now lets rotate the plane a little bit
+  std::cout << "==========================================================\n";
+  std::cout << "rotating plane now!\n";
+  std::cout << "==========================================================\n";
+  
+  Quaterniond planeRotated = AngleAxisd(-0.342, Vector3d::UnitY()) * planeAngles;
+  
+  planeAngles = planeRotated;
+  planeAnglesInMyArmyAngles = MyArmyAngles::FromRotation<true, false, false>(planeRotated);
+  
+  std::cout << "new plane angles(ZYZ):     " << planeAngles << std::endl;
+  std::cout << "new plane angles(MyArmy): " << planeAnglesInMyArmyAngles << std::endl;
+  
+  return 0;
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/doc/examples/FFT.cpp b/uppsrc/plugin/Eigen/unsupported/doc/examples/FFT.cpp
new file mode 100644
index 000000000..85e8a0241
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/doc/examples/FFT.cpp
@@ -0,0 +1,118 @@
+//  To use the simple FFT implementation
+//  g++ -o demofft -I.. -Wall -O3 FFT.cpp 
+
+//  To use the FFTW implementation
+//  g++ -o demofft -I.. -DUSE_FFTW -Wall -O3 FFT.cpp -lfftw3 -lfftw3f -lfftw3l
+
+#ifdef USE_FFTW
+#include <fftw3.h>
+#endif
+
+#include <vector>
+#include <complex>
+#include <algorithm>
+#include <iterator>
+#include <iostream>
+#include <Eigen/Core>
+#include <unsupported/Eigen/FFT>
+
+using namespace std;
+using namespace Eigen;
+
+template <typename T>
+T mag2(T a)
+{
+    return a*a;
+}
+template <typename T>
+T mag2(std::complex<T> a)
+{
+    return norm(a);
+}
+
+template <typename T>
+T mag2(const std::vector<T> & vec)
+{
+    T out=0;
+    for (size_t k=0;k<vec.size();++k)
+        out += mag2(vec[k]);
+    return out;
+}
+
+template <typename T>
+T mag2(const std::vector<std::complex<T> > & vec)
+{
+    T out=0;
+    for (size_t k=0;k<vec.size();++k)
+        out += mag2(vec[k]);
+    return out;
+}
+
+template <typename T>
+vector<T> operator-(const vector<T> & a,const vector<T> & b )
+{
+    vector<T> c(a);
+    for (size_t k=0;k<b.size();++k) 
+        c[k] -= b[k];
+    return c;
+}
+
+template <typename T>
+void RandomFill(std::vector<T> & vec)
+{
+    for (size_t k=0;k<vec.size();++k)
+        vec[k] = T( rand() )/T(RAND_MAX) - T(.5);
+}
+
+template <typename T>
+void RandomFill(std::vector<std::complex<T> > & vec)
+{
+    for (size_t k=0;k<vec.size();++k)
+        vec[k] = std::complex<T> ( T( rand() )/T(RAND_MAX) - T(.5), T( rand() )/T(RAND_MAX) - T(.5));
+}
+
+template <typename T_time,typename T_freq>
+void fwd_inv(size_t nfft)
+{
+    typedef typename NumTraits<T_freq>::Real Scalar;
+    vector<T_time> timebuf(nfft);
+    RandomFill(timebuf);
+
+    vector<T_freq> freqbuf;
+    static FFT<Scalar> fft;
+    fft.fwd(freqbuf,timebuf);
+
+    vector<T_time> timebuf2;
+    fft.inv(timebuf2,freqbuf);
+
+    T_time rmse = mag2(timebuf - timebuf2) / mag2(timebuf);
+    cout << "roundtrip rmse: " << rmse << endl;
+}
+
+template <typename T_scalar>
+void two_demos(int nfft)
+{
+    cout << "     scalar ";
+    fwd_inv<T_scalar,std::complex<T_scalar> >(nfft);
+    cout << "    complex ";
+    fwd_inv<std::complex<T_scalar>,std::complex<T_scalar> >(nfft);
+}
+
+void demo_all_types(int nfft)
+{
+    cout << "nfft=" << nfft << endl;
+    cout << "   float" << endl;
+    two_demos<float>(nfft);
+    cout << "   double" << endl;
+    two_demos<double>(nfft);
+    cout << "   long double" << endl;
+    two_demos<long double>(nfft);
+}
+
+int main()
+{
+    demo_all_types( 2*3*4*5*7 );
+    demo_all_types( 2*9*16*25 );
+    demo_all_types( 1024 );
+    return 0;
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixExponential.cpp b/uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixExponential.cpp
new file mode 100644
index 000000000..ebd3b9675
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixExponential.cpp
@@ -0,0 +1,16 @@
+#include <unsupported/Eigen/MatrixFunctions>
+#include <iostream>
+
+using namespace Eigen;
+
+int main()
+{
+  const double pi = std::acos(-1.0);
+
+  MatrixXd A(3,3);
+  A << 0,    -pi/4, 0,
+       pi/4, 0,     0,
+       0,    0,     0;
+  std::cout << "The matrix A is:\n" << A << "\n\n";
+  std::cout << "The matrix exponential of A is:\n" << A.exp() << "\n\n";
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixFunction.cpp b/uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixFunction.cpp
new file mode 100644
index 000000000..a4172e4ae
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixFunction.cpp
@@ -0,0 +1,23 @@
+#include <unsupported/Eigen/MatrixFunctions>
+#include <iostream>
+
+using namespace Eigen;
+
+std::complex<double> expfn(std::complex<double> x, int)
+{
+  return std::exp(x);
+}
+
+int main()
+{
+  const double pi = std::acos(-1.0);
+
+  MatrixXd A(3,3);
+  A << 0,    -pi/4, 0,
+       pi/4, 0,     0,
+       0,    0,     0;
+
+  std::cout << "The matrix A is:\n" << A << "\n\n";
+  std::cout << "The matrix exponential of A is:\n" 
+            << A.matrixFunction(expfn) << "\n\n";
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixLogarithm.cpp b/uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixLogarithm.cpp
new file mode 100644
index 000000000..8c5d97054
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixLogarithm.cpp
@@ -0,0 +1,15 @@
+#include <unsupported/Eigen/MatrixFunctions>
+#include <iostream>
+
+using namespace Eigen;
+
+int main()
+{
+  using std::sqrt;
+  MatrixXd A(3,3);
+  A << 0.5*sqrt(2), -0.5*sqrt(2), 0,
+       0.5*sqrt(2),  0.5*sqrt(2), 0,
+       0,            0,           1;
+  std::cout << "The matrix A is:\n" << A << "\n\n";
+  std::cout << "The matrix logarithm of A is:\n" << A.log() << "\n";
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixPower.cpp b/uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixPower.cpp
new file mode 100644
index 000000000..222452476
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixPower.cpp
@@ -0,0 +1,16 @@
+#include <unsupported/Eigen/MatrixFunctions>
+#include <iostream>
+
+using namespace Eigen;
+
+int main()
+{
+  const double pi = std::acos(-1.0);
+  Matrix3d A;
+  A << cos(1), -sin(1), 0,
+       sin(1),  cos(1), 0,
+	   0 ,      0 , 1;
+  std::cout << "The matrix A is:\n" << A << "\n\n"
+	       "The matrix power A^(pi/4) is:\n" << A.pow(pi/4) << std::endl;
+  return 0;
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixPower_optimal.cpp b/uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixPower_optimal.cpp
new file mode 100644
index 000000000..86470ba0a
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixPower_optimal.cpp
@@ -0,0 +1,17 @@
+#include <unsupported/Eigen/MatrixFunctions>
+#include <iostream>
+
+using namespace Eigen;
+
+int main()
+{
+  Matrix4cd A = Matrix4cd::Random();
+  MatrixPower<Matrix4cd> Apow(A);
+
+  std::cout << "The matrix A is:\n" << A << "\n\n"
+	       "A^3.1 is:\n" << Apow(3.1) << "\n\n"
+	       "A^3.3 is:\n" << Apow(3.3) << "\n\n"
+	       "A^3.7 is:\n" << Apow(3.7) << "\n\n"
+	       "A^3.9 is:\n" << Apow(3.9) << std::endl;
+  return 0;
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixSine.cpp b/uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixSine.cpp
new file mode 100644
index 000000000..9eea9a081
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixSine.cpp
@@ -0,0 +1,20 @@
+#include <unsupported/Eigen/MatrixFunctions>
+#include <iostream>
+
+using namespace Eigen;
+
+int main()
+{
+  MatrixXd A = MatrixXd::Random(3,3);
+  std::cout << "A = \n" << A << "\n\n";
+
+  MatrixXd sinA = A.sin();
+  std::cout << "sin(A) = \n" << sinA << "\n\n";
+
+  MatrixXd cosA = A.cos();
+  std::cout << "cos(A) = \n" << cosA << "\n\n";
+  
+  // The matrix functions satisfy sin^2(A) + cos^2(A) = I, 
+  // like the scalar functions.
+  std::cout << "sin^2(A) + cos^2(A) = \n" << sinA*sinA + cosA*cosA << "\n\n";
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixSinh.cpp b/uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixSinh.cpp
new file mode 100644
index 000000000..f77186724
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixSinh.cpp
@@ -0,0 +1,20 @@
+#include <unsupported/Eigen/MatrixFunctions>
+#include <iostream>
+
+using namespace Eigen;
+
+int main()
+{
+  MatrixXf A = MatrixXf::Random(3,3);
+  std::cout << "A = \n" << A << "\n\n";
+
+  MatrixXf sinhA = A.sinh();
+  std::cout << "sinh(A) = \n" << sinhA << "\n\n";
+
+  MatrixXf coshA = A.cosh();
+  std::cout << "cosh(A) = \n" << coshA << "\n\n";
+  
+  // The matrix functions satisfy cosh^2(A) - sinh^2(A) = I, 
+  // like the scalar functions.
+  std::cout << "cosh^2(A) - sinh^2(A) = \n" << coshA*coshA - sinhA*sinhA << "\n\n";
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixSquareRoot.cpp b/uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixSquareRoot.cpp
new file mode 100644
index 000000000..88e7557d7
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixSquareRoot.cpp
@@ -0,0 +1,16 @@
+#include <unsupported/Eigen/MatrixFunctions>
+#include <iostream>
+
+using namespace Eigen;
+
+int main()
+{
+  const double pi = std::acos(-1.0);
+
+  MatrixXd A(2,2);
+  A << cos(pi/3), -sin(pi/3), 
+       sin(pi/3),  cos(pi/3);
+  std::cout << "The matrix A is:\n" << A << "\n\n";
+  std::cout << "The matrix square root of A is:\n" << A.sqrt() << "\n\n";
+  std::cout << "The square of the last matrix is:\n" << A.sqrt() * A.sqrt() << "\n";
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/doc/examples/PolynomialSolver1.cpp b/uppsrc/plugin/Eigen/unsupported/doc/examples/PolynomialSolver1.cpp
new file mode 100644
index 000000000..cd777a4e2
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/doc/examples/PolynomialSolver1.cpp
@@ -0,0 +1,53 @@
+#include <unsupported/Eigen/Polynomials>
+#include <vector>
+#include <iostream>
+
+using namespace Eigen;
+using namespace std;
+
+int main()
+{
+  typedef Matrix<double,5,1> Vector5d;
+
+  Vector5d roots = Vector5d::Random();
+  cout << "Roots: " << roots.transpose() << endl;
+  Eigen::Matrix<double,6,1> polynomial;
+  roots_to_monicPolynomial( roots, polynomial );
+
+  PolynomialSolver<double,5> psolve( polynomial );
+  cout << "Complex roots: " << psolve.roots().transpose() << endl;
+
+  std::vector<double> realRoots;
+  psolve.realRoots( realRoots );
+  Map<Vector5d> mapRR( &realRoots[0] );
+  cout << "Real roots: " << mapRR.transpose() << endl;
+
+  cout << endl;
+  cout << "Illustration of the convergence problem with the QR algorithm: " << endl;
+  cout << "---------------------------------------------------------------" << endl;
+  Eigen::Matrix<float,7,1> hardCase_polynomial;
+  hardCase_polynomial <<
+  -0.957, 0.9219, 0.3516, 0.9453, -0.4023, -0.5508, -0.03125;
+  cout << "Hard case polynomial defined by floats: " << hardCase_polynomial.transpose() << endl;
+  PolynomialSolver<float,6> psolvef( hardCase_polynomial );
+  cout << "Complex roots: " << psolvef.roots().transpose() << endl;
+  Eigen::Matrix<float,6,1> evals;
+  for( int i=0; i<6; ++i ){ evals[i] = std::abs( poly_eval( hardCase_polynomial, psolvef.roots()[i] ) ); }
+  cout << "Norms of the evaluations of the polynomial at the roots: " << evals.transpose() << endl << endl;
+
+  cout << "Using double's almost always solves the problem for small degrees: " << endl;
+  cout << "-------------------------------------------------------------------" << endl;
+  PolynomialSolver<double,6> psolve6d( hardCase_polynomial.cast<double>() );
+  cout << "Complex roots: " << psolve6d.roots().transpose() << endl;
+  for( int i=0; i<6; ++i )
+  {
+    std::complex<float> castedRoot( psolve6d.roots()[i].real(), psolve6d.roots()[i].imag() );
+    evals[i] = std::abs( poly_eval( hardCase_polynomial, castedRoot ) );
+  }
+  cout << "Norms of the evaluations of the polynomial at the roots: " << evals.transpose() << endl << endl;
+
+  cout.precision(10);
+  cout << "The last root in float then in double: " << psolvef.roots()[5] << "\t" << psolve6d.roots()[5] << endl;
+  std::complex<float> castedRoot( psolve6d.roots()[5].real(), psolve6d.roots()[5].imag() );
+  cout << "Norm of the difference: " << std::abs( psolvef.roots()[5] - castedRoot ) << endl;
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/doc/examples/PolynomialUtils1.cpp b/uppsrc/plugin/Eigen/unsupported/doc/examples/PolynomialUtils1.cpp
new file mode 100644
index 000000000..dbfe520b5
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/doc/examples/PolynomialUtils1.cpp
@@ -0,0 +1,20 @@
+#include <unsupported/Eigen/Polynomials>
+#include <iostream>
+
+using namespace Eigen;
+using namespace std;
+
+int main()
+{
+  Vector4d roots = Vector4d::Random();
+  cout << "Roots: " << roots.transpose() << endl;
+  Eigen::Matrix<double,5,1> polynomial;
+  roots_to_monicPolynomial( roots, polynomial );
+  cout << "Polynomial: ";
+  for( int i=0; i<4; ++i ){ cout << polynomial[i] << ".x^" << i << "+ "; }
+  cout << polynomial[4] << ".x^4" << endl;
+  Vector4d evaluation;
+  for( int i=0; i<4; ++i ){
+    evaluation[i] = poly_eval( polynomial, roots[i] ); }
+  cout << "Evaluation of the polynomial at the roots: " << evaluation.transpose();
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/doc/snippets/CMakeLists.txt b/uppsrc/plugin/Eigen/unsupported/doc/snippets/CMakeLists.txt
new file mode 100644
index 000000000..f0c5cc2a8
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/doc/snippets/CMakeLists.txt
@@ -0,0 +1,26 @@
+FILE(GLOB snippets_SRCS "*.cpp")
+
+ADD_CUSTOM_TARGET(unsupported_snippets)
+
+FOREACH(snippet_src ${snippets_SRCS})
+  GET_FILENAME_COMPONENT(snippet ${snippet_src} NAME_WE)
+  SET(compile_snippet_target compile_${snippet})
+  SET(compile_snippet_src ${compile_snippet_target}.cpp)
+  FILE(READ ${snippet_src} snippet_source_code)
+  CONFIGURE_FILE(${PROJECT_SOURCE_DIR}/doc/snippets/compile_snippet.cpp.in
+                 ${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src})
+  ADD_EXECUTABLE(${compile_snippet_target}
+                 ${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src})
+  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
+    target_link_libraries(${compile_snippet_target} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
+  endif()
+  ADD_CUSTOM_COMMAND(
+    TARGET ${compile_snippet_target}
+    POST_BUILD
+    COMMAND ${compile_snippet_target}
+    ARGS >${CMAKE_CURRENT_BINARY_DIR}/${snippet}.out
+  )
+  ADD_DEPENDENCIES(unsupported_snippets ${compile_snippet_target})
+  set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/${compile_snippet_src}
+                              PROPERTIES OBJECT_DEPENDS ${snippet_src})
+ENDFOREACH(snippet_src)
diff --git a/uppsrc/plugin/Eigen/unsupported/test/BVH.cpp b/uppsrc/plugin/Eigen/unsupported/test/BVH.cpp
new file mode 100644
index 000000000..ff5b3299d
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/BVH.cpp
@@ -0,0 +1,222 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Ilya Baran <ibaran@mit.edu>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/StdVector>
+#include <Eigen/Geometry>
+#include <unsupported/Eigen/BVH>
+
+namespace Eigen {
+
+template<typename Scalar, int Dim> AlignedBox<Scalar, Dim> bounding_box(const Matrix<Scalar, Dim, 1> &v) { return AlignedBox<Scalar, Dim>(v); }
+
+}
+
+
+template<int Dim>
+struct Ball
+{
+EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(double, Dim)
+
+  typedef Matrix<double, Dim, 1> VectorType;
+
+  Ball() {}
+  Ball(const VectorType &c, double r) : center(c), radius(r) {}
+
+  VectorType center;
+  double radius;
+};
+template<int Dim> AlignedBox<double, Dim> bounding_box(const Ball<Dim> &b)
+{ return AlignedBox<double, Dim>(b.center.array() - b.radius, b.center.array() + b.radius); }
+
+inline double SQR(double x) { return x * x; }
+
+template<int Dim>
+struct BallPointStuff //this class provides functions to be both an intersector and a minimizer, both for a ball and a point and for two trees
+{
+  typedef double Scalar;
+  typedef Matrix<double, Dim, 1> VectorType;
+  typedef Ball<Dim> BallType;
+  typedef AlignedBox<double, Dim> BoxType;
+
+  BallPointStuff() : calls(0), count(0) {}
+  BallPointStuff(const VectorType &inP) : p(inP), calls(0), count(0) {}
+
+
+  bool intersectVolume(const BoxType &r) { ++calls; return r.contains(p); }
+  bool intersectObject(const BallType &b) {
+    ++calls;
+    if((b.center - p).squaredNorm() < SQR(b.radius))
+      ++count;
+    return false; //continue
+  }
+
+  bool intersectVolumeVolume(const BoxType &r1, const BoxType &r2) { ++calls; return !(r1.intersection(r2)).isNull(); }
+  bool intersectVolumeObject(const BoxType &r, const BallType &b) { ++calls; return r.squaredExteriorDistance(b.center) < SQR(b.radius); }
+  bool intersectObjectVolume(const BallType &b, const BoxType &r) { ++calls; return r.squaredExteriorDistance(b.center) < SQR(b.radius); }
+  bool intersectObjectObject(const BallType &b1, const BallType &b2){
+    ++calls;
+    if((b1.center - b2.center).norm() < b1.radius + b2.radius)
+      ++count;
+    return false;
+  }
+  bool intersectVolumeObject(const BoxType &r, const VectorType &v) { ++calls; return r.contains(v); }
+  bool intersectObjectObject(const BallType &b, const VectorType &v){
+    ++calls;
+    if((b.center - v).squaredNorm() < SQR(b.radius))
+      ++count;
+    return false;
+  }
+
+  double minimumOnVolume(const BoxType &r) { ++calls; return r.squaredExteriorDistance(p); }
+  double minimumOnObject(const BallType &b) { ++calls; return (std::max)(0., (b.center - p).squaredNorm() - SQR(b.radius)); }
+  double minimumOnVolumeVolume(const BoxType &r1, const BoxType &r2) { ++calls; return r1.squaredExteriorDistance(r2); }
+  double minimumOnVolumeObject(const BoxType &r, const BallType &b) { ++calls; return SQR((std::max)(0., r.exteriorDistance(b.center) - b.radius)); }
+  double minimumOnObjectVolume(const BallType &b, const BoxType &r) { ++calls; return SQR((std::max)(0., r.exteriorDistance(b.center) - b.radius)); }
+  double minimumOnObjectObject(const BallType &b1, const BallType &b2){ ++calls; return SQR((std::max)(0., (b1.center - b2.center).norm() - b1.radius - b2.radius)); }
+  double minimumOnVolumeObject(const BoxType &r, const VectorType &v) { ++calls; return r.squaredExteriorDistance(v); }
+  double minimumOnObjectObject(const BallType &b, const VectorType &v){ ++calls; return SQR((std::max)(0., (b.center - v).norm() - b.radius)); }
+
+  VectorType p;
+  int calls;
+  int count;
+};
+
+
+template<int Dim>
+struct TreeTest
+{
+  typedef Matrix<double, Dim, 1> VectorType;
+  typedef std::vector<VectorType, aligned_allocator<VectorType> > VectorTypeList;
+  typedef Ball<Dim> BallType;
+  typedef std::vector<BallType, aligned_allocator<BallType> > BallTypeList;
+  typedef AlignedBox<double, Dim> BoxType;
+
+  void testIntersect1()
+  {
+    BallTypeList b;
+    for(int i = 0; i < 500; ++i) {
+        b.push_back(BallType(VectorType::Random(), 0.5 * internal::random(0., 1.)));
+    }
+    KdBVH<double, Dim, BallType> tree(b.begin(), b.end());
+
+    VectorType pt = VectorType::Random();
+    BallPointStuff<Dim> i1(pt), i2(pt);
+
+    for(int i = 0; i < (int)b.size(); ++i)
+      i1.intersectObject(b[i]);
+
+    BVIntersect(tree, i2);
+
+    VERIFY(i1.count == i2.count);
+  }
+
+  void testMinimize1()
+  {
+    BallTypeList b;
+    for(int i = 0; i < 500; ++i) {
+        b.push_back(BallType(VectorType::Random(), 0.01 * internal::random(0., 1.)));
+    }
+    KdBVH<double, Dim, BallType> tree(b.begin(), b.end());
+
+    VectorType pt = VectorType::Random();
+    BallPointStuff<Dim> i1(pt), i2(pt);
+
+    double m1 = (std::numeric_limits<double>::max)(), m2 = m1;
+
+    for(int i = 0; i < (int)b.size(); ++i)
+      m1 = (std::min)(m1, i1.minimumOnObject(b[i]));
+
+    m2 = BVMinimize(tree, i2);
+
+    VERIFY_IS_APPROX(m1, m2);
+  }
+
+  void testIntersect2()
+  {
+    BallTypeList b;
+    VectorTypeList v;
+
+    for(int i = 0; i < 50; ++i) {
+        b.push_back(BallType(VectorType::Random(), 0.5 * internal::random(0., 1.)));
+        for(int j = 0; j < 3; ++j)
+            v.push_back(VectorType::Random());
+    }
+
+    KdBVH<double, Dim, BallType> tree(b.begin(), b.end());
+    KdBVH<double, Dim, VectorType> vTree(v.begin(), v.end());
+
+    BallPointStuff<Dim> i1, i2;
+
+    for(int i = 0; i < (int)b.size(); ++i)
+        for(int j = 0; j < (int)v.size(); ++j)
+            i1.intersectObjectObject(b[i], v[j]);
+
+    BVIntersect(tree, vTree, i2);
+
+    VERIFY(i1.count == i2.count);
+  }
+
+  void testMinimize2()
+  {
+    BallTypeList b;
+    VectorTypeList v;
+
+    for(int i = 0; i < 50; ++i) {
+        b.push_back(BallType(VectorType::Random(), 1e-7 + 1e-6 * internal::random(0., 1.)));
+        for(int j = 0; j < 3; ++j)
+            v.push_back(VectorType::Random());
+    }
+
+    KdBVH<double, Dim, BallType> tree(b.begin(), b.end());
+    KdBVH<double, Dim, VectorType> vTree(v.begin(), v.end());
+
+    BallPointStuff<Dim> i1, i2;
+
+    double m1 = (std::numeric_limits<double>::max)(), m2 = m1;
+
+    for(int i = 0; i < (int)b.size(); ++i)
+        for(int j = 0; j < (int)v.size(); ++j)
+            m1 = (std::min)(m1, i1.minimumOnObjectObject(b[i], v[j]));
+
+    m2 = BVMinimize(tree, vTree, i2);
+
+    VERIFY_IS_APPROX(m1, m2);
+  }
+};
+
+
+void test_BVH()
+{
+  for(int i = 0; i < g_repeat; i++) {
+#ifdef EIGEN_TEST_PART_1
+    TreeTest<2> test2;
+    CALL_SUBTEST(test2.testIntersect1());
+    CALL_SUBTEST(test2.testMinimize1());
+    CALL_SUBTEST(test2.testIntersect2());
+    CALL_SUBTEST(test2.testMinimize2());
+#endif
+
+#ifdef EIGEN_TEST_PART_2
+    TreeTest<3> test3;
+    CALL_SUBTEST(test3.testIntersect1());
+    CALL_SUBTEST(test3.testMinimize1());
+    CALL_SUBTEST(test3.testIntersect2());
+    CALL_SUBTEST(test3.testMinimize2());
+#endif
+
+#ifdef EIGEN_TEST_PART_3
+    TreeTest<4> test4;
+    CALL_SUBTEST(test4.testIntersect1());
+    CALL_SUBTEST(test4.testMinimize1());
+    CALL_SUBTEST(test4.testIntersect2());
+    CALL_SUBTEST(test4.testMinimize2());
+#endif
+  }
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/CMakeLists.txt b/uppsrc/plugin/Eigen/unsupported/test/CMakeLists.txt
new file mode 100644
index 000000000..3a8775a1c
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/CMakeLists.txt
@@ -0,0 +1,263 @@
+# generate split test header file only if it does not yet exist
+# in order to prevent a rebuild everytime cmake is configured
+if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h)
+  file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h "")
+  foreach(i RANGE 1 999)
+    file(APPEND ${CMAKE_CURRENT_BINARY_DIR}/split_test_helper.h
+      "#ifdef EIGEN_TEST_PART_${i}\n"
+      "#define CALL_SUBTEST_${i}(FUNC) CALL_SUBTEST(FUNC)\n"
+      "#else\n"
+      "#define CALL_SUBTEST_${i}(FUNC)\n"
+      "#endif\n\n"
+    )
+  endforeach()
+endif()
+
+set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Unsupported")
+add_custom_target(BuildUnsupported)
+
+include_directories(../../test ../../unsupported ../../Eigen
+                    ${CMAKE_CURRENT_BINARY_DIR}/../../test)
+
+find_package (Threads)
+
+find_package(GoogleHash)
+if(GOOGLEHASH_FOUND)
+  add_definitions("-DEIGEN_GOOGLEHASH_SUPPORT")
+  include_directories(${GOOGLEHASH_INCLUDES})
+  ei_add_property(EIGEN_TESTED_BACKENDS  "GoogleHash, ")
+else(GOOGLEHASH_FOUND)
+  ei_add_property(EIGEN_MISSING_BACKENDS  "GoogleHash, ")
+endif(GOOGLEHASH_FOUND)
+
+
+find_package(Adolc)
+if(ADOLC_FOUND)
+  include_directories(${ADOLC_INCLUDES})
+  ei_add_property(EIGEN_TESTED_BACKENDS "Adolc, ")
+  if(EIGEN_TEST_CXX11)
+    ei_add_test(forward_adolc "" ${ADOLC_LIBRARIES})
+  else()
+    message(STATUS "Adolc found, but tests require C++11 mode")
+  endif()
+else(ADOLC_FOUND)
+  ei_add_property(EIGEN_MISSING_BACKENDS "Adolc, ")
+endif(ADOLC_FOUND)
+
+# this test seems to never have been successful on x87, so is considered to contain a FP-related bug.
+# see thread: "non-linear optimization test summary"
+ei_add_test(NonLinearOptimization)
+
+ei_add_test(NumericalDiff)
+ei_add_test(autodiff_scalar)
+ei_add_test(autodiff)
+
+if (NOT CMAKE_CXX_COMPILER MATCHES "clang\\+\\+$")
+ei_add_test(BVH)
+endif()
+
+ei_add_test(matrix_exponential)
+ei_add_test(matrix_function)
+ei_add_test(matrix_power)
+ei_add_test(matrix_square_root)
+ei_add_test(alignedvector3)
+
+ei_add_test(FFT)
+
+ei_add_test(EulerAngles)
+
+find_package(MPFR 2.3.0)
+find_package(GMP)
+if(MPFR_FOUND AND EIGEN_COMPILER_SUPPORT_CPP11)
+  include_directories(${MPFR_INCLUDES} ./mpreal)
+  ei_add_property(EIGEN_TESTED_BACKENDS "MPFR C++, ")
+  set(EIGEN_MPFR_TEST_LIBRARIES ${MPFR_LIBRARIES} ${GMP_LIBRARIES})
+ ei_add_test(mpreal_support "-std=c++11" "${EIGEN_MPFR_TEST_LIBRARIES}" )
+else()
+  ei_add_property(EIGEN_MISSING_BACKENDS "MPFR C++, ")
+endif()
+
+ei_add_test(sparse_extra   "" "")
+
+find_package(FFTW)
+if(FFTW_FOUND)
+  ei_add_property(EIGEN_TESTED_BACKENDS "fftw, ")
+  include_directories( ${FFTW_INCLUDES} )
+  if(FFTWL_LIB)
+    ei_add_test(FFTW  "-DEIGEN_FFTW_DEFAULT -DEIGEN_HAS_FFTWL" "${FFTW_LIBRARIES}" )
+  else()
+    ei_add_test(FFTW  "-DEIGEN_FFTW_DEFAULT" "${FFTW_LIBRARIES}" )
+  endif()
+else()
+  ei_add_property(EIGEN_MISSING_BACKENDS "fftw, ")
+endif()
+
+option(EIGEN_TEST_NO_OPENGL "Disable OpenGL support in unit tests" OFF)
+if(NOT EIGEN_TEST_NO_OPENGL)
+  find_package(OpenGL)
+  find_package(GLUT)
+  find_package(GLEW)
+  if(OPENGL_FOUND AND GLUT_FOUND AND GLEW_FOUND)
+    include_directories(${OPENGL_INCLUDE_DIR} ${GLUT_INCLUDE_DIR} ${GLEW_INCLUDE_DIRS})
+    ei_add_property(EIGEN_TESTED_BACKENDS "OpenGL, ")
+    set(EIGEN_GL_LIB ${GLUT_LIBRARIES} ${GLEW_LIBRARIES} ${OPENGL_LIBRARIES})
+    ei_add_test(openglsupport  "" "${EIGEN_GL_LIB}" )
+  else()
+    ei_add_property(EIGEN_MISSING_BACKENDS "OpenGL, ")
+  endif()
+else()
+    ei_add_property(EIGEN_MISSING_BACKENDS "OpenGL, ")
+endif()
+
+ei_add_test(polynomialsolver)
+ei_add_test(polynomialutils)
+ei_add_test(splines)
+ei_add_test(gmres)
+ei_add_test(minres)
+ei_add_test(levenberg_marquardt)
+ei_add_test(kronecker_product)
+ei_add_test(special_functions)
+
+# TODO: The following test names are prefixed with the cxx11 string, since historically
+# the tests depended on c++11. This isn't the case anymore so we ought to rename them.
+# FIXME: Old versions of MSVC fail to compile this code, so we just disable these tests
+# when using visual studio. We should make the check more strict to enable the tests for
+# newer versions of MSVC.
+if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+ei_add_test(cxx11_tensor_dimension)
+ei_add_test(cxx11_tensor_map)
+ei_add_test(cxx11_tensor_assign)
+ei_add_test(cxx11_tensor_comparisons)
+ei_add_test(cxx11_tensor_forced_eval)
+ei_add_test(cxx11_tensor_math)
+ei_add_test(cxx11_tensor_const)
+ei_add_test(cxx11_tensor_intdiv)
+ei_add_test(cxx11_tensor_casts)
+ei_add_test(cxx11_tensor_empty)
+ei_add_test(cxx11_tensor_sugar)
+ei_add_test(cxx11_tensor_roundings)
+ei_add_test(cxx11_tensor_layout_swap)
+ei_add_test(cxx11_tensor_io)
+if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
+  # This test requires __uint128_t which is only available on 64bit systems
+  ei_add_test(cxx11_tensor_uint128)
+endif()
+endif()
+
+if(EIGEN_TEST_CXX11)
+  if(EIGEN_TEST_SYCL)
+    ei_add_test_sycl(cxx11_tensor_sycl "-std=c++11")
+    ei_add_test_sycl(cxx11_tensor_forced_eval_sycl "-std=c++11")
+    ei_add_test_sycl(cxx11_tensor_broadcast_sycl "-std=c++11")
+    ei_add_test_sycl(cxx11_tensor_device_sycl "-std=c++11")
+    ei_add_test_sycl(cxx11_tensor_reduction_sycl "-std=c++11")
+  endif(EIGEN_TEST_SYCL)
+  # It should be safe to always run these tests as there is some fallback code for
+  # older compiler that don't support cxx11.
+  # This is already set if EIGEN_TEST_CXX11 is enabled:
+  # set(CMAKE_CXX_STANDARD 11)
+
+  ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+  ei_add_test(cxx11_runqueue "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+  ei_add_test(cxx11_non_blocking_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+
+  ei_add_test(cxx11_meta)
+  ei_add_test(cxx11_tensor_simple)
+#  ei_add_test(cxx11_tensor_symmetry)
+  ei_add_test(cxx11_tensor_index_list)
+  ei_add_test(cxx11_tensor_mixed_indices)
+  ei_add_test(cxx11_tensor_contraction)
+  ei_add_test(cxx11_tensor_convolution)
+  ei_add_test(cxx11_tensor_expr)
+  ei_add_test(cxx11_tensor_fixed_size)
+  ei_add_test(cxx11_tensor_of_const_values)
+  ei_add_test(cxx11_tensor_of_complex)
+  ei_add_test(cxx11_tensor_of_strings)
+  ei_add_test(cxx11_tensor_lvalue)
+  ei_add_test(cxx11_tensor_broadcasting)
+  ei_add_test(cxx11_tensor_chipping)
+  ei_add_test(cxx11_tensor_concatenation)
+  ei_add_test(cxx11_tensor_inflation)
+  ei_add_test(cxx11_tensor_morphing)
+  ei_add_test(cxx11_tensor_padding)
+  ei_add_test(cxx11_tensor_patch)
+  ei_add_test(cxx11_tensor_image_patch)
+  ei_add_test(cxx11_tensor_volume_patch)
+  ei_add_test(cxx11_tensor_reduction)
+  ei_add_test(cxx11_tensor_argmax)
+  ei_add_test(cxx11_tensor_shuffling)
+  ei_add_test(cxx11_tensor_striding)
+  ei_add_test(cxx11_tensor_notification "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+  ei_add_test(cxx11_tensor_thread_pool "-pthread" "${CMAKE_THREAD_LIBS_INIT}")
+  ei_add_test(cxx11_tensor_ref)
+  ei_add_test(cxx11_tensor_random)
+  ei_add_test(cxx11_tensor_generator)
+  ei_add_test(cxx11_tensor_custom_op)
+  ei_add_test(cxx11_tensor_custom_index)
+  ei_add_test(cxx11_tensor_fft)
+  ei_add_test(cxx11_tensor_ifft)
+  ei_add_test(cxx11_tensor_scan)
+
+endif()
+
+# These tests needs nvcc
+find_package(CUDA 7.0)
+if(CUDA_FOUND AND EIGEN_TEST_CUDA)
+  # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
+  # and -fno-check-new flags since they trigger thousands of compilation warnings
+  # in the CUDA runtime
+  # Also remove -ansi that is incompatible with std=c++11.
+  string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-Wnon-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+
+  message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS})
+
+  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+    set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
+  endif()
+  if(EIGEN_TEST_CUDA_CLANG)
+   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 --cuda-gpu-arch=sm_${EIGEN_CUDA_COMPUTE_ARCH}")
+  endif()
+
+  set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr")
+  if (${CUDA_VERSION} STREQUAL "7.0")
+    set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr")
+  endif()
+
+  if( (NOT EIGEN_TEST_CXX11) OR (CMAKE_VERSION VERSION_LESS 3.3))
+    set(EIGEN_CUDA_CXX11_FLAG "-std=c++11")
+  else()
+    # otherwise the flag has already been added because of the above set(CMAKE_CXX_STANDARD 11)
+    set(EIGEN_CUDA_CXX11_FLAG "")
+  endif()
+
+  set(CUDA_NVCC_FLAGS  "${EIGEN_CUDA_CXX11_FLAG} ${EIGEN_CUDA_RELAXED_CONSTEXPR} -arch compute_${EIGEN_CUDA_COMPUTE_ARCH} -Xcudafe \"--display_error_number\" ${CUDA_NVCC_FLAGS}")
+  cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
+  set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
+
+  ei_add_test(cxx11_tensor_complex_cuda)
+  ei_add_test(cxx11_tensor_complex_cwise_ops_cuda)
+  ei_add_test(cxx11_tensor_reduction_cuda)
+  ei_add_test(cxx11_tensor_argmax_cuda)
+  ei_add_test(cxx11_tensor_cast_float16_cuda)
+  ei_add_test(cxx11_tensor_scan_cuda)
+
+  # Contractions require arch 3.0 or higher
+  if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 29)
+    ei_add_test(cxx11_tensor_device)
+    ei_add_test(cxx11_tensor_cuda)
+    ei_add_test(cxx11_tensor_contract_cuda)
+    ei_add_test(cxx11_tensor_of_float16_cuda)
+  endif()
+
+  # The random number generation code requires arch 3.5 or greater.
+  if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 34)
+    ei_add_test(cxx11_tensor_random_cuda)
+  endif()
+
+
+  unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
+endif()
diff --git a/uppsrc/plugin/Eigen/unsupported/test/EulerAngles.cpp b/uppsrc/plugin/Eigen/unsupported/test/EulerAngles.cpp
new file mode 100644
index 000000000..a8cb52864
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/EulerAngles.cpp
@@ -0,0 +1,208 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <unsupported/Eigen/EulerAngles>
+
+using namespace Eigen;
+
+template<typename EulerSystem, typename Scalar>
+void verify_euler_ranged(const Matrix<Scalar,3,1>& ea,
+  bool positiveRangeAlpha, bool positiveRangeBeta, bool positiveRangeGamma)
+{
+  typedef EulerAngles<Scalar, EulerSystem> EulerAnglesType;
+  typedef Matrix<Scalar,3,3> Matrix3;
+  typedef Matrix<Scalar,3,1> Vector3;
+  typedef Quaternion<Scalar> QuaternionType;
+  typedef AngleAxis<Scalar> AngleAxisType;
+  using std::abs;
+  
+  Scalar alphaRangeStart, alphaRangeEnd;
+  Scalar betaRangeStart, betaRangeEnd;
+  Scalar gammaRangeStart, gammaRangeEnd;
+  
+  if (positiveRangeAlpha)
+  {
+    alphaRangeStart = Scalar(0);
+    alphaRangeEnd = Scalar(2 * EIGEN_PI);
+  }
+  else
+  {
+    alphaRangeStart = -Scalar(EIGEN_PI);
+    alphaRangeEnd = Scalar(EIGEN_PI);
+  }
+  
+  if (positiveRangeBeta)
+  {
+    betaRangeStart = Scalar(0);
+    betaRangeEnd = Scalar(2 * EIGEN_PI);
+  }
+  else
+  {
+    betaRangeStart = -Scalar(EIGEN_PI);
+    betaRangeEnd = Scalar(EIGEN_PI);
+  }
+  
+  if (positiveRangeGamma)
+  {
+    gammaRangeStart = Scalar(0);
+    gammaRangeEnd = Scalar(2 * EIGEN_PI);
+  }
+  else
+  {
+    gammaRangeStart = -Scalar(EIGEN_PI);
+    gammaRangeEnd = Scalar(EIGEN_PI);
+  }
+  
+  const int i = EulerSystem::AlphaAxisAbs - 1;
+  const int j = EulerSystem::BetaAxisAbs - 1;
+  const int k = EulerSystem::GammaAxisAbs - 1;
+  
+  const int iFactor = EulerSystem::IsAlphaOpposite ? -1 : 1;
+  const int jFactor = EulerSystem::IsBetaOpposite ? -1 : 1;
+  const int kFactor = EulerSystem::IsGammaOpposite ? -1 : 1;
+  
+  const Vector3 I = EulerAnglesType::AlphaAxisVector();
+  const Vector3 J = EulerAnglesType::BetaAxisVector();
+  const Vector3 K = EulerAnglesType::GammaAxisVector();
+  
+  EulerAnglesType e(ea[0], ea[1], ea[2]);
+  
+  Matrix3 m(e);
+  Vector3 eabis = EulerAnglesType(m, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles();
+  
+  // Check that eabis in range
+  VERIFY(alphaRangeStart <= eabis[0] && eabis[0] <= alphaRangeEnd);
+  VERIFY(betaRangeStart <= eabis[1] && eabis[1] <= betaRangeEnd);
+  VERIFY(gammaRangeStart <= eabis[2] && eabis[2] <= gammaRangeEnd);
+  
+  Vector3 eabis2 = m.eulerAngles(i, j, k);
+  
+  // Invert the relevant axes
+  eabis2[0] *= iFactor;
+  eabis2[1] *= jFactor;
+  eabis2[2] *= kFactor;
+  
+  // Saturate the angles to the correct range
+  if (positiveRangeAlpha && (eabis2[0] < 0))
+    eabis2[0] += Scalar(2 * EIGEN_PI);
+  if (positiveRangeBeta && (eabis2[1] < 0))
+    eabis2[1] += Scalar(2 * EIGEN_PI);
+  if (positiveRangeGamma && (eabis2[2] < 0))
+    eabis2[2] += Scalar(2 * EIGEN_PI);
+  
+  VERIFY_IS_APPROX(eabis, eabis2);// Verify that our estimation is the same as m.eulerAngles() is
+  
+  Matrix3 mbis(AngleAxisType(eabis[0], I) * AngleAxisType(eabis[1], J) * AngleAxisType(eabis[2], K));
+  VERIFY_IS_APPROX(m,  mbis);
+  
+  // Tests that are only relevant for no possitive range
+  if (!(positiveRangeAlpha || positiveRangeBeta || positiveRangeGamma))
+  {
+    /* If I==K, and ea[1]==0, then there no unique solution. */ 
+    /* The remark apply in the case where I!=K, and |ea[1]| is close to pi/2. */ 
+    if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision<Scalar>())) ) 
+      VERIFY((ea-eabis).norm() <= test_precision<Scalar>());
+    
+    // approx_or_less_than does not work for 0
+    VERIFY(0 < eabis[0] || test_isMuchSmallerThan(eabis[0], Scalar(1)));
+  }
+  
+  // Quaternions
+  QuaternionType q(e);
+  eabis = EulerAnglesType(q, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles();
+  VERIFY_IS_APPROX(eabis, eabis2);// Verify that the euler angles are still the same
+}
+
+template<typename EulerSystem, typename Scalar>
+void verify_euler(const Matrix<Scalar,3,1>& ea)
+{
+  verify_euler_ranged<EulerSystem>(ea, false, false, false);
+  verify_euler_ranged<EulerSystem>(ea, false, false, true);
+  verify_euler_ranged<EulerSystem>(ea, false, true, false);
+  verify_euler_ranged<EulerSystem>(ea, false, true, true);
+  verify_euler_ranged<EulerSystem>(ea, true, false, false);
+  verify_euler_ranged<EulerSystem>(ea, true, false, true);
+  verify_euler_ranged<EulerSystem>(ea, true, true, false);
+  verify_euler_ranged<EulerSystem>(ea, true, true, true);
+}
+
+template<typename Scalar> void check_all_var(const Matrix<Scalar,3,1>& ea)
+{
+  verify_euler<EulerSystemXYZ>(ea);
+  verify_euler<EulerSystemXYX>(ea);
+  verify_euler<EulerSystemXZY>(ea);
+  verify_euler<EulerSystemXZX>(ea);
+  
+  verify_euler<EulerSystemYZX>(ea);
+  verify_euler<EulerSystemYZY>(ea);
+  verify_euler<EulerSystemYXZ>(ea);
+  verify_euler<EulerSystemYXY>(ea);
+  
+  verify_euler<EulerSystemZXY>(ea);
+  verify_euler<EulerSystemZXZ>(ea);
+  verify_euler<EulerSystemZYX>(ea);
+  verify_euler<EulerSystemZYZ>(ea);
+}
+
+template<typename Scalar> void eulerangles()
+{
+  typedef Matrix<Scalar,3,3> Matrix3;
+  typedef Matrix<Scalar,3,1> Vector3;
+  typedef Array<Scalar,3,1> Array3;
+  typedef Quaternion<Scalar> Quaternionx;
+  typedef AngleAxis<Scalar> AngleAxisType;
+
+  Scalar a = internal::random<Scalar>(-Scalar(EIGEN_PI), Scalar(EIGEN_PI));
+  Quaternionx q1;
+  q1 = AngleAxisType(a, Vector3::Random().normalized());
+  Matrix3 m;
+  m = q1;
+  
+  Vector3 ea = m.eulerAngles(0,1,2);
+  check_all_var(ea);
+  ea = m.eulerAngles(0,1,0);
+  check_all_var(ea);
+  
+  // Check with purely random Quaternion:
+  q1.coeffs() = Quaternionx::Coefficients::Random().normalized();
+  m = q1;
+  ea = m.eulerAngles(0,1,2);
+  check_all_var(ea);
+  ea = m.eulerAngles(0,1,0);
+  check_all_var(ea);
+  
+  // Check with random angles in range [0:pi]x[-pi:pi]x[-pi:pi].
+  ea = (Array3::Random() + Array3(1,0,0))*Scalar(EIGEN_PI)*Array3(0.5,1,1);
+  check_all_var(ea);
+  
+  ea[2] = ea[0] = internal::random<Scalar>(0,Scalar(EIGEN_PI));
+  check_all_var(ea);
+  
+  ea[0] = ea[1] = internal::random<Scalar>(0,Scalar(EIGEN_PI));
+  check_all_var(ea);
+  
+  ea[1] = 0;
+  check_all_var(ea);
+  
+  ea.head(2).setZero();
+  check_all_var(ea);
+  
+  ea.setZero();
+  check_all_var(ea);
+}
+
+void test_EulerAngles()
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( eulerangles<float>() );
+    CALL_SUBTEST_2( eulerangles<double>() );
+  }
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/FFT.cpp b/uppsrc/plugin/Eigen/unsupported/test/FFT.cpp
new file mode 100644
index 000000000..45c87f5a7
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/FFT.cpp
@@ -0,0 +1,2 @@
+#define test_FFTW test_FFT
+#include "FFTW.cpp"
diff --git a/uppsrc/plugin/Eigen/unsupported/test/FFTW.cpp b/uppsrc/plugin/Eigen/unsupported/test/FFTW.cpp
new file mode 100644
index 000000000..8b7528fb7
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/FFTW.cpp
@@ -0,0 +1,262 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <unsupported/Eigen/FFT>
+
+template <typename T> 
+std::complex<T> RandomCpx() { return std::complex<T>( (T)(rand()/(T)RAND_MAX - .5), (T)(rand()/(T)RAND_MAX - .5) ); }
+
+using namespace std;
+using namespace Eigen;
+
+
+template < typename T>
+complex<long double>  promote(complex<T> x) { return complex<long double>((long double)x.real(),(long double)x.imag()); }
+
+complex<long double>  promote(float x) { return complex<long double>((long double)x); }
+complex<long double>  promote(double x) { return complex<long double>((long double)x); }
+complex<long double>  promote(long double x) { return complex<long double>((long double)x); }
+    
+
+    template <typename VT1,typename VT2>
+    long double fft_rmse( const VT1 & fftbuf,const VT2 & timebuf)
+    {
+        long double totalpower=0;
+        long double difpower=0;
+        long double pi = acos((long double)-1 );
+        for (size_t k0=0;k0<(size_t)fftbuf.size();++k0) {
+            complex<long double> acc = 0;
+            long double phinc = (long double)(-2.)*k0* pi / timebuf.size();
+            for (size_t k1=0;k1<(size_t)timebuf.size();++k1) {
+                acc +=  promote( timebuf[k1] ) * exp( complex<long double>(0,k1*phinc) );
+            }
+            totalpower += numext::abs2(acc);
+            complex<long double> x = promote(fftbuf[k0]); 
+            complex<long double> dif = acc - x;
+            difpower += numext::abs2(dif);
+            //cerr << k0 << "\t" << acc << "\t" <<  x << "\t" << sqrt(numext::abs2(dif)) << endl;
+        }
+        cerr << "rmse:" << sqrt(difpower/totalpower) << endl;
+        return sqrt(difpower/totalpower);
+    }
+
+    template <typename VT1,typename VT2>
+    long double dif_rmse( const VT1 buf1,const VT2 buf2)
+    {
+        long double totalpower=0;
+        long double difpower=0;
+        size_t n = (min)( buf1.size(),buf2.size() );
+        for (size_t k=0;k<n;++k) {
+            totalpower += (long double)((numext::abs2( buf1[k] ) + numext::abs2(buf2[k]) )/2);
+            difpower += (long double)(numext::abs2(buf1[k] - buf2[k]));
+        }
+        return sqrt(difpower/totalpower);
+    }
+
+enum { StdVectorContainer, EigenVectorContainer };
+
+template<int Container, typename Scalar> struct VectorType;
+
+template<typename Scalar> struct VectorType<StdVectorContainer,Scalar>
+{
+  typedef vector<Scalar> type;
+};
+
+template<typename Scalar> struct VectorType<EigenVectorContainer,Scalar>
+{
+  typedef Matrix<Scalar,Dynamic,1> type;
+};
+
+template <int Container, typename T>
+void test_scalar_generic(int nfft)
+{
+    typedef typename FFT<T>::Complex Complex;
+    typedef typename FFT<T>::Scalar Scalar;
+    typedef typename VectorType<Container,Scalar>::type ScalarVector;
+    typedef typename VectorType<Container,Complex>::type ComplexVector;
+
+    FFT<T> fft;
+    ScalarVector tbuf(nfft);
+    ComplexVector freqBuf;
+    for (int k=0;k<nfft;++k)
+        tbuf[k]= (T)( rand()/(double)RAND_MAX - .5);
+
+    // make sure it DOESN'T give the right full spectrum answer
+    // if we've asked for half-spectrum
+    fft.SetFlag(fft.HalfSpectrum );
+    fft.fwd( freqBuf,tbuf);
+    VERIFY((size_t)freqBuf.size() == (size_t)( (nfft>>1)+1) );
+    VERIFY( T(fft_rmse(freqBuf,tbuf)) < test_precision<T>()  );// gross check
+
+    fft.ClearFlag(fft.HalfSpectrum );
+    fft.fwd( freqBuf,tbuf);
+    VERIFY( (size_t)freqBuf.size() == (size_t)nfft);
+    VERIFY( T(fft_rmse(freqBuf,tbuf)) < test_precision<T>()  );// gross check
+
+    if (nfft&1)
+        return; // odd FFTs get the wrong size inverse FFT
+
+    ScalarVector tbuf2;
+    fft.inv( tbuf2 , freqBuf);
+    VERIFY( T(dif_rmse(tbuf,tbuf2)) < test_precision<T>()  );// gross check
+
+
+    // verify that the Unscaled flag takes effect
+    ScalarVector tbuf3;
+    fft.SetFlag(fft.Unscaled);
+
+    fft.inv( tbuf3 , freqBuf);
+
+    for (int k=0;k<nfft;++k)
+        tbuf3[k] *= T(1./nfft);
+
+
+    //for (size_t i=0;i<(size_t) tbuf.size();++i)
+    //    cout << "freqBuf=" << freqBuf[i] << " in2=" << tbuf3[i] << " -  in=" << tbuf[i] << " => " << (tbuf3[i] - tbuf[i] ) <<  endl;
+
+    VERIFY( T(dif_rmse(tbuf,tbuf3)) < test_precision<T>()  );// gross check
+
+    // verify that ClearFlag works
+    fft.ClearFlag(fft.Unscaled);
+    fft.inv( tbuf2 , freqBuf);
+    VERIFY( T(dif_rmse(tbuf,tbuf2)) < test_precision<T>()  );// gross check
+}
+
+template <typename T>
+void test_scalar(int nfft)
+{
+  test_scalar_generic<StdVectorContainer,T>(nfft);
+  //test_scalar_generic<EigenVectorContainer,T>(nfft);
+}
+
+
+template <int Container, typename T>
+void test_complex_generic(int nfft)
+{
+    typedef typename FFT<T>::Complex Complex;
+    typedef typename VectorType<Container,Complex>::type ComplexVector;
+
+    FFT<T> fft;
+
+    ComplexVector inbuf(nfft);
+    ComplexVector outbuf;
+    ComplexVector buf3;
+    for (int k=0;k<nfft;++k)
+        inbuf[k]= Complex( (T)(rand()/(double)RAND_MAX - .5), (T)(rand()/(double)RAND_MAX - .5) );
+    fft.fwd( outbuf , inbuf);
+
+    VERIFY( T(fft_rmse(outbuf,inbuf)) < test_precision<T>()  );// gross check
+    fft.inv( buf3 , outbuf);
+
+    VERIFY( T(dif_rmse(inbuf,buf3)) < test_precision<T>()  );// gross check
+
+    // verify that the Unscaled flag takes effect
+    ComplexVector buf4;
+    fft.SetFlag(fft.Unscaled);
+    fft.inv( buf4 , outbuf);
+    for (int k=0;k<nfft;++k)
+        buf4[k] *= T(1./nfft);
+    VERIFY( T(dif_rmse(inbuf,buf4)) < test_precision<T>()  );// gross check
+
+    // verify that ClearFlag works
+    fft.ClearFlag(fft.Unscaled);
+    fft.inv( buf3 , outbuf);
+    VERIFY( T(dif_rmse(inbuf,buf3)) < test_precision<T>()  );// gross check
+}
+
+template <typename T>
+void test_complex(int nfft)
+{
+  test_complex_generic<StdVectorContainer,T>(nfft);
+  test_complex_generic<EigenVectorContainer,T>(nfft);
+}
+/*
+template <typename T,int nrows,int ncols>
+void test_complex2d()
+{
+    typedef typename Eigen::FFT<T>::Complex Complex;
+    FFT<T> fft;
+    Eigen::Matrix<Complex,nrows,ncols> src,src2,dst,dst2;
+
+    src = Eigen::Matrix<Complex,nrows,ncols>::Random();
+    //src =  Eigen::Matrix<Complex,nrows,ncols>::Identity();
+
+    for (int k=0;k<ncols;k++) {
+        Eigen::Matrix<Complex,nrows,1> tmpOut;
+        fft.fwd( tmpOut,src.col(k) );
+        dst2.col(k) = tmpOut;
+    }
+
+    for (int k=0;k<nrows;k++) {
+        Eigen::Matrix<Complex,1,ncols> tmpOut;
+        fft.fwd( tmpOut,  dst2.row(k) );
+        dst2.row(k) = tmpOut;
+    }
+
+    fft.fwd2(dst.data(),src.data(),ncols,nrows);
+    fft.inv2(src2.data(),dst.data(),ncols,nrows);
+    VERIFY( (src-src2).norm() < test_precision<T>() );
+    VERIFY( (dst-dst2).norm() < test_precision<T>() );
+}
+*/
+
+
+void test_return_by_value(int len)
+{
+    VectorXf in;
+    VectorXf in1;
+    in.setRandom( len );
+    VectorXcf out1,out2;
+    FFT<float> fft;
+
+    fft.SetFlag(fft.HalfSpectrum );
+
+    fft.fwd(out1,in);
+    out2 = fft.fwd(in);
+    VERIFY( (out1-out2).norm() < test_precision<float>() );
+    in1 = fft.inv(out1);
+    VERIFY( (in1-in).norm() < test_precision<float>() );
+}
+
+void test_FFTW()
+{
+  CALL_SUBTEST( test_return_by_value(32) );
+  //CALL_SUBTEST( ( test_complex2d<float,4,8> () ) ); CALL_SUBTEST( ( test_complex2d<double,4,8> () ) );
+  //CALL_SUBTEST( ( test_complex2d<long double,4,8> () ) );
+  CALL_SUBTEST( test_complex<float>(32) ); CALL_SUBTEST( test_complex<double>(32) ); 
+  CALL_SUBTEST( test_complex<float>(256) ); CALL_SUBTEST( test_complex<double>(256) ); 
+  CALL_SUBTEST( test_complex<float>(3*8) ); CALL_SUBTEST( test_complex<double>(3*8) ); 
+  CALL_SUBTEST( test_complex<float>(5*32) ); CALL_SUBTEST( test_complex<double>(5*32) ); 
+  CALL_SUBTEST( test_complex<float>(2*3*4) ); CALL_SUBTEST( test_complex<double>(2*3*4) ); 
+  CALL_SUBTEST( test_complex<float>(2*3*4*5) ); CALL_SUBTEST( test_complex<double>(2*3*4*5) ); 
+  CALL_SUBTEST( test_complex<float>(2*3*4*5*7) ); CALL_SUBTEST( test_complex<double>(2*3*4*5*7) ); 
+
+  CALL_SUBTEST( test_scalar<float>(32) ); CALL_SUBTEST( test_scalar<double>(32) ); 
+  CALL_SUBTEST( test_scalar<float>(45) ); CALL_SUBTEST( test_scalar<double>(45) ); 
+  CALL_SUBTEST( test_scalar<float>(50) ); CALL_SUBTEST( test_scalar<double>(50) ); 
+  CALL_SUBTEST( test_scalar<float>(256) ); CALL_SUBTEST( test_scalar<double>(256) ); 
+  CALL_SUBTEST( test_scalar<float>(2*3*4*5*7) ); CALL_SUBTEST( test_scalar<double>(2*3*4*5*7) ); 
+  
+  #ifdef EIGEN_HAS_FFTWL
+  CALL_SUBTEST( test_complex<long double>(32) );
+  CALL_SUBTEST( test_complex<long double>(256) );
+  CALL_SUBTEST( test_complex<long double>(3*8) );
+  CALL_SUBTEST( test_complex<long double>(5*32) );
+  CALL_SUBTEST( test_complex<long double>(2*3*4) );
+  CALL_SUBTEST( test_complex<long double>(2*3*4*5) );
+  CALL_SUBTEST( test_complex<long double>(2*3*4*5*7) );
+  
+  CALL_SUBTEST( test_scalar<long double>(32) );
+  CALL_SUBTEST( test_scalar<long double>(45) );
+  CALL_SUBTEST( test_scalar<long double>(50) );
+  CALL_SUBTEST( test_scalar<long double>(256) );
+  CALL_SUBTEST( test_scalar<long double>(2*3*4*5*7) );
+  #endif
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/NonLinearOptimization.cpp b/uppsrc/plugin/Eigen/unsupported/test/NonLinearOptimization.cpp
new file mode 100644
index 000000000..dd93c21e9
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/NonLinearOptimization.cpp
@@ -0,0 +1,1849 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel@freehackers.org>
+
+#include <stdio.h>
+
+#include "main.h"
+#include <unsupported/Eigen/NonLinearOptimization>
+
+// This disables some useless Warnings on MSVC.
+// It is intended to be done for this test only.
+#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+
+// tolerance for chekcing number of iterations
+#define LM_EVAL_COUNT_TOL 4/3
+
+#define LM_CHECK_N_ITERS(SOLVER,NFEV,NJEV) { \
+            ++g_test_level; \
+            VERIFY_IS_EQUAL(SOLVER.nfev, NFEV); \
+            VERIFY_IS_EQUAL(SOLVER.njev, NJEV); \
+            --g_test_level; \
+            VERIFY(SOLVER.nfev <= NFEV * LM_EVAL_COUNT_TOL); \
+            VERIFY(SOLVER.njev <= NJEV * LM_EVAL_COUNT_TOL); \
+        }
+
+int fcn_chkder(const VectorXd &x, VectorXd &fvec, MatrixXd &fjac, int iflag)
+{
+    /*      subroutine fcn for chkder example. */
+
+    int i;
+    assert(15 ==  fvec.size());
+    assert(3 ==  x.size());
+    double tmp1, tmp2, tmp3, tmp4;
+    static const double y[15]={1.4e-1, 1.8e-1, 2.2e-1, 2.5e-1, 2.9e-1, 3.2e-1, 3.5e-1,
+        3.9e-1, 3.7e-1, 5.8e-1, 7.3e-1, 9.6e-1, 1.34, 2.1, 4.39};
+
+
+    if (iflag == 0)
+        return 0;
+
+    if (iflag != 2)
+        for (i=0; i<15; i++) {
+            tmp1 = i+1;
+            tmp2 = 16-i-1;
+            tmp3 = tmp1;
+            if (i >= 8) tmp3 = tmp2;
+            fvec[i] = y[i] - (x[0] + tmp1/(x[1]*tmp2 + x[2]*tmp3));
+        }
+    else {
+        for (i = 0; i < 15; i++) {
+            tmp1 = i+1;
+            tmp2 = 16-i-1;
+
+            /* error introduced into next statement for illustration. */
+            /* corrected statement should read    tmp3 = tmp1 . */
+
+            tmp3 = tmp2;
+            if (i >= 8) tmp3 = tmp2;
+            tmp4 = (x[1]*tmp2 + x[2]*tmp3); tmp4=tmp4*tmp4;
+            fjac(i,0) = -1.;
+            fjac(i,1) = tmp1*tmp2/tmp4;
+            fjac(i,2) = tmp1*tmp3/tmp4;
+        }
+    }
+    return 0;
+}
+
+
+void testChkder()
+{
+  const int m=15, n=3;
+  VectorXd x(n), fvec(m), xp, fvecp(m), err;
+  MatrixXd fjac(m,n);
+  VectorXi ipvt;
+
+  /*      the following values should be suitable for */
+  /*      checking the jacobian matrix. */
+  x << 9.2e-1, 1.3e-1, 5.4e-1;
+
+  internal::chkder(x, fvec, fjac, xp, fvecp, 1, err);
+  fcn_chkder(x, fvec, fjac, 1);
+  fcn_chkder(x, fvec, fjac, 2);
+  fcn_chkder(xp, fvecp, fjac, 1);
+  internal::chkder(x, fvec, fjac, xp, fvecp, 2, err);
+
+  fvecp -= fvec;
+
+  // check those
+  VectorXd fvec_ref(m), fvecp_ref(m), err_ref(m);
+  fvec_ref <<
+      -1.181606, -1.429655, -1.606344,
+      -1.745269, -1.840654, -1.921586,
+      -1.984141, -2.022537, -2.468977,
+      -2.827562, -3.473582, -4.437612,
+      -6.047662, -9.267761, -18.91806;
+  fvecp_ref <<
+      -7.724666e-09, -3.432406e-09, -2.034843e-10,
+      2.313685e-09,  4.331078e-09,  5.984096e-09,
+      7.363281e-09,   8.53147e-09,  1.488591e-08,
+      2.33585e-08,  3.522012e-08,  5.301255e-08,
+      8.26666e-08,  1.419747e-07,   3.19899e-07;
+  err_ref <<
+      0.1141397,  0.09943516,  0.09674474,
+      0.09980447,  0.1073116, 0.1220445,
+      0.1526814, 1, 1,
+      1, 1, 1,
+      1, 1, 1;
+
+  VERIFY_IS_APPROX(fvec, fvec_ref);
+  VERIFY_IS_APPROX(fvecp, fvecp_ref);
+  VERIFY_IS_APPROX(err, err_ref);
+}
+
+// Generic functor
+template<typename _Scalar, int NX=Dynamic, int NY=Dynamic>
+struct Functor
+{
+  typedef _Scalar Scalar;
+  enum {
+    InputsAtCompileTime = NX,
+    ValuesAtCompileTime = NY
+  };
+  typedef Matrix<Scalar,InputsAtCompileTime,1> InputType;
+  typedef Matrix<Scalar,ValuesAtCompileTime,1> ValueType;
+  typedef Matrix<Scalar,ValuesAtCompileTime,InputsAtCompileTime> JacobianType;
+
+  const int m_inputs, m_values;
+
+  Functor() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {}
+  Functor(int inputs, int values) : m_inputs(inputs), m_values(values) {}
+
+  int inputs() const { return m_inputs; }
+  int values() const { return m_values; }
+
+  // you should define that in the subclass :
+//  void operator() (const InputType& x, ValueType* v, JacobianType* _j=0) const;
+};
+
+struct lmder_functor : Functor<double>
+{
+    lmder_functor(void): Functor<double>(3,15) {}
+    int operator()(const VectorXd &x, VectorXd &fvec) const
+    {
+        double tmp1, tmp2, tmp3;
+        static const double y[15] = {1.4e-1, 1.8e-1, 2.2e-1, 2.5e-1, 2.9e-1, 3.2e-1, 3.5e-1,
+            3.9e-1, 3.7e-1, 5.8e-1, 7.3e-1, 9.6e-1, 1.34, 2.1, 4.39};
+
+        for (int i = 0; i < values(); i++)
+        {
+            tmp1 = i+1;
+            tmp2 = 16 - i - 1;
+            tmp3 = (i>=8)? tmp2 : tmp1;
+            fvec[i] = y[i] - (x[0] + tmp1/(x[1]*tmp2 + x[2]*tmp3));
+        }
+        return 0;
+    }
+
+    int df(const VectorXd &x, MatrixXd &fjac) const
+    {
+        double tmp1, tmp2, tmp3, tmp4;
+        for (int i = 0; i < values(); i++)
+        {
+            tmp1 = i+1;
+            tmp2 = 16 - i - 1;
+            tmp3 = (i>=8)? tmp2 : tmp1;
+            tmp4 = (x[1]*tmp2 + x[2]*tmp3); tmp4 = tmp4*tmp4;
+            fjac(i,0) = -1;
+            fjac(i,1) = tmp1*tmp2/tmp4;
+            fjac(i,2) = tmp1*tmp3/tmp4;
+        }
+        return 0;
+    }
+};
+
+void testLmder1()
+{
+  int n=3, info;
+
+  VectorXd x;
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, 1.);
+
+  // do the computation
+  lmder_functor functor;
+  LevenbergMarquardt<lmder_functor> lm(functor);
+  info = lm.lmder1(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 6, 5);
+
+  // check norm
+  VERIFY_IS_APPROX(lm.fvec.blueNorm(), 0.09063596);
+
+  // check x
+  VectorXd x_ref(n);
+  x_ref << 0.08241058, 1.133037, 2.343695;
+  VERIFY_IS_APPROX(x, x_ref);
+}
+
+void testLmder()
+{
+  const int m=15, n=3;
+  int info;
+  double fnorm, covfac;
+  VectorXd x;
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, 1.);
+
+  // do the computation
+  lmder_functor functor;
+  LevenbergMarquardt<lmder_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return values
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 6, 5);
+
+  // check norm
+  fnorm = lm.fvec.blueNorm();
+  VERIFY_IS_APPROX(fnorm, 0.09063596);
+
+  // check x
+  VectorXd x_ref(n);
+  x_ref << 0.08241058, 1.133037, 2.343695;
+  VERIFY_IS_APPROX(x, x_ref);
+
+  // check covariance
+  covfac = fnorm*fnorm/(m-n);
+  internal::covar(lm.fjac, lm.permutation.indices()); // TODO : move this as a function of lm
+
+  MatrixXd cov_ref(n,n);
+  cov_ref <<
+      0.0001531202,   0.002869941,  -0.002656662,
+      0.002869941,    0.09480935,   -0.09098995,
+      -0.002656662,   -0.09098995,    0.08778727;
+
+//  std::cout << fjac*covfac << std::endl;
+
+  MatrixXd cov;
+  cov =  covfac*lm.fjac.topLeftCorner<n,n>();
+  VERIFY_IS_APPROX( cov, cov_ref);
+  // TODO: why isn't this allowed ? :
+  // VERIFY_IS_APPROX( covfac*fjac.topLeftCorner<n,n>() , cov_ref);
+}
+
+struct hybrj_functor : Functor<double>
+{
+    hybrj_functor(void) : Functor<double>(9,9) {}
+
+    int operator()(const VectorXd &x, VectorXd &fvec)
+    {
+        double temp, temp1, temp2;
+        const VectorXd::Index n = x.size();
+        assert(fvec.size()==n);
+        for (VectorXd::Index k = 0; k < n; k++)
+        {
+            temp = (3. - 2.*x[k])*x[k];
+            temp1 = 0.;
+            if (k) temp1 = x[k-1];
+            temp2 = 0.;
+            if (k != n-1) temp2 = x[k+1];
+            fvec[k] = temp - temp1 - 2.*temp2 + 1.;
+        }
+        return 0;
+    }
+    int df(const VectorXd &x, MatrixXd &fjac)
+    {
+        const VectorXd::Index n = x.size();
+        assert(fjac.rows()==n);
+        assert(fjac.cols()==n);
+        for (VectorXd::Index k = 0; k < n; k++)
+        {
+            for (VectorXd::Index j = 0; j < n; j++)
+                fjac(k,j) = 0.;
+            fjac(k,k) = 3.- 4.*x[k];
+            if (k) fjac(k,k-1) = -1.;
+            if (k != n-1) fjac(k,k+1) = -2.;
+        }
+        return 0;
+    }
+};
+
+
+void testHybrj1()
+{
+  const int n=9;
+  int info;
+  VectorXd x(n);
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, -1.);
+
+  // do the computation
+  hybrj_functor functor;
+  HybridNonLinearSolver<hybrj_functor> solver(functor);
+  info = solver.hybrj1(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(solver, 11, 1);
+
+  // check norm
+  VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08);
+
+
+// check x
+  VectorXd x_ref(n);
+  x_ref <<
+     -0.5706545,    -0.6816283,    -0.7017325,
+     -0.7042129,     -0.701369,    -0.6918656,
+     -0.665792,    -0.5960342,    -0.4164121;
+  VERIFY_IS_APPROX(x, x_ref);
+}
+
+void testHybrj()
+{
+  const int n=9;
+  int info;
+  VectorXd x(n);
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, -1.);
+
+
+  // do the computation
+  hybrj_functor functor;
+  HybridNonLinearSolver<hybrj_functor> solver(functor);
+  solver.diag.setConstant(n, 1.);
+  solver.useExternalScaling = true;
+  info = solver.solve(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(solver, 11, 1);
+
+  // check norm
+  VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08);
+
+
+// check x
+  VectorXd x_ref(n);
+  x_ref <<
+     -0.5706545,    -0.6816283,    -0.7017325,
+     -0.7042129,     -0.701369,    -0.6918656,
+     -0.665792,    -0.5960342,    -0.4164121;
+  VERIFY_IS_APPROX(x, x_ref);
+
+}
+
+struct hybrd_functor : Functor<double>
+{
+    hybrd_functor(void) : Functor<double>(9,9) {}
+    int operator()(const VectorXd &x, VectorXd &fvec) const
+    {
+        double temp, temp1, temp2;
+        const VectorXd::Index n = x.size();
+
+        assert(fvec.size()==n);
+        for (VectorXd::Index k=0; k < n; k++)
+        {
+            temp = (3. - 2.*x[k])*x[k];
+            temp1 = 0.;
+            if (k) temp1 = x[k-1];
+            temp2 = 0.;
+            if (k != n-1) temp2 = x[k+1];
+            fvec[k] = temp - temp1 - 2.*temp2 + 1.;
+        }
+        return 0;
+    }
+};
+
+void testHybrd1()
+{
+  int n=9, info;
+  VectorXd x(n);
+
+  /* the following starting values provide a rough solution. */
+  x.setConstant(n, -1.);
+
+  // do the computation
+  hybrd_functor functor;
+  HybridNonLinearSolver<hybrd_functor> solver(functor);
+  info = solver.hybrd1(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(solver.nfev, 20);
+
+  // check norm
+  VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08);
+
+  // check x
+  VectorXd x_ref(n);
+  x_ref << -0.5706545, -0.6816283, -0.7017325, -0.7042129, -0.701369, -0.6918656, -0.665792, -0.5960342, -0.4164121;
+  VERIFY_IS_APPROX(x, x_ref);
+}
+
+void testHybrd()
+{
+  const int n=9;
+  int info;
+  VectorXd x;
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, -1.);
+
+  // do the computation
+  hybrd_functor functor;
+  HybridNonLinearSolver<hybrd_functor> solver(functor);
+  solver.parameters.nb_of_subdiagonals = 1;
+  solver.parameters.nb_of_superdiagonals = 1;
+  solver.diag.setConstant(n, 1.);
+  solver.useExternalScaling = true;
+  info = solver.solveNumericalDiff(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(solver.nfev, 14);
+
+  // check norm
+  VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08);
+
+  // check x
+  VectorXd x_ref(n);
+  x_ref <<
+      -0.5706545,    -0.6816283,    -0.7017325,
+      -0.7042129,     -0.701369,    -0.6918656,
+      -0.665792,    -0.5960342,    -0.4164121;
+  VERIFY_IS_APPROX(x, x_ref);
+}
+
+struct lmstr_functor : Functor<double>
+{
+    lmstr_functor(void) : Functor<double>(3,15) {}
+    int operator()(const VectorXd &x, VectorXd &fvec)
+    {
+        /*  subroutine fcn for lmstr1 example. */
+        double tmp1, tmp2, tmp3;
+        static const double y[15]={1.4e-1, 1.8e-1, 2.2e-1, 2.5e-1, 2.9e-1, 3.2e-1, 3.5e-1,
+            3.9e-1, 3.7e-1, 5.8e-1, 7.3e-1, 9.6e-1, 1.34, 2.1, 4.39};
+
+        assert(15==fvec.size());
+        assert(3==x.size());
+
+        for (int i=0; i<15; i++)
+        {
+            tmp1 = i+1;
+            tmp2 = 16 - i - 1;
+            tmp3 = (i>=8)? tmp2 : tmp1;
+            fvec[i] = y[i] - (x[0] + tmp1/(x[1]*tmp2 + x[2]*tmp3));
+        }
+        return 0;
+    }
+    int df(const VectorXd &x, VectorXd &jac_row, VectorXd::Index rownb)
+    {
+        assert(x.size()==3);
+        assert(jac_row.size()==x.size());
+        double tmp1, tmp2, tmp3, tmp4;
+
+        VectorXd::Index i = rownb-2;
+        tmp1 = i+1;
+        tmp2 = 16 - i - 1;
+        tmp3 = (i>=8)? tmp2 : tmp1;
+        tmp4 = (x[1]*tmp2 + x[2]*tmp3); tmp4 = tmp4*tmp4;
+        jac_row[0] = -1;
+        jac_row[1] = tmp1*tmp2/tmp4;
+        jac_row[2] = tmp1*tmp3/tmp4;
+        return 0;
+    }
+};
+
+void testLmstr1()
+{
+  const int n=3;
+  int info;
+
+  VectorXd x(n);
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, 1.);
+
+  // do the computation
+  lmstr_functor functor;
+  LevenbergMarquardt<lmstr_functor> lm(functor);
+  info = lm.lmstr1(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 6, 5);
+
+  // check norm
+  VERIFY_IS_APPROX(lm.fvec.blueNorm(), 0.09063596);
+
+  // check x
+  VectorXd x_ref(n);
+  x_ref << 0.08241058, 1.133037, 2.343695 ;
+  VERIFY_IS_APPROX(x, x_ref);
+}
+
+void testLmstr()
+{
+  const int n=3;
+  int info;
+  double fnorm;
+  VectorXd x(n);
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, 1.);
+
+  // do the computation
+  lmstr_functor functor;
+  LevenbergMarquardt<lmstr_functor> lm(functor);
+  info = lm.minimizeOptimumStorage(x);
+
+  // check return values
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 6, 5);
+
+  // check norm
+  fnorm = lm.fvec.blueNorm();
+  VERIFY_IS_APPROX(fnorm, 0.09063596);
+
+  // check x
+  VectorXd x_ref(n);
+  x_ref << 0.08241058, 1.133037, 2.343695;
+  VERIFY_IS_APPROX(x, x_ref);
+
+}
+
+struct lmdif_functor : Functor<double>
+{
+    lmdif_functor(void) : Functor<double>(3,15) {}
+    int operator()(const VectorXd &x, VectorXd &fvec) const
+    {
+        int i;
+        double tmp1,tmp2,tmp3;
+        static const double y[15]={1.4e-1,1.8e-1,2.2e-1,2.5e-1,2.9e-1,3.2e-1,3.5e-1,3.9e-1,
+            3.7e-1,5.8e-1,7.3e-1,9.6e-1,1.34e0,2.1e0,4.39e0};
+
+        assert(x.size()==3);
+        assert(fvec.size()==15);
+        for (i=0; i<15; i++)
+        {
+            tmp1 = i+1;
+            tmp2 = 15 - i;
+            tmp3 = tmp1;
+
+            if (i >= 8) tmp3 = tmp2;
+            fvec[i] = y[i] - (x[0] + tmp1/(x[1]*tmp2 + x[2]*tmp3));
+        }
+        return 0;
+    }
+};
+
+void testLmdif1()
+{
+  const int n=3;
+  int info;
+
+  VectorXd x(n), fvec(15);
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, 1.);
+
+  // do the computation
+  lmdif_functor functor;
+  DenseIndex nfev = -1; // initialize to avoid maybe-uninitialized warning
+  info = LevenbergMarquardt<lmdif_functor>::lmdif1(functor, x, &nfev);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(nfev, 26);
+
+  // check norm
+  functor(x, fvec);
+  VERIFY_IS_APPROX(fvec.blueNorm(), 0.09063596);
+
+  // check x
+  VectorXd x_ref(n);
+  x_ref << 0.0824106, 1.1330366, 2.3436947;
+  VERIFY_IS_APPROX(x, x_ref);
+
+}
+
+void testLmdif()
+{
+  const int m=15, n=3;
+  int info;
+  double fnorm, covfac;
+  VectorXd x(n);
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, 1.);
+
+  // do the computation
+  lmdif_functor functor;
+  NumericalDiff<lmdif_functor> numDiff(functor);
+  LevenbergMarquardt<NumericalDiff<lmdif_functor> > lm(numDiff);
+  info = lm.minimize(x);
+
+  // check return values
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev, 26);
+
+  // check norm
+  fnorm = lm.fvec.blueNorm();
+  VERIFY_IS_APPROX(fnorm, 0.09063596);
+
+  // check x
+  VectorXd x_ref(n);
+  x_ref << 0.08241058, 1.133037, 2.343695;
+  VERIFY_IS_APPROX(x, x_ref);
+
+  // check covariance
+  covfac = fnorm*fnorm/(m-n);
+  internal::covar(lm.fjac, lm.permutation.indices()); // TODO : move this as a function of lm
+
+  MatrixXd cov_ref(n,n);
+  cov_ref <<
+      0.0001531202,   0.002869942,  -0.002656662,
+      0.002869942,    0.09480937,   -0.09098997,
+      -0.002656662,   -0.09098997,    0.08778729;
+
+//  std::cout << fjac*covfac << std::endl;
+
+  MatrixXd cov;
+  cov =  covfac*lm.fjac.topLeftCorner<n,n>();
+  VERIFY_IS_APPROX( cov, cov_ref);
+  // TODO: why isn't this allowed ? :
+  // VERIFY_IS_APPROX( covfac*fjac.topLeftCorner<n,n>() , cov_ref);
+}
+
+struct chwirut2_functor : Functor<double>
+{
+    chwirut2_functor(void) : Functor<double>(3,54) {}
+    static const double m_x[54];
+    static const double m_y[54];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        int i;
+
+        assert(b.size()==3);
+        assert(fvec.size()==54);
+        for(i=0; i<54; i++) {
+            double x = m_x[i];
+            fvec[i] = exp(-b[0]*x)/(b[1]+b[2]*x) - m_y[i];
+        }
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==3);
+        assert(fjac.rows()==54);
+        assert(fjac.cols()==3);
+        for(int i=0; i<54; i++) {
+            double x = m_x[i];
+            double factor = 1./(b[1]+b[2]*x);
+            double e = exp(-b[0]*x);
+            fjac(i,0) = -x*e*factor;
+            fjac(i,1) = -e*factor*factor;
+            fjac(i,2) = -x*e*factor*factor;
+        }
+        return 0;
+    }
+};
+const double chwirut2_functor::m_x[54] = { 0.500E0, 1.000E0, 1.750E0, 3.750E0, 5.750E0, 0.875E0, 2.250E0, 3.250E0, 5.250E0, 0.750E0, 1.750E0, 2.750E0, 4.750E0, 0.625E0, 1.250E0, 2.250E0, 4.250E0, .500E0, 3.000E0, .750E0, 3.000E0, 1.500E0, 6.000E0, 3.000E0, 6.000E0, 1.500E0, 3.000E0, .500E0, 2.000E0, 4.000E0, .750E0, 2.000E0, 5.000E0, .750E0, 2.250E0, 3.750E0, 5.750E0, 3.000E0, .750E0, 2.500E0, 4.000E0, .750E0, 2.500E0, 4.000E0, .750E0, 2.500E0, 4.000E0, .500E0, 6.000E0, 3.000E0, .500E0, 2.750E0, .500E0, 1.750E0};
+const double chwirut2_functor::m_y[54] = { 92.9000E0 ,57.1000E0 ,31.0500E0 ,11.5875E0 ,8.0250E0 ,63.6000E0 ,21.4000E0 ,14.2500E0 ,8.4750E0 ,63.8000E0 ,26.8000E0 ,16.4625E0 ,7.1250E0 ,67.3000E0 ,41.0000E0 ,21.1500E0 ,8.1750E0 ,81.5000E0 ,13.1200E0 ,59.9000E0 ,14.6200E0 ,32.9000E0 ,5.4400E0 ,12.5600E0 ,5.4400E0 ,32.0000E0 ,13.9500E0 ,75.8000E0 ,20.0000E0 ,10.4200E0 ,59.5000E0 ,21.6700E0 ,8.5500E0 ,62.0000E0 ,20.2000E0 ,7.7600E0 ,3.7500E0 ,11.8100E0 ,54.7000E0 ,23.7000E0 ,11.5500E0 ,61.3000E0 ,17.7000E0 ,8.7400E0 ,59.2000E0 ,16.3000E0 ,8.6200E0 ,81.0000E0 ,4.8700E0 ,14.6200E0 ,81.7000E0 ,17.1700E0 ,81.3000E0 ,28.9000E0  };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/chwirut2.shtml
+void testNistChwirut2(void)
+{
+  const int n=3;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 0.1, 0.01, 0.02;
+  // do the computation
+  chwirut2_functor functor;
+  LevenbergMarquardt<chwirut2_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 10, 8);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.1304802941E+02);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.6657666537E-01);
+  VERIFY_IS_APPROX(x[1], 5.1653291286E-03);
+  VERIFY_IS_APPROX(x[2], 1.2150007096E-02);
+
+  /*
+   * Second try
+   */
+  x<< 0.15, 0.008, 0.010;
+  // do the computation
+  lm.resetParameters();
+  lm.parameters.ftol = 1.E6*NumTraits<double>::epsilon();
+  lm.parameters.xtol = 1.E6*NumTraits<double>::epsilon();
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 7, 6);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.1304802941E+02);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.6657666537E-01);
+  VERIFY_IS_APPROX(x[1], 5.1653291286E-03);
+  VERIFY_IS_APPROX(x[2], 1.2150007096E-02);
+}
+
+
+struct misra1a_functor : Functor<double>
+{
+    misra1a_functor(void) : Functor<double>(2,14) {}
+    static const double m_x[14];
+    static const double m_y[14];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==2);
+        assert(fvec.size()==14);
+        for(int i=0; i<14; i++) {
+            fvec[i] = b[0]*(1.-exp(-b[1]*m_x[i])) - m_y[i] ;
+        }
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==2);
+        assert(fjac.rows()==14);
+        assert(fjac.cols()==2);
+        for(int i=0; i<14; i++) {
+            fjac(i,0) = (1.-exp(-b[1]*m_x[i]));
+            fjac(i,1) = (b[0]*m_x[i]*exp(-b[1]*m_x[i]));
+        }
+        return 0;
+    }
+};
+const double misra1a_functor::m_x[14] = { 77.6E0, 114.9E0, 141.1E0, 190.8E0, 239.9E0, 289.0E0, 332.8E0, 378.4E0, 434.8E0, 477.3E0, 536.8E0, 593.1E0, 689.1E0, 760.0E0};
+const double misra1a_functor::m_y[14] = { 10.07E0, 14.73E0, 17.94E0, 23.93E0, 29.61E0, 35.18E0, 40.02E0, 44.82E0, 50.76E0, 55.05E0, 61.01E0, 66.40E0, 75.47E0, 81.78E0};
+
+// http://www.itl.nist.gov/div898/strd/nls/data/misra1a.shtml
+void testNistMisra1a(void)
+{
+  const int n=2;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 500., 0.0001;
+  // do the computation
+  misra1a_functor functor;
+  LevenbergMarquardt<misra1a_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 19, 15);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.2455138894E-01);
+  // check x
+  VERIFY_IS_APPROX(x[0], 2.3894212918E+02);
+  VERIFY_IS_APPROX(x[1], 5.5015643181E-04);
+
+  /*
+   * Second try
+   */
+  x<< 250., 0.0005;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 5, 4);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.2455138894E-01);
+  // check x
+  VERIFY_IS_APPROX(x[0], 2.3894212918E+02);
+  VERIFY_IS_APPROX(x[1], 5.5015643181E-04);
+}
+
+struct hahn1_functor : Functor<double>
+{
+    hahn1_functor(void) : Functor<double>(7,236) {}
+    static const double m_x[236];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        static const double m_y[236] = { .591E0 , 1.547E0 , 2.902E0 , 2.894E0 , 4.703E0 , 6.307E0 , 7.03E0  , 7.898E0 , 9.470E0 , 9.484E0 , 10.072E0 , 10.163E0 , 11.615E0 , 12.005E0 , 12.478E0 , 12.982E0 , 12.970E0 , 13.926E0 , 14.452E0 , 14.404E0 , 15.190E0 , 15.550E0 , 15.528E0 , 15.499E0 , 16.131E0 , 16.438E0 , 16.387E0 , 16.549E0 , 16.872E0 , 16.830E0 , 16.926E0 , 16.907E0 , 16.966E0 , 17.060E0 , 17.122E0 , 17.311E0 , 17.355E0 , 17.668E0 , 17.767E0 , 17.803E0 , 17.765E0 , 17.768E0 , 17.736E0 , 17.858E0 , 17.877E0 , 17.912E0 , 18.046E0 , 18.085E0 , 18.291E0 , 18.357E0 , 18.426E0 , 18.584E0 , 18.610E0 , 18.870E0 , 18.795E0 , 19.111E0 , .367E0 , .796E0 , 0.892E0 , 1.903E0 , 2.150E0 , 3.697E0 , 5.870E0 , 6.421E0 , 7.422E0 , 9.944E0 , 11.023E0 , 11.87E0  , 12.786E0 , 14.067E0 , 13.974E0 , 14.462E0 , 14.464E0 , 15.381E0 , 15.483E0 , 15.59E0  , 16.075E0 , 16.347E0 , 16.181E0 , 16.915E0 , 17.003E0 , 16.978E0 , 17.756E0 , 17.808E0 , 17.868E0 , 18.481E0 , 18.486E0 , 19.090E0 , 16.062E0 , 16.337E0 , 16.345E0 ,
+        16.388E0 , 17.159E0 , 17.116E0 , 17.164E0 , 17.123E0 , 17.979E0 , 17.974E0 , 18.007E0 , 17.993E0 , 18.523E0 , 18.669E0 , 18.617E0 , 19.371E0 , 19.330E0 , 0.080E0 , 0.248E0 , 1.089E0 , 1.418E0 , 2.278E0 , 3.624E0 , 4.574E0 , 5.556E0 , 7.267E0 , 7.695E0 , 9.136E0 , 9.959E0 , 9.957E0 , 11.600E0 , 13.138E0 , 13.564E0 , 13.871E0 , 13.994E0 , 14.947E0 , 15.473E0 , 15.379E0 , 15.455E0 , 15.908E0 , 16.114E0 , 17.071E0 , 17.135E0 , 17.282E0 , 17.368E0 , 17.483E0 , 17.764E0 , 18.185E0 , 18.271E0 , 18.236E0 , 18.237E0 , 18.523E0 , 18.627E0 , 18.665E0 , 19.086E0 , 0.214E0 , 0.943E0 , 1.429E0 , 2.241E0 , 2.951E0 , 3.782E0 , 4.757E0 , 5.602E0 , 7.169E0 , 8.920E0 , 10.055E0 , 12.035E0 , 12.861E0 , 13.436E0 , 14.167E0 , 14.755E0 , 15.168E0 , 15.651E0 , 15.746E0 , 16.216E0 , 16.445E0 , 16.965E0 , 17.121E0 , 17.206E0 , 17.250E0 , 17.339E0 , 17.793E0 , 18.123E0 , 18.49E0  , 18.566E0 , 18.645E0 , 18.706E0 , 18.924E0 , 19.1E0   , 0.375E0 , 0.471E0 , 1.504E0 , 2.204E0 , 2.813E0 , 4.765E0 , 9.835E0 , 10.040E0 , 11.946E0 , 12.596E0 , 
+13.303E0 , 13.922E0 , 14.440E0 , 14.951E0 , 15.627E0 , 15.639E0 , 15.814E0 , 16.315E0 , 16.334E0 , 16.430E0 , 16.423E0 , 17.024E0 , 17.009E0 , 17.165E0 , 17.134E0 , 17.349E0 , 17.576E0 , 17.848E0 , 18.090E0 , 18.276E0 , 18.404E0 , 18.519E0 , 19.133E0 , 19.074E0 , 19.239E0 , 19.280E0 , 19.101E0 , 19.398E0 , 19.252E0 , 19.89E0  , 20.007E0 , 19.929E0 , 19.268E0 , 19.324E0 , 20.049E0 , 20.107E0 , 20.062E0 , 20.065E0 , 19.286E0 , 19.972E0 , 20.088E0 , 20.743E0 , 20.83E0  , 20.935E0 , 21.035E0 , 20.93E0  , 21.074E0 , 21.085E0 , 20.935E0 };
+
+        //        int called=0; printf("call hahn1_functor with  iflag=%d, called=%d\n", iflag, called); if (iflag==1) called++;
+
+        assert(b.size()==7);
+        assert(fvec.size()==236);
+        for(int i=0; i<236; i++) {
+            double x=m_x[i], xx=x*x, xxx=xx*x;
+            fvec[i] = (b[0]+b[1]*x+b[2]*xx+b[3]*xxx) / (1.+b[4]*x+b[5]*xx+b[6]*xxx) - m_y[i];
+        }
+        return 0;
+    }
+
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==7);
+        assert(fjac.rows()==236);
+        assert(fjac.cols()==7);
+        for(int i=0; i<236; i++) {
+            double x=m_x[i], xx=x*x, xxx=xx*x;
+            double fact = 1./(1.+b[4]*x+b[5]*xx+b[6]*xxx);
+            fjac(i,0) = 1.*fact;
+            fjac(i,1) = x*fact;
+            fjac(i,2) = xx*fact;
+            fjac(i,3) = xxx*fact;
+            fact = - (b[0]+b[1]*x+b[2]*xx+b[3]*xxx) * fact * fact;
+            fjac(i,4) = x*fact;
+            fjac(i,5) = xx*fact;
+            fjac(i,6) = xxx*fact;
+        }
+        return 0;
+    }
+};
+const double hahn1_functor::m_x[236] = { 24.41E0 , 34.82E0 , 44.09E0 , 45.07E0 , 54.98E0 , 65.51E0 , 70.53E0 , 75.70E0 , 89.57E0 , 91.14E0 , 96.40E0 , 97.19E0 , 114.26E0 , 120.25E0 , 127.08E0 , 133.55E0 , 133.61E0 , 158.67E0 , 172.74E0 , 171.31E0 , 202.14E0 , 220.55E0 , 221.05E0 , 221.39E0 , 250.99E0 , 268.99E0 , 271.80E0 , 271.97E0 , 321.31E0 , 321.69E0 , 330.14E0 , 333.03E0 , 333.47E0 , 340.77E0 , 345.65E0 , 373.11E0 , 373.79E0 , 411.82E0 , 419.51E0 , 421.59E0 , 422.02E0 , 422.47E0 , 422.61E0 , 441.75E0 , 447.41E0 , 448.7E0  , 472.89E0 , 476.69E0 , 522.47E0 , 522.62E0 , 524.43E0 , 546.75E0 , 549.53E0 , 575.29E0 , 576.00E0 , 625.55E0 , 20.15E0 , 28.78E0 , 29.57E0 , 37.41E0 , 39.12E0 , 50.24E0 , 61.38E0 , 66.25E0 , 73.42E0 , 95.52E0 , 107.32E0 , 122.04E0 , 134.03E0 , 163.19E0 , 163.48E0 , 175.70E0 , 179.86E0 , 211.27E0 , 217.78E0 , 219.14E0 , 262.52E0 , 268.01E0 , 268.62E0 , 336.25E0 , 337.23E0 , 339.33E0 , 427.38E0 , 428.58E0 , 432.68E0 , 528.99E0 , 531.08E0 , 628.34E0 , 253.24E0 , 273.13E0 , 273.66E0 ,
+282.10E0 , 346.62E0 , 347.19E0 , 348.78E0 , 351.18E0 , 450.10E0 , 450.35E0 , 451.92E0 , 455.56E0 , 552.22E0 , 553.56E0 , 555.74E0 , 652.59E0 , 656.20E0 , 14.13E0 , 20.41E0 , 31.30E0 , 33.84E0 , 39.70E0 , 48.83E0 , 54.50E0 , 60.41E0 , 72.77E0 , 75.25E0 , 86.84E0 , 94.88E0 , 96.40E0 , 117.37E0 , 139.08E0 , 147.73E0 , 158.63E0 , 161.84E0 , 192.11E0 , 206.76E0 , 209.07E0 , 213.32E0 , 226.44E0 , 237.12E0 , 330.90E0 , 358.72E0 , 370.77E0 , 372.72E0 , 396.24E0 , 416.59E0 , 484.02E0 , 495.47E0 , 514.78E0 , 515.65E0 , 519.47E0 , 544.47E0 , 560.11E0 , 620.77E0 , 18.97E0 , 28.93E0 , 33.91E0 , 40.03E0 , 44.66E0 , 49.87E0 , 55.16E0 , 60.90E0 , 72.08E0 , 85.15E0 , 97.06E0 , 119.63E0 , 133.27E0 , 143.84E0 , 161.91E0 , 180.67E0 , 198.44E0 , 226.86E0 , 229.65E0 , 258.27E0 , 273.77E0 , 339.15E0 , 350.13E0 , 362.75E0 , 371.03E0 , 393.32E0 , 448.53E0 , 473.78E0 , 511.12E0 , 524.70E0 , 548.75E0 , 551.64E0 , 574.02E0 , 623.86E0 , 21.46E0 , 24.33E0 , 33.43E0 , 39.22E0 , 44.18E0 , 55.02E0 , 94.33E0 , 96.44E0 , 118.82E0 , 128.48E0 ,
+141.94E0 , 156.92E0 , 171.65E0 , 190.00E0 , 223.26E0 , 223.88E0 , 231.50E0 , 265.05E0 , 269.44E0 , 271.78E0 , 273.46E0 , 334.61E0 , 339.79E0 , 349.52E0 , 358.18E0 , 377.98E0 , 394.77E0 , 429.66E0 , 468.22E0 , 487.27E0 , 519.54E0 , 523.03E0 , 612.99E0 , 638.59E0 , 641.36E0 , 622.05E0 , 631.50E0 , 663.97E0 , 646.9E0  , 748.29E0 , 749.21E0 , 750.14E0 , 647.04E0 , 646.89E0 , 746.9E0  , 748.43E0 , 747.35E0 , 749.27E0 , 647.61E0 , 747.78E0 , 750.51E0 , 851.37E0 , 845.97E0 , 847.54E0 , 849.93E0 , 851.61E0 , 849.75E0 , 850.98E0 , 848.23E0};
+
+// http://www.itl.nist.gov/div898/strd/nls/data/hahn1.shtml
+void testNistHahn1(void)
+{
+  const int  n=7;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 10., -1., .05, -.00001, -.05, .001, -.000001;
+  // do the computation
+  hahn1_functor functor;
+  LevenbergMarquardt<hahn1_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 11, 10);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.5324382854E+00);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.0776351733E+00);
+  VERIFY_IS_APPROX(x[1],-1.2269296921E-01);
+  VERIFY_IS_APPROX(x[2], 4.0863750610E-03);
+  VERIFY_IS_APPROX(x[3],-1.426264e-06); // shoulde be : -1.4262662514E-06
+  VERIFY_IS_APPROX(x[4],-5.7609940901E-03);
+  VERIFY_IS_APPROX(x[5], 2.4053735503E-04);
+  VERIFY_IS_APPROX(x[6],-1.2314450199E-07);
+
+  /*
+   * Second try
+   */
+  x<< .1, -.1, .005, -.000001, -.005, .0001, -.0000001;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 11, 10);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.5324382854E+00);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.077640); // should be :  1.0776351733E+00
+  VERIFY_IS_APPROX(x[1], -0.1226933); // should be : -1.2269296921E-01
+  VERIFY_IS_APPROX(x[2], 0.004086383); // should be : 4.0863750610E-03
+  VERIFY_IS_APPROX(x[3], -1.426277e-06); // shoulde be : -1.4262662514E-06
+  VERIFY_IS_APPROX(x[4],-5.7609940901E-03);
+  VERIFY_IS_APPROX(x[5], 0.00024053772); // should be : 2.4053735503E-04
+  VERIFY_IS_APPROX(x[6], -1.231450e-07); // should be : -1.2314450199E-07
+
+}
+
+struct misra1d_functor : Functor<double>
+{
+    misra1d_functor(void) : Functor<double>(2,14) {}
+    static const double x[14];
+    static const double y[14];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==2);
+        assert(fvec.size()==14);
+        for(int i=0; i<14; i++) {
+            fvec[i] = b[0]*b[1]*x[i]/(1.+b[1]*x[i]) - y[i];
+        }
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==2);
+        assert(fjac.rows()==14);
+        assert(fjac.cols()==2);
+        for(int i=0; i<14; i++) {
+            double den = 1.+b[1]*x[i];
+            fjac(i,0) = b[1]*x[i] / den;
+            fjac(i,1) = b[0]*x[i]*(den-b[1]*x[i])/den/den;
+        }
+        return 0;
+    }
+};
+const double misra1d_functor::x[14] = { 77.6E0, 114.9E0, 141.1E0, 190.8E0, 239.9E0, 289.0E0, 332.8E0, 378.4E0, 434.8E0, 477.3E0, 536.8E0, 593.1E0, 689.1E0, 760.0E0};
+const double misra1d_functor::y[14] = { 10.07E0, 14.73E0, 17.94E0, 23.93E0, 29.61E0, 35.18E0, 40.02E0, 44.82E0, 50.76E0, 55.05E0, 61.01E0, 66.40E0, 75.47E0, 81.78E0};
+
+// http://www.itl.nist.gov/div898/strd/nls/data/misra1d.shtml
+void testNistMisra1d(void)
+{
+  const int n=2;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 500., 0.0001;
+  // do the computation
+  misra1d_functor functor;
+  LevenbergMarquardt<misra1d_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 3);
+  LM_CHECK_N_ITERS(lm, 9, 7);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6419295283E-02);
+  // check x
+  VERIFY_IS_APPROX(x[0], 4.3736970754E+02);
+  VERIFY_IS_APPROX(x[1], 3.0227324449E-04);
+
+  /*
+   * Second try
+   */
+  x<< 450., 0.0003;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 4, 3);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6419295283E-02);
+  // check x
+  VERIFY_IS_APPROX(x[0], 4.3736970754E+02);
+  VERIFY_IS_APPROX(x[1], 3.0227324449E-04);
+}
+
+
+struct lanczos1_functor : Functor<double>
+{
+    lanczos1_functor(void) : Functor<double>(6,24) {}
+    static const double x[24];
+    static const double y[24];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==6);
+        assert(fvec.size()==24);
+        for(int i=0; i<24; i++)
+            fvec[i] = b[0]*exp(-b[1]*x[i]) + b[2]*exp(-b[3]*x[i]) + b[4]*exp(-b[5]*x[i])  - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==6);
+        assert(fjac.rows()==24);
+        assert(fjac.cols()==6);
+        for(int i=0; i<24; i++) {
+            fjac(i,0) = exp(-b[1]*x[i]);
+            fjac(i,1) = -b[0]*x[i]*exp(-b[1]*x[i]);
+            fjac(i,2) = exp(-b[3]*x[i]);
+            fjac(i,3) = -b[2]*x[i]*exp(-b[3]*x[i]);
+            fjac(i,4) = exp(-b[5]*x[i]);
+            fjac(i,5) = -b[4]*x[i]*exp(-b[5]*x[i]);
+        }
+        return 0;
+    }
+};
+const double lanczos1_functor::x[24] = { 0.000000000000E+00, 5.000000000000E-02, 1.000000000000E-01, 1.500000000000E-01, 2.000000000000E-01, 2.500000000000E-01, 3.000000000000E-01, 3.500000000000E-01, 4.000000000000E-01, 4.500000000000E-01, 5.000000000000E-01, 5.500000000000E-01, 6.000000000000E-01, 6.500000000000E-01, 7.000000000000E-01, 7.500000000000E-01, 8.000000000000E-01, 8.500000000000E-01, 9.000000000000E-01, 9.500000000000E-01, 1.000000000000E+00, 1.050000000000E+00, 1.100000000000E+00, 1.150000000000E+00 };
+const double lanczos1_functor::y[24] = { 2.513400000000E+00 ,2.044333373291E+00 ,1.668404436564E+00 ,1.366418021208E+00 ,1.123232487372E+00 ,9.268897180037E-01 ,7.679338563728E-01 ,6.388775523106E-01 ,5.337835317402E-01 ,4.479363617347E-01 ,3.775847884350E-01 ,3.197393199326E-01 ,2.720130773746E-01 ,2.324965529032E-01 ,1.996589546065E-01 ,1.722704126914E-01 ,1.493405660168E-01 ,1.300700206922E-01 ,1.138119324644E-01 ,1.000415587559E-01 ,8.833209084540E-02 ,7.833544019350E-02 ,6.976693743449E-02 ,6.239312536719E-02 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/lanczos1.shtml
+void testNistLanczos1(void)
+{
+  const int n=6;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 1.2, 0.3, 5.6, 5.5, 6.5, 7.6;
+  // do the computation
+  lanczos1_functor functor;
+  LevenbergMarquardt<lanczos1_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 2);
+  LM_CHECK_N_ITERS(lm, 79, 72);
+  // check norm^2
+  std::cout.precision(30);
+  std::cout << lm.fvec.squaredNorm() << "\n";
+  VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25);
+  // check x
+  VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
+  VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
+  VERIFY_IS_APPROX(x[2], 8.6070000013E-01);
+  VERIFY_IS_APPROX(x[3], 3.0000000002E+00);
+  VERIFY_IS_APPROX(x[4], 1.5575999998E+00);
+  VERIFY_IS_APPROX(x[5], 5.0000000001E+00);
+
+  /*
+   * Second try
+   */
+  x<< 0.5, 0.7, 3.6, 4.2, 4., 6.3;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 2);
+  LM_CHECK_N_ITERS(lm, 9, 8);
+  // check norm^2
+  VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25);
+  // check x
+  VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
+  VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
+  VERIFY_IS_APPROX(x[2], 8.6070000013E-01);
+  VERIFY_IS_APPROX(x[3], 3.0000000002E+00);
+  VERIFY_IS_APPROX(x[4], 1.5575999998E+00);
+  VERIFY_IS_APPROX(x[5], 5.0000000001E+00);
+
+}
+
+struct rat42_functor : Functor<double>
+{
+    rat42_functor(void) : Functor<double>(3,9) {}
+    static const double x[9];
+    static const double y[9];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==3);
+        assert(fvec.size()==9);
+        for(int i=0; i<9; i++) {
+            fvec[i] = b[0] / (1.+exp(b[1]-b[2]*x[i])) - y[i];
+        }
+        return 0;
+    }
+
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==3);
+        assert(fjac.rows()==9);
+        assert(fjac.cols()==3);
+        for(int i=0; i<9; i++) {
+            double e = exp(b[1]-b[2]*x[i]);
+            fjac(i,0) = 1./(1.+e);
+            fjac(i,1) = -b[0]*e/(1.+e)/(1.+e);
+            fjac(i,2) = +b[0]*e*x[i]/(1.+e)/(1.+e);
+        }
+        return 0;
+    }
+};
+const double rat42_functor::x[9] = { 9.000E0, 14.000E0, 21.000E0, 28.000E0, 42.000E0, 57.000E0, 63.000E0, 70.000E0, 79.000E0 };
+const double rat42_functor::y[9] = { 8.930E0 ,10.800E0 ,18.590E0 ,22.330E0 ,39.350E0 ,56.110E0 ,61.730E0 ,64.620E0 ,67.080E0 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/ratkowsky2.shtml
+void testNistRat42(void)
+{
+  const int n=3;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 100., 1., 0.1;
+  // do the computation
+  rat42_functor functor;
+  LevenbergMarquardt<rat42_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 10, 8);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.0565229338E+00);
+  // check x
+  VERIFY_IS_APPROX(x[0], 7.2462237576E+01);
+  VERIFY_IS_APPROX(x[1], 2.6180768402E+00);
+  VERIFY_IS_APPROX(x[2], 6.7359200066E-02);
+
+  /*
+   * Second try
+   */
+  x<< 75., 2.5, 0.07;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 6, 5);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.0565229338E+00);
+  // check x
+  VERIFY_IS_APPROX(x[0], 7.2462237576E+01);
+  VERIFY_IS_APPROX(x[1], 2.6180768402E+00);
+  VERIFY_IS_APPROX(x[2], 6.7359200066E-02);
+}
+
+struct MGH10_functor : Functor<double>
+{
+    MGH10_functor(void) : Functor<double>(3,16) {}
+    static const double x[16];
+    static const double y[16];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==3);
+        assert(fvec.size()==16);
+        for(int i=0; i<16; i++)
+            fvec[i] =  b[0] * exp(b[1]/(x[i]+b[2])) - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==3);
+        assert(fjac.rows()==16);
+        assert(fjac.cols()==3);
+        for(int i=0; i<16; i++) {
+            double factor = 1./(x[i]+b[2]);
+            double e = exp(b[1]*factor);
+            fjac(i,0) = e;
+            fjac(i,1) = b[0]*factor*e;
+            fjac(i,2) = -b[1]*b[0]*factor*factor*e;
+        }
+        return 0;
+    }
+};
+const double MGH10_functor::x[16] = { 5.000000E+01, 5.500000E+01, 6.000000E+01, 6.500000E+01, 7.000000E+01, 7.500000E+01, 8.000000E+01, 8.500000E+01, 9.000000E+01, 9.500000E+01, 1.000000E+02, 1.050000E+02, 1.100000E+02, 1.150000E+02, 1.200000E+02, 1.250000E+02 };
+const double MGH10_functor::y[16] = { 3.478000E+04, 2.861000E+04, 2.365000E+04, 1.963000E+04, 1.637000E+04, 1.372000E+04, 1.154000E+04, 9.744000E+03, 8.261000E+03, 7.030000E+03, 6.005000E+03, 5.147000E+03, 4.427000E+03, 3.820000E+03, 3.307000E+03, 2.872000E+03 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/mgh10.shtml
+void testNistMGH10(void)
+{
+  const int n=3;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 2., 400000., 25000.;
+  // do the computation
+  MGH10_functor functor;
+  LevenbergMarquardt<MGH10_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 2); 
+  LM_CHECK_N_ITERS(lm, 284, 249); 
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7945855171E+01);
+  // check x
+  VERIFY_IS_APPROX(x[0], 5.6096364710E-03);
+  VERIFY_IS_APPROX(x[1], 6.1813463463E+03);
+  VERIFY_IS_APPROX(x[2], 3.4522363462E+02);
+
+  /*
+   * Second try
+   */
+  x<< 0.02, 4000., 250.;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 3);
+  LM_CHECK_N_ITERS(lm, 126, 116);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7945855171E+01);
+  // check x
+  VERIFY_IS_APPROX(x[0], 5.6096364710E-03);
+  VERIFY_IS_APPROX(x[1], 6.1813463463E+03);
+  VERIFY_IS_APPROX(x[2], 3.4522363462E+02);
+}
+
+
+struct BoxBOD_functor : Functor<double>
+{
+    BoxBOD_functor(void) : Functor<double>(2,6) {}
+    static const double x[6];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        static const double y[6] = { 109., 149., 149., 191., 213., 224. };
+        assert(b.size()==2);
+        assert(fvec.size()==6);
+        for(int i=0; i<6; i++)
+            fvec[i] =  b[0]*(1.-exp(-b[1]*x[i])) - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==2);
+        assert(fjac.rows()==6);
+        assert(fjac.cols()==2);
+        for(int i=0; i<6; i++) {
+            double e = exp(-b[1]*x[i]);
+            fjac(i,0) = 1.-e;
+            fjac(i,1) = b[0]*x[i]*e;
+        }
+        return 0;
+    }
+};
+const double BoxBOD_functor::x[6] = { 1., 2., 3., 5., 7., 10. };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/boxbod.shtml
+void testNistBoxBOD(void)
+{
+  const int n=2;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 1., 1.;
+  // do the computation
+  BoxBOD_functor functor;
+  LevenbergMarquardt<BoxBOD_functor> lm(functor);
+  lm.parameters.ftol = 1.E6*NumTraits<double>::epsilon();
+  lm.parameters.xtol = 1.E6*NumTraits<double>::epsilon();
+  lm.parameters.factor = 10.;
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 31, 25);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.1680088766E+03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 2.1380940889E+02);
+  VERIFY_IS_APPROX(x[1], 5.4723748542E-01);
+
+  /*
+   * Second try
+   */
+  x<< 100., 0.75;
+  // do the computation
+  lm.resetParameters();
+  lm.parameters.ftol = NumTraits<double>::epsilon();
+  lm.parameters.xtol = NumTraits<double>::epsilon();
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 15, 14);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.1680088766E+03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 2.1380940889E+02);
+  VERIFY_IS_APPROX(x[1], 5.4723748542E-01);
+}
+
+struct MGH17_functor : Functor<double>
+{
+    MGH17_functor(void) : Functor<double>(5,33) {}
+    static const double x[33];
+    static const double y[33];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==5);
+        assert(fvec.size()==33);
+        for(int i=0; i<33; i++)
+            fvec[i] =  b[0] + b[1]*exp(-b[3]*x[i]) +  b[2]*exp(-b[4]*x[i]) - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==5);
+        assert(fjac.rows()==33);
+        assert(fjac.cols()==5);
+        for(int i=0; i<33; i++) {
+            fjac(i,0) = 1.;
+            fjac(i,1) = exp(-b[3]*x[i]);
+            fjac(i,2) = exp(-b[4]*x[i]);
+            fjac(i,3) = -x[i]*b[1]*exp(-b[3]*x[i]);
+            fjac(i,4) = -x[i]*b[2]*exp(-b[4]*x[i]);
+        }
+        return 0;
+    }
+};
+const double MGH17_functor::x[33] = { 0.000000E+00, 1.000000E+01, 2.000000E+01, 3.000000E+01, 4.000000E+01, 5.000000E+01, 6.000000E+01, 7.000000E+01, 8.000000E+01, 9.000000E+01, 1.000000E+02, 1.100000E+02, 1.200000E+02, 1.300000E+02, 1.400000E+02, 1.500000E+02, 1.600000E+02, 1.700000E+02, 1.800000E+02, 1.900000E+02, 2.000000E+02, 2.100000E+02, 2.200000E+02, 2.300000E+02, 2.400000E+02, 2.500000E+02, 2.600000E+02, 2.700000E+02, 2.800000E+02, 2.900000E+02, 3.000000E+02, 3.100000E+02, 3.200000E+02 };
+const double MGH17_functor::y[33] = { 8.440000E-01, 9.080000E-01, 9.320000E-01, 9.360000E-01, 9.250000E-01, 9.080000E-01, 8.810000E-01, 8.500000E-01, 8.180000E-01, 7.840000E-01, 7.510000E-01, 7.180000E-01, 6.850000E-01, 6.580000E-01, 6.280000E-01, 6.030000E-01, 5.800000E-01, 5.580000E-01, 5.380000E-01, 5.220000E-01, 5.060000E-01, 4.900000E-01, 4.780000E-01, 4.670000E-01, 4.570000E-01, 4.480000E-01, 4.380000E-01, 4.310000E-01, 4.240000E-01, 4.200000E-01, 4.140000E-01, 4.110000E-01, 4.060000E-01 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/mgh17.shtml
+void testNistMGH17(void)
+{
+  const int n=5;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 50., 150., -100., 1., 2.;
+  // do the computation
+  MGH17_functor functor;
+  LevenbergMarquardt<MGH17_functor> lm(functor);
+  lm.parameters.ftol = NumTraits<double>::epsilon();
+  lm.parameters.xtol = NumTraits<double>::epsilon();
+  lm.parameters.maxfev = 1000;
+  info = lm.minimize(x);
+
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.4648946975E-05);
+  // check x
+  VERIFY_IS_APPROX(x[0], 3.7541005211E-01);
+  VERIFY_IS_APPROX(x[1], 1.9358469127E+00);
+  VERIFY_IS_APPROX(x[2], -1.4646871366E+00);
+  VERIFY_IS_APPROX(x[3], 1.2867534640E-02);
+  VERIFY_IS_APPROX(x[4], 2.2122699662E-02);
+  
+  // check return value
+  VERIFY_IS_EQUAL(info, 2); 
+  LM_CHECK_N_ITERS(lm, 602, 545);
+
+  /*
+   * Second try
+   */
+  x<< 0.5  ,1.5  ,-1   ,0.01 ,0.02;
+  // do the computation
+  lm.resetParameters();
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 18, 15);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.4648946975E-05);
+  // check x
+  VERIFY_IS_APPROX(x[0], 3.7541005211E-01);
+  VERIFY_IS_APPROX(x[1], 1.9358469127E+00);
+  VERIFY_IS_APPROX(x[2], -1.4646871366E+00);
+  VERIFY_IS_APPROX(x[3], 1.2867534640E-02);
+  VERIFY_IS_APPROX(x[4], 2.2122699662E-02);
+}
+
+struct MGH09_functor : Functor<double>
+{
+    MGH09_functor(void) : Functor<double>(4,11) {}
+    static const double _x[11];
+    static const double y[11];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==4);
+        assert(fvec.size()==11);
+        for(int i=0; i<11; i++) {
+            double x = _x[i], xx=x*x;
+            fvec[i] = b[0]*(xx+x*b[1])/(xx+x*b[2]+b[3]) - y[i];
+        }
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==4);
+        assert(fjac.rows()==11);
+        assert(fjac.cols()==4);
+        for(int i=0; i<11; i++) {
+            double x = _x[i], xx=x*x;
+            double factor = 1./(xx+x*b[2]+b[3]);
+            fjac(i,0) = (xx+x*b[1]) * factor;
+            fjac(i,1) = b[0]*x* factor;
+            fjac(i,2) = - b[0]*(xx+x*b[1]) * x * factor * factor;
+            fjac(i,3) = - b[0]*(xx+x*b[1]) * factor * factor;
+        }
+        return 0;
+    }
+};
+const double MGH09_functor::_x[11] = { 4., 2., 1., 5.E-1 , 2.5E-01, 1.670000E-01, 1.250000E-01,  1.E-01, 8.330000E-02, 7.140000E-02, 6.250000E-02 };
+const double MGH09_functor::y[11] = { 1.957000E-01, 1.947000E-01, 1.735000E-01, 1.600000E-01, 8.440000E-02, 6.270000E-02, 4.560000E-02, 3.420000E-02, 3.230000E-02, 2.350000E-02, 2.460000E-02 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/mgh09.shtml
+void testNistMGH09(void)
+{
+  const int n=4;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 25., 39, 41.5, 39.;
+  // do the computation
+  MGH09_functor functor;
+  LevenbergMarquardt<MGH09_functor> lm(functor);
+  lm.parameters.maxfev = 1000;
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 490, 376);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 3.0750560385E-04);
+  // check x
+  VERIFY_IS_APPROX(x[0], 0.1928077089); // should be 1.9280693458E-01
+  VERIFY_IS_APPROX(x[1], 0.19126423573); // should be 1.9128232873E-01
+  VERIFY_IS_APPROX(x[2], 0.12305309914); // should be 1.2305650693E-01
+  VERIFY_IS_APPROX(x[3], 0.13605395375); // should be 1.3606233068E-01
+
+  /*
+   * Second try
+   */
+  x<< 0.25, 0.39, 0.415, 0.39;
+  // do the computation
+  lm.resetParameters();
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 18, 16);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 3.0750560385E-04);
+  // check x
+  VERIFY_IS_APPROX(x[0], 0.19280781); // should be 1.9280693458E-01
+  VERIFY_IS_APPROX(x[1], 0.19126265); // should be 1.9128232873E-01
+  VERIFY_IS_APPROX(x[2], 0.12305280); // should be 1.2305650693E-01
+  VERIFY_IS_APPROX(x[3], 0.13605322); // should be 1.3606233068E-01
+}
+
+
+
+struct Bennett5_functor : Functor<double>
+{
+    Bennett5_functor(void) : Functor<double>(3,154) {}
+    static const double x[154];
+    static const double y[154];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==3);
+        assert(fvec.size()==154);
+        for(int i=0; i<154; i++)
+            fvec[i] = b[0]* pow(b[1]+x[i],-1./b[2]) - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==3);
+        assert(fjac.rows()==154);
+        assert(fjac.cols()==3);
+        for(int i=0; i<154; i++) {
+            double e = pow(b[1]+x[i],-1./b[2]);
+            fjac(i,0) = e;
+            fjac(i,1) = - b[0]*e/b[2]/(b[1]+x[i]);
+            fjac(i,2) = b[0]*e*log(b[1]+x[i])/b[2]/b[2];
+        }
+        return 0;
+    }
+};
+const double Bennett5_functor::x[154] = { 7.447168E0, 8.102586E0, 8.452547E0, 8.711278E0, 8.916774E0, 9.087155E0, 9.232590E0, 9.359535E0, 9.472166E0, 9.573384E0, 9.665293E0, 9.749461E0, 9.827092E0, 9.899128E0, 9.966321E0, 10.029280E0, 10.088510E0, 10.144430E0, 10.197380E0, 10.247670E0, 10.295560E0, 10.341250E0, 10.384950E0, 10.426820E0, 10.467000E0, 10.505640E0, 10.542830E0, 10.578690E0, 10.613310E0, 10.646780E0, 10.679150E0, 10.710520E0, 10.740920E0, 10.770440E0, 10.799100E0, 10.826970E0, 10.854080E0, 10.880470E0, 10.906190E0, 10.931260E0, 10.955720E0, 10.979590E0, 11.002910E0, 11.025700E0, 11.047980E0, 11.069770E0, 11.091100E0, 11.111980E0, 11.132440E0, 11.152480E0, 11.172130E0, 11.191410E0, 11.210310E0, 11.228870E0, 11.247090E0, 11.264980E0, 11.282560E0, 11.299840E0, 11.316820E0, 11.333520E0, 11.349940E0, 11.366100E0, 11.382000E0, 11.397660E0, 11.413070E0, 11.428240E0, 11.443200E0, 11.457930E0, 11.472440E0, 11.486750E0, 11.500860E0, 11.514770E0, 11.528490E0, 11.542020E0, 11.555380E0, 11.568550E0,
+11.581560E0, 11.594420E0, 11.607121E0, 11.619640E0, 11.632000E0, 11.644210E0, 11.656280E0, 11.668200E0, 11.679980E0, 11.691620E0, 11.703130E0, 11.714510E0, 11.725760E0, 11.736880E0, 11.747890E0, 11.758780E0, 11.769550E0, 11.780200E0, 11.790730E0, 11.801160E0, 11.811480E0, 11.821700E0, 11.831810E0, 11.841820E0, 11.851730E0, 11.861550E0, 11.871270E0, 11.880890E0, 11.890420E0, 11.899870E0, 11.909220E0, 11.918490E0, 11.927680E0, 11.936780E0, 11.945790E0, 11.954730E0, 11.963590E0, 11.972370E0, 11.981070E0, 11.989700E0, 11.998260E0, 12.006740E0, 12.015150E0, 12.023490E0, 12.031760E0, 12.039970E0, 12.048100E0, 12.056170E0, 12.064180E0, 12.072120E0, 12.080010E0, 12.087820E0, 12.095580E0, 12.103280E0, 12.110920E0, 12.118500E0, 12.126030E0, 12.133500E0, 12.140910E0, 12.148270E0, 12.155570E0, 12.162830E0, 12.170030E0, 12.177170E0, 12.184270E0, 12.191320E0, 12.198320E0, 12.205270E0, 12.212170E0, 12.219030E0, 12.225840E0, 12.232600E0, 12.239320E0, 12.245990E0, 12.252620E0, 12.259200E0, 12.265750E0, 12.272240E0 };
+const double Bennett5_functor::y[154] = { -34.834702E0 ,-34.393200E0 ,-34.152901E0 ,-33.979099E0 ,-33.845901E0 ,-33.732899E0 ,-33.640301E0 ,-33.559200E0 ,-33.486801E0 ,-33.423100E0 ,-33.365101E0 ,-33.313000E0 ,-33.260899E0 ,-33.217400E0 ,-33.176899E0 ,-33.139198E0 ,-33.101601E0 ,-33.066799E0 ,-33.035000E0 ,-33.003101E0 ,-32.971298E0 ,-32.942299E0 ,-32.916302E0 ,-32.890202E0 ,-32.864101E0 ,-32.841000E0 ,-32.817799E0 ,-32.797501E0 ,-32.774300E0 ,-32.757000E0 ,-32.733799E0 ,-32.716400E0 ,-32.699100E0 ,-32.678799E0 ,-32.661400E0 ,-32.644001E0 ,-32.626701E0 ,-32.612202E0 ,-32.597698E0 ,-32.583199E0 ,-32.568699E0 ,-32.554298E0 ,-32.539799E0 ,-32.525299E0 ,-32.510799E0 ,-32.499199E0 ,-32.487598E0 ,-32.473202E0 ,-32.461601E0 ,-32.435501E0 ,-32.435501E0 ,-32.426800E0 ,-32.412300E0 ,-32.400799E0 ,-32.392101E0 ,-32.380501E0 ,-32.366001E0 ,-32.357300E0 ,-32.348598E0 ,-32.339901E0 ,-32.328400E0 ,-32.319698E0 ,-32.311001E0 ,-32.299400E0 ,-32.290699E0 ,-32.282001E0 ,-32.273300E0 ,-32.264599E0 ,-32.256001E0 ,-32.247299E0
+,-32.238602E0 ,-32.229900E0 ,-32.224098E0 ,-32.215401E0 ,-32.203800E0 ,-32.198002E0 ,-32.189400E0 ,-32.183601E0 ,-32.174900E0 ,-32.169102E0 ,-32.163300E0 ,-32.154598E0 ,-32.145901E0 ,-32.140099E0 ,-32.131401E0 ,-32.125599E0 ,-32.119801E0 ,-32.111198E0 ,-32.105400E0 ,-32.096699E0 ,-32.090900E0 ,-32.088001E0 ,-32.079300E0 ,-32.073502E0 ,-32.067699E0 ,-32.061901E0 ,-32.056099E0 ,-32.050301E0 ,-32.044498E0 ,-32.038799E0 ,-32.033001E0 ,-32.027199E0 ,-32.024300E0 ,-32.018501E0 ,-32.012699E0 ,-32.004002E0 ,-32.001099E0 ,-31.995300E0 ,-31.989500E0 ,-31.983700E0 ,-31.977900E0 ,-31.972099E0 ,-31.969299E0 ,-31.963501E0 ,-31.957701E0 ,-31.951900E0 ,-31.946100E0 ,-31.940300E0 ,-31.937401E0 ,-31.931601E0 ,-31.925800E0 ,-31.922899E0 ,-31.917101E0 ,-31.911301E0 ,-31.908400E0 ,-31.902599E0 ,-31.896900E0 ,-31.893999E0 ,-31.888201E0 ,-31.885300E0 ,-31.882401E0 ,-31.876600E0 ,-31.873699E0 ,-31.867901E0 ,-31.862101E0 ,-31.859200E0 ,-31.856300E0 ,-31.850500E0 ,-31.844700E0 ,-31.841801E0 ,-31.838900E0 ,-31.833099E0 ,-31.830200E0 ,
+-31.827299E0 ,-31.821600E0 ,-31.818701E0 ,-31.812901E0 ,-31.809999E0 ,-31.807100E0 ,-31.801300E0 ,-31.798401E0 ,-31.795500E0 ,-31.789700E0 ,-31.786800E0 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/bennett5.shtml
+void testNistBennett5(void)
+{
+  const int  n=3;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< -2000., 50., 0.8;
+  // do the computation
+  Bennett5_functor functor;
+  LevenbergMarquardt<Bennett5_functor> lm(functor);
+  lm.parameters.maxfev = 1000;
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 758, 744);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.2404744073E-04);
+  // check x
+  VERIFY_IS_APPROX(x[0], -2.5235058043E+03);
+  VERIFY_IS_APPROX(x[1], 4.6736564644E+01);
+  VERIFY_IS_APPROX(x[2], 9.3218483193E-01);
+  /*
+   * Second try
+   */
+  x<< -1500., 45., 0.85;
+  // do the computation
+  lm.resetParameters();
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 203, 192);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.2404744073E-04);
+  // check x
+  VERIFY_IS_APPROX(x[0], -2523.3007865); // should be -2.5235058043E+03
+  VERIFY_IS_APPROX(x[1], 46.735705771); // should be 4.6736564644E+01);
+  VERIFY_IS_APPROX(x[2], 0.93219881891); // should be 9.3218483193E-01);
+}
+
+struct thurber_functor : Functor<double>
+{
+    thurber_functor(void) : Functor<double>(7,37) {}
+    static const double _x[37];
+    static const double _y[37];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        //        int called=0; printf("call hahn1_functor with  iflag=%d, called=%d\n", iflag, called); if (iflag==1) called++;
+        assert(b.size()==7);
+        assert(fvec.size()==37);
+        for(int i=0; i<37; i++) {
+            double x=_x[i], xx=x*x, xxx=xx*x;
+            fvec[i] = (b[0]+b[1]*x+b[2]*xx+b[3]*xxx) / (1.+b[4]*x+b[5]*xx+b[6]*xxx) - _y[i];
+        }
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==7);
+        assert(fjac.rows()==37);
+        assert(fjac.cols()==7);
+        for(int i=0; i<37; i++) {
+            double x=_x[i], xx=x*x, xxx=xx*x;
+            double fact = 1./(1.+b[4]*x+b[5]*xx+b[6]*xxx);
+            fjac(i,0) = 1.*fact;
+            fjac(i,1) = x*fact;
+            fjac(i,2) = xx*fact;
+            fjac(i,3) = xxx*fact;
+            fact = - (b[0]+b[1]*x+b[2]*xx+b[3]*xxx) * fact * fact;
+            fjac(i,4) = x*fact;
+            fjac(i,5) = xx*fact;
+            fjac(i,6) = xxx*fact;
+        }
+        return 0;
+    }
+};
+const double thurber_functor::_x[37] = { -3.067E0, -2.981E0, -2.921E0, -2.912E0, -2.840E0, -2.797E0, -2.702E0, -2.699E0, -2.633E0, -2.481E0, -2.363E0, -2.322E0, -1.501E0, -1.460E0, -1.274E0, -1.212E0, -1.100E0, -1.046E0, -0.915E0, -0.714E0, -0.566E0, -0.545E0, -0.400E0, -0.309E0, -0.109E0, -0.103E0, 0.010E0, 0.119E0, 0.377E0, 0.790E0, 0.963E0, 1.006E0, 1.115E0, 1.572E0, 1.841E0, 2.047E0, 2.200E0 };
+const double thurber_functor::_y[37] = { 80.574E0, 84.248E0, 87.264E0, 87.195E0, 89.076E0, 89.608E0, 89.868E0, 90.101E0, 92.405E0, 95.854E0, 100.696E0, 101.060E0, 401.672E0, 390.724E0, 567.534E0, 635.316E0, 733.054E0, 759.087E0, 894.206E0, 990.785E0, 1090.109E0, 1080.914E0, 1122.643E0, 1178.351E0, 1260.531E0, 1273.514E0, 1288.339E0, 1327.543E0, 1353.863E0, 1414.509E0, 1425.208E0, 1421.384E0, 1442.962E0, 1464.350E0, 1468.705E0, 1447.894E0, 1457.628E0};
+
+// http://www.itl.nist.gov/div898/strd/nls/data/thurber.shtml
+void testNistThurber(void)
+{
+  const int n=7;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 1000 ,1000 ,400 ,40 ,0.7,0.3,0.0 ;
+  // do the computation
+  thurber_functor functor;
+  LevenbergMarquardt<thurber_functor> lm(functor);
+  lm.parameters.ftol = 1.E4*NumTraits<double>::epsilon();
+  lm.parameters.xtol = 1.E4*NumTraits<double>::epsilon();
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 39,36);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6427082397E+03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.2881396800E+03);
+  VERIFY_IS_APPROX(x[1], 1.4910792535E+03);
+  VERIFY_IS_APPROX(x[2], 5.8323836877E+02);
+  VERIFY_IS_APPROX(x[3], 7.5416644291E+01);
+  VERIFY_IS_APPROX(x[4], 9.6629502864E-01);
+  VERIFY_IS_APPROX(x[5], 3.9797285797E-01);
+  VERIFY_IS_APPROX(x[6], 4.9727297349E-02);
+
+  /*
+   * Second try
+   */
+  x<< 1300 ,1500 ,500  ,75   ,1    ,0.4  ,0.05  ;
+  // do the computation
+  lm.resetParameters();
+  lm.parameters.ftol = 1.E4*NumTraits<double>::epsilon();
+  lm.parameters.xtol = 1.E4*NumTraits<double>::epsilon();
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 29, 28);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6427082397E+03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.2881396800E+03);
+  VERIFY_IS_APPROX(x[1], 1.4910792535E+03);
+  VERIFY_IS_APPROX(x[2], 5.8323836877E+02);
+  VERIFY_IS_APPROX(x[3], 7.5416644291E+01);
+  VERIFY_IS_APPROX(x[4], 9.6629502864E-01);
+  VERIFY_IS_APPROX(x[5], 3.9797285797E-01);
+  VERIFY_IS_APPROX(x[6], 4.9727297349E-02);
+}
+
+struct rat43_functor : Functor<double>
+{
+    rat43_functor(void) : Functor<double>(4,15) {}
+    static const double x[15];
+    static const double y[15];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==4);
+        assert(fvec.size()==15);
+        for(int i=0; i<15; i++)
+            fvec[i] = b[0] * pow(1.+exp(b[1]-b[2]*x[i]),-1./b[3]) - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==4);
+        assert(fjac.rows()==15);
+        assert(fjac.cols()==4);
+        for(int i=0; i<15; i++) {
+            double e = exp(b[1]-b[2]*x[i]);
+            double power = -1./b[3];
+            fjac(i,0) = pow(1.+e, power);
+            fjac(i,1) = power*b[0]*e*pow(1.+e, power-1.);
+            fjac(i,2) = -power*b[0]*e*x[i]*pow(1.+e, power-1.);
+            fjac(i,3) = b[0]*power*power*log(1.+e)*pow(1.+e, power);
+        }
+        return 0;
+    }
+};
+const double rat43_functor::x[15] = { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15. };
+const double rat43_functor::y[15] = { 16.08, 33.83, 65.80, 97.20, 191.55, 326.20, 386.87, 520.53, 590.03, 651.92, 724.93, 699.56, 689.96, 637.56, 717.41 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/ratkowsky3.shtml
+void testNistRat43(void)
+{
+  const int n=4;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 100., 10., 1., 1.;
+  // do the computation
+  rat43_functor functor;
+  LevenbergMarquardt<rat43_functor> lm(functor);
+  lm.parameters.ftol = 1.E6*NumTraits<double>::epsilon();
+  lm.parameters.xtol = 1.E6*NumTraits<double>::epsilon();
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 27, 20);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7864049080E+03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 6.9964151270E+02);
+  VERIFY_IS_APPROX(x[1], 5.2771253025E+00);
+  VERIFY_IS_APPROX(x[2], 7.5962938329E-01);
+  VERIFY_IS_APPROX(x[3], 1.2792483859E+00);
+
+  /*
+   * Second try
+   */
+  x<< 700., 5., 0.75, 1.3;
+  // do the computation
+  lm.resetParameters();
+  lm.parameters.ftol = 1.E5*NumTraits<double>::epsilon();
+  lm.parameters.xtol = 1.E5*NumTraits<double>::epsilon();
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 9, 8);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7864049080E+03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 6.9964151270E+02);
+  VERIFY_IS_APPROX(x[1], 5.2771253025E+00);
+  VERIFY_IS_APPROX(x[2], 7.5962938329E-01);
+  VERIFY_IS_APPROX(x[3], 1.2792483859E+00);
+}
+
+
+
+struct eckerle4_functor : Functor<double>
+{
+    eckerle4_functor(void) : Functor<double>(3,35) {}
+    static const double x[35];
+    static const double y[35];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==3);
+        assert(fvec.size()==35);
+        for(int i=0; i<35; i++)
+            fvec[i] = b[0]/b[1] * exp(-0.5*(x[i]-b[2])*(x[i]-b[2])/(b[1]*b[1])) - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==3);
+        assert(fjac.rows()==35);
+        assert(fjac.cols()==3);
+        for(int i=0; i<35; i++) {
+            double b12 = b[1]*b[1];
+            double e = exp(-0.5*(x[i]-b[2])*(x[i]-b[2])/b12);
+            fjac(i,0) = e / b[1];
+            fjac(i,1) = ((x[i]-b[2])*(x[i]-b[2])/b12-1.) * b[0]*e/b12;
+            fjac(i,2) = (x[i]-b[2])*e*b[0]/b[1]/b12;
+        }
+        return 0;
+    }
+};
+const double eckerle4_functor::x[35] = { 400.0, 405.0, 410.0, 415.0, 420.0, 425.0, 430.0, 435.0, 436.5, 438.0, 439.5, 441.0, 442.5, 444.0, 445.5, 447.0, 448.5, 450.0, 451.5, 453.0, 454.5, 456.0, 457.5, 459.0, 460.5, 462.0, 463.5, 465.0, 470.0, 475.0, 480.0, 485.0, 490.0, 495.0, 500.0};
+const double eckerle4_functor::y[35] = { 0.0001575, 0.0001699, 0.0002350, 0.0003102, 0.0004917, 0.0008710, 0.0017418, 0.0046400, 0.0065895, 0.0097302, 0.0149002, 0.0237310, 0.0401683, 0.0712559, 0.1264458, 0.2073413, 0.2902366, 0.3445623, 0.3698049, 0.3668534, 0.3106727, 0.2078154, 0.1164354, 0.0616764, 0.0337200, 0.0194023, 0.0117831, 0.0074357, 0.0022732, 0.0008800, 0.0004579, 0.0002345, 0.0001586, 0.0001143, 0.0000710 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/eckerle4.shtml
+void testNistEckerle4(void)
+{
+  const int n=3;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 1., 10., 500.;
+  // do the computation
+  eckerle4_functor functor;
+  LevenbergMarquardt<eckerle4_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 18, 15);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.4635887487E-03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.5543827178);
+  VERIFY_IS_APPROX(x[1], 4.0888321754);
+  VERIFY_IS_APPROX(x[2], 4.5154121844E+02);
+
+  /*
+   * Second try
+   */
+  x<< 1.5, 5., 450.;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 7, 6);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.4635887487E-03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.5543827178);
+  VERIFY_IS_APPROX(x[1], 4.0888321754);
+  VERIFY_IS_APPROX(x[2], 4.5154121844E+02);
+}
+
+void test_NonLinearOptimization()
+{
+    // Tests using the examples provided by (c)minpack
+    CALL_SUBTEST/*_1*/(testChkder());
+    CALL_SUBTEST/*_1*/(testLmder1());
+    CALL_SUBTEST/*_1*/(testLmder());
+    CALL_SUBTEST/*_2*/(testHybrj1());
+    CALL_SUBTEST/*_2*/(testHybrj());
+    CALL_SUBTEST/*_2*/(testHybrd1());
+    CALL_SUBTEST/*_2*/(testHybrd());
+    CALL_SUBTEST/*_3*/(testLmstr1());
+    CALL_SUBTEST/*_3*/(testLmstr());
+    CALL_SUBTEST/*_3*/(testLmdif1());
+    CALL_SUBTEST/*_3*/(testLmdif());
+
+    // NIST tests, level of difficulty = "Lower"
+    CALL_SUBTEST/*_4*/(testNistMisra1a());
+    CALL_SUBTEST/*_4*/(testNistChwirut2());
+
+    // NIST tests, level of difficulty = "Average"
+    CALL_SUBTEST/*_5*/(testNistHahn1());
+    CALL_SUBTEST/*_6*/(testNistMisra1d());
+    CALL_SUBTEST/*_7*/(testNistMGH17());
+    CALL_SUBTEST/*_8*/(testNistLanczos1());
+
+//     // NIST tests, level of difficulty = "Higher"
+    CALL_SUBTEST/*_9*/(testNistRat42());
+//     CALL_SUBTEST/*_10*/(testNistMGH10());
+    CALL_SUBTEST/*_11*/(testNistBoxBOD());
+//     CALL_SUBTEST/*_12*/(testNistMGH09());
+    CALL_SUBTEST/*_13*/(testNistBennett5());
+    CALL_SUBTEST/*_14*/(testNistThurber());
+    CALL_SUBTEST/*_15*/(testNistRat43());
+    CALL_SUBTEST/*_16*/(testNistEckerle4());
+}
+
+/*
+ * Can be useful for debugging...
+  printf("info, nfev : %d, %d\n", info, lm.nfev);
+  printf("info, nfev, njev : %d, %d, %d\n", info, solver.nfev, solver.njev);
+  printf("info, nfev : %d, %d\n", info, solver.nfev);
+  printf("x[0] : %.32g\n", x[0]);
+  printf("x[1] : %.32g\n", x[1]);
+  printf("x[2] : %.32g\n", x[2]);
+  printf("x[3] : %.32g\n", x[3]);
+  printf("fvec.blueNorm() : %.32g\n", solver.fvec.blueNorm());
+  printf("fvec.blueNorm() : %.32g\n", lm.fvec.blueNorm());
+
+  printf("info, nfev, njev : %d, %d, %d\n", info, lm.nfev, lm.njev);
+  printf("fvec.squaredNorm() : %.13g\n", lm.fvec.squaredNorm());
+  std::cout << x << std::endl;
+  std::cout.precision(9);
+  std::cout << x[0] << std::endl;
+  std::cout << x[1] << std::endl;
+  std::cout << x[2] << std::endl;
+  std::cout << x[3] << std::endl;
+*/
+
diff --git a/uppsrc/plugin/Eigen/unsupported/test/NumericalDiff.cpp b/uppsrc/plugin/Eigen/unsupported/test/NumericalDiff.cpp
new file mode 100644
index 000000000..27d888056
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/NumericalDiff.cpp
@@ -0,0 +1,114 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel@freehackers.org>
+
+#include <stdio.h>
+
+#include "main.h"
+#include <unsupported/Eigen/NumericalDiff>
+    
+// Generic functor
+template<typename _Scalar, int NX=Dynamic, int NY=Dynamic>
+struct Functor
+{
+  typedef _Scalar Scalar;
+  enum {
+    InputsAtCompileTime = NX,
+    ValuesAtCompileTime = NY
+  };
+  typedef Matrix<Scalar,InputsAtCompileTime,1> InputType;
+  typedef Matrix<Scalar,ValuesAtCompileTime,1> ValueType;
+  typedef Matrix<Scalar,ValuesAtCompileTime,InputsAtCompileTime> JacobianType;
+  
+  int m_inputs, m_values;
+  
+  Functor() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {}
+  Functor(int inputs, int values) : m_inputs(inputs), m_values(values) {}
+  
+  int inputs() const { return m_inputs; }
+  int values() const { return m_values; }
+
+};
+
+struct my_functor : Functor<double>
+{
+    my_functor(void): Functor<double>(3,15) {}
+    int operator()(const VectorXd &x, VectorXd &fvec) const
+    {
+        double tmp1, tmp2, tmp3;
+        double y[15] = {1.4e-1, 1.8e-1, 2.2e-1, 2.5e-1, 2.9e-1, 3.2e-1, 3.5e-1,
+            3.9e-1, 3.7e-1, 5.8e-1, 7.3e-1, 9.6e-1, 1.34, 2.1, 4.39};
+
+        for (int i = 0; i < values(); i++)
+        {
+            tmp1 = i+1;
+            tmp2 = 16 - i - 1;
+            tmp3 = (i>=8)? tmp2 : tmp1;
+            fvec[i] = y[i] - (x[0] + tmp1/(x[1]*tmp2 + x[2]*tmp3));
+        }
+        return 0;
+    }
+
+    int actual_df(const VectorXd &x, MatrixXd &fjac) const
+    {
+        double tmp1, tmp2, tmp3, tmp4;
+        for (int i = 0; i < values(); i++)
+        {
+            tmp1 = i+1;
+            tmp2 = 16 - i - 1;
+            tmp3 = (i>=8)? tmp2 : tmp1;
+            tmp4 = (x[1]*tmp2 + x[2]*tmp3); tmp4 = tmp4*tmp4;
+            fjac(i,0) = -1;
+            fjac(i,1) = tmp1*tmp2/tmp4;
+            fjac(i,2) = tmp1*tmp3/tmp4;
+        }
+        return 0;
+    }
+};
+
+void test_forward()
+{
+    VectorXd x(3);
+    MatrixXd jac(15,3);
+    MatrixXd actual_jac(15,3);
+    my_functor functor;
+
+    x << 0.082, 1.13, 2.35;
+
+    // real one 
+    functor.actual_df(x, actual_jac);
+//    std::cout << actual_jac << std::endl << std::endl;
+
+    // using NumericalDiff
+    NumericalDiff<my_functor> numDiff(functor);
+    numDiff.df(x, jac);
+//    std::cout << jac << std::endl;
+
+    VERIFY_IS_APPROX(jac, actual_jac);
+}
+
+void test_central()
+{
+    VectorXd x(3);
+    MatrixXd jac(15,3);
+    MatrixXd actual_jac(15,3);
+    my_functor functor;
+
+    x << 0.082, 1.13, 2.35;
+
+    // real one 
+    functor.actual_df(x, actual_jac);
+
+    // using NumericalDiff
+    NumericalDiff<my_functor,Central> numDiff(functor);
+    numDiff.df(x, jac);
+
+    VERIFY_IS_APPROX(jac, actual_jac);
+}
+
+void test_NumericalDiff()
+{
+    CALL_SUBTEST(test_forward());
+    CALL_SUBTEST(test_central());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/alignedvector3.cpp b/uppsrc/plugin/Eigen/unsupported/test/alignedvector3.cpp
new file mode 100644
index 000000000..252cb1d3f
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/alignedvector3.cpp
@@ -0,0 +1,84 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <unsupported/Eigen/AlignedVector3>
+
+namespace Eigen {
+
+template<typename T,typename Derived>
+T test_relative_error(const AlignedVector3<T> &a, const MatrixBase<Derived> &b)
+{
+  return test_relative_error(a.coeffs().template head<3>(), b);
+}
+
+}
+
+template<typename Scalar>
+void alignedvector3()
+{
+  Scalar s1 = internal::random<Scalar>();
+  Scalar s2 = internal::random<Scalar>();
+  typedef Matrix<Scalar,3,1> RefType;
+  typedef Matrix<Scalar,3,3> Mat33;
+  typedef AlignedVector3<Scalar> FastType;
+  RefType  r1(RefType::Random()), r2(RefType::Random()), r3(RefType::Random()),
+           r4(RefType::Random()), r5(RefType::Random());
+  FastType f1(r1), f2(r2), f3(r3), f4(r4), f5(r5);
+  Mat33 m1(Mat33::Random());
+  
+  VERIFY_IS_APPROX(f1,r1);
+  VERIFY_IS_APPROX(f4,r4);
+
+  VERIFY_IS_APPROX(f4+f1,r4+r1);
+  VERIFY_IS_APPROX(f4-f1,r4-r1);
+  VERIFY_IS_APPROX(f4+f1-f2,r4+r1-r2);
+  VERIFY_IS_APPROX(f4+=f3,r4+=r3);
+  VERIFY_IS_APPROX(f4-=f5,r4-=r5);
+  VERIFY_IS_APPROX(f4-=f5+f1,r4-=r5+r1);
+  VERIFY_IS_APPROX(f5+f1-s1*f2,r5+r1-s1*r2);
+  VERIFY_IS_APPROX(f5+f1/s2-s1*f2,r5+r1/s2-s1*r2);
+  
+  VERIFY_IS_APPROX(m1*f4,m1*r4);
+  VERIFY_IS_APPROX(f4.transpose()*m1,r4.transpose()*m1);
+  
+  VERIFY_IS_APPROX(f2.dot(f3),r2.dot(r3));
+  VERIFY_IS_APPROX(f2.cross(f3),r2.cross(r3));
+  VERIFY_IS_APPROX(f2.norm(),r2.norm());
+
+  VERIFY_IS_APPROX(f2.normalized(),r2.normalized());
+
+  VERIFY_IS_APPROX((f2+f1).normalized(),(r2+r1).normalized());
+  
+  f2.normalize();
+  r2.normalize();
+  VERIFY_IS_APPROX(f2,r2);
+  
+  {
+    FastType f6 = RefType::Zero();
+    FastType f7 = FastType::Zero();
+    VERIFY_IS_APPROX(f6,f7);
+    f6 = r4+r1;
+    VERIFY_IS_APPROX(f6,r4+r1);
+    f6 -= Scalar(2)*r4;
+    VERIFY_IS_APPROX(f6,r1-r4);
+  }
+  
+  std::stringstream ss1, ss2;
+  ss1 << f1;
+  ss2 << r1;
+  VERIFY(ss1.str()==ss2.str());
+}
+
+void test_alignedvector3()
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST( alignedvector3<float>() );
+  }
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/autodiff.cpp b/uppsrc/plugin/Eigen/unsupported/test/autodiff.cpp
new file mode 100644
index 000000000..1d8c8b5fd
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/autodiff.cpp
@@ -0,0 +1,387 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <unsupported/Eigen/AutoDiff>
+
+template<typename Scalar>
+EIGEN_DONT_INLINE Scalar foo(const Scalar& x, const Scalar& y)
+{
+  using namespace std;
+//   return x+std::sin(y);
+  EIGEN_ASM_COMMENT("mybegin");
+  // pow(float, int) promotes to pow(double, double)
+  return x*2 - 1 + static_cast<Scalar>(pow(1+x,2)) + 2*sqrt(y*y+0) - 4 * sin(0+x) + 2 * cos(y+0) - exp(Scalar(-0.5)*x*x+0);
+  //return x+2*y*x;//x*2 -std::pow(x,2);//(2*y/x);// - y*2;
+  EIGEN_ASM_COMMENT("myend");
+}
+
+template<typename Vector>
+EIGEN_DONT_INLINE typename Vector::Scalar foo(const Vector& p)
+{
+  typedef typename Vector::Scalar Scalar;
+  return (p-Vector(Scalar(-1),Scalar(1.))).norm() + (p.array() * p.array()).sum() + p.dot(p);
+}
+
+template<typename _Scalar, int NX=Dynamic, int NY=Dynamic>
+struct TestFunc1
+{
+  typedef _Scalar Scalar;
+  enum {
+    InputsAtCompileTime = NX,
+    ValuesAtCompileTime = NY
+  };
+  typedef Matrix<Scalar,InputsAtCompileTime,1> InputType;
+  typedef Matrix<Scalar,ValuesAtCompileTime,1> ValueType;
+  typedef Matrix<Scalar,ValuesAtCompileTime,InputsAtCompileTime> JacobianType;
+
+  int m_inputs, m_values;
+
+  TestFunc1() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {}
+  TestFunc1(int inputs, int values) : m_inputs(inputs), m_values(values) {}
+
+  int inputs() const { return m_inputs; }
+  int values() const { return m_values; }
+
+  template<typename T>
+  void operator() (const Matrix<T,InputsAtCompileTime,1>& x, Matrix<T,ValuesAtCompileTime,1>* _v) const
+  {
+    Matrix<T,ValuesAtCompileTime,1>& v = *_v;
+
+    v[0] = 2 * x[0] * x[0] + x[0] * x[1];
+    v[1] = 3 * x[1] * x[0] + 0.5 * x[1] * x[1];
+    if(inputs()>2)
+    {
+      v[0] += 0.5 * x[2];
+      v[1] += x[2];
+    }
+    if(values()>2)
+    {
+      v[2] = 3 * x[1] * x[0] * x[0];
+    }
+    if (inputs()>2 && values()>2)
+      v[2] *= x[2];
+  }
+
+  void operator() (const InputType& x, ValueType* v, JacobianType* _j) const
+  {
+    (*this)(x, v);
+
+    if(_j)
+    {
+      JacobianType& j = *_j;
+
+      j(0,0) = 4 * x[0] + x[1];
+      j(1,0) = 3 * x[1];
+
+      j(0,1) = x[0];
+      j(1,1) = 3 * x[0] + 2 * 0.5 * x[1];
+
+      if (inputs()>2)
+      {
+        j(0,2) = 0.5;
+        j(1,2) = 1;
+      }
+      if(values()>2)
+      {
+        j(2,0) = 3 * x[1] * 2 * x[0];
+        j(2,1) = 3 * x[0] * x[0];
+      }
+      if (inputs()>2 && values()>2)
+      {
+        j(2,0) *= x[2];
+        j(2,1) *= x[2];
+
+        j(2,2) = 3 * x[1] * x[0] * x[0];
+        j(2,2) = 3 * x[1] * x[0] * x[0];
+      }
+    }
+  }
+};
+
+
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+/* Test functor for the C++11 features. */
+template <typename Scalar>
+struct integratorFunctor
+{
+    typedef Matrix<Scalar, 2, 1> InputType;
+    typedef Matrix<Scalar, 2, 1> ValueType;
+
+    /*
+     * Implementation starts here.
+     */
+    integratorFunctor(const Scalar gain) : _gain(gain) {}
+    integratorFunctor(const integratorFunctor& f) : _gain(f._gain) {}
+    const Scalar _gain;
+
+    template <typename T1, typename T2>
+    void operator() (const T1 &input, T2 *output, const Scalar dt) const
+    {
+        T2 &o = *output;
+
+        /* Integrator to test the AD. */
+        o[0] = input[0] + input[1] * dt * _gain;
+        o[1] = input[1] * _gain;
+    }
+
+    /* Only needed for the test */
+    template <typename T1, typename T2, typename T3>
+    void operator() (const T1 &input, T2 *output, T3 *jacobian, const Scalar dt) const
+    {
+        T2 &o = *output;
+
+        /* Integrator to test the AD. */
+        o[0] = input[0] + input[1] * dt * _gain;
+        o[1] = input[1] * _gain;
+
+        if (jacobian)
+        {
+            T3 &j = *jacobian;
+
+            j(0, 0) = 1;
+            j(0, 1) = dt * _gain;
+            j(1, 0) = 0;
+            j(1, 1) = _gain;
+        }
+    }
+
+};
+
+template<typename Func> void forward_jacobian_cpp11(const Func& f)
+{
+    typedef typename Func::ValueType::Scalar Scalar;
+    typedef typename Func::ValueType ValueType;
+    typedef typename Func::InputType InputType;
+    typedef typename AutoDiffJacobian<Func>::JacobianType JacobianType;
+
+    InputType x = InputType::Random(InputType::RowsAtCompileTime);
+    ValueType y, yref;
+    JacobianType j, jref;
+
+    const Scalar dt = internal::random<double>();
+
+    jref.setZero();
+    yref.setZero();
+    f(x, &yref, &jref, dt);
+
+    //std::cerr << "y, yref, jref: " << "\n";
+    //std::cerr << y.transpose() << "\n\n";
+    //std::cerr << yref << "\n\n";
+    //std::cerr << jref << "\n\n";
+
+    AutoDiffJacobian<Func> autoj(f);
+    autoj(x, &y, &j, dt);
+
+    //std::cerr << "y j (via autodiff): " << "\n";
+    //std::cerr << y.transpose() << "\n\n";
+    //std::cerr << j << "\n\n";
+
+    VERIFY_IS_APPROX(y, yref);
+    VERIFY_IS_APPROX(j, jref);
+}
+#endif
+
+template<typename Func> void forward_jacobian(const Func& f)
+{
+    typename Func::InputType x = Func::InputType::Random(f.inputs());
+    typename Func::ValueType y(f.values()), yref(f.values());
+    typename Func::JacobianType j(f.values(),f.inputs()), jref(f.values(),f.inputs());
+
+    jref.setZero();
+    yref.setZero();
+    f(x,&yref,&jref);
+//     std::cerr << y.transpose() << "\n\n";;
+//     std::cerr << j << "\n\n";;
+
+    j.setZero();
+    y.setZero();
+    AutoDiffJacobian<Func> autoj(f);
+    autoj(x, &y, &j);
+//     std::cerr << y.transpose() << "\n\n";;
+//     std::cerr << j << "\n\n";;
+
+    VERIFY_IS_APPROX(y, yref);
+    VERIFY_IS_APPROX(j, jref);
+}
+
+// TODO also check actual derivatives!
+template <int>
+void test_autodiff_scalar()
+{
+  Vector2f p = Vector2f::Random();
+  typedef AutoDiffScalar<Vector2f> AD;
+  AD ax(p.x(),Vector2f::UnitX());
+  AD ay(p.y(),Vector2f::UnitY());
+  AD res = foo<AD>(ax,ay);
+  VERIFY_IS_APPROX(res.value(), foo(p.x(),p.y()));
+}
+
+
+// TODO also check actual derivatives!
+template <int>
+void test_autodiff_vector()
+{
+  Vector2f p = Vector2f::Random();
+  typedef AutoDiffScalar<Vector2f> AD;
+  typedef Matrix<AD,2,1> VectorAD;
+  VectorAD ap = p.cast<AD>();
+  ap.x().derivatives() = Vector2f::UnitX();
+  ap.y().derivatives() = Vector2f::UnitY();
+
+  AD res = foo<VectorAD>(ap);
+  VERIFY_IS_APPROX(res.value(), foo(p));
+}
+
+template <int>
+void test_autodiff_jacobian()
+{
+  CALL_SUBTEST(( forward_jacobian(TestFunc1<double,2,2>()) ));
+  CALL_SUBTEST(( forward_jacobian(TestFunc1<double,2,3>()) ));
+  CALL_SUBTEST(( forward_jacobian(TestFunc1<double,3,2>()) ));
+  CALL_SUBTEST(( forward_jacobian(TestFunc1<double,3,3>()) ));
+  CALL_SUBTEST(( forward_jacobian(TestFunc1<double>(3,3)) ));
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  CALL_SUBTEST(( forward_jacobian_cpp11(integratorFunctor<double>(10)) ));
+#endif
+}
+
+
+template <int>
+void test_autodiff_hessian()
+{
+  typedef AutoDiffScalar<VectorXd> AD;
+  typedef Matrix<AD,Eigen::Dynamic,1> VectorAD;
+  typedef AutoDiffScalar<VectorAD> ADD;
+  typedef Matrix<ADD,Eigen::Dynamic,1> VectorADD;
+  VectorADD x(2);
+  double s1 = internal::random<double>(), s2 = internal::random<double>(), s3 = internal::random<double>(), s4 = internal::random<double>();
+  x(0).value()=s1;
+  x(1).value()=s2;
+
+  //set unit vectors for the derivative directions (partial derivatives of the input vector)
+  x(0).derivatives().resize(2);
+  x(0).derivatives().setZero();
+  x(0).derivatives()(0)= 1;
+  x(1).derivatives().resize(2);
+  x(1).derivatives().setZero();
+  x(1).derivatives()(1)=1;
+
+  //repeat partial derivatives for the inner AutoDiffScalar
+  x(0).value().derivatives() = VectorXd::Unit(2,0);
+  x(1).value().derivatives() = VectorXd::Unit(2,1);
+
+  //set the hessian matrix to zero
+  for(int idx=0; idx<2; idx++) {
+      x(0).derivatives()(idx).derivatives()  = VectorXd::Zero(2);
+      x(1).derivatives()(idx).derivatives()  = VectorXd::Zero(2);
+  }
+
+  ADD y = sin(AD(s3)*x(0) + AD(s4)*x(1));
+
+  VERIFY_IS_APPROX(y.value().derivatives()(0), y.derivatives()(0).value());
+  VERIFY_IS_APPROX(y.value().derivatives()(1), y.derivatives()(1).value());
+  VERIFY_IS_APPROX(y.value().derivatives()(0), s3*std::cos(s1*s3+s2*s4));
+  VERIFY_IS_APPROX(y.value().derivatives()(1), s4*std::cos(s1*s3+s2*s4));
+  VERIFY_IS_APPROX(y.derivatives()(0).derivatives(), -std::sin(s1*s3+s2*s4)*Vector2d(s3*s3,s4*s3));
+  VERIFY_IS_APPROX(y.derivatives()(1).derivatives(),  -std::sin(s1*s3+s2*s4)*Vector2d(s3*s4,s4*s4));
+
+  ADD z = x(0)*x(1);
+  VERIFY_IS_APPROX(z.derivatives()(0).derivatives(), Vector2d(0,1));
+  VERIFY_IS_APPROX(z.derivatives()(1).derivatives(), Vector2d(1,0));
+}
+
+double bug_1222() {
+  typedef Eigen::AutoDiffScalar<Eigen::Vector3d> AD;
+  const double _cv1_3 = 1.0;
+  const AD chi_3 = 1.0;
+  // this line did not work, because operator+ returns ADS<DerType&>, which then cannot be converted to ADS<DerType>
+  const AD denom = chi_3 + _cv1_3;
+  return denom.value();
+}
+
+#ifdef EIGEN_TEST_PART_5
+
+double bug_1223() {
+  using std::min;
+  typedef Eigen::AutoDiffScalar<Eigen::Vector3d> AD;
+
+  const double _cv1_3 = 1.0;
+  const AD chi_3 = 1.0;
+  const AD denom = 1.0;
+
+  // failed because implementation of min attempts to construct ADS<DerType&> via constructor AutoDiffScalar(const Real& value)
+  // without initializing m_derivatives (which is a reference in this case)
+  #define EIGEN_TEST_SPACE
+  const AD t = min EIGEN_TEST_SPACE (denom / chi_3, 1.0);
+
+  const AD t2 = min EIGEN_TEST_SPACE (denom / (chi_3 * _cv1_3), 1.0);
+
+  return t.value() + t2.value();
+}
+
+// regression test for some compilation issues with specializations of ScalarBinaryOpTraits
+void bug_1260() {
+  Matrix4d A = Matrix4d::Ones();
+  Vector4d v = Vector4d::Ones();
+  A*v;
+}
+
+// check a compilation issue with numext::max
+double bug_1261() {
+  typedef AutoDiffScalar<Matrix2d> AD;
+  typedef Matrix<AD,2,1> VectorAD;
+
+  VectorAD v(0.,0.);
+  const AD maxVal = v.maxCoeff();
+  const AD minVal = v.minCoeff();
+  return maxVal.value() + minVal.value();
+}
+
+double bug_1264() {
+  typedef AutoDiffScalar<Vector2d> AD;
+  const AD s = 0.;
+  const Matrix<AD, 3, 1> v1(0.,0.,0.);
+  const Matrix<AD, 3, 1> v2 = (s + 3.0) * v1;
+  return v2(0).value();
+}
+
+// check with expressions on constants
+double bug_1281() {
+  int n = 2;
+  typedef AutoDiffScalar<VectorXd> AD;
+  const AD c = 1.;
+  AD x0(2,n,0);
+  AD y1 = (AD(c)+AD(c))*x0;
+  y1 = x0 * (AD(c)+AD(c));
+  AD y2 = (-AD(c))+x0;
+  y2 = x0+(-AD(c));
+  AD y3 = (AD(c)*(-AD(c))+AD(c))*x0;
+  y3 = x0 * (AD(c)*(-AD(c))+AD(c));
+  return (y1+y2+y3).value();
+}
+
+#endif
+
+void test_autodiff()
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( test_autodiff_scalar<1>() );
+    CALL_SUBTEST_2( test_autodiff_vector<1>() );
+    CALL_SUBTEST_3( test_autodiff_jacobian<1>() );
+    CALL_SUBTEST_4( test_autodiff_hessian<1>() );
+  }
+
+  CALL_SUBTEST_5( bug_1222() );
+  CALL_SUBTEST_5( bug_1223() );
+  CALL_SUBTEST_5( bug_1260() );
+  CALL_SUBTEST_5( bug_1261() );
+  CALL_SUBTEST_5( bug_1281() );
+}
+
diff --git a/uppsrc/plugin/Eigen/unsupported/test/autodiff_scalar.cpp b/uppsrc/plugin/Eigen/unsupported/test/autodiff_scalar.cpp
new file mode 100644
index 000000000..a917ec344
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/autodiff_scalar.cpp
@@ -0,0 +1,101 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christoph Hertzberg <chtz@informatik.uni-bremen.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <unsupported/Eigen/AutoDiff>
+
+/*
+ * In this file scalar derivations are tested for correctness.
+ * TODO add more tests!
+ */
+
+template<typename Scalar> void check_atan2()
+{
+  typedef Matrix<Scalar, 1, 1> Deriv1;
+  typedef AutoDiffScalar<Deriv1> AD;
+  
+  AD x(internal::random<Scalar>(-3.0, 3.0), Deriv1::UnitX());
+  
+  using std::exp;
+  Scalar r = exp(internal::random<Scalar>(-10, 10));
+  
+  AD s = sin(x), c = cos(x);
+  AD res = atan2(r*s, r*c);
+  
+  VERIFY_IS_APPROX(res.value(), x.value());
+  VERIFY_IS_APPROX(res.derivatives(), x.derivatives());
+
+  res = atan2(r*s+0, r*c+0);
+  VERIFY_IS_APPROX(res.value(), x.value());
+  VERIFY_IS_APPROX(res.derivatives(), x.derivatives());
+}
+
+template<typename Scalar> void check_hyperbolic_functions()
+{
+  using std::sinh;
+  using std::cosh;
+  using std::tanh;
+  typedef Matrix<Scalar, 1, 1> Deriv1;
+  typedef AutoDiffScalar<Deriv1> AD;
+  Deriv1 p = Deriv1::Random();
+  AD val(p.x(),Deriv1::UnitX());
+
+  Scalar cosh_px = std::cosh(p.x());
+  AD res1 = tanh(val);
+  VERIFY_IS_APPROX(res1.value(), std::tanh(p.x()));
+  VERIFY_IS_APPROX(res1.derivatives().x(), Scalar(1.0) / (cosh_px * cosh_px));
+
+  AD res2 = sinh(val);
+  VERIFY_IS_APPROX(res2.value(), std::sinh(p.x()));
+  VERIFY_IS_APPROX(res2.derivatives().x(), cosh_px);
+
+  AD res3 = cosh(val);
+  VERIFY_IS_APPROX(res3.value(), cosh_px);
+  VERIFY_IS_APPROX(res3.derivatives().x(), std::sinh(p.x()));
+
+  // Check constant values.
+  const Scalar sample_point = Scalar(1) / Scalar(3); 
+  val = AD(sample_point,Deriv1::UnitX());
+  res1 = tanh(val);
+  VERIFY_IS_APPROX(res1.derivatives().x(), Scalar(0.896629559604914));
+
+  res2 = sinh(val);
+  VERIFY_IS_APPROX(res2.derivatives().x(), Scalar(1.056071867829939));
+
+  res3 = cosh(val);
+  VERIFY_IS_APPROX(res3.derivatives().x(), Scalar(0.339540557256150));
+}
+
+template <typename Scalar>
+void check_limits_specialization()
+{
+  typedef Eigen::Matrix<Scalar, 1, 1> Deriv;
+  typedef Eigen::AutoDiffScalar<Deriv> AD;
+
+  typedef std::numeric_limits<AD> A;
+  typedef std::numeric_limits<Scalar> B;
+
+  // workaround "unsed typedef" warning:
+  VERIFY(!bool(internal::is_same<B, A>::value));
+
+#if EIGEN_HAS_CXX11
+  VERIFY(bool(std::is_base_of<B, A>::value));
+#endif
+}
+
+void test_autodiff_scalar()
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( check_atan2<float>() );
+    CALL_SUBTEST_2( check_atan2<double>() );
+    CALL_SUBTEST_3( check_hyperbolic_functions<float>() );
+    CALL_SUBTEST_4( check_hyperbolic_functions<double>() );
+    CALL_SUBTEST_5( check_limits_specialization<double>());
+  }
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_eventcount.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_eventcount.cpp
new file mode 100644
index 000000000..3b598bf42
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_eventcount.cpp
@@ -0,0 +1,142 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+#include "main.h"
+#include <Eigen/CXX11/ThreadPool>
+
+// Visual studio doesn't implement a rand_r() function since its
+// implementation of rand() is already thread safe
+int rand_reentrant(unsigned int* s) {
+#ifdef EIGEN_COMP_MSVC_STRICT
+  EIGEN_UNUSED_VARIABLE(s);
+  return rand();
+#else
+  return rand_r(s);
+#endif
+}
+
+static void test_basic_eventcount()
+{
+  MaxSizeVector<EventCount::Waiter> waiters(1);
+  waiters.resize(1);
+  EventCount ec(waiters);
+  EventCount::Waiter& w = waiters[0];
+  ec.Notify(false);
+  ec.Prewait(&w);
+  ec.Notify(true);
+  ec.CommitWait(&w);
+  ec.Prewait(&w);
+  ec.CancelWait(&w);
+}
+
+// Fake bounded counter-based queue.
+struct TestQueue {
+  std::atomic<int> val_;
+  static const int kQueueSize = 10;
+
+  TestQueue() : val_() {}
+
+  ~TestQueue() { VERIFY_IS_EQUAL(val_.load(), 0); }
+
+  bool Push() {
+    int val = val_.load(std::memory_order_relaxed);
+    for (;;) {
+      VERIFY_GE(val, 0);
+      VERIFY_LE(val, kQueueSize);
+      if (val == kQueueSize) return false;
+      if (val_.compare_exchange_weak(val, val + 1, std::memory_order_relaxed))
+        return true;
+    }
+  }
+
+  bool Pop() {
+    int val = val_.load(std::memory_order_relaxed);
+    for (;;) {
+      VERIFY_GE(val, 0);
+      VERIFY_LE(val, kQueueSize);
+      if (val == 0) return false;
+      if (val_.compare_exchange_weak(val, val - 1, std::memory_order_relaxed))
+        return true;
+    }
+  }
+
+  bool Empty() { return val_.load(std::memory_order_relaxed) == 0; }
+};
+
+const int TestQueue::kQueueSize;
+
+// A number of producers send messages to a set of consumers using a set of
+// fake queues. Ensure that it does not crash, consumers don't deadlock and
+// number of blocked and unblocked threads match.
+static void test_stress_eventcount()
+{
+  const int kThreads = std::thread::hardware_concurrency();
+  static const int kEvents = 1 << 16;
+  static const int kQueues = 10;
+
+  MaxSizeVector<EventCount::Waiter> waiters(kThreads);
+  waiters.resize(kThreads);
+  EventCount ec(waiters);
+  TestQueue queues[kQueues];
+
+  std::vector<std::unique_ptr<std::thread>> producers;
+  for (int i = 0; i < kThreads; i++) {
+    producers.emplace_back(new std::thread([&ec, &queues]() {
+      unsigned int rnd = static_cast<unsigned int>(std::hash<std::thread::id>()(std::this_thread::get_id()));
+      for (int j = 0; j < kEvents; j++) {
+        unsigned idx = rand_reentrant(&rnd) % kQueues;
+        if (queues[idx].Push()) {
+          ec.Notify(false);
+          continue;
+        }
+        EIGEN_THREAD_YIELD();
+        j--;
+      }
+    }));
+  }
+
+  std::vector<std::unique_ptr<std::thread>> consumers;
+  for (int i = 0; i < kThreads; i++) {
+    consumers.emplace_back(new std::thread([&ec, &queues, &waiters, i]() {
+      EventCount::Waiter& w = waiters[i];
+      unsigned int rnd = static_cast<unsigned int>(std::hash<std::thread::id>()(std::this_thread::get_id()));
+      for (int j = 0; j < kEvents; j++) {
+        unsigned idx = rand_reentrant(&rnd) % kQueues;
+        if (queues[idx].Pop()) continue;
+        j--;
+        ec.Prewait(&w);
+        bool empty = true;
+        for (int q = 0; q < kQueues; q++) {
+          if (!queues[q].Empty()) {
+            empty = false;
+            break;
+          }
+        }
+        if (!empty) {
+          ec.CancelWait(&w);
+          continue;
+        }
+        ec.CommitWait(&w);
+      }
+    }));
+  }
+
+  for (int i = 0; i < kThreads; i++) {
+    producers[i]->join();
+    consumers[i]->join();
+  }
+}
+
+void test_cxx11_eventcount()
+{
+  CALL_SUBTEST(test_basic_eventcount());
+  CALL_SUBTEST(test_stress_eventcount());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_meta.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_meta.cpp
new file mode 100644
index 000000000..8911c59d8
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_meta.cpp
@@ -0,0 +1,357 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <array>
+#include <Eigen/CXX11/src/util/CXX11Meta.h>
+
+using Eigen::internal::is_same;
+using Eigen::internal::type_list;
+using Eigen::internal::numeric_list;
+using Eigen::internal::gen_numeric_list;
+using Eigen::internal::gen_numeric_list_reversed;
+using Eigen::internal::gen_numeric_list_swapped_pair;
+using Eigen::internal::gen_numeric_list_repeated;
+using Eigen::internal::concat;
+using Eigen::internal::mconcat;
+using Eigen::internal::take;
+using Eigen::internal::skip;
+using Eigen::internal::slice;
+using Eigen::internal::get;
+using Eigen::internal::id_numeric;
+using Eigen::internal::id_type;
+using Eigen::internal::is_same_gf;
+using Eigen::internal::apply_op_from_left;
+using Eigen::internal::apply_op_from_right;
+using Eigen::internal::contained_in_list;
+using Eigen::internal::contained_in_list_gf;
+using Eigen::internal::arg_prod;
+using Eigen::internal::arg_sum;
+using Eigen::internal::sum_op;
+using Eigen::internal::product_op;
+using Eigen::internal::array_reverse;
+using Eigen::internal::array_sum;
+using Eigen::internal::array_prod;
+using Eigen::internal::array_reduce;
+using Eigen::internal::array_zip;
+using Eigen::internal::array_zip_and_reduce;
+using Eigen::internal::array_apply;
+using Eigen::internal::array_apply_and_reduce;
+using Eigen::internal::repeat;
+using Eigen::internal::instantiate_by_c_array;
+
+struct dummy_a {};
+struct dummy_b {};
+struct dummy_c {};
+struct dummy_d {};
+struct dummy_e {};
+
+// dummy operation for testing apply
+template<typename A, typename B> struct dummy_op;
+template<> struct dummy_op<dummy_a, dummy_b> { typedef dummy_c type; };
+template<> struct dummy_op<dummy_b, dummy_a> { typedef dummy_d type; };
+template<> struct dummy_op<dummy_b, dummy_c> { typedef dummy_a type; };
+template<> struct dummy_op<dummy_c, dummy_b> { typedef dummy_d type; };
+template<> struct dummy_op<dummy_c, dummy_a> { typedef dummy_b type; };
+template<> struct dummy_op<dummy_a, dummy_c> { typedef dummy_d type; };
+template<> struct dummy_op<dummy_a, dummy_a> { typedef dummy_e type; };
+template<> struct dummy_op<dummy_b, dummy_b> { typedef dummy_e type; };
+template<> struct dummy_op<dummy_c, dummy_c> { typedef dummy_e type; };
+
+template<typename A, typename B> struct dummy_test { constexpr static bool value = false; constexpr static int global_flags = 0; };
+template<> struct dummy_test<dummy_a, dummy_a>     { constexpr static bool value = true;  constexpr static int global_flags = 1; };
+template<> struct dummy_test<dummy_b, dummy_b>     { constexpr static bool value = true;  constexpr static int global_flags = 2; };
+template<> struct dummy_test<dummy_c, dummy_c>     { constexpr static bool value = true;  constexpr static int global_flags = 4; };
+
+struct times2_op { template<typename A> static A run(A v) { return v * 2; } };
+
+struct dummy_inst
+{
+  int c;
+
+  dummy_inst() : c(0) {}
+  explicit dummy_inst(int) : c(1) {}
+  dummy_inst(int, int) : c(2) {}
+  dummy_inst(int, int, int) : c(3) {}
+  dummy_inst(int, int, int, int) : c(4) {}
+  dummy_inst(int, int, int, int, int) : c(5) {}
+};
+
+static void test_gen_numeric_list()
+{
+  VERIFY((is_same<typename gen_numeric_list<int, 0>::type, numeric_list<int>>::value));
+  VERIFY((is_same<typename gen_numeric_list<int, 1>::type, numeric_list<int, 0>>::value));
+  VERIFY((is_same<typename gen_numeric_list<int, 2>::type, numeric_list<int, 0, 1>>::value));
+  VERIFY((is_same<typename gen_numeric_list<int, 5>::type, numeric_list<int, 0, 1, 2, 3, 4>>::value));
+  VERIFY((is_same<typename gen_numeric_list<int, 10>::type, numeric_list<int, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9>>::value));
+
+  VERIFY((is_same<typename gen_numeric_list<int, 0, 42>::type, numeric_list<int>>::value));
+  VERIFY((is_same<typename gen_numeric_list<int, 1, 42>::type, numeric_list<int, 42>>::value));
+  VERIFY((is_same<typename gen_numeric_list<int, 2, 42>::type, numeric_list<int, 42, 43>>::value));
+  VERIFY((is_same<typename gen_numeric_list<int, 5, 42>::type, numeric_list<int, 42, 43, 44, 45, 46>>::value));
+  VERIFY((is_same<typename gen_numeric_list<int, 10, 42>::type, numeric_list<int, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51>>::value));
+
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 0>::type, numeric_list<int>>::value));
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 1>::type, numeric_list<int, 0>>::value));
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 2>::type, numeric_list<int, 1, 0>>::value));
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 5>::type, numeric_list<int, 4, 3, 2, 1, 0>>::value));
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 10>::type, numeric_list<int, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0>>::value));
+
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 0, 42>::type, numeric_list<int>>::value));
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 1, 42>::type, numeric_list<int, 42>>::value));
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 2, 42>::type, numeric_list<int, 43, 42>>::value));
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 5, 42>::type, numeric_list<int, 46, 45, 44, 43, 42>>::value));
+  VERIFY((is_same<typename gen_numeric_list_reversed<int, 10, 42>::type, numeric_list<int, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42>>::value));
+
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 0, 2, 3>::type, numeric_list<int>>::value));
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 1, 2, 3>::type, numeric_list<int, 0>>::value));
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 2, 2, 3>::type, numeric_list<int, 0, 1>>::value));
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 5, 2, 3>::type, numeric_list<int, 0, 1, 3, 2, 4>>::value));
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 10, 2, 3>::type, numeric_list<int, 0, 1, 3, 2, 4, 5, 6, 7, 8, 9>>::value));
+
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 0, 44, 45, 42>::type, numeric_list<int>>::value));
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 1, 44, 45, 42>::type, numeric_list<int, 42>>::value));
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 2, 44, 45, 42>::type, numeric_list<int, 42, 43>>::value));
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 5, 44, 45, 42>::type, numeric_list<int, 42, 43, 45, 44, 46>>::value));
+  VERIFY((is_same<typename gen_numeric_list_swapped_pair<int, 10, 44, 45, 42>::type, numeric_list<int, 42, 43, 45, 44, 46, 47, 48, 49, 50, 51>>::value));
+
+  VERIFY((is_same<typename gen_numeric_list_repeated<int, 0, 0>::type, numeric_list<int>>::value));
+  VERIFY((is_same<typename gen_numeric_list_repeated<int, 1, 0>::type, numeric_list<int, 0>>::value));
+  VERIFY((is_same<typename gen_numeric_list_repeated<int, 2, 0>::type, numeric_list<int, 0, 0>>::value));
+  VERIFY((is_same<typename gen_numeric_list_repeated<int, 5, 0>::type, numeric_list<int, 0, 0, 0, 0, 0>>::value));
+  VERIFY((is_same<typename gen_numeric_list_repeated<int, 10, 0>::type, numeric_list<int, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>>::value));
+}
+
+static void test_concat()
+{
+  VERIFY((is_same<typename concat<type_list<dummy_a, dummy_a>, type_list<>>::type, type_list<dummy_a, dummy_a>>::value));
+  VERIFY((is_same<typename concat<type_list<>, type_list<dummy_a, dummy_a>>::type, type_list<dummy_a, dummy_a>>::value));
+  VERIFY((is_same<typename concat<type_list<dummy_a, dummy_a>, type_list<dummy_a, dummy_a>>::type, type_list<dummy_a, dummy_a, dummy_a, dummy_a>>::value));
+  VERIFY((is_same<typename concat<type_list<dummy_a, dummy_a>, type_list<dummy_b, dummy_c>>::type, type_list<dummy_a, dummy_a, dummy_b, dummy_c>>::value));
+  VERIFY((is_same<typename concat<type_list<dummy_a>, type_list<dummy_b, dummy_c>>::type, type_list<dummy_a, dummy_b, dummy_c>>::value));
+
+  VERIFY((is_same<typename concat<numeric_list<int, 0, 0>, numeric_list<int>>::type, numeric_list<int, 0, 0>>::value));
+  VERIFY((is_same<typename concat<numeric_list<int>, numeric_list<int, 0, 0>>::type, numeric_list<int, 0, 0>>::value));
+  VERIFY((is_same<typename concat<numeric_list<int, 0, 0>, numeric_list<int, 0, 0>>::type, numeric_list<int, 0, 0, 0, 0>>::value));
+  VERIFY((is_same<typename concat<numeric_list<int, 0, 0>, numeric_list<int, 1, 2>>::type, numeric_list<int, 0, 0, 1, 2>>::value));
+  VERIFY((is_same<typename concat<numeric_list<int, 0>, numeric_list<int, 1, 2>>::type, numeric_list<int, 0, 1, 2>>::value));
+
+  VERIFY((is_same<typename mconcat<type_list<dummy_a>>::type, type_list<dummy_a>>::value));
+  VERIFY((is_same<typename mconcat<type_list<dummy_a>, type_list<dummy_b>>::type, type_list<dummy_a, dummy_b>>::value));
+  VERIFY((is_same<typename mconcat<type_list<dummy_a>, type_list<dummy_b>, type_list<dummy_c>>::type, type_list<dummy_a, dummy_b, dummy_c>>::value));
+  VERIFY((is_same<typename mconcat<type_list<dummy_a>, type_list<dummy_b, dummy_c>>::type, type_list<dummy_a, dummy_b, dummy_c>>::value));
+  VERIFY((is_same<typename mconcat<type_list<dummy_a, dummy_b>, type_list<dummy_c>>::type, type_list<dummy_a, dummy_b, dummy_c>>::value));
+
+  VERIFY((is_same<typename mconcat<numeric_list<int, 0>>::type, numeric_list<int, 0>>::value));
+  VERIFY((is_same<typename mconcat<numeric_list<int, 0>, numeric_list<int, 1>>::type, numeric_list<int, 0, 1>>::value));
+  VERIFY((is_same<typename mconcat<numeric_list<int, 0>, numeric_list<int, 1>, numeric_list<int, 2>>::type, numeric_list<int, 0, 1, 2>>::value));
+  VERIFY((is_same<typename mconcat<numeric_list<int, 0>, numeric_list<int, 1, 2>>::type, numeric_list<int, 0, 1, 2>>::value));
+  VERIFY((is_same<typename mconcat<numeric_list<int, 0, 1>, numeric_list<int, 2>>::type, numeric_list<int, 0, 1, 2>>::value));
+}
+
+static void test_slice()
+{
+  typedef type_list<dummy_a, dummy_a, dummy_b, dummy_b, dummy_c, dummy_c> tl;
+  typedef numeric_list<int, 0, 1, 2, 3, 4, 5> il;
+
+  VERIFY((is_same<typename take<0, tl>::type, type_list<>>::value));
+  VERIFY((is_same<typename take<1, tl>::type, type_list<dummy_a>>::value));
+  VERIFY((is_same<typename take<2, tl>::type, type_list<dummy_a, dummy_a>>::value));
+  VERIFY((is_same<typename take<3, tl>::type, type_list<dummy_a, dummy_a, dummy_b>>::value));
+  VERIFY((is_same<typename take<4, tl>::type, type_list<dummy_a, dummy_a, dummy_b, dummy_b>>::value));
+  VERIFY((is_same<typename take<5, tl>::type, type_list<dummy_a, dummy_a, dummy_b, dummy_b, dummy_c>>::value));
+  VERIFY((is_same<typename take<6, tl>::type, type_list<dummy_a, dummy_a, dummy_b, dummy_b, dummy_c, dummy_c>>::value));
+
+  VERIFY((is_same<typename take<0, il>::type, numeric_list<int>>::value));
+  VERIFY((is_same<typename take<1, il>::type, numeric_list<int, 0>>::value));
+  VERIFY((is_same<typename take<2, il>::type, numeric_list<int, 0, 1>>::value));
+  VERIFY((is_same<typename take<3, il>::type, numeric_list<int, 0, 1, 2>>::value));
+  VERIFY((is_same<typename take<4, il>::type, numeric_list<int, 0, 1, 2, 3>>::value));
+  VERIFY((is_same<typename take<5, il>::type, numeric_list<int, 0, 1, 2, 3, 4>>::value));
+  VERIFY((is_same<typename take<6, il>::type, numeric_list<int, 0, 1, 2, 3, 4, 5>>::value));
+  
+  VERIFY((is_same<typename skip<0, tl>::type, type_list<dummy_a, dummy_a, dummy_b, dummy_b, dummy_c, dummy_c>>::value));
+  VERIFY((is_same<typename skip<1, tl>::type, type_list<dummy_a, dummy_b, dummy_b, dummy_c, dummy_c>>::value));
+  VERIFY((is_same<typename skip<2, tl>::type, type_list<dummy_b, dummy_b, dummy_c, dummy_c>>::value));
+  VERIFY((is_same<typename skip<3, tl>::type, type_list<dummy_b, dummy_c, dummy_c>>::value));
+  VERIFY((is_same<typename skip<4, tl>::type, type_list<dummy_c, dummy_c>>::value));
+  VERIFY((is_same<typename skip<5, tl>::type, type_list<dummy_c>>::value));
+  VERIFY((is_same<typename skip<6, tl>::type, type_list<>>::value));
+
+  VERIFY((is_same<typename skip<0, il>::type, numeric_list<int, 0, 1, 2, 3, 4, 5>>::value));
+  VERIFY((is_same<typename skip<1, il>::type, numeric_list<int, 1, 2, 3, 4, 5>>::value));
+  VERIFY((is_same<typename skip<2, il>::type, numeric_list<int, 2, 3, 4, 5>>::value));
+  VERIFY((is_same<typename skip<3, il>::type, numeric_list<int, 3, 4, 5>>::value));
+  VERIFY((is_same<typename skip<4, il>::type, numeric_list<int, 4, 5>>::value));
+  VERIFY((is_same<typename skip<5, il>::type, numeric_list<int, 5>>::value));
+  VERIFY((is_same<typename skip<6, il>::type, numeric_list<int>>::value));
+
+  VERIFY((is_same<typename slice<0, 3, tl>::type, typename take<3, tl>::type>::value));
+  VERIFY((is_same<typename slice<0, 3, il>::type, typename take<3, il>::type>::value));
+  VERIFY((is_same<typename slice<1, 3, tl>::type, type_list<dummy_a, dummy_b, dummy_b>>::value));
+  VERIFY((is_same<typename slice<1, 3, il>::type, numeric_list<int, 1, 2, 3>>::value));
+}
+
+static void test_get()
+{
+  typedef type_list<dummy_a, dummy_a, dummy_b, dummy_b, dummy_c, dummy_c> tl;
+  typedef numeric_list<int, 4, 8, 15, 16, 23, 42> il;
+
+  VERIFY((is_same<typename get<0, tl>::type, dummy_a>::value));
+  VERIFY((is_same<typename get<1, tl>::type, dummy_a>::value));
+  VERIFY((is_same<typename get<2, tl>::type, dummy_b>::value));
+  VERIFY((is_same<typename get<3, tl>::type, dummy_b>::value));
+  VERIFY((is_same<typename get<4, tl>::type, dummy_c>::value));
+  VERIFY((is_same<typename get<5, tl>::type, dummy_c>::value));
+
+  VERIFY_IS_EQUAL(((int)get<0, il>::value), 4);
+  VERIFY_IS_EQUAL(((int)get<1, il>::value), 8);
+  VERIFY_IS_EQUAL(((int)get<2, il>::value), 15);
+  VERIFY_IS_EQUAL(((int)get<3, il>::value), 16);
+  VERIFY_IS_EQUAL(((int)get<4, il>::value), 23);
+  VERIFY_IS_EQUAL(((int)get<5, il>::value), 42);
+}
+
+static void test_id_helper(dummy_a a, dummy_a b, dummy_a c)
+{
+  (void)a;
+  (void)b;
+  (void)c;
+}
+
+template<int... ii>
+static void test_id_numeric()
+{
+  test_id_helper(typename id_numeric<int, ii, dummy_a>::type()...);
+}
+
+template<typename... tt>
+static void test_id_type()
+{
+  test_id_helper(typename id_type<tt, dummy_a>::type()...);
+}
+
+static void test_id()
+{
+  // don't call VERIFY here, just assume it works if it compiles
+  // (otherwise it will complain that it can't find the function)
+  test_id_numeric<1, 4, 6>();
+  test_id_type<dummy_a, dummy_b, dummy_c>();
+}
+
+static void test_is_same_gf()
+{
+  VERIFY((!is_same_gf<dummy_a, dummy_b>::value));
+  VERIFY((!!is_same_gf<dummy_a, dummy_a>::value));
+  VERIFY_IS_EQUAL((!!is_same_gf<dummy_a, dummy_b>::global_flags), false);
+  VERIFY_IS_EQUAL((!!is_same_gf<dummy_a, dummy_a>::global_flags), false);
+}
+
+static void test_apply_op()
+{
+  typedef type_list<dummy_a, dummy_b, dummy_c> tl;
+  VERIFY((!!is_same<typename apply_op_from_left<dummy_op, dummy_a, tl>::type, type_list<dummy_e, dummy_c, dummy_d>>::value));
+  VERIFY((!!is_same<typename apply_op_from_right<dummy_op, dummy_a, tl>::type, type_list<dummy_e, dummy_d, dummy_b>>::value));
+}
+
+static void test_contained_in_list()
+{
+  typedef type_list<dummy_a, dummy_b, dummy_c> tl;
+
+  VERIFY((!!contained_in_list<is_same, dummy_a, tl>::value));
+  VERIFY((!!contained_in_list<is_same, dummy_b, tl>::value));
+  VERIFY((!!contained_in_list<is_same, dummy_c, tl>::value));
+  VERIFY((!contained_in_list<is_same, dummy_d, tl>::value));
+  VERIFY((!contained_in_list<is_same, dummy_e, tl>::value));
+
+  VERIFY((!!contained_in_list_gf<dummy_test, dummy_a, tl>::value));
+  VERIFY((!!contained_in_list_gf<dummy_test, dummy_b, tl>::value));
+  VERIFY((!!contained_in_list_gf<dummy_test, dummy_c, tl>::value));
+  VERIFY((!contained_in_list_gf<dummy_test, dummy_d, tl>::value));
+  VERIFY((!contained_in_list_gf<dummy_test, dummy_e, tl>::value));
+
+  VERIFY_IS_EQUAL(((int)contained_in_list_gf<dummy_test, dummy_a, tl>::global_flags), 1);
+  VERIFY_IS_EQUAL(((int)contained_in_list_gf<dummy_test, dummy_b, tl>::global_flags), 2);
+  VERIFY_IS_EQUAL(((int)contained_in_list_gf<dummy_test, dummy_c, tl>::global_flags), 4);
+  VERIFY_IS_EQUAL(((int)contained_in_list_gf<dummy_test, dummy_d, tl>::global_flags), 0);
+  VERIFY_IS_EQUAL(((int)contained_in_list_gf<dummy_test, dummy_e, tl>::global_flags), 0);
+}
+
+static void test_arg_reductions()
+{
+  VERIFY_IS_EQUAL(arg_sum(1,2,3,4), 10);
+  VERIFY_IS_EQUAL(arg_prod(1,2,3,4), 24);
+  VERIFY_IS_APPROX(arg_sum(0.5, 2, 5), 7.5);
+  VERIFY_IS_APPROX(arg_prod(0.5, 2, 5), 5.0);
+}
+
+static void test_array_reverse_and_reduce()
+{
+  array<int, 6> a{{4, 8, 15, 16, 23, 42}};
+  array<int, 6> b{{42, 23, 16, 15, 8, 4}};
+
+  // there is no operator<< for std::array, so VERIFY_IS_EQUAL will
+  // not compile
+  VERIFY((array_reverse(a) == b));
+  VERIFY((array_reverse(b) == a));
+  VERIFY_IS_EQUAL((array_sum(a)), 108);
+  VERIFY_IS_EQUAL((array_sum(b)), 108);
+  VERIFY_IS_EQUAL((array_prod(a)), 7418880);
+  VERIFY_IS_EQUAL((array_prod(b)), 7418880);
+}
+
+static void test_array_zip_and_apply()
+{
+  array<int, 6> a{{4, 8, 15, 16, 23, 42}};
+  array<int, 6> b{{0, 1, 2, 3, 4, 5}};
+  array<int, 6> c{{4, 9, 17, 19, 27, 47}};
+  array<int, 6> d{{0, 8, 30, 48, 92, 210}};
+  array<int, 6> e{{0, 2, 4, 6, 8, 10}};
+
+  VERIFY((array_zip<sum_op>(a, b) == c));
+  VERIFY((array_zip<product_op>(a, b) == d));
+  VERIFY((array_apply<times2_op>(b) == e));
+  VERIFY_IS_EQUAL((array_apply_and_reduce<sum_op, times2_op>(a)), 216);
+  VERIFY_IS_EQUAL((array_apply_and_reduce<sum_op, times2_op>(b)), 30);
+  VERIFY_IS_EQUAL((array_zip_and_reduce<product_op, sum_op>(a, b)), 14755932);
+  VERIFY_IS_EQUAL((array_zip_and_reduce<sum_op, product_op>(a, b)), 388);
+}
+
+static void test_array_misc()
+{
+  array<int, 3> a3{{1, 1, 1}};
+  array<int, 6> a6{{2, 2, 2, 2, 2, 2}};
+  VERIFY((repeat<3, int>(1) == a3));
+  VERIFY((repeat<6, int>(2) == a6));
+
+  int data[5] = { 0, 1, 2, 3, 4 };
+  VERIFY_IS_EQUAL((instantiate_by_c_array<dummy_inst, int, 0>(data).c), 0);
+  VERIFY_IS_EQUAL((instantiate_by_c_array<dummy_inst, int, 1>(data).c), 1);
+  VERIFY_IS_EQUAL((instantiate_by_c_array<dummy_inst, int, 2>(data).c), 2);
+  VERIFY_IS_EQUAL((instantiate_by_c_array<dummy_inst, int, 3>(data).c), 3);
+  VERIFY_IS_EQUAL((instantiate_by_c_array<dummy_inst, int, 4>(data).c), 4);
+  VERIFY_IS_EQUAL((instantiate_by_c_array<dummy_inst, int, 5>(data).c), 5);
+}
+
+void test_cxx11_meta()
+{
+  CALL_SUBTEST(test_gen_numeric_list());
+  CALL_SUBTEST(test_concat());
+  CALL_SUBTEST(test_slice());
+  CALL_SUBTEST(test_get());
+  CALL_SUBTEST(test_id());
+  CALL_SUBTEST(test_is_same_gf());
+  CALL_SUBTEST(test_apply_op());
+  CALL_SUBTEST(test_contained_in_list());
+  CALL_SUBTEST(test_arg_reductions());
+  CALL_SUBTEST(test_array_reverse_and_reduce());
+  CALL_SUBTEST(test_array_zip_and_apply());
+  CALL_SUBTEST(test_array_misc());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_non_blocking_thread_pool.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_non_blocking_thread_pool.cpp
new file mode 100644
index 000000000..5f9bb938b
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_non_blocking_thread_pool.cpp
@@ -0,0 +1,107 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+#include "main.h"
+#include "Eigen/CXX11/ThreadPool"
+
+static void test_create_destroy_empty_pool()
+{
+  // Just create and destroy the pool. This will wind up and tear down worker
+  // threads. Ensure there are no issues in that logic.
+  for (int i = 0; i < 16; ++i) {
+    NonBlockingThreadPool tp(i);
+  }
+}
+
+
+static void test_parallelism()
+{
+  // Test we never-ever fail to match available tasks with idle threads.
+  const int kThreads = 16;  // code below expects that this is a multiple of 4
+  NonBlockingThreadPool tp(kThreads);
+  VERIFY_IS_EQUAL(tp.NumThreads(), kThreads);
+  VERIFY_IS_EQUAL(tp.CurrentThreadId(), -1);
+  for (int iter = 0; iter < 100; ++iter) {
+    std::atomic<int> running(0);
+    std::atomic<int> done(0);
+    std::atomic<int> phase(0);
+    // Schedule kThreads tasks and ensure that they all are running.
+    for (int i = 0; i < kThreads; ++i) {
+      tp.Schedule([&]() {
+        const int thread_id = tp.CurrentThreadId();
+        VERIFY_GE(thread_id, 0);
+        VERIFY_LE(thread_id, kThreads - 1);
+        running++;
+        while (phase < 1) {
+        }
+        done++;
+      });
+    }
+    while (running != kThreads) {
+    }
+    running = 0;
+    phase = 1;
+    // Now, while the previous tasks exit, schedule another kThreads tasks and
+    // ensure that they are running.
+    for (int i = 0; i < kThreads; ++i) {
+      tp.Schedule([&, i]() {
+        running++;
+        while (phase < 2) {
+        }
+        // When all tasks are running, half of tasks exit, quarter of tasks
+        // continue running and quarter of tasks schedule another 2 tasks each.
+        // Concurrently main thread schedules another quarter of tasks.
+        // This gives us another kThreads tasks and we ensure that they all
+        // are running.
+        if (i < kThreads / 2) {
+        } else if (i < 3 * kThreads / 4) {
+          running++;
+          while (phase < 3) {
+          }
+          done++;
+        } else {
+          for (int j = 0; j < 2; ++j) {
+            tp.Schedule([&]() {
+              running++;
+              while (phase < 3) {
+              }
+              done++;
+            });
+          }
+        }
+        done++;
+      });
+    }
+    while (running != kThreads) {
+    }
+    running = 0;
+    phase = 2;
+    for (int i = 0; i < kThreads / 4; ++i) {
+      tp.Schedule([&]() {
+        running++;
+        while (phase < 3) {
+        }
+        done++;
+      });
+    }
+    while (running != kThreads) {
+    }
+    phase = 3;
+    while (done != 3 * kThreads) {
+    }
+  }
+}
+
+void test_cxx11_non_blocking_thread_pool()
+{
+  CALL_SUBTEST(test_create_destroy_empty_pool());
+  CALL_SUBTEST(test_parallelism());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_runqueue.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_runqueue.cpp
new file mode 100644
index 000000000..91f690114
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_runqueue.cpp
@@ -0,0 +1,235 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+#include <cstdlib>
+#include "main.h"
+#include <Eigen/CXX11/ThreadPool>
+
+
+// Visual studio doesn't implement a rand_r() function since its
+// implementation of rand() is already thread safe
+int rand_reentrant(unsigned int* s) {
+#ifdef EIGEN_COMP_MSVC_STRICT
+  EIGEN_UNUSED_VARIABLE(s);
+  return rand();
+#else
+  return rand_r(s);
+#endif
+}
+
+void test_basic_runqueue()
+{
+  RunQueue<int, 4> q;
+  // Check empty state.
+  VERIFY(q.Empty());
+  VERIFY_IS_EQUAL(0u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PopFront());
+  std::vector<int> stolen;
+  VERIFY_IS_EQUAL(0u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(0u, stolen.size());
+  // Push one front, pop one front.
+  VERIFY_IS_EQUAL(0, q.PushFront(1));
+  VERIFY_IS_EQUAL(1u, q.Size());
+  VERIFY_IS_EQUAL(1, q.PopFront());
+  VERIFY_IS_EQUAL(0u, q.Size());
+  // Push front to overflow.
+  VERIFY_IS_EQUAL(0, q.PushFront(2));
+  VERIFY_IS_EQUAL(1u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushFront(3));
+  VERIFY_IS_EQUAL(2u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushFront(4));
+  VERIFY_IS_EQUAL(3u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushFront(5));
+  VERIFY_IS_EQUAL(4u, q.Size());
+  VERIFY_IS_EQUAL(6, q.PushFront(6));
+  VERIFY_IS_EQUAL(4u, q.Size());
+  VERIFY_IS_EQUAL(5, q.PopFront());
+  VERIFY_IS_EQUAL(3u, q.Size());
+  VERIFY_IS_EQUAL(4, q.PopFront());
+  VERIFY_IS_EQUAL(2u, q.Size());
+  VERIFY_IS_EQUAL(3, q.PopFront());
+  VERIFY_IS_EQUAL(1u, q.Size());
+  VERIFY_IS_EQUAL(2, q.PopFront());
+  VERIFY_IS_EQUAL(0u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PopFront());
+  // Push one back, pop one back.
+  VERIFY_IS_EQUAL(0, q.PushBack(7));
+  VERIFY_IS_EQUAL(1u, q.Size());
+  VERIFY_IS_EQUAL(1u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(1u, stolen.size());
+  VERIFY_IS_EQUAL(7, stolen[0]);
+  VERIFY_IS_EQUAL(0u, q.Size());
+  stolen.clear();
+  // Push back to overflow.
+  VERIFY_IS_EQUAL(0, q.PushBack(8));
+  VERIFY_IS_EQUAL(1u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushBack(9));
+  VERIFY_IS_EQUAL(2u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushBack(10));
+  VERIFY_IS_EQUAL(3u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushBack(11));
+  VERIFY_IS_EQUAL(4u, q.Size());
+  VERIFY_IS_EQUAL(12, q.PushBack(12));
+  VERIFY_IS_EQUAL(4u, q.Size());
+  // Pop back in halves.
+  VERIFY_IS_EQUAL(2u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(2u, stolen.size());
+  VERIFY_IS_EQUAL(10, stolen[0]);
+  VERIFY_IS_EQUAL(11, stolen[1]);
+  VERIFY_IS_EQUAL(2u, q.Size());
+  stolen.clear();
+  VERIFY_IS_EQUAL(1u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(1u, stolen.size());
+  VERIFY_IS_EQUAL(9, stolen[0]);
+  VERIFY_IS_EQUAL(1u, q.Size());
+  stolen.clear();
+  VERIFY_IS_EQUAL(1u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(1u, stolen.size());
+  VERIFY_IS_EQUAL(8, stolen[0]);
+  stolen.clear();
+  VERIFY_IS_EQUAL(0u, q.PopBackHalf(&stolen));
+  VERIFY_IS_EQUAL(0u, stolen.size());
+  // Empty again.
+  VERIFY(q.Empty());
+  VERIFY_IS_EQUAL(0u, q.Size());
+  VERIFY_IS_EQUAL(0, q.PushFront(1));
+  VERIFY_IS_EQUAL(0, q.PushFront(2));
+  VERIFY_IS_EQUAL(0, q.PushFront(3));
+  VERIFY_IS_EQUAL(1, q.PopBack());
+  VERIFY_IS_EQUAL(2, q.PopBack());
+  VERIFY_IS_EQUAL(3, q.PopBack());
+  VERIFY(q.Empty());
+  VERIFY_IS_EQUAL(0u, q.Size());
+}
+
+// Empty tests that the queue is not claimed to be empty when is is in fact not.
+// Emptiness property is crucial part of thread pool blocking scheme,
+// so we go to great effort to ensure this property. We create a queue with
+// 1 element and then push 1 element (either front or back at random) and pop
+// 1 element (either front or back at random). So queue always contains at least
+// 1 element, but otherwise changes chaotically. Another thread constantly tests
+// that the queue is not claimed to be empty.
+void test_empty_runqueue()
+{
+  RunQueue<int, 4> q;
+  q.PushFront(1);
+  std::atomic<bool> done(false);
+  std::thread mutator([&q, &done]() {
+    unsigned rnd = 0;
+    std::vector<int> stolen;
+    for (int i = 0; i < 1 << 18; i++) {
+      if (rand_reentrant(&rnd) % 2)
+        VERIFY_IS_EQUAL(0, q.PushFront(1));
+      else
+        VERIFY_IS_EQUAL(0, q.PushBack(1));
+      if (rand_reentrant(&rnd) % 2)
+        VERIFY_IS_EQUAL(1, q.PopFront());
+      else {
+        for (;;) {
+          if (q.PopBackHalf(&stolen) == 1) {
+            stolen.clear();
+            break;
+          }
+          VERIFY_IS_EQUAL(0u, stolen.size());
+        }
+      }
+    }
+    done = true;
+  });
+  while (!done) {
+    VERIFY(!q.Empty());
+    int size = q.Size();
+    VERIFY_GE(size, 1);
+    VERIFY_LE(size, 2);
+  }
+  VERIFY_IS_EQUAL(1, q.PopFront());
+  mutator.join();
+}
+
+// Stress is a chaotic random test.
+// One thread (owner) calls PushFront/PopFront, other threads call PushBack/
+// PopBack. Ensure that we don't crash, deadlock, and all sanity checks pass.
+void test_stress_runqueue()
+{
+  static const int kEvents = 1 << 18;
+  RunQueue<int, 8> q;
+  std::atomic<int> total(0);
+  std::vector<std::unique_ptr<std::thread>> threads;
+  threads.emplace_back(new std::thread([&q, &total]() {
+    int sum = 0;
+    int pushed = 1;
+    int popped = 1;
+    while (pushed < kEvents || popped < kEvents) {
+      if (pushed < kEvents) {
+        if (q.PushFront(pushed) == 0) {
+          sum += pushed;
+          pushed++;
+        }
+      }
+      if (popped < kEvents) {
+        int v = q.PopFront();
+        if (v != 0) {
+          sum -= v;
+          popped++;
+        }
+      }
+    }
+    total += sum;
+  }));
+  for (int i = 0; i < 2; i++) {
+    threads.emplace_back(new std::thread([&q, &total]() {
+      int sum = 0;
+      for (int j = 1; j < kEvents; j++) {
+        if (q.PushBack(j) == 0) {
+          sum += j;
+          continue;
+        }
+        EIGEN_THREAD_YIELD();
+        j--;
+      }
+      total += sum;
+    }));
+    threads.emplace_back(new std::thread([&q, &total]() {
+      int sum = 0;
+      std::vector<int> stolen;
+      for (int j = 1; j < kEvents;) {
+        if (q.PopBackHalf(&stolen) == 0) {
+          EIGEN_THREAD_YIELD();
+          continue;
+        }
+        while (stolen.size() && j < kEvents) {
+          int v = stolen.back();
+          stolen.pop_back();
+          VERIFY_IS_NOT_EQUAL(v, 0);
+          sum += v;
+          j++;
+        }
+      }
+      while (stolen.size()) {
+        int v = stolen.back();
+        stolen.pop_back();
+        VERIFY_IS_NOT_EQUAL(v, 0);
+        while ((v = q.PushBack(v)) != 0) EIGEN_THREAD_YIELD();
+      }
+      total -= sum;
+    }));
+  }
+  for (size_t i = 0; i < threads.size(); i++) threads[i]->join();
+  VERIFY(q.Empty());
+  VERIFY(total.load() == 0);
+}
+
+void test_cxx11_runqueue()
+{
+  CALL_SUBTEST_1(test_basic_runqueue());
+  CALL_SUBTEST_2(test_empty_runqueue());
+  CALL_SUBTEST_3(test_stress_runqueue());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_argmax.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_argmax.cpp
new file mode 100644
index 000000000..037767270
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_argmax.cpp
@@ -0,0 +1,294 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Eugene Brevdo <ebrevdo@google.com>
+//                    Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::array;
+using Eigen::Tuple;
+
+template <int DataLayout>
+static void test_simple_index_tuples()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  tensor = (tensor + tensor.constant(0.5)).log();
+
+  Tensor<Tuple<DenseIndex, float>, 4, DataLayout> index_tuples(2,3,5,7);
+  index_tuples = tensor.index_tuples();
+
+  for (DenseIndex n = 0; n < 2*3*5*7; ++n) {
+    const Tuple<DenseIndex, float>& v = index_tuples.coeff(n);
+    VERIFY_IS_EQUAL(v.first, n);
+    VERIFY_IS_EQUAL(v.second, tensor.coeff(n));
+  }
+}
+
+template <int DataLayout>
+static void test_index_tuples_dim()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  tensor = (tensor + tensor.constant(0.5)).log();
+
+  Tensor<Tuple<DenseIndex, float>, 4, DataLayout> index_tuples(2,3,5,7);
+
+  index_tuples = tensor.index_tuples();
+
+  for (Eigen::DenseIndex n = 0; n < tensor.size(); ++n) {
+    const Tuple<DenseIndex, float>& v = index_tuples(n); //(i, j, k, l);
+    VERIFY_IS_EQUAL(v.first, n);
+    VERIFY_IS_EQUAL(v.second, tensor(n));
+  }
+}
+
+template <int DataLayout>
+static void test_argmax_tuple_reducer()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  tensor = (tensor + tensor.constant(0.5)).log();
+
+  Tensor<Tuple<DenseIndex, float>, 4, DataLayout> index_tuples(2,3,5,7);
+  index_tuples = tensor.index_tuples();
+
+  Tensor<Tuple<DenseIndex, float>, 0, DataLayout> reduced;
+  DimensionList<DenseIndex, 4> dims;
+  reduced = index_tuples.reduce(
+      dims, internal::ArgMaxTupleReducer<Tuple<DenseIndex, float> >());
+
+  Tensor<float, 0, DataLayout> maxi = tensor.maximum();
+
+  VERIFY_IS_EQUAL(maxi(), reduced(0).second);
+
+  array<DenseIndex, 3> reduce_dims;
+  for (int d = 0; d < 3; ++d) reduce_dims[d] = d;
+  Tensor<Tuple<DenseIndex, float>, 1, DataLayout> reduced_by_dims(7);
+  reduced_by_dims = index_tuples.reduce(
+      reduce_dims, internal::ArgMaxTupleReducer<Tuple<DenseIndex, float> >());
+
+  Tensor<float, 1, DataLayout> max_by_dims = tensor.maximum(reduce_dims);
+
+  for (int l = 0; l < 7; ++l) {
+    VERIFY_IS_EQUAL(max_by_dims(l), reduced_by_dims(l).second);
+  }
+}
+
+template <int DataLayout>
+static void test_argmin_tuple_reducer()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  tensor = (tensor + tensor.constant(0.5)).log();
+
+  Tensor<Tuple<DenseIndex, float>, 4, DataLayout> index_tuples(2,3,5,7);
+  index_tuples = tensor.index_tuples();
+
+  Tensor<Tuple<DenseIndex, float>, 0, DataLayout> reduced;
+  DimensionList<DenseIndex, 4> dims;
+  reduced = index_tuples.reduce(
+      dims, internal::ArgMinTupleReducer<Tuple<DenseIndex, float> >());
+
+  Tensor<float, 0, DataLayout> mini = tensor.minimum();
+
+  VERIFY_IS_EQUAL(mini(), reduced(0).second);
+
+  array<DenseIndex, 3> reduce_dims;
+  for (int d = 0; d < 3; ++d) reduce_dims[d] = d;
+  Tensor<Tuple<DenseIndex, float>, 1, DataLayout> reduced_by_dims(7);
+  reduced_by_dims = index_tuples.reduce(
+      reduce_dims, internal::ArgMinTupleReducer<Tuple<DenseIndex, float> >());
+
+  Tensor<float, 1, DataLayout> min_by_dims = tensor.minimum(reduce_dims);
+
+  for (int l = 0; l < 7; ++l) {
+    VERIFY_IS_EQUAL(min_by_dims(l), reduced_by_dims(l).second);
+  }
+}
+
+template <int DataLayout>
+static void test_simple_argmax()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  tensor = (tensor + tensor.constant(0.5)).log();
+  tensor(0,0,0,0) = 10.0;
+
+  Tensor<DenseIndex, 0, DataLayout> tensor_argmax;
+
+  tensor_argmax = tensor.argmax();
+
+  VERIFY_IS_EQUAL(tensor_argmax(0), 0);
+
+  tensor(1,2,4,6) = 20.0;
+
+  tensor_argmax = tensor.argmax();
+
+  VERIFY_IS_EQUAL(tensor_argmax(0), 2*3*5*7 - 1);
+}
+
+template <int DataLayout>
+static void test_simple_argmin()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  tensor = (tensor + tensor.constant(0.5)).log();
+  tensor(0,0,0,0) = -10.0;
+
+  Tensor<DenseIndex, 0, DataLayout> tensor_argmin;
+
+  tensor_argmin = tensor.argmin();
+
+  VERIFY_IS_EQUAL(tensor_argmin(0), 0);
+
+  tensor(1,2,4,6) = -20.0;
+
+  tensor_argmin = tensor.argmin();
+
+  VERIFY_IS_EQUAL(tensor_argmin(0), 2*3*5*7 - 1);
+}
+
+template <int DataLayout>
+static void test_argmax_dim()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  std::vector<int> dims {2, 3, 5, 7};
+
+  for (int dim = 0; dim < 4; ++dim) {
+    tensor.setRandom();
+    tensor = (tensor + tensor.constant(0.5)).log();
+
+    Tensor<DenseIndex, 3, DataLayout> tensor_argmax;
+    array<DenseIndex, 4> ix;
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != 0) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
+            tensor(ix) = 10.0;
+          }
+        }
+      }
+    }
+
+    tensor_argmax = tensor.argmax(dim);
+
+    VERIFY_IS_EQUAL(tensor_argmax.size(),
+                    ptrdiff_t(2*3*5*7 / tensor.dimension(dim)));
+    for (ptrdiff_t n = 0; n < tensor_argmax.size(); ++n) {
+      // Expect max to be in the first index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_argmax.data()[n], 0);
+    }
+
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != tensor.dimension(dim) - 1) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
+            tensor(ix) = 20.0;
+          }
+        }
+      }
+    }
+
+    tensor_argmax = tensor.argmax(dim);
+
+    VERIFY_IS_EQUAL(tensor_argmax.size(),
+                    ptrdiff_t(2*3*5*7 / tensor.dimension(dim)));
+    for (ptrdiff_t n = 0; n < tensor_argmax.size(); ++n) {
+      // Expect max to be in the last index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_argmax.data()[n], tensor.dimension(dim) - 1);
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_argmin_dim()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  std::vector<int> dims {2, 3, 5, 7};
+
+  for (int dim = 0; dim < 4; ++dim) {
+    tensor.setRandom();
+    tensor = (tensor + tensor.constant(0.5)).log();
+
+    Tensor<DenseIndex, 3, DataLayout> tensor_argmin;
+    array<DenseIndex, 4> ix;
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != 0) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = -10.0
+            tensor(ix) = -10.0;
+          }
+        }
+      }
+    }
+
+    tensor_argmin = tensor.argmin(dim);
+
+    VERIFY_IS_EQUAL(tensor_argmin.size(),
+                    ptrdiff_t(2*3*5*7 / tensor.dimension(dim)));
+    for (ptrdiff_t n = 0; n < tensor_argmin.size(); ++n) {
+      // Expect min to be in the first index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_argmin.data()[n], 0);
+    }
+
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != tensor.dimension(dim) - 1) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = -20.0
+            tensor(ix) = -20.0;
+          }
+        }
+      }
+    }
+
+    tensor_argmin = tensor.argmin(dim);
+
+    VERIFY_IS_EQUAL(tensor_argmin.size(),
+                    ptrdiff_t(2*3*5*7 / tensor.dimension(dim)));
+    for (ptrdiff_t n = 0; n < tensor_argmin.size(); ++n) {
+      // Expect min to be in the last index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_argmin.data()[n], tensor.dimension(dim) - 1);
+    }
+  }
+}
+
+void test_cxx11_tensor_argmax()
+{
+  CALL_SUBTEST(test_simple_index_tuples<RowMajor>());
+  CALL_SUBTEST(test_simple_index_tuples<ColMajor>());
+  CALL_SUBTEST(test_index_tuples_dim<RowMajor>());
+  CALL_SUBTEST(test_index_tuples_dim<ColMajor>());
+  CALL_SUBTEST(test_argmax_tuple_reducer<RowMajor>());
+  CALL_SUBTEST(test_argmax_tuple_reducer<ColMajor>());
+  CALL_SUBTEST(test_argmin_tuple_reducer<RowMajor>());
+  CALL_SUBTEST(test_argmin_tuple_reducer<ColMajor>());
+  CALL_SUBTEST(test_simple_argmax<RowMajor>());
+  CALL_SUBTEST(test_simple_argmax<ColMajor>());
+  CALL_SUBTEST(test_simple_argmin<RowMajor>());
+  CALL_SUBTEST(test_simple_argmin<ColMajor>());
+  CALL_SUBTEST(test_argmax_dim<RowMajor>());
+  CALL_SUBTEST(test_argmax_dim<ColMajor>());
+  CALL_SUBTEST(test_argmin_dim<RowMajor>());
+  CALL_SUBTEST(test_argmin_dim<ColMajor>());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_argmax_cuda.cu b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_argmax_cuda.cu
new file mode 100644
index 000000000..3d73d491a
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_argmax_cuda.cu
@@ -0,0 +1,251 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <int Layout>
+void test_cuda_simple_argmax()
+{
+  Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97));
+  Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1));
+  Tensor<DenseIndex, 1, Layout> out_min(Eigen::array<DenseIndex, 1>(1));
+  in.setRandom();
+  in *= in.constant(100.0);
+  in(0, 0, 0) = -1000.0;
+  in(71, 52, 96) = 1000.0;
+
+  std::size_t in_bytes = in.size() * sizeof(double);
+  std::size_t out_bytes = out_max.size() * sizeof(DenseIndex);
+
+  double* d_in;
+  DenseIndex* d_out_max;
+  DenseIndex* d_out_min;
+  cudaMalloc((void**)(&d_in), in_bytes);
+  cudaMalloc((void**)(&d_out_max), out_bytes);
+  cudaMalloc((void**)(&d_out_min), out_bytes);
+
+  cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_max(d_out_max, Eigen::array<DenseIndex, 1>(1));
+  Eigen::TensorMap<Eigen::Tensor<DenseIndex, 1, Layout>, Aligned > gpu_out_min(d_out_min, Eigen::array<DenseIndex, 1>(1));
+
+  gpu_out_max.device(gpu_device) = gpu_in.argmax();
+  gpu_out_min.device(gpu_device) = gpu_in.argmin();
+
+  assert(cudaMemcpyAsync(out_max.data(), d_out_max, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaMemcpyAsync(out_min.data(), d_out_min, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1);
+  VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0);
+
+  cudaFree(d_in);
+  cudaFree(d_out_max);
+  cudaFree(d_out_min);
+}
+
+template <int DataLayout>
+void test_cuda_argmax_dim()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  std::vector<int> dims;
+  dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
+
+  for (int dim = 0; dim < 4; ++dim) {
+    tensor.setRandom();
+    tensor = (tensor + tensor.constant(0.5)).log();
+
+    array<DenseIndex, 3> out_shape;
+    for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
+
+    Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
+
+    array<DenseIndex, 4> ix;
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != 0) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
+            tensor(ix) = 10.0;
+          }
+        }
+      }
+    }
+
+    std::size_t in_bytes = tensor.size() * sizeof(float);
+    std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
+
+    float* d_in;
+    DenseIndex* d_out;
+    cudaMalloc((void**)(&d_in), in_bytes);
+    cudaMalloc((void**)(&d_out), out_bytes);
+
+    cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+
+    Eigen::CudaStreamDevice stream;
+    Eigen::GpuDevice gpu_device(&stream);
+
+    Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
+    Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
+
+    gpu_out.device(gpu_device) = gpu_in.argmax(dim);
+
+    assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+    VERIFY_IS_EQUAL(tensor_arg.size(),
+                    size_t(2*3*5*7 / tensor.dimension(dim)));
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the first index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+    }
+
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != tensor.dimension(dim) - 1) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
+            tensor(ix) = 20.0;
+          }
+        }
+      }
+    }
+
+    cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+
+    gpu_out.device(gpu_device) = gpu_in.argmax(dim);
+
+    assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the last index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
+    }
+
+    cudaFree(d_in);
+    cudaFree(d_out);
+  }
+}
+
+template <int DataLayout>
+void test_cuda_argmin_dim()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  std::vector<int> dims;
+  dims.push_back(2); dims.push_back(3); dims.push_back(5); dims.push_back(7);
+
+  for (int dim = 0; dim < 4; ++dim) {
+    tensor.setRandom();
+    tensor = (tensor + tensor.constant(0.5)).log();
+
+    array<DenseIndex, 3> out_shape;
+    for (int d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1];
+
+    Tensor<DenseIndex, 3, DataLayout> tensor_arg(out_shape);
+
+    array<DenseIndex, 4> ix;
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != 0) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0
+            tensor(ix) = -10.0;
+          }
+        }
+      }
+    }
+
+    std::size_t in_bytes = tensor.size() * sizeof(float);
+    std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex);
+
+    float* d_in;
+    DenseIndex* d_out;
+    cudaMalloc((void**)(&d_in), in_bytes);
+    cudaMalloc((void**)(&d_out), out_bytes);
+
+    cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+
+    Eigen::CudaStreamDevice stream;
+    Eigen::GpuDevice gpu_device(&stream);
+
+    Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7));
+    Eigen::TensorMap<Eigen::Tensor<DenseIndex, 3, DataLayout>, Aligned > gpu_out(d_out, out_shape);
+
+    gpu_out.device(gpu_device) = gpu_in.argmin(dim);
+
+    assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+    VERIFY_IS_EQUAL(tensor_arg.size(),
+                    2*3*5*7 / tensor.dimension(dim));
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect min to be in the first index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], 0);
+    }
+
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        for (int k = 0; k < 5; ++k) {
+          for (int l = 0; l < 7; ++l) {
+            ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l;
+            if (ix[dim] != tensor.dimension(dim) - 1) continue;
+            // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0
+            tensor(ix) = -20.0;
+          }
+        }
+      }
+    }
+
+    cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice);
+
+    gpu_out.device(gpu_device) = gpu_in.argmin(dim);
+
+    assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+    for (DenseIndex n = 0; n < tensor_arg.size(); ++n) {
+      // Expect max to be in the last index of the reduced dimension
+      VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1);
+    }
+
+    cudaFree(d_in);
+    cudaFree(d_out);
+  }
+}
+
+void test_cxx11_tensor_cuda()
+{
+  CALL_SUBTEST_1(test_cuda_simple_argmax<RowMajor>());
+  CALL_SUBTEST_1(test_cuda_simple_argmax<ColMajor>());
+  CALL_SUBTEST_2(test_cuda_argmax_dim<RowMajor>());
+  CALL_SUBTEST_2(test_cuda_argmax_dim<ColMajor>());
+  CALL_SUBTEST_3(test_cuda_argmin_dim<RowMajor>());
+  CALL_SUBTEST_3(test_cuda_argmin_dim<ColMajor>());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_assign.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_assign.cpp
new file mode 100644
index 000000000..8fe85d83c
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_assign.cpp
@@ -0,0 +1,370 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_1d()
+{
+  Tensor<int, 1> vec1(6);
+  Tensor<int, 1, RowMajor> vec2(6);
+  vec1(0) = 4;  vec2(0) = 0;
+  vec1(1) = 8;  vec2(1) = 1;
+  vec1(2) = 15; vec2(2) = 2;
+  vec1(3) = 16; vec2(3) = 3;
+  vec1(4) = 23; vec2(4) = 4;
+  vec1(5) = 42; vec2(5) = 5;
+
+  int col_major[6];
+  int row_major[6];
+  memset(col_major, 0, 6*sizeof(int));
+  memset(row_major, 0, 6*sizeof(int));
+  TensorMap<Tensor<int, 1> > vec3(col_major, 6);
+  TensorMap<Tensor<int, 1, RowMajor> > vec4(row_major, 6);
+
+  vec3 = vec1;
+  vec4 = vec2;
+
+  VERIFY_IS_EQUAL(vec3(0), 4);
+  VERIFY_IS_EQUAL(vec3(1), 8);
+  VERIFY_IS_EQUAL(vec3(2), 15);
+  VERIFY_IS_EQUAL(vec3(3), 16);
+  VERIFY_IS_EQUAL(vec3(4), 23);
+  VERIFY_IS_EQUAL(vec3(5), 42);
+
+  VERIFY_IS_EQUAL(vec4(0), 0);
+  VERIFY_IS_EQUAL(vec4(1), 1);
+  VERIFY_IS_EQUAL(vec4(2), 2);
+  VERIFY_IS_EQUAL(vec4(3), 3);
+  VERIFY_IS_EQUAL(vec4(4), 4);
+  VERIFY_IS_EQUAL(vec4(5), 5);
+
+  vec1.setZero();
+  vec2.setZero();
+  vec1 = vec3;
+  vec2 = vec4;
+
+  VERIFY_IS_EQUAL(vec1(0), 4);
+  VERIFY_IS_EQUAL(vec1(1), 8);
+  VERIFY_IS_EQUAL(vec1(2), 15);
+  VERIFY_IS_EQUAL(vec1(3), 16);
+  VERIFY_IS_EQUAL(vec1(4), 23);
+  VERIFY_IS_EQUAL(vec1(5), 42);
+
+  VERIFY_IS_EQUAL(vec2(0), 0);
+  VERIFY_IS_EQUAL(vec2(1), 1);
+  VERIFY_IS_EQUAL(vec2(2), 2);
+  VERIFY_IS_EQUAL(vec2(3), 3);
+  VERIFY_IS_EQUAL(vec2(4), 4);
+  VERIFY_IS_EQUAL(vec2(5), 5);
+}
+
+static void test_2d()
+{
+  Tensor<int, 2> mat1(2,3);
+  Tensor<int, 2, RowMajor> mat2(2,3);
+
+  mat1(0,0) = 0;
+  mat1(0,1) = 1;
+  mat1(0,2) = 2;
+  mat1(1,0) = 3;
+  mat1(1,1) = 4;
+  mat1(1,2) = 5;
+
+  mat2(0,0) = 0;
+  mat2(0,1) = 1;
+  mat2(0,2) = 2;
+  mat2(1,0) = 3;
+  mat2(1,1) = 4;
+  mat2(1,2) = 5;
+
+  int col_major[6];
+  int row_major[6];
+  memset(col_major, 0, 6*sizeof(int));
+  memset(row_major, 0, 6*sizeof(int));
+  TensorMap<Tensor<int, 2> > mat3(row_major, 2, 3);
+  TensorMap<Tensor<int, 2, RowMajor> > mat4(col_major, 2, 3);
+
+  mat3 = mat1;
+  mat4 = mat2;
+
+  VERIFY_IS_EQUAL(mat3(0,0), 0);
+  VERIFY_IS_EQUAL(mat3(0,1), 1);
+  VERIFY_IS_EQUAL(mat3(0,2), 2);
+  VERIFY_IS_EQUAL(mat3(1,0), 3);
+  VERIFY_IS_EQUAL(mat3(1,1), 4);
+  VERIFY_IS_EQUAL(mat3(1,2), 5);
+
+  VERIFY_IS_EQUAL(mat4(0,0), 0);
+  VERIFY_IS_EQUAL(mat4(0,1), 1);
+  VERIFY_IS_EQUAL(mat4(0,2), 2);
+  VERIFY_IS_EQUAL(mat4(1,0), 3);
+  VERIFY_IS_EQUAL(mat4(1,1), 4);
+  VERIFY_IS_EQUAL(mat4(1,2), 5);
+
+  mat1.setZero();
+  mat2.setZero();
+  mat1 = mat3;
+  mat2 = mat4;
+
+  VERIFY_IS_EQUAL(mat1(0,0), 0);
+  VERIFY_IS_EQUAL(mat1(0,1), 1);
+  VERIFY_IS_EQUAL(mat1(0,2), 2);
+  VERIFY_IS_EQUAL(mat1(1,0), 3);
+  VERIFY_IS_EQUAL(mat1(1,1), 4);
+  VERIFY_IS_EQUAL(mat1(1,2), 5);
+
+  VERIFY_IS_EQUAL(mat2(0,0), 0);
+  VERIFY_IS_EQUAL(mat2(0,1), 1);
+  VERIFY_IS_EQUAL(mat2(0,2), 2);
+  VERIFY_IS_EQUAL(mat2(1,0), 3);
+  VERIFY_IS_EQUAL(mat2(1,1), 4);
+  VERIFY_IS_EQUAL(mat2(1,2), 5);
+}
+
+static void test_3d()
+{
+  Tensor<int, 3> mat1(2,3,7);
+  Tensor<int, 3, RowMajor> mat2(2,3,7);
+
+  int val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        mat2(i,j,k) = val;
+        val++;
+      }
+    }
+  }
+
+  int col_major[2*3*7];
+  int row_major[2*3*7];
+  memset(col_major, 0, 2*3*7*sizeof(int));
+  memset(row_major, 0, 2*3*7*sizeof(int));
+  TensorMap<Tensor<int, 3> > mat3(col_major, 2, 3, 7);
+  TensorMap<Tensor<int, 3, RowMajor> > mat4(row_major, 2, 3, 7);
+
+  mat3 = mat1;
+  mat4 = mat2;
+
+  val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(mat3(i,j,k), val);
+        VERIFY_IS_EQUAL(mat4(i,j,k), val);
+        val++;
+      }
+    }
+  }
+
+  mat1.setZero();
+  mat2.setZero();
+  mat1 = mat3;
+  mat2 = mat4;
+
+  val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(mat1(i,j,k), val);
+        VERIFY_IS_EQUAL(mat2(i,j,k), val);
+        val++;
+      }
+    }
+  }
+}
+
+static void test_same_type()
+{
+  Tensor<int, 1> orig_tensor(5);
+  Tensor<int, 1> dest_tensor(5);
+  orig_tensor.setRandom();
+  dest_tensor.setRandom();
+  int* orig_data = orig_tensor.data();
+  int* dest_data = dest_tensor.data();
+  dest_tensor = orig_tensor;
+  VERIFY_IS_EQUAL(orig_tensor.data(), orig_data);
+  VERIFY_IS_EQUAL(dest_tensor.data(), dest_data);
+  for (int i = 0; i < 5; ++i) {
+    VERIFY_IS_EQUAL(dest_tensor(i), orig_tensor(i));
+  }
+
+  TensorFixedSize<int, Sizes<5> > orig_array;
+  TensorFixedSize<int, Sizes<5> > dest_array;
+  orig_array.setRandom();
+  dest_array.setRandom();
+  orig_data = orig_array.data();
+  dest_data = dest_array.data();
+  dest_array = orig_array;
+  VERIFY_IS_EQUAL(orig_array.data(), orig_data);
+  VERIFY_IS_EQUAL(dest_array.data(), dest_data);
+  for (int i = 0; i < 5; ++i) {
+    VERIFY_IS_EQUAL(dest_array(i), orig_array(i));
+  }
+
+  int orig[5] = {1, 2, 3, 4, 5};
+  int dest[5] = {6, 7, 8, 9, 10};
+  TensorMap<Tensor<int, 1> > orig_map(orig, 5);
+  TensorMap<Tensor<int, 1> > dest_map(dest, 5);
+  orig_data = orig_map.data();
+  dest_data = dest_map.data();
+  dest_map = orig_map;
+  VERIFY_IS_EQUAL(orig_map.data(), orig_data);
+  VERIFY_IS_EQUAL(dest_map.data(), dest_data);
+  for (int i = 0; i < 5; ++i) {
+    VERIFY_IS_EQUAL(dest[i], i+1);
+  }
+}
+
+static void test_auto_resize()
+{
+  Tensor<int, 1> tensor1;
+  Tensor<int, 1> tensor2(3);
+  Tensor<int, 1> tensor3(5);
+  Tensor<int, 1> tensor4(7);
+
+  Tensor<int, 1> new_tensor(5);
+  new_tensor.setRandom();
+
+  tensor1 = tensor2 = tensor3 = tensor4 = new_tensor;
+
+  VERIFY_IS_EQUAL(tensor1.dimension(0), new_tensor.dimension(0));
+  VERIFY_IS_EQUAL(tensor2.dimension(0), new_tensor.dimension(0));
+  VERIFY_IS_EQUAL(tensor3.dimension(0), new_tensor.dimension(0));
+  VERIFY_IS_EQUAL(tensor4.dimension(0), new_tensor.dimension(0));
+  for (int i = 0; i < new_tensor.dimension(0); ++i) {
+    VERIFY_IS_EQUAL(tensor1(i), new_tensor(i));
+    VERIFY_IS_EQUAL(tensor2(i), new_tensor(i));
+    VERIFY_IS_EQUAL(tensor3(i), new_tensor(i));
+    VERIFY_IS_EQUAL(tensor4(i), new_tensor(i));
+  }
+}
+
+
+static void test_compound_assign()
+{
+  Tensor<int, 1> start_tensor(10);
+  Tensor<int, 1> offset_tensor(10);
+  start_tensor.setRandom();
+  offset_tensor.setRandom();
+
+  Tensor<int, 1> tensor = start_tensor;
+  tensor += offset_tensor;
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_EQUAL(tensor(i), start_tensor(i) + offset_tensor(i));
+  }
+
+  tensor = start_tensor;
+  tensor -= offset_tensor;
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_EQUAL(tensor(i), start_tensor(i) - offset_tensor(i));
+  }
+
+  tensor = start_tensor;
+  tensor *= offset_tensor;
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_EQUAL(tensor(i), start_tensor(i) * offset_tensor(i));
+  }
+
+  tensor = start_tensor;
+  tensor /= offset_tensor;
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_EQUAL(tensor(i), start_tensor(i) / offset_tensor(i));
+  }
+}
+
+static void test_std_initializers_tensor() {
+#if EIGEN_HAS_VARIADIC_TEMPLATES
+  Tensor<int, 1> a(3);
+  a.setValues({0, 1, 2});
+  VERIFY_IS_EQUAL(a(0), 0);
+  VERIFY_IS_EQUAL(a(1), 1);
+  VERIFY_IS_EQUAL(a(2), 2);
+
+  // It fills the top-left slice.
+  a.setValues({10, 20});
+  VERIFY_IS_EQUAL(a(0), 10);
+  VERIFY_IS_EQUAL(a(1), 20);
+  VERIFY_IS_EQUAL(a(2), 2);
+
+  // Chaining.
+  Tensor<int, 1> a2(3);
+  a2 = a.setValues({100, 200, 300});
+  VERIFY_IS_EQUAL(a(0), 100);
+  VERIFY_IS_EQUAL(a(1), 200);
+  VERIFY_IS_EQUAL(a(2), 300);
+  VERIFY_IS_EQUAL(a2(0), 100);
+  VERIFY_IS_EQUAL(a2(1), 200);
+  VERIFY_IS_EQUAL(a2(2), 300);
+
+  Tensor<int, 2> b(2, 3);
+  b.setValues({{0, 1, 2}, {3, 4, 5}});
+  VERIFY_IS_EQUAL(b(0, 0), 0);
+  VERIFY_IS_EQUAL(b(0, 1), 1);
+  VERIFY_IS_EQUAL(b(0, 2), 2);
+  VERIFY_IS_EQUAL(b(1, 0), 3);
+  VERIFY_IS_EQUAL(b(1, 1), 4);
+  VERIFY_IS_EQUAL(b(1, 2), 5);
+
+  // It fills the top-left slice.
+  b.setValues({{10, 20}, {30}});
+  VERIFY_IS_EQUAL(b(0, 0), 10);
+  VERIFY_IS_EQUAL(b(0, 1), 20);
+  VERIFY_IS_EQUAL(b(0, 2), 2);
+  VERIFY_IS_EQUAL(b(1, 0), 30);
+  VERIFY_IS_EQUAL(b(1, 1), 4);
+  VERIFY_IS_EQUAL(b(1, 2), 5);
+
+  Eigen::Tensor<int, 3> c(3, 2, 4);
+  c.setValues({{{0, 1, 2, 3}, {4, 5, 6, 7}},
+               {{10, 11, 12, 13}, {14, 15, 16, 17}},
+               {{20, 21, 22, 23}, {24, 25, 26, 27}}});
+  VERIFY_IS_EQUAL(c(0, 0, 0), 0);
+  VERIFY_IS_EQUAL(c(0, 0, 1), 1);
+  VERIFY_IS_EQUAL(c(0, 0, 2), 2);
+  VERIFY_IS_EQUAL(c(0, 0, 3), 3);
+  VERIFY_IS_EQUAL(c(0, 1, 0), 4);
+  VERIFY_IS_EQUAL(c(0, 1, 1), 5);
+  VERIFY_IS_EQUAL(c(0, 1, 2), 6);
+  VERIFY_IS_EQUAL(c(0, 1, 3), 7);
+  VERIFY_IS_EQUAL(c(1, 0, 0), 10);
+  VERIFY_IS_EQUAL(c(1, 0, 1), 11);
+  VERIFY_IS_EQUAL(c(1, 0, 2), 12);
+  VERIFY_IS_EQUAL(c(1, 0, 3), 13);
+  VERIFY_IS_EQUAL(c(1, 1, 0), 14);
+  VERIFY_IS_EQUAL(c(1, 1, 1), 15);
+  VERIFY_IS_EQUAL(c(1, 1, 2), 16);
+  VERIFY_IS_EQUAL(c(1, 1, 3), 17);
+  VERIFY_IS_EQUAL(c(2, 0, 0), 20);
+  VERIFY_IS_EQUAL(c(2, 0, 1), 21);
+  VERIFY_IS_EQUAL(c(2, 0, 2), 22);
+  VERIFY_IS_EQUAL(c(2, 0, 3), 23);
+  VERIFY_IS_EQUAL(c(2, 1, 0), 24);
+  VERIFY_IS_EQUAL(c(2, 1, 1), 25);
+  VERIFY_IS_EQUAL(c(2, 1, 2), 26);
+  VERIFY_IS_EQUAL(c(2, 1, 3), 27);
+#endif  // EIGEN_HAS_VARIADIC_TEMPLATES
+}
+
+void test_cxx11_tensor_assign()
+{
+  CALL_SUBTEST(test_1d());
+  CALL_SUBTEST(test_2d());
+  CALL_SUBTEST(test_3d());
+  CALL_SUBTEST(test_same_type());
+  CALL_SUBTEST(test_auto_resize());
+  CALL_SUBTEST(test_compound_assign());
+  CALL_SUBTEST(test_std_initializers_tensor());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_broadcast_sycl.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
new file mode 100644
index 000000000..7201bfe37
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_broadcast_sycl.cpp
@@ -0,0 +1,74 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_broadcast_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+static void test_broadcast_sycl(const Eigen::SyclDevice &sycl_device){
+
+  // BROADCAST test:
+  array<int, 4> in_range   = {{2, 3, 5, 7}};
+  array<int, 4> broadcasts = {{2, 3, 1, 4}};
+  array<int, 4> out_range;  // = in_range * broadcasts
+  for (size_t i = 0; i < out_range.size(); ++i)
+    out_range[i] = in_range[i] * broadcasts[i];
+
+  Tensor<float, 4>  input(in_range);
+  Tensor<float, 4> out(out_range);
+
+  for (size_t i = 0; i < in_range.size(); ++i)
+    VERIFY_IS_EQUAL(out.dimension(i), out_range[i]);
+
+
+  for (int i = 0; i < input.size(); ++i)
+    input(i) = static_cast<float>(i);
+
+  float * gpu_in_data  = static_cast<float*>(sycl_device.allocate(input.dimensions().TotalSize()*sizeof(float)));
+  float * gpu_out_data  = static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float)));
+
+  TensorMap<Tensor<float, 4>>  gpu_in(gpu_in_data, in_range);
+  TensorMap<Tensor<float, 4>> gpu_out(gpu_out_data, out_range);
+  sycl_device.memcpyHostToDevice(gpu_in_data, input.data(),(input.dimensions().TotalSize())*sizeof(float));
+  gpu_out.device(sycl_device) = gpu_in.broadcast(broadcasts);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 28; ++l) {
+          VERIFY_IS_APPROX(input(i%2,j%3,k%5,l%7), out(i,j,k,l));
+        }
+      }
+    }
+  }
+  printf("Broadcast Test Passed\n");
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+void test_cxx11_tensor_broadcast_sycl() {
+  cl::sycl::gpu_selector s;
+  Eigen::SyclDevice sycl_device(s);
+  CALL_SUBTEST(test_broadcast_sycl(sycl_device));
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_broadcasting.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_broadcasting.cpp
new file mode 100644
index 000000000..5c0ea5889
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_broadcasting.cpp
@@ -0,0 +1,194 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <int DataLayout>
+static void test_simple_broadcasting()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> broadcasts;
+  broadcasts[0] = 1;
+  broadcasts[1] = 1;
+  broadcasts[2] = 1;
+  broadcasts[3] = 1;
+
+  Tensor<float, 4, DataLayout> no_broadcast;
+  no_broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(no_broadcast.dimension(0), 2);
+  VERIFY_IS_EQUAL(no_broadcast.dimension(1), 3);
+  VERIFY_IS_EQUAL(no_broadcast.dimension(2), 5);
+  VERIFY_IS_EQUAL(no_broadcast.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  broadcasts[0] = 2;
+  broadcasts[1] = 3;
+  broadcasts[2] = 1;
+  broadcasts[3] = 4;
+  Tensor<float, 4, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 4);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 5);
+  VERIFY_IS_EQUAL(broadcast.dimension(3), 28);
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 28; ++l) {
+          VERIFY_IS_EQUAL(tensor(i%2,j%3,k%5,l%7), broadcast(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_vectorized_broadcasting()
+{
+  Tensor<float, 3, DataLayout> tensor(8,3,5);
+  tensor.setRandom();
+  array<ptrdiff_t, 3> broadcasts;
+  broadcasts[0] = 2;
+  broadcasts[1] = 3;
+  broadcasts[2] = 4;
+
+  Tensor<float, 3, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 16);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 20);
+
+  for (int i = 0; i < 16; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 20; ++k) {
+        VERIFY_IS_EQUAL(tensor(i%8,j%3,k%5), broadcast(i,j,k));
+      }
+    }
+  }
+
+  tensor.resize(11,3,5);
+  tensor.setRandom();
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 22);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 20);
+
+  for (int i = 0; i < 22; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 20; ++k) {
+        VERIFY_IS_EQUAL(tensor(i%11,j%3,k%5), broadcast(i,j,k));
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_static_broadcasting()
+{
+  Tensor<float, 3, DataLayout> tensor(8,3,5);
+  tensor.setRandom();
+
+#if EIGEN_HAS_CONSTEXPR
+  Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3>, Eigen::type2index<4>> broadcasts;
+#else
+  Eigen::array<int, 3> broadcasts;
+  broadcasts[0] = 2;
+  broadcasts[1] = 3;
+  broadcasts[2] = 4;
+#endif
+
+  Tensor<float, 3, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 16);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 20);
+
+  for (int i = 0; i < 16; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 20; ++k) {
+        VERIFY_IS_EQUAL(tensor(i%8,j%3,k%5), broadcast(i,j,k));
+      }
+    }
+  }
+
+  tensor.resize(11,3,5);
+  tensor.setRandom();
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), 22);
+  VERIFY_IS_EQUAL(broadcast.dimension(1), 9);
+  VERIFY_IS_EQUAL(broadcast.dimension(2), 20);
+
+  for (int i = 0; i < 22; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 20; ++k) {
+        VERIFY_IS_EQUAL(tensor(i%11,j%3,k%5), broadcast(i,j,k));
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_fixed_size_broadcasting()
+{
+  // Need to add a [] operator to the Size class for this to work
+#if 0
+  Tensor<float, 1, DataLayout> t1(10);
+  t1.setRandom();
+  TensorFixedSize<float, Sizes<1>, DataLayout> t2;
+  t2 = t2.constant(20.0f);
+
+  Tensor<float, 1, DataLayout> t3 = t1 + t2.broadcast(Eigen::array<int, 1>{{10}});
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_APPROX(t3(i), t1(i) + t2(0));
+  }
+
+  TensorMap<TensorFixedSize<float, Sizes<1>, DataLayout> > t4(t2.data(), {{1}});
+  Tensor<float, 1, DataLayout> t5 = t1 + t4.broadcast(Eigen::array<int, 1>{{10}});
+  for (int i = 0; i < 10; ++i) {
+    VERIFY_IS_APPROX(t5(i), t1(i) + t2(0));
+  }
+#endif
+}
+
+
+void test_cxx11_tensor_broadcasting()
+{
+  CALL_SUBTEST(test_simple_broadcasting<ColMajor>());
+  CALL_SUBTEST(test_simple_broadcasting<RowMajor>());
+  CALL_SUBTEST(test_vectorized_broadcasting<ColMajor>());
+  CALL_SUBTEST(test_vectorized_broadcasting<RowMajor>());
+  CALL_SUBTEST(test_static_broadcasting<ColMajor>());
+  CALL_SUBTEST(test_static_broadcasting<RowMajor>());
+  CALL_SUBTEST(test_fixed_size_broadcasting<ColMajor>());
+  CALL_SUBTEST(test_fixed_size_broadcasting<RowMajor>());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
new file mode 100644
index 000000000..816e03220
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
@@ -0,0 +1,79 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_cuda
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_cuda_conversion() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  Tensor<float, 1> floats(num_elem);
+  floats.setRandom();
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
+      d_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
+      d_conv, num_elem);
+
+  gpu_device.memcpyHostToDevice(d_float, floats.data(), num_elem*sizeof(float));
+
+  gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
+  gpu_conv.device(gpu_device) = gpu_half.cast<float>();
+
+  Tensor<float, 1> initial(num_elem);
+  Tensor<float, 1> final(num_elem);
+  gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    VERIFY_IS_APPROX(initial(i), final(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_half);
+  gpu_device.deallocate(d_conv);
+}
+
+
+void test_fallback_conversion() {
+  int num_elem = 101;
+  Tensor<float, 1> floats(num_elem);
+  floats.setRandom();
+
+  Eigen::Tensor<Eigen::half, 1> halfs = floats.cast<Eigen::half>();
+  Eigen::Tensor<float, 1> conv = halfs.cast<float>();
+
+  for (int i = 0; i < num_elem; ++i) {
+    VERIFY_IS_APPROX(floats(i), conv(i));
+  }
+}
+
+
+void test_cxx11_tensor_cast_float16_cuda()
+{
+  CALL_SUBTEST(test_cuda_conversion());
+  CALL_SUBTEST(test_fallback_conversion());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_casts.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_casts.cpp
new file mode 100644
index 000000000..3c6d0d2ff
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_casts.cpp
@@ -0,0 +1,115 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::array;
+
+static void test_simple_cast()
+{
+  Tensor<float, 2> ftensor(20,30);
+  ftensor = ftensor.random() * 100.f;
+  Tensor<char, 2> chartensor(20,30);
+  chartensor.setRandom();
+  Tensor<std::complex<float>, 2> cplextensor(20,30);
+  cplextensor.setRandom();
+
+  chartensor = ftensor.cast<char>();
+  cplextensor = ftensor.cast<std::complex<float> >();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_EQUAL(chartensor(i,j), static_cast<char>(ftensor(i,j)));
+      VERIFY_IS_EQUAL(cplextensor(i,j), static_cast<std::complex<float> >(ftensor(i,j)));
+    }
+  }
+}
+
+
+static void test_vectorized_cast()
+{
+  Tensor<int, 2> itensor(20,30);
+  itensor = itensor.random() / 1000;
+  Tensor<float, 2> ftensor(20,30);
+  ftensor.setRandom();
+  Tensor<double, 2> dtensor(20,30);
+  dtensor.setRandom();
+
+  ftensor = itensor.cast<float>();
+  dtensor = itensor.cast<double>();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_EQUAL(itensor(i,j), static_cast<int>(ftensor(i,j)));
+      VERIFY_IS_EQUAL(dtensor(i,j), static_cast<double>(ftensor(i,j)));
+    }
+  }
+}
+
+
+static void test_float_to_int_cast()
+{
+  Tensor<float, 2> ftensor(20,30);
+  ftensor = ftensor.random() * 1000.0f;
+  Tensor<double, 2> dtensor(20,30);
+  dtensor = dtensor.random() * 1000.0;
+
+  Tensor<int, 2> i1tensor = ftensor.cast<int>();
+  Tensor<int, 2> i2tensor = dtensor.cast<int>();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_EQUAL(i1tensor(i,j), static_cast<int>(ftensor(i,j)));
+      VERIFY_IS_EQUAL(i2tensor(i,j), static_cast<int>(dtensor(i,j)));
+    }
+  }
+}
+
+
+static void test_big_to_small_type_cast()
+{
+  Tensor<double, 2> dtensor(20, 30);
+  dtensor.setRandom();
+  Tensor<float, 2> ftensor(20, 30);
+  ftensor = dtensor.cast<float>();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_APPROX(dtensor(i,j), static_cast<double>(ftensor(i,j)));
+    }
+  }
+}
+
+
+static void test_small_to_big_type_cast()
+{
+  Tensor<float, 2> ftensor(20, 30);
+  ftensor.setRandom();
+  Tensor<double, 2> dtensor(20, 30);
+  dtensor = ftensor.cast<double>();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_APPROX(dtensor(i,j), static_cast<double>(ftensor(i,j)));
+    }
+  }
+}
+
+
+void test_cxx11_tensor_casts()
+{
+   CALL_SUBTEST(test_simple_cast());
+   CALL_SUBTEST(test_vectorized_cast());
+   CALL_SUBTEST(test_float_to_int_cast());
+   CALL_SUBTEST(test_big_to_small_type_cast());
+   CALL_SUBTEST(test_small_to_big_type_cast());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_chipping.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_chipping.cpp
new file mode 100644
index 000000000..1832dec8b
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_chipping.cpp
@@ -0,0 +1,425 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<int DataLayout>
+static void test_simple_chip()
+{
+  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
+  tensor.setRandom();
+
+  Tensor<float, 4, DataLayout> chip1;
+  chip1 = tensor.template chip<0>(1);
+
+  VERIFY_IS_EQUAL(chip1.dimension(0), 3);
+  VERIFY_IS_EQUAL(chip1.dimension(1), 5);
+  VERIFY_IS_EQUAL(chip1.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip1.dimension(3), 11);
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1,i,j,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip2 = tensor.template chip<1>(1);
+  VERIFY_IS_EQUAL(chip2.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip2.dimension(1), 5);
+  VERIFY_IS_EQUAL(chip2.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip2.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip3 = tensor.template chip<2>(2);
+  VERIFY_IS_EQUAL(chip3.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip3.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip3.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip3.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip4(tensor.template chip<3>(5));
+  VERIFY_IS_EQUAL(chip4.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip4.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip4.dimension(2), 5);
+  VERIFY_IS_EQUAL(chip4.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip5(tensor.template chip<4>(7));
+  VERIFY_IS_EQUAL(chip5.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip5.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip5.dimension(2), 5);
+  VERIFY_IS_EQUAL(chip5.dimension(3), 7);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7));
+        }
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_dynamic_chip()
+{
+  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
+  tensor.setRandom();
+
+  Tensor<float, 4, DataLayout> chip1;
+  chip1 = tensor.chip(1, 0);
+  VERIFY_IS_EQUAL(chip1.dimension(0), 3);
+  VERIFY_IS_EQUAL(chip1.dimension(1), 5);
+  VERIFY_IS_EQUAL(chip1.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip1.dimension(3), 11);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip1(i,j,k,l), tensor(1,i,j,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip2 = tensor.chip(1, 1);
+  VERIFY_IS_EQUAL(chip2.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip2.dimension(1), 5);
+  VERIFY_IS_EQUAL(chip2.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip2.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip2(i,j,k,l), tensor(i,1,j,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip3 = tensor.chip(2, 2);
+  VERIFY_IS_EQUAL(chip3.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip3.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip3.dimension(2), 7);
+  VERIFY_IS_EQUAL(chip3.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(chip3(i,j,k,l), tensor(i,j,2,k,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip4(tensor.chip(5, 3));
+  VERIFY_IS_EQUAL(chip4.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip4.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip4.dimension(2), 5);
+  VERIFY_IS_EQUAL(chip4.dimension(3), 11);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(chip4(i,j,k,l), tensor(i,j,k,5,l));
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> chip5(tensor.chip(7, 4));
+  VERIFY_IS_EQUAL(chip5.dimension(0), 2);
+  VERIFY_IS_EQUAL(chip5.dimension(1), 3);
+  VERIFY_IS_EQUAL(chip5.dimension(2), 5);
+  VERIFY_IS_EQUAL(chip5.dimension(3), 7);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(chip5(i,j,k,l), tensor(i,j,k,l,7));
+        }
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_chip_in_expr() {
+  Tensor<float, 5, DataLayout> input1(2,3,5,7,11);
+  input1.setRandom();
+  Tensor<float, 4, DataLayout> input2(3,5,7,11);
+  input2.setRandom();
+
+  Tensor<float, 4, DataLayout> result = input1.template chip<0>(0) + input2;
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          float expected = input1(0,i,j,k,l) + input2(i,j,k,l);
+          VERIFY_IS_EQUAL(result(i,j,k,l), expected);
+        }
+      }
+    }
+  }
+
+  Tensor<float, 3, DataLayout> input3(3,7,11);
+  input3.setRandom();
+  Tensor<float, 3, DataLayout> result2 = input1.template chip<0>(0).template chip<1>(2) + input3;
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      for (int k = 0; k < 11; ++k) {
+        float expected = input1(0,i,2,j,k) + input3(i,j,k);
+        VERIFY_IS_EQUAL(result2(i,j,k), expected);
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_chip_as_lvalue()
+{
+  Tensor<float, 5, DataLayout> input1(2,3,5,7,11);
+  input1.setRandom();
+
+  Tensor<float, 4, DataLayout> input2(3,5,7,11);
+  input2.setRandom();
+  Tensor<float, 5, DataLayout> tensor = input1;
+  tensor.template chip<0>(1) = input2;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          for (int m = 0; m < 11; ++m) {
+            if (i != 1) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input2(j,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> input3(2,5,7,11);
+  input3.setRandom();
+  tensor = input1;
+  tensor.template chip<1>(1) = input3;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          for (int m = 0; m < 11; ++m) {
+            if (j != 1) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input3(i,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> input4(2,3,7,11);
+  input4.setRandom();
+  tensor = input1;
+  tensor.template chip<2>(3) = input4;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          for (int m = 0; m < 11; ++m) {
+            if (k != 3) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input4(i,j,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> input5(2,3,5,11);
+  input5.setRandom();
+  tensor = input1;
+  tensor.template chip<3>(4) = input5;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          for (int m = 0; m < 11; ++m) {
+            if (l != 4) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input5(i,j,k,m));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  Tensor<float, 4, DataLayout> input6(2,3,5,7);
+  input6.setRandom();
+  tensor = input1;
+  tensor.template chip<4>(5) = input6;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          for (int m = 0; m < 11; ++m) {
+            if (m != 5) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input6(i,j,k,l));
+            }
+          }
+        }
+      }
+    }
+  }
+
+  Tensor<float, 5, DataLayout> input7(2,3,5,7,11);
+  input7.setRandom();
+  tensor = input1;
+  tensor.chip(0, 0) = input7.chip(0, 0);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          for (int m = 0; m < 11; ++m) {
+            if (i != 0) {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input1(i,j,k,l,m));
+            } else {
+              VERIFY_IS_EQUAL(tensor(i,j,k,l,m), input7(i,j,k,l,m));
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+static void test_chip_raw_data_col_major()
+{
+  Tensor<float, 5, ColMajor> tensor(2,3,5,7,11);
+  tensor.setRandom();
+
+  typedef TensorEvaluator<decltype(tensor.chip<4>(3)), DefaultDevice> Evaluator4;
+  auto chip = Evaluator4(tensor.chip<4>(3), DefaultDevice());
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          int chip_index = i + 2 * (j + 3 * (k + 5 * l));
+          VERIFY_IS_EQUAL(chip.data()[chip_index], tensor(i,j,k,l,3));
+        }
+      }
+    }
+  }
+
+  typedef TensorEvaluator<decltype(tensor.chip<0>(0)), DefaultDevice> Evaluator0;
+  auto chip0 = Evaluator0(tensor.chip<0>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip0.data(), static_cast<float*>(0));
+
+  typedef TensorEvaluator<decltype(tensor.chip<1>(0)), DefaultDevice> Evaluator1;
+  auto chip1 = Evaluator1(tensor.chip<1>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip1.data(), static_cast<float*>(0));
+
+  typedef TensorEvaluator<decltype(tensor.chip<2>(0)), DefaultDevice> Evaluator2;
+  auto chip2 = Evaluator2(tensor.chip<2>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip2.data(), static_cast<float*>(0));
+
+  typedef TensorEvaluator<decltype(tensor.chip<3>(0)), DefaultDevice> Evaluator3;
+  auto chip3 = Evaluator3(tensor.chip<3>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip3.data(), static_cast<float*>(0));
+}
+
+static void test_chip_raw_data_row_major()
+{
+  Tensor<float, 5, RowMajor> tensor(11,7,5,3,2);
+  tensor.setRandom();
+
+  typedef TensorEvaluator<decltype(tensor.chip<0>(3)), DefaultDevice> Evaluator0;
+  auto chip = Evaluator0(tensor.chip<0>(3), DefaultDevice());
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 2; ++l) {
+          int chip_index = l + 2 * (k + 3 * (j + 5 * i));
+          VERIFY_IS_EQUAL(chip.data()[chip_index], tensor(3,i,j,k,l));
+        }
+      }
+    }
+  }
+
+  typedef TensorEvaluator<decltype(tensor.chip<1>(0)), DefaultDevice> Evaluator1;
+  auto chip1 = Evaluator1(tensor.chip<1>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip1.data(), static_cast<float*>(0));
+
+  typedef TensorEvaluator<decltype(tensor.chip<2>(0)), DefaultDevice> Evaluator2;
+  auto chip2 = Evaluator2(tensor.chip<2>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip2.data(), static_cast<float*>(0));
+
+  typedef TensorEvaluator<decltype(tensor.chip<3>(0)), DefaultDevice> Evaluator3;
+  auto chip3 = Evaluator3(tensor.chip<3>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip3.data(), static_cast<float*>(0));
+
+  typedef TensorEvaluator<decltype(tensor.chip<4>(0)), DefaultDevice> Evaluator4;
+  auto chip4 = Evaluator4(tensor.chip<4>(0), DefaultDevice());
+  VERIFY_IS_EQUAL(chip4.data(), static_cast<float*>(0));
+}
+
+void test_cxx11_tensor_chipping()
+{
+  CALL_SUBTEST(test_simple_chip<ColMajor>());
+  CALL_SUBTEST(test_simple_chip<RowMajor>());
+  CALL_SUBTEST(test_dynamic_chip<ColMajor>());
+  CALL_SUBTEST(test_dynamic_chip<RowMajor>());
+  CALL_SUBTEST(test_chip_in_expr<ColMajor>());
+  CALL_SUBTEST(test_chip_in_expr<RowMajor>());
+  CALL_SUBTEST(test_chip_as_lvalue<ColMajor>());
+  CALL_SUBTEST(test_chip_as_lvalue<RowMajor>());
+  CALL_SUBTEST(test_chip_raw_data_col_major());
+  CALL_SUBTEST(test_chip_raw_data_row_major());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_comparisons.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_comparisons.cpp
new file mode 100644
index 000000000..b1ff8aecb
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_comparisons.cpp
@@ -0,0 +1,84 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_orderings()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<bool, 3> lt(2,3,7);
+  Tensor<bool, 3> le(2,3,7);
+  Tensor<bool, 3> gt(2,3,7);
+  Tensor<bool, 3> ge(2,3,7);
+
+  mat1.setRandom();
+  mat2.setRandom();
+
+  lt = mat1 < mat2;
+  le = mat1 <= mat2;
+  gt = mat1 > mat2;
+  ge = mat1 >= mat2;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(lt(i,j,k), mat1(i,j,k) < mat2(i,j,k));
+        VERIFY_IS_EQUAL(le(i,j,k), mat1(i,j,k) <= mat2(i,j,k));
+        VERIFY_IS_EQUAL(gt(i,j,k), mat1(i,j,k) > mat2(i,j,k));
+        VERIFY_IS_EQUAL(ge(i,j,k), mat1(i,j,k) >= mat2(i,j,k));
+      }
+    }
+  }
+}
+
+
+static void test_equality()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+
+  mat1.setRandom();
+  mat2.setRandom();
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        if (internal::random<bool>()) {
+          mat2(i,j,k) = mat1(i,j,k);
+        }
+      }
+    }
+  }
+
+  Tensor<bool, 3> eq(2,3,7);
+  Tensor<bool, 3> ne(2,3,7);
+  eq = (mat1 == mat2);
+  ne = (mat1 != mat2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(eq(i,j,k), mat1(i,j,k) == mat2(i,j,k));
+        VERIFY_IS_EQUAL(ne(i,j,k), mat1(i,j,k) != mat2(i,j,k));
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_comparisons()
+{
+  CALL_SUBTEST(test_orderings());
+  CALL_SUBTEST(test_equality());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_complex_cuda.cu b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_complex_cuda.cu
new file mode 100644
index 000000000..916f12a84
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_complex_cuda.cu
@@ -0,0 +1,150 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_FUNC cxx11_tensor_complex
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_cuda_nullary() {
+  Tensor<std::complex<float>, 1, 0, int> in1(2);
+  Tensor<std::complex<float>, 1, 0, int> in2(2);
+  in1.setRandom();
+  in2.setRandom();
+
+  std::size_t float_bytes = in1.size() * sizeof(float);
+  std::size_t complex_bytes = in1.size() * sizeof(std::complex<float>);
+
+  std::complex<float>* d_in1;
+  std::complex<float>* d_in2;
+  float* d_out2;
+  cudaMalloc((void**)(&d_in1), complex_bytes);
+  cudaMalloc((void**)(&d_in2), complex_bytes);
+  cudaMalloc((void**)(&d_out2), float_bytes);
+  cudaMemcpy(d_in1, in1.data(), complex_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2.data(), complex_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in1(
+      d_in1, 2);
+  Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in2(
+      d_in2, 2);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_out2(
+      d_out2, 2);
+
+  gpu_in1.device(gpu_device) = gpu_in1.constant(std::complex<float>(3.14f, 2.7f));
+  gpu_out2.device(gpu_device) = gpu_in2.abs();
+
+  Tensor<std::complex<float>, 1, 0, int> new1(2);
+  Tensor<float, 1, 0, int> new2(2);
+
+  assert(cudaMemcpyAsync(new1.data(), d_in1, complex_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+  assert(cudaMemcpyAsync(new2.data(), d_out2, float_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 2; ++i) {
+    VERIFY_IS_APPROX(new1(i), std::complex<float>(3.14f, 2.7f));
+    VERIFY_IS_APPROX(new2(i), std::abs(in2(i)));
+  }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+  cudaFree(d_out2);
+}
+
+
+static void test_cuda_sum_reductions() {
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  const int num_rows = internal::random<int>(1024, 5*1024);
+  const int num_cols = internal::random<int>(1024, 5*1024);
+
+  Tensor<std::complex<float>, 2> in(num_rows, num_cols);
+  in.setRandom();
+
+  Tensor<std::complex<float>, 0> full_redux;
+  full_redux = in.sum();
+
+  std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
+  std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
+  std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
+  std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
+  gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
+
+  TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+  TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
+
+  out_gpu.device(gpu_device) = in_gpu.sum();
+
+  Tensor<std::complex<float>, 0> full_redux_gpu;
+  gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
+  gpu_device.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
+
+  gpu_device.deallocate(gpu_in_ptr);
+  gpu_device.deallocate(gpu_out_ptr);
+}
+
+
+static void test_cuda_product_reductions() {
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  const int num_rows = internal::random<int>(1024, 5*1024);
+  const int num_cols = internal::random<int>(1024, 5*1024);
+
+  Tensor<std::complex<float>, 2> in(num_rows, num_cols);
+  in.setRandom();
+
+  Tensor<std::complex<float>, 0> full_redux;
+  full_redux = in.prod();
+
+  std::size_t in_bytes = in.size() * sizeof(std::complex<float>);
+  std::size_t out_bytes = full_redux.size() * sizeof(std::complex<float>);
+  std::complex<float>* gpu_in_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(in_bytes));
+  std::complex<float>* gpu_out_ptr = static_cast<std::complex<float>*>(gpu_device.allocate(out_bytes));
+  gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
+
+  TensorMap<Tensor<std::complex<float>, 2> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+  TensorMap<Tensor<std::complex<float>, 0> > out_gpu(gpu_out_ptr);
+
+  out_gpu.device(gpu_device) = in_gpu.prod();
+
+  Tensor<std::complex<float>, 0> full_redux_gpu;
+  gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
+  gpu_device.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
+
+  gpu_device.deallocate(gpu_in_ptr);
+  gpu_device.deallocate(gpu_out_ptr);
+}
+
+
+void test_cxx11_tensor_complex()
+{
+  CALL_SUBTEST(test_cuda_nullary());
+  CALL_SUBTEST(test_cuda_sum_reductions());
+  CALL_SUBTEST(test_cuda_product_reductions());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
new file mode 100644
index 000000000..aac780905
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu
@@ -0,0 +1,94 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_FUNC cxx11_tensor_complex_cwise_ops
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<typename T>
+void test_cuda_complex_cwise_ops() {
+  const int kNumItems = 2;
+  std::size_t complex_bytes = kNumItems * sizeof(std::complex<T>);
+
+  std::complex<T>* d_in1;
+  std::complex<T>* d_in2;
+  std::complex<T>* d_out;
+  cudaMalloc((void**)(&d_in1), complex_bytes);
+  cudaMalloc((void**)(&d_in2), complex_bytes);
+  cudaMalloc((void**)(&d_out), complex_bytes);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in1(
+      d_in1, kNumItems);
+  Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in2(
+      d_in2, kNumItems);
+  Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_out(
+      d_out, kNumItems);
+
+  const std::complex<T> a(3.14f, 2.7f);
+  const std::complex<T> b(-10.6f, 1.4f);
+
+  gpu_in1.device(gpu_device) = gpu_in1.constant(a);
+  gpu_in2.device(gpu_device) = gpu_in2.constant(b);
+
+  enum CwiseOp {
+    Add = 0,
+    Sub,
+    Mul,
+    Div
+  };
+
+  Tensor<std::complex<T>, 1, 0, int> actual(kNumItems);
+  for (int op = Add; op <= Div; op++) {
+    std::complex<T> expected;
+    switch (static_cast<CwiseOp>(op)) {
+      case Add:
+        gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
+        expected = a + b;
+        break;
+      case Sub:
+        gpu_out.device(gpu_device) = gpu_in1 - gpu_in2;
+        expected = a - b;
+        break;
+      case Mul:
+        gpu_out.device(gpu_device) = gpu_in1 * gpu_in2;
+        expected = a * b;
+        break;
+      case Div:
+        gpu_out.device(gpu_device) = gpu_in1 / gpu_in2;
+        expected = a / b;
+        break;
+    }
+    assert(cudaMemcpyAsync(actual.data(), d_out, complex_bytes, cudaMemcpyDeviceToHost,
+                           gpu_device.stream()) == cudaSuccess);
+    assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+    for (int i = 0; i < kNumItems; ++i) {
+      VERIFY_IS_APPROX(actual(i), expected);
+    }
+  }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+  cudaFree(d_out);
+}
+
+
+void test_cxx11_tensor_complex_cwise_ops()
+{
+  CALL_SUBTEST(test_cuda_complex_cwise_ops<float>());
+  CALL_SUBTEST(test_cuda_complex_cwise_ops<double>());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_concatenation.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_concatenation.cpp
new file mode 100644
index 000000000..03ef12e63
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_concatenation.cpp
@@ -0,0 +1,137 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<int DataLayout>
+static void test_dimension_failures()
+{
+  Tensor<int, 3, DataLayout> left(2, 3, 1);
+  Tensor<int, 3, DataLayout> right(3, 3, 1);
+  left.setRandom();
+  right.setRandom();
+
+  // Okay; other dimensions are equal.
+  Tensor<int, 3, DataLayout> concatenation = left.concatenate(right, 0);
+
+  // Dimension mismatches.
+  VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 1));
+  VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 2));
+
+  // Axis > NumDims or < 0.
+  VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, 3));
+  VERIFY_RAISES_ASSERT(concatenation = left.concatenate(right, -1));
+}
+
+template<int DataLayout>
+static void test_static_dimension_failure()
+{
+  Tensor<int, 2, DataLayout> left(2, 3);
+  Tensor<int, 3, DataLayout> right(2, 3, 1);
+
+#ifdef CXX11_TENSOR_CONCATENATION_STATIC_DIMENSION_FAILURE
+  // Technically compatible, but we static assert that the inputs have same
+  // NumDims.
+  Tensor<int, 3, DataLayout> concatenation = left.concatenate(right, 0);
+#endif
+
+  // This can be worked around in this case.
+  Tensor<int, 3, DataLayout> concatenation = left
+      .reshape(Tensor<int, 3>::Dimensions(2, 3, 1))
+      .concatenate(right, 0);
+  Tensor<int, 2, DataLayout> alternative = left
+      .concatenate(right.reshape(Tensor<int, 2>::Dimensions{{{2, 3}}}), 0);
+}
+
+template<int DataLayout>
+static void test_simple_concatenation()
+{
+  Tensor<int, 3, DataLayout> left(2, 3, 1);
+  Tensor<int, 3, DataLayout> right(2, 3, 1);
+  left.setRandom();
+  right.setRandom();
+
+  Tensor<int, 3, DataLayout> concatenation = left.concatenate(right, 0);
+  VERIFY_IS_EQUAL(concatenation.dimension(0), 4);
+  VERIFY_IS_EQUAL(concatenation.dimension(1), 3);
+  VERIFY_IS_EQUAL(concatenation.dimension(2), 1);
+  for (int j = 0; j < 3; ++j) {
+    for (int i = 0; i < 2; ++i) {
+      VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0));
+    }
+    for (int i = 2; i < 4; ++i) {
+      VERIFY_IS_EQUAL(concatenation(i, j, 0), right(i - 2, j, 0));
+    }
+  }
+
+  concatenation = left.concatenate(right, 1);
+  VERIFY_IS_EQUAL(concatenation.dimension(0), 2);
+  VERIFY_IS_EQUAL(concatenation.dimension(1), 6);
+  VERIFY_IS_EQUAL(concatenation.dimension(2), 1);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0));
+    }
+    for (int j = 3; j < 6; ++j) {
+      VERIFY_IS_EQUAL(concatenation(i, j, 0), right(i, j - 3, 0));
+    }
+  }
+
+  concatenation = left.concatenate(right, 2);
+  VERIFY_IS_EQUAL(concatenation.dimension(0), 2);
+  VERIFY_IS_EQUAL(concatenation.dimension(1), 3);
+  VERIFY_IS_EQUAL(concatenation.dimension(2), 2);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(concatenation(i, j, 0), left(i, j, 0));
+      VERIFY_IS_EQUAL(concatenation(i, j, 1), right(i, j, 0));
+    }
+  }
+}
+
+
+// TODO(phli): Add test once we have a real vectorized implementation.
+// static void test_vectorized_concatenation() {}
+
+static void test_concatenation_as_lvalue()
+{
+  Tensor<int, 2> t1(2, 3);
+  Tensor<int, 2> t2(2, 3);
+  t1.setRandom();
+  t2.setRandom();
+
+  Tensor<int, 2> result(4, 3);
+  result.setRandom();
+  t1.concatenate(t2, 0) = result;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(t1(i, j), result(i, j));
+      VERIFY_IS_EQUAL(t2(i, j), result(i+2, j));
+    }
+  }
+}
+
+
+void test_cxx11_tensor_concatenation()
+{
+   CALL_SUBTEST(test_dimension_failures<ColMajor>());
+   CALL_SUBTEST(test_dimension_failures<RowMajor>());
+   CALL_SUBTEST(test_static_dimension_failure<ColMajor>());
+   CALL_SUBTEST(test_static_dimension_failure<RowMajor>());
+   CALL_SUBTEST(test_simple_concatenation<ColMajor>());
+   CALL_SUBTEST(test_simple_concatenation<RowMajor>());
+   // CALL_SUBTEST(test_vectorized_concatenation());
+   CALL_SUBTEST(test_concatenation_as_lvalue());
+
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_const.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_const.cpp
new file mode 100644
index 000000000..ad9c9da39
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_const.cpp
@@ -0,0 +1,62 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+using Eigen::Tensor;
+
+
+static void test_simple_assign()
+{
+  Tensor<int, 3> random(2,3,7);
+  random.setRandom();
+
+  TensorMap<Tensor<const int, 3> > constant(random.data(), 2, 3, 7);
+  Tensor<int, 3> result(2,3,7);
+  result = constant;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL((result(i,j,k)), random(i,j,k));
+      }
+    }
+  }
+}
+
+
+static void test_assign_of_const_tensor()
+{
+  Tensor<int, 3> random(2,3,7);
+  random.setRandom();
+
+  TensorMap<Tensor<const int, 3> > constant1(random.data(), 2, 3, 7);
+  TensorMap<const Tensor<int, 3> > constant2(random.data(), 2, 3, 7);
+  const TensorMap<Tensor<int, 3> > constant3(random.data(), 2, 3, 7);
+
+  Tensor<int, 2> result1 = constant1.chip(0, 2);
+  Tensor<int, 2> result2 = constant2.chip(0, 2);
+  Tensor<int, 2> result3 = constant3.chip(0, 2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL((result1(i,j)), random(i,j,0));
+      VERIFY_IS_EQUAL((result2(i,j)), random(i,j,0));
+      VERIFY_IS_EQUAL((result3(i,j)), random(i,j,0));
+    }
+  }
+}
+
+
+void test_cxx11_tensor_const()
+{
+  CALL_SUBTEST(test_simple_assign());
+  CALL_SUBTEST(test_assign_of_const_tensor());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_contract_cuda.cu b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_contract_cuda.cu
new file mode 100644
index 000000000..e821ccf0c
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_contract_cuda.cu
@@ -0,0 +1,213 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template<int DataLayout>
+void test_cuda_contraction(int m_size, int k_size, int n_size)
+{
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
+  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
+  Tensor<float, 2, DataLayout> t_result(m_size, n_size);
+  Tensor<float, 2, DataLayout> t_result_gpu(m_size, n_size);
+  Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
+  std::size_t t_right_bytes = t_right.size() * sizeof(float);
+  std::size_t t_result_bytes = t_result.size() * sizeof(float);
+
+  float* d_t_left;
+  float* d_t_right;
+  float* d_t_result;
+
+  cudaMalloc((void**)(&d_t_left), t_left_bytes);
+  cudaMalloc((void**)(&d_t_right), t_right_bytes);
+  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+
+  cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_left(d_t_left, Eigen::array<int, 2>(m_size, k_size));
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_right(d_t_right, Eigen::array<int, 2>(k_size, n_size));
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_result(d_t_result, Eigen::array<int, 2>(m_size, n_size));
+
+
+  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
+  t_result = t_left.contract(t_right, dims);
+
+  cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  for (DenseIndex i = 0; i < t_result.size(); i++) {
+    if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
+      continue;
+    }
+    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+              << " vs " <<  t_result_gpu(i) << std::endl;
+    assert(false);
+  }
+
+  cudaFree((void*)d_t_left);
+  cudaFree((void*)d_t_right);
+  cudaFree((void*)d_t_result);
+}
+
+
+template<int DataLayout>
+void test_scalar(int m_size, int k_size, int n_size)
+{
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  Tensor<float, 2, DataLayout> t_left(m_size, k_size);
+  Tensor<float, 2, DataLayout> t_right(k_size, n_size);
+  Tensor<float, 0, DataLayout> t_result;
+  Tensor<float, 0, DataLayout> t_result_gpu;
+  Eigen::array<DimPair, 2> dims(DimPair(0, 0), DimPair(1, 1));
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
+  std::size_t t_right_bytes = t_right.size() * sizeof(float);
+  std::size_t t_result_bytes = sizeof(float);
+
+  float* d_t_left;
+  float* d_t_right;
+  float* d_t_result;
+
+  cudaMalloc((void**)(&d_t_left), t_left_bytes);
+  cudaMalloc((void**)(&d_t_right), t_right_bytes);
+  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+
+  cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_left(d_t_left, m_size, k_size);
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
+      gpu_t_right(d_t_right, k_size, n_size);
+  Eigen::TensorMap<Eigen::Tensor<float, 0, DataLayout> >
+      gpu_t_result(d_t_result);
+
+  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
+  t_result = t_left.contract(t_right, dims);
+
+  cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  if (fabs(t_result() - t_result_gpu()) > 1e-4f &&
+      !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
+    std::cout << "mismatch detected: " << t_result()
+              << " vs " <<  t_result_gpu() << std::endl;
+    assert(false);
+  }
+
+  cudaFree((void*)d_t_left);
+  cudaFree((void*)d_t_right);
+  cudaFree((void*)d_t_result);
+}
+
+
+template<int DataLayout>
+void test_cuda_contraction_m() {
+  for (int k = 32; k < 256; k++) {
+    test_cuda_contraction<ColMajor>(k, 128, 128);
+    test_cuda_contraction<RowMajor>(k, 128, 128);
+  }
+}
+
+template<int DataLayout>
+void test_cuda_contraction_k() {
+  for (int k = 32; k < 256; k++) {
+    test_cuda_contraction<ColMajor>(128, k, 128);
+    test_cuda_contraction<RowMajor>(128, k, 128);
+  }
+}
+
+template<int DataLayout>
+void test_cuda_contraction_n() {
+  for (int k = 32; k < 256; k++) {
+    test_cuda_contraction<ColMajor>(128, 128, k);
+    test_cuda_contraction<RowMajor>(128, 128, k);
+  }
+}
+
+
+template<int DataLayout>
+void test_cuda_contraction_sizes() {
+  int m_sizes[] = { 31,  39,   63,   64,   65,
+                   127, 129,  255,  257 , 511,
+                   512, 513, 1023, 1024, 1025};
+
+  int n_sizes[] = { 31,  39,   63,   64,   65,
+                   127, 129,  255,  257,  511,
+                   512, 513, 1023, 1024, 1025};
+
+  int k_sizes[] = {  31,   39,  63,  64,   65,
+                     95,   96, 127, 129,  255,
+                    257,  511, 512, 513, 1023,
+                   1024, 1025};
+
+  for (int i = 0; i < 15; i++) {
+    for (int j = 0; j < 15; j++) {
+      for (int k = 0; k < 17; k++) {
+        test_cuda_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_cuda()
+{
+  CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128));
+
+  CALL_SUBTEST_1(test_scalar<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_1(test_scalar<RowMajor>(128, 128, 128));
+
+  CALL_SUBTEST_2(test_cuda_contraction_m<ColMajor>());
+  CALL_SUBTEST_3(test_cuda_contraction_m<RowMajor>());
+
+  CALL_SUBTEST_4(test_cuda_contraction_k<ColMajor>());
+  CALL_SUBTEST_5(test_cuda_contraction_k<RowMajor>());
+
+  CALL_SUBTEST_6(test_cuda_contraction_n<ColMajor>());
+  CALL_SUBTEST_7(test_cuda_contraction_n<RowMajor>());
+
+  CALL_SUBTEST_8(test_cuda_contraction_sizes<ColMajor>());
+  CALL_SUBTEST_9(test_cuda_contraction_sizes<RowMajor>());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_contraction.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_contraction.cpp
new file mode 100644
index 000000000..ace97057f
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_contraction.cpp
@@ -0,0 +1,545 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::DefaultDevice;
+using Eigen::Tensor;
+
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template<int DataLayout>
+static void test_evals()
+{
+  Tensor<float, 2, DataLayout> mat1(2, 3);
+  Tensor<float, 2, DataLayout> mat2(2, 3);
+  Tensor<float, 2, DataLayout> mat3(3, 2);
+
+  mat1.setRandom();
+  mat2.setRandom();
+  mat3.setRandom();
+
+  Tensor<float, 2, DataLayout> mat4(3,3);
+  mat4.setZero();
+  Eigen::array<DimPair, 1> dims3 = {{DimPair(0, 0)}};
+  typedef TensorEvaluator<decltype(mat1.contract(mat2, dims3)), DefaultDevice> Evaluator;
+  Evaluator eval(mat1.contract(mat2, dims3), DefaultDevice());
+  eval.evalTo(mat4.data());
+  EIGEN_STATIC_ASSERT(Evaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval.dimensions()[0], 3);
+  VERIFY_IS_EQUAL(eval.dimensions()[1], 3);
+
+  VERIFY_IS_APPROX(mat4(0,0), mat1(0,0)*mat2(0,0) + mat1(1,0)*mat2(1,0));
+  VERIFY_IS_APPROX(mat4(0,1), mat1(0,0)*mat2(0,1) + mat1(1,0)*mat2(1,1));
+  VERIFY_IS_APPROX(mat4(0,2), mat1(0,0)*mat2(0,2) + mat1(1,0)*mat2(1,2));
+  VERIFY_IS_APPROX(mat4(1,0), mat1(0,1)*mat2(0,0) + mat1(1,1)*mat2(1,0));
+  VERIFY_IS_APPROX(mat4(1,1), mat1(0,1)*mat2(0,1) + mat1(1,1)*mat2(1,1));
+  VERIFY_IS_APPROX(mat4(1,2), mat1(0,1)*mat2(0,2) + mat1(1,1)*mat2(1,2));
+  VERIFY_IS_APPROX(mat4(2,0), mat1(0,2)*mat2(0,0) + mat1(1,2)*mat2(1,0));
+  VERIFY_IS_APPROX(mat4(2,1), mat1(0,2)*mat2(0,1) + mat1(1,2)*mat2(1,1));
+  VERIFY_IS_APPROX(mat4(2,2), mat1(0,2)*mat2(0,2) + mat1(1,2)*mat2(1,2));
+
+  Tensor<float, 2, DataLayout> mat5(2,2);
+  mat5.setZero();
+  Eigen::array<DimPair, 1> dims4 = {{DimPair(1, 1)}};
+  typedef TensorEvaluator<decltype(mat1.contract(mat2, dims4)), DefaultDevice> Evaluator2;
+  Evaluator2 eval2(mat1.contract(mat2, dims4), DefaultDevice());
+  eval2.evalTo(mat5.data());
+  EIGEN_STATIC_ASSERT(Evaluator2::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval2.dimensions()[0], 2);
+  VERIFY_IS_EQUAL(eval2.dimensions()[1], 2);
+
+  VERIFY_IS_APPROX(mat5(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(0,1) + mat1(0,2)*mat2(0,2));
+  VERIFY_IS_APPROX(mat5(0,1), mat1(0,0)*mat2(1,0) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(1,2));
+  VERIFY_IS_APPROX(mat5(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(0,1) + mat1(1,2)*mat2(0,2));
+  VERIFY_IS_APPROX(mat5(1,1), mat1(1,0)*mat2(1,0) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(1,2));
+
+  Tensor<float, 2, DataLayout> mat6(2,2);
+  mat6.setZero();
+  Eigen::array<DimPair, 1> dims6 = {{DimPair(1, 0)}};
+  typedef TensorEvaluator<decltype(mat1.contract(mat3, dims6)), DefaultDevice> Evaluator3;
+  Evaluator3 eval3(mat1.contract(mat3, dims6), DefaultDevice());
+  eval3.evalTo(mat6.data());
+  EIGEN_STATIC_ASSERT(Evaluator3::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval3.dimensions()[0], 2);
+  VERIFY_IS_EQUAL(eval3.dimensions()[1], 2);
+
+  VERIFY_IS_APPROX(mat6(0,0), mat1(0,0)*mat3(0,0) + mat1(0,1)*mat3(1,0) + mat1(0,2)*mat3(2,0));
+  VERIFY_IS_APPROX(mat6(0,1), mat1(0,0)*mat3(0,1) + mat1(0,1)*mat3(1,1) + mat1(0,2)*mat3(2,1));
+  VERIFY_IS_APPROX(mat6(1,0), mat1(1,0)*mat3(0,0) + mat1(1,1)*mat3(1,0) + mat1(1,2)*mat3(2,0));
+  VERIFY_IS_APPROX(mat6(1,1), mat1(1,0)*mat3(0,1) + mat1(1,1)*mat3(1,1) + mat1(1,2)*mat3(2,1));
+}
+
+template<int DataLayout>
+static void test_scalar()
+{
+  Tensor<float, 1, DataLayout> vec1({6});
+  Tensor<float, 1, DataLayout> vec2({6});
+
+  vec1.setRandom();
+  vec2.setRandom();
+
+  Eigen::array<DimPair, 1> dims = {{DimPair(0, 0)}};
+  Tensor<float, 0, DataLayout> scalar = vec1.contract(vec2, dims);
+
+  float expected = 0.0f;
+  for (int i = 0; i < 6; ++i) {
+    expected += vec1(i) * vec2(i);
+  }
+  VERIFY_IS_APPROX(scalar(), expected);
+}
+
+template<int DataLayout>
+static void test_multidims()
+{
+  Tensor<float, 3, DataLayout> mat1(2, 2, 2);
+  Tensor<float, 4, DataLayout> mat2(2, 2, 2, 2);
+
+  mat1.setRandom();
+  mat2.setRandom();
+
+  Tensor<float, 3, DataLayout> mat3(2, 2, 2);
+  mat3.setZero();
+  Eigen::array<DimPair, 2> dims = {{DimPair(1, 2), DimPair(2, 3)}};
+  typedef TensorEvaluator<decltype(mat1.contract(mat2, dims)), DefaultDevice> Evaluator;
+  Evaluator eval(mat1.contract(mat2, dims), DefaultDevice());
+  eval.evalTo(mat3.data());
+  EIGEN_STATIC_ASSERT(Evaluator::NumDims==3ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval.dimensions()[0], 2);
+  VERIFY_IS_EQUAL(eval.dimensions()[1], 2);
+  VERIFY_IS_EQUAL(eval.dimensions()[2], 2);
+
+  VERIFY_IS_APPROX(mat3(0,0,0), mat1(0,0,0)*mat2(0,0,0,0) + mat1(0,1,0)*mat2(0,0,1,0) +
+                                mat1(0,0,1)*mat2(0,0,0,1) + mat1(0,1,1)*mat2(0,0,1,1));
+  VERIFY_IS_APPROX(mat3(0,0,1), mat1(0,0,0)*mat2(0,1,0,0) + mat1(0,1,0)*mat2(0,1,1,0) +
+                                mat1(0,0,1)*mat2(0,1,0,1) + mat1(0,1,1)*mat2(0,1,1,1));
+  VERIFY_IS_APPROX(mat3(0,1,0), mat1(0,0,0)*mat2(1,0,0,0) + mat1(0,1,0)*mat2(1,0,1,0) +
+                                mat1(0,0,1)*mat2(1,0,0,1) + mat1(0,1,1)*mat2(1,0,1,1));
+  VERIFY_IS_APPROX(mat3(0,1,1), mat1(0,0,0)*mat2(1,1,0,0) + mat1(0,1,0)*mat2(1,1,1,0) +
+                                mat1(0,0,1)*mat2(1,1,0,1) + mat1(0,1,1)*mat2(1,1,1,1));
+  VERIFY_IS_APPROX(mat3(1,0,0), mat1(1,0,0)*mat2(0,0,0,0) + mat1(1,1,0)*mat2(0,0,1,0) +
+                                mat1(1,0,1)*mat2(0,0,0,1) + mat1(1,1,1)*mat2(0,0,1,1));
+  VERIFY_IS_APPROX(mat3(1,0,1), mat1(1,0,0)*mat2(0,1,0,0) + mat1(1,1,0)*mat2(0,1,1,0) +
+                                mat1(1,0,1)*mat2(0,1,0,1) + mat1(1,1,1)*mat2(0,1,1,1));
+  VERIFY_IS_APPROX(mat3(1,1,0), mat1(1,0,0)*mat2(1,0,0,0) + mat1(1,1,0)*mat2(1,0,1,0) +
+                                mat1(1,0,1)*mat2(1,0,0,1) + mat1(1,1,1)*mat2(1,0,1,1));
+  VERIFY_IS_APPROX(mat3(1,1,1), mat1(1,0,0)*mat2(1,1,0,0) + mat1(1,1,0)*mat2(1,1,1,0) +
+                                mat1(1,0,1)*mat2(1,1,0,1) + mat1(1,1,1)*mat2(1,1,1,1));
+
+  Tensor<float, 2, DataLayout> mat4(2, 2);
+  Tensor<float, 3, DataLayout> mat5(2, 2, 2);
+
+  mat4.setRandom();
+  mat5.setRandom();
+
+  Tensor<float, 1, DataLayout> mat6(2);
+  mat6.setZero();
+  Eigen::array<DimPair, 2> dims2({{DimPair(0, 1), DimPair(1, 0)}});
+  typedef TensorEvaluator<decltype(mat4.contract(mat5, dims2)), DefaultDevice> Evaluator2;
+  Evaluator2 eval2(mat4.contract(mat5, dims2), DefaultDevice());
+  eval2.evalTo(mat6.data());
+  EIGEN_STATIC_ASSERT(Evaluator2::NumDims==1ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval2.dimensions()[0], 2);
+
+  VERIFY_IS_APPROX(mat6(0), mat4(0,0)*mat5(0,0,0) + mat4(1,0)*mat5(0,1,0) +
+                   mat4(0,1)*mat5(1,0,0) + mat4(1,1)*mat5(1,1,0));
+  VERIFY_IS_APPROX(mat6(1), mat4(0,0)*mat5(0,0,1) + mat4(1,0)*mat5(0,1,1) +
+                   mat4(0,1)*mat5(1,0,1) + mat4(1,1)*mat5(1,1,1));
+}
+
+template<int DataLayout>
+static void test_holes() {
+  Tensor<float, 4, DataLayout> t1(2, 5, 7, 3);
+  Tensor<float, 5, DataLayout> t2(2, 7, 11, 13, 3);
+  t1.setRandom();
+  t2.setRandom();
+
+  Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(3, 4)}};
+  Tensor<float, 5, DataLayout> result = t1.contract(t2, dims);
+  VERIFY_IS_EQUAL(result.dimension(0), 5);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  VERIFY_IS_EQUAL(result.dimension(2), 7);
+  VERIFY_IS_EQUAL(result.dimension(3), 11);
+  VERIFY_IS_EQUAL(result.dimension(4), 13);
+
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 5; ++l) {
+          for (int m = 0; m < 5; ++m) {
+            VERIFY_IS_APPROX(result(i, j, k, l, m),
+                             t1(0, i, j, 0) * t2(0, k, l, m, 0) +
+                             t1(1, i, j, 0) * t2(1, k, l, m, 0) +
+                             t1(0, i, j, 1) * t2(0, k, l, m, 1) +
+                             t1(1, i, j, 1) * t2(1, k, l, m, 1) +
+                             t1(0, i, j, 2) * t2(0, k, l, m, 2) +
+                             t1(1, i, j, 2) * t2(1, k, l, m, 2));
+          }
+        }
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_full_redux()
+{
+  Tensor<float, 2, DataLayout> t1(2, 2);
+  Tensor<float, 3, DataLayout> t2(2, 2, 2);
+  t1.setRandom();
+  t2.setRandom();
+
+  Eigen::array<DimPair, 2> dims = {{DimPair(0, 0), DimPair(1, 1)}};
+  Tensor<float, 1, DataLayout> result = t1.contract(t2, dims);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_APPROX(result(0), t1(0, 0) * t2(0, 0, 0) +  t1(1, 0) * t2(1, 0, 0)
+                            + t1(0, 1) * t2(0, 1, 0) +  t1(1, 1) * t2(1, 1, 0));
+  VERIFY_IS_APPROX(result(1), t1(0, 0) * t2(0, 0, 1) +  t1(1, 0) * t2(1, 0, 1)
+                            + t1(0, 1) * t2(0, 1, 1) +  t1(1, 1) * t2(1, 1, 1));
+
+  dims[0] = DimPair(1, 0);
+  dims[1] = DimPair(2, 1);
+  result = t2.contract(t1, dims);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_APPROX(result(0), t1(0, 0) * t2(0, 0, 0) +  t1(1, 0) * t2(0, 1, 0)
+                            + t1(0, 1) * t2(0, 0, 1) +  t1(1, 1) * t2(0, 1, 1));
+  VERIFY_IS_APPROX(result(1), t1(0, 0) * t2(1, 0, 0) +  t1(1, 0) * t2(1, 1, 0)
+                            + t1(0, 1) * t2(1, 0, 1) +  t1(1, 1) * t2(1, 1, 1));
+}
+
+template<int DataLayout>
+static void test_contraction_of_contraction()
+{
+  Tensor<float, 2, DataLayout> t1(2, 2);
+  Tensor<float, 2, DataLayout> t2(2, 2);
+  Tensor<float, 2, DataLayout> t3(2, 2);
+  Tensor<float, 2, DataLayout> t4(2, 2);
+  t1.setRandom();
+  t2.setRandom();
+  t3.setRandom();
+  t4.setRandom();
+
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+  auto contract1 = t1.contract(t2, dims);
+  auto diff = t3 - contract1;
+  auto contract2 = t1.contract(t4, dims);
+  Tensor<float, 2, DataLayout> result = contract2.contract(diff, dims);
+
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 2);
+
+  Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>>
+      m1(t1.data(), 2, 2), m2(t2.data(), 2, 2), m3(t3.data(), 2, 2),
+      m4(t4.data(), 2, 2);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>
+      expected = (m1 * m4) * (m3 - m1 * m2);
+
+  VERIFY_IS_APPROX(result(0, 0), expected(0, 0));
+  VERIFY_IS_APPROX(result(0, 1), expected(0, 1));
+  VERIFY_IS_APPROX(result(1, 0), expected(1, 0));
+  VERIFY_IS_APPROX(result(1, 1), expected(1, 1));
+}
+
+template<int DataLayout>
+static void test_expr()
+{
+  Tensor<float, 2, DataLayout> mat1(2, 3);
+  Tensor<float, 2, DataLayout> mat2(3, 2);
+  mat1.setRandom();
+  mat2.setRandom();
+
+  Tensor<float, 2, DataLayout> mat3(2,2);
+
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+  mat3 = mat1.contract(mat2, dims);
+
+  VERIFY_IS_APPROX(mat3(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(1,0) + mat1(0,2)*mat2(2,0));
+  VERIFY_IS_APPROX(mat3(0,1), mat1(0,0)*mat2(0,1) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(2,1));
+  VERIFY_IS_APPROX(mat3(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(1,0) + mat1(1,2)*mat2(2,0));
+  VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1));
+}
+
+template<int DataLayout>
+static void test_out_of_order_contraction()
+{
+  Tensor<float, 3, DataLayout> mat1(2, 2, 2);
+  Tensor<float, 3, DataLayout> mat2(2, 2, 2);
+
+  mat1.setRandom();
+  mat2.setRandom();
+
+  Tensor<float, 2, DataLayout> mat3(2, 2);
+
+  Eigen::array<DimPair, 2> dims = {{DimPair(2, 0), DimPair(0, 2)}};
+  mat3 = mat1.contract(mat2, dims);
+
+  VERIFY_IS_APPROX(mat3(0, 0),
+                   mat1(0,0,0)*mat2(0,0,0) + mat1(1,0,0)*mat2(0,0,1) +
+                   mat1(0,0,1)*mat2(1,0,0) + mat1(1,0,1)*mat2(1,0,1));
+  VERIFY_IS_APPROX(mat3(1, 0),
+                   mat1(0,1,0)*mat2(0,0,0) + mat1(1,1,0)*mat2(0,0,1) +
+                   mat1(0,1,1)*mat2(1,0,0) + mat1(1,1,1)*mat2(1,0,1));
+  VERIFY_IS_APPROX(mat3(0, 1),
+                   mat1(0,0,0)*mat2(0,1,0) + mat1(1,0,0)*mat2(0,1,1) +
+                   mat1(0,0,1)*mat2(1,1,0) + mat1(1,0,1)*mat2(1,1,1));
+  VERIFY_IS_APPROX(mat3(1, 1),
+                   mat1(0,1,0)*mat2(0,1,0) + mat1(1,1,0)*mat2(0,1,1) +
+                   mat1(0,1,1)*mat2(1,1,0) + mat1(1,1,1)*mat2(1,1,1));
+
+  Eigen::array<DimPair, 2> dims2 = {{DimPair(0, 2), DimPair(2, 0)}};
+  mat3 = mat1.contract(mat2, dims2);
+
+  VERIFY_IS_APPROX(mat3(0, 0),
+                   mat1(0,0,0)*mat2(0,0,0) + mat1(1,0,0)*mat2(0,0,1) +
+                   mat1(0,0,1)*mat2(1,0,0) + mat1(1,0,1)*mat2(1,0,1));
+  VERIFY_IS_APPROX(mat3(1, 0),
+                   mat1(0,1,0)*mat2(0,0,0) + mat1(1,1,0)*mat2(0,0,1) +
+                   mat1(0,1,1)*mat2(1,0,0) + mat1(1,1,1)*mat2(1,0,1));
+  VERIFY_IS_APPROX(mat3(0, 1),
+                   mat1(0,0,0)*mat2(0,1,0) + mat1(1,0,0)*mat2(0,1,1) +
+                   mat1(0,0,1)*mat2(1,1,0) + mat1(1,0,1)*mat2(1,1,1));
+  VERIFY_IS_APPROX(mat3(1, 1),
+                   mat1(0,1,0)*mat2(0,1,0) + mat1(1,1,0)*mat2(0,1,1) +
+                   mat1(0,1,1)*mat2(1,1,0) + mat1(1,1,1)*mat2(1,1,1));
+
+}
+
+template<int DataLayout>
+static void test_consistency()
+{
+  // this does something like testing (A*B)^T = (B^T * A^T)
+
+  Tensor<float, 3, DataLayout> mat1(4, 3, 5);
+  Tensor<float, 5, DataLayout> mat2(3, 2, 1, 5, 4);
+  mat1.setRandom();
+  mat2.setRandom();
+
+  Tensor<float, 4, DataLayout> mat3(5, 2, 1, 5);
+  Tensor<float, 4, DataLayout> mat4(2, 1, 5, 5);
+
+  // contract on dimensions of size 4 and 3
+  Eigen::array<DimPair, 2> dims1 = {{DimPair(0, 4), DimPair(1, 0)}};
+  Eigen::array<DimPair, 2> dims2 = {{DimPair(4, 0), DimPair(0, 1)}};
+
+  mat3 = mat1.contract(mat2, dims1);
+  mat4 = mat2.contract(mat1, dims2);
+
+  // check that these are equal except for ordering of dimensions
+  if (DataLayout == ColMajor) {
+    for (size_t i = 0; i < 5; i++) {
+      for (size_t j = 0; j < 10; j++) {
+        VERIFY_IS_APPROX(mat3.data()[i + 5 * j], mat4.data()[j + 10 * i]);
+      }
+    }
+  } else {
+    // Row major
+    for (size_t i = 0; i < 5; i++) {
+      for (size_t j = 0; j < 10; j++) {
+        VERIFY_IS_APPROX(mat3.data()[10 * i + j], mat4.data()[i + 5 * j]);
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_large_contraction()
+{
+  Tensor<float, 4, DataLayout> t_left(30, 50, 8, 31);
+  Tensor<float, 5, DataLayout> t_right(8, 31, 7, 20, 10);
+  Tensor<float, 5, DataLayout> t_result(30, 50, 7, 20, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 1500, 248);
+  MapXf m_right(t_right.data(), 248, 1400);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 2> dims = {{DimPair(2, 0), DimPair(3, 1)}};
+
+  // compute results by separate methods
+  t_result = t_left.contract(t_right, dims);
+  m_result = m_left * m_right;
+
+  for (int i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY(&t_result.data()[i] != &m_result.data()[i]);
+    VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+  }
+}
+
+template<int DataLayout>
+static void test_matrix_vector()
+{
+  Tensor<float, 2, DataLayout> t_left(30, 50);
+  Tensor<float, 1, DataLayout> t_right(50);
+  Tensor<float, 1, DataLayout> t_result(30);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 30, 50);
+  MapXf m_right(t_right.data(), 50, 1);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(30, 1);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 1> dims{{DimPair(1, 0)}};
+
+  // compute results by separate methods
+  t_result = t_left.contract(t_right, dims);
+  m_result = m_left * m_right;
+
+  for (int i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY(internal::isApprox(t_result(i), m_result(i, 0), 1));
+  }
+}
+
+
+template<int DataLayout>
+static void test_tensor_vector()
+{
+  Tensor<float, 3, DataLayout> t_left(7, 13, 17);
+  Tensor<float, 2, DataLayout> t_right(1, 7);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  typedef typename Tensor<float, 1, DataLayout>::DimensionPair DimensionPair;
+  Eigen::array<DimensionPair, 1> dim_pair01{{{0, 1}}};
+  Tensor<float, 3, DataLayout> t_result = t_left.contract(t_right, dim_pair01);
+
+  typedef Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 7, 13*17);
+  MapXf m_right(t_right.data(), 1, 7);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result = m_left.transpose() * m_right.transpose();
+
+  for (int i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY(internal::isApprox(t_result(i), m_result(i, 0), 1));
+  }
+}
+
+
+template<int DataLayout>
+static void test_small_blocking_factors()
+{
+  Tensor<float, 4, DataLayout> t_left(30, 5, 3, 31);
+  Tensor<float, 5, DataLayout> t_right(3, 31, 7, 20, 1);
+  t_left.setRandom();
+  t_right.setRandom();
+
+  // Add a little offset so that the results won't be close to zero.
+  t_left += t_left.constant(1.0f);
+  t_right += t_right.constant(1.0f);
+
+  // Force the cache sizes, which results in smaller blocking factors.
+  Eigen::setCpuCacheSizes(896, 1920, 2944);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  Eigen::array<DimPair, 2> dims = {{DimPair(2, 0), DimPair(3, 1)}};
+  Tensor<float, 5, DataLayout> t_result;
+  t_result = t_left.contract(t_right, dims);
+
+  // compute result using a simple eigen matrix product
+  Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> m_left(t_left.data(), 150, 93);
+  Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout>> m_right(t_right.data(), 93, 140);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result = m_left * m_right;
+
+  for (int i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+  }
+}
+
+template<int DataLayout>
+static void test_tensor_product()
+{
+  Tensor<float, 2, DataLayout> mat1(2, 3);
+  Tensor<float, 2, DataLayout> mat2(4, 1);
+  mat1.setRandom();
+  mat2.setRandom();
+
+  Tensor<float, 4, DataLayout> result = mat1.contract(mat2, Eigen::array<DimPair, 0>{{}});
+
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 3);
+  VERIFY_IS_EQUAL(result.dimension(2), 4);
+  VERIFY_IS_EQUAL(result.dimension(3), 1);
+  for (int i = 0; i < result.dimension(0); ++i) {
+    for (int j = 0; j < result.dimension(1); ++j) {
+      for (int k = 0; k < result.dimension(2); ++k) {
+        for (int l = 0; l < result.dimension(3); ++l) {
+			VERIFY_IS_APPROX(result(i, j, k, l), mat1(i, j) * mat2(k, l) );
+        }
+      }
+    }
+  }
+}
+
+
+template<int DataLayout>
+static void test_const_inputs()
+{
+  Tensor<float, 2, DataLayout> in1(2, 3);
+  Tensor<float, 2, DataLayout> in2(3, 2);
+  in1.setRandom();
+  in2.setRandom();
+
+  TensorMap<Tensor<const float, 2, DataLayout> > mat1(in1.data(), 2, 3);
+  TensorMap<Tensor<const float, 2, DataLayout> > mat2(in2.data(), 3, 2);
+  Tensor<float, 2, DataLayout> mat3(2,2);
+
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+  mat3 = mat1.contract(mat2, dims);
+
+  VERIFY_IS_APPROX(mat3(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(1,0) + mat1(0,2)*mat2(2,0));
+  VERIFY_IS_APPROX(mat3(0,1), mat1(0,0)*mat2(0,1) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(2,1));
+  VERIFY_IS_APPROX(mat3(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(1,0) + mat1(1,2)*mat2(2,0));
+  VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1));
+}
+
+void test_cxx11_tensor_contraction()
+{
+  CALL_SUBTEST(test_evals<ColMajor>());
+  CALL_SUBTEST(test_evals<RowMajor>());
+  CALL_SUBTEST(test_scalar<ColMajor>());
+  CALL_SUBTEST(test_scalar<RowMajor>());
+  CALL_SUBTEST(test_multidims<ColMajor>());
+  CALL_SUBTEST(test_multidims<RowMajor>());
+  CALL_SUBTEST(test_holes<ColMajor>());
+  CALL_SUBTEST(test_holes<RowMajor>());
+  CALL_SUBTEST(test_full_redux<ColMajor>());
+  CALL_SUBTEST(test_full_redux<RowMajor>());
+  CALL_SUBTEST(test_contraction_of_contraction<ColMajor>());
+  CALL_SUBTEST(test_contraction_of_contraction<RowMajor>());
+  CALL_SUBTEST(test_expr<ColMajor>());
+  CALL_SUBTEST(test_expr<RowMajor>());
+  CALL_SUBTEST(test_out_of_order_contraction<ColMajor>());
+  CALL_SUBTEST(test_out_of_order_contraction<RowMajor>());
+  CALL_SUBTEST(test_consistency<ColMajor>());
+  CALL_SUBTEST(test_consistency<RowMajor>());
+  CALL_SUBTEST(test_large_contraction<ColMajor>());
+  CALL_SUBTEST(test_large_contraction<RowMajor>());
+  CALL_SUBTEST(test_matrix_vector<ColMajor>());
+  CALL_SUBTEST(test_matrix_vector<RowMajor>());
+  CALL_SUBTEST(test_tensor_vector<ColMajor>());
+  CALL_SUBTEST(test_tensor_vector<RowMajor>());
+  CALL_SUBTEST(test_small_blocking_factors<ColMajor>());
+  CALL_SUBTEST(test_small_blocking_factors<RowMajor>());
+  CALL_SUBTEST(test_tensor_product<ColMajor>());
+  CALL_SUBTEST(test_tensor_product<RowMajor>());
+  CALL_SUBTEST(test_const_inputs<ColMajor>());
+  CALL_SUBTEST(test_const_inputs<RowMajor>());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_convolution.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_convolution.cpp
new file mode 100644
index 000000000..e3d4675eb
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_convolution.cpp
@@ -0,0 +1,149 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::DefaultDevice;
+
+template <int DataLayout>
+static void test_evals()
+{
+  Tensor<float, 2, DataLayout> input(3, 3);
+  Tensor<float, 1, DataLayout> kernel(2);
+
+  input.setRandom();
+  kernel.setRandom();
+
+  Tensor<float, 2, DataLayout> result(2,3);
+  result.setZero();
+  Eigen::array<Tensor<float, 2>::Index, 1> dims3{{0}};
+
+  typedef TensorEvaluator<decltype(input.convolve(kernel, dims3)), DefaultDevice> Evaluator;
+  Evaluator eval(input.convolve(kernel, dims3), DefaultDevice());
+  eval.evalTo(result.data());
+  EIGEN_STATIC_ASSERT(Evaluator::NumDims==2ul, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  VERIFY_IS_EQUAL(eval.dimensions()[0], 2);
+  VERIFY_IS_EQUAL(eval.dimensions()[1], 3);
+
+  VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0) + input(1,0)*kernel(1));  // index 0
+  VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0) + input(1,1)*kernel(1));  // index 2
+  VERIFY_IS_APPROX(result(0,2), input(0,2)*kernel(0) + input(1,2)*kernel(1));  // index 4
+  VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0) + input(2,0)*kernel(1));  // index 1
+  VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0) + input(2,1)*kernel(1));  // index 3
+  VERIFY_IS_APPROX(result(1,2), input(1,2)*kernel(0) + input(2,2)*kernel(1));  // index 5
+}
+
+template <int DataLayout>
+static void test_expr()
+{
+  Tensor<float, 2, DataLayout> input(3, 3);
+  Tensor<float, 2, DataLayout> kernel(2, 2);
+  input.setRandom();
+  kernel.setRandom();
+
+  Tensor<float, 2, DataLayout> result(2,2);
+  Eigen::array<ptrdiff_t, 2> dims;
+  dims[0] = 0;
+  dims[1] = 1;
+  result = input.convolve(kernel, dims);
+
+  VERIFY_IS_APPROX(result(0,0), input(0,0)*kernel(0,0) + input(0,1)*kernel(0,1) +
+                                input(1,0)*kernel(1,0) + input(1,1)*kernel(1,1));
+  VERIFY_IS_APPROX(result(0,1), input(0,1)*kernel(0,0) + input(0,2)*kernel(0,1) +
+                                input(1,1)*kernel(1,0) + input(1,2)*kernel(1,1));
+  VERIFY_IS_APPROX(result(1,0), input(1,0)*kernel(0,0) + input(1,1)*kernel(0,1) +
+                                input(2,0)*kernel(1,0) + input(2,1)*kernel(1,1));
+  VERIFY_IS_APPROX(result(1,1), input(1,1)*kernel(0,0) + input(1,2)*kernel(0,1) +
+                                input(2,1)*kernel(1,0) + input(2,2)*kernel(1,1));
+}
+
+template <int DataLayout>
+static void test_modes() {
+  Tensor<float, 1, DataLayout> input(3);
+  Tensor<float, 1, DataLayout> kernel(3);
+  input(0) = 1.0f;
+  input(1) = 2.0f;
+  input(2) = 3.0f;
+  kernel(0) = 0.5f;
+  kernel(1) = 1.0f;
+  kernel(2) = 0.0f;
+
+  Eigen::array<ptrdiff_t, 1> dims;
+  dims[0] = 0;
+  Eigen::array<std::pair<ptrdiff_t, ptrdiff_t>, 1> padding;
+
+  // Emulate VALID mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(0, 0);
+  Tensor<float, 1, DataLayout> valid(1);
+  valid = input.pad(padding).convolve(kernel, dims);
+  VERIFY_IS_EQUAL(valid.dimension(0), 1);
+  VERIFY_IS_APPROX(valid(0), 2.5f);
+
+  // Emulate SAME mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(1, 1);
+  Tensor<float, 1, DataLayout> same(3);
+  same = input.pad(padding).convolve(kernel, dims);
+  VERIFY_IS_EQUAL(same.dimension(0), 3);
+  VERIFY_IS_APPROX(same(0), 1.0f);
+  VERIFY_IS_APPROX(same(1), 2.5f);
+  VERIFY_IS_APPROX(same(2), 4.0f);
+
+  // Emulate FULL mode (as defined in
+  // http://docs.scipy.org/doc/numpy/reference/generated/numpy.convolve.html).
+  padding[0] = std::make_pair(2, 2);
+  Tensor<float, 1, DataLayout> full(5);
+  full = input.pad(padding).convolve(kernel, dims);
+  VERIFY_IS_EQUAL(full.dimension(0), 5);
+  VERIFY_IS_APPROX(full(0), 0.0f);
+  VERIFY_IS_APPROX(full(1), 1.0f);
+  VERIFY_IS_APPROX(full(2), 2.5f);
+  VERIFY_IS_APPROX(full(3), 4.0f);
+  VERIFY_IS_APPROX(full(4), 1.5f);
+}
+
+template <int DataLayout>
+static void test_strides() {
+  Tensor<float, 1, DataLayout> input(13);
+  Tensor<float, 1, DataLayout> kernel(3);
+  input.setRandom();
+  kernel.setRandom();
+
+  Eigen::array<ptrdiff_t, 1> dims;
+  dims[0] = 0;
+  Eigen::array<ptrdiff_t, 1> stride_of_3;
+  stride_of_3[0] = 3;
+  Eigen::array<ptrdiff_t, 1> stride_of_2;
+  stride_of_2[0] = 2;
+
+  Tensor<float, 1, DataLayout> result;
+  result = input.stride(stride_of_3).convolve(kernel, dims).stride(stride_of_2);
+
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_APPROX(result(0), (input(0)*kernel(0) + input(3)*kernel(1) +
+                               input(6)*kernel(2)));
+  VERIFY_IS_APPROX(result(1), (input(6)*kernel(0) + input(9)*kernel(1) +
+                               input(12)*kernel(2)));
+}
+
+void test_cxx11_tensor_convolution()
+{
+  CALL_SUBTEST(test_evals<ColMajor>());
+  CALL_SUBTEST(test_evals<RowMajor>());
+  CALL_SUBTEST(test_expr<ColMajor>());
+  CALL_SUBTEST(test_expr<RowMajor>());
+  CALL_SUBTEST(test_modes<ColMajor>());
+  CALL_SUBTEST(test_modes<RowMajor>());
+  CALL_SUBTEST(test_strides<ColMajor>());
+  CALL_SUBTEST(test_strides<RowMajor>());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_cuda.cu b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_cuda.cu
new file mode 100644
index 000000000..9584a539f
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_cuda.cu
@@ -0,0 +1,1284 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_cuda_nullary() {
+  Tensor<float, 1, 0, int> in1(2);
+  Tensor<float, 1, 0, int> in2(2);
+  in1.setRandom();
+  in2.setRandom();
+
+  std::size_t tensor_bytes = in1.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_in2;
+  cudaMalloc((void**)(&d_in1), tensor_bytes);
+  cudaMalloc((void**)(&d_in2), tensor_bytes);
+  cudaMemcpy(d_in1, in1.data(), tensor_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2.data(), tensor_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
+      d_in1, 2);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2(
+      d_in2, 2);
+
+  gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f);
+  gpu_in2.device(gpu_device) = gpu_in2.random();
+
+  Tensor<float, 1, 0, int> new1(2);
+  Tensor<float, 1, 0, int> new2(2);
+
+  assert(cudaMemcpyAsync(new1.data(), d_in1, tensor_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+  assert(cudaMemcpyAsync(new2.data(), d_in2, tensor_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 2; ++i) {
+    VERIFY_IS_APPROX(new1(i), 3.14f);
+    VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
+  }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+}
+
+void test_cuda_elementwise_small() {
+  Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
+  Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
+  Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2));
+  in1.setRandom();
+  in2.setRandom();
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t in2_bytes = in2.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_in2;
+  float* d_out;
+  cudaMalloc((void**)(&d_in1), in1_bytes);
+  cudaMalloc((void**)(&d_in2), in2_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
+      d_in1, Eigen::array<Eigen::DenseIndex, 1>(2));
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
+      d_in2, Eigen::array<Eigen::DenseIndex, 1>(2));
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
+      d_out, Eigen::array<Eigen::DenseIndex, 1>(2));
+
+  gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 2; ++i) {
+    VERIFY_IS_APPROX(
+        out(Eigen::array<Eigen::DenseIndex, 1>(i)),
+        in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i)));
+  }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+  cudaFree(d_out);
+}
+
+void test_cuda_elementwise()
+{
+  Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  in1.setRandom();
+  in2.setRandom();
+  in3.setRandom();
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t in2_bytes = in2.size() * sizeof(float);
+  std::size_t in3_bytes = in3.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_in2;
+  float* d_in3;
+  float* d_out;
+  cudaMalloc((void**)(&d_in1), in1_bytes);
+  cudaMalloc((void**)(&d_in2), in2_bytes);
+  cudaMalloc((void**)(&d_in3), in3_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in3, in3.data(), in3_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+
+  gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 53; ++j) {
+      for (int k = 0; k < 97; ++k) {
+        VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)), in1(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) + in2(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) * in3(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)));
+      }
+    }
+  }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+  cudaFree(d_in3);
+  cudaFree(d_out);
+}
+
+void test_cuda_props() {
+  Tensor<float, 1> in1(200);
+  Tensor<bool, 1> out(200);
+  in1.setRandom();
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(bool);
+
+  float* d_in1;
+  bool* d_out;
+  cudaMalloc((void**)(&d_in1), in1_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
+      d_in1, 200);
+  Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_out(
+      d_out, 200);
+
+  gpu_out.device(gpu_device) = (gpu_in1.isnan)();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 200; ++i) {
+    VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i)));
+  }
+
+  cudaFree(d_in1);
+  cudaFree(d_out);
+}
+
+void test_cuda_reduction()
+{
+  Tensor<float, 4> in1(72,53,97,113);
+  Tensor<float, 2> out(72,97);
+  in1.setRandom();
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_out;
+  cudaMalloc((void**)(&d_in1), in1_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
+  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
+
+  array<Eigen::DenseIndex, 2> reduction_axis;
+  reduction_axis[0] = 1;
+  reduction_axis[1] = 3;
+
+  gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      float expected = 0;
+      for (int k = 0; k < 53; ++k) {
+        for (int l = 0; l < 113; ++l) {
+          expected =
+              std::max<float>(expected, in1(i, k, j, l));
+        }
+      }
+      VERIFY_IS_APPROX(out(i,j), expected);
+    }
+  }
+
+  cudaFree(d_in1);
+  cudaFree(d_out);
+}
+
+template<int DataLayout>
+void test_cuda_contraction()
+{
+  // with these dimensions, the output has 300 * 140 elements, which is
+  // more than 30 * 1024, which is the number of threads in blocks on
+  // a 15 SM GK110 GPU
+  Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31);
+  Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>(3, 31, 7, 20, 1));
+  Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>(6, 50, 7, 20, 1));
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  std::size_t t_left_bytes = t_left.size()  * sizeof(float);
+  std::size_t t_right_bytes = t_right.size() * sizeof(float);
+  std::size_t t_result_bytes = t_result.size() * sizeof(float);
+
+  float* d_t_left;
+  float* d_t_right;
+  float* d_t_result;
+
+  cudaMalloc((void**)(&d_t_left), t_left_bytes);
+  cudaMalloc((void**)(&d_t_right), t_right_bytes);
+  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+
+  cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31);
+  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_right(d_t_right, 3, 31, 7, 20, 1);
+  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_t_result(d_t_result, 6, 50, 7, 20, 1);
+
+  typedef Eigen::Map<Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> > MapXf;
+  MapXf m_left(t_left.data(), 300, 93);
+  MapXf m_right(t_right.data(), 93, 140);
+  Eigen::Matrix<float, Dynamic, Dynamic, DataLayout> m_result(300, 140);
+
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 2> dims;
+  dims[0] = DimPair(2, 0);
+  dims[1] = DimPair(3, 1);
+
+  m_result = m_left * m_right;
+  gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
+
+  cudaMemcpy(t_result.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+
+  for (DenseIndex i = 0; i < t_result.size(); i++) {
+    if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
+      std::cout << "mismatch detected at index " << i << ": " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+
+  cudaFree(d_t_left);
+  cudaFree(d_t_right);
+  cudaFree(d_t_result);
+}
+
+template<int DataLayout>
+void test_cuda_convolution_1d()
+{
+  Tensor<float, 4, DataLayout> input(74,37,11,137);
+  Tensor<float, 1, DataLayout> kernel(4);
+  Tensor<float, 4, DataLayout> out(74,34,11,137);
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  cudaMalloc((void**)(&d_input), input_bytes);
+  cudaMalloc((void**)(&d_kernel), kernel_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4);
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137);
+
+  Eigen::array<Eigen::DenseIndex, 1> dims(1);
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 74; ++i) {
+    for (int j = 0; j < 34; ++j) {
+      for (int k = 0; k < 11; ++k) {
+        for (int l = 0; l < 137; ++l) {
+          const float result = out(i,j,k,l);
+          const float expected = input(i,j+0,k,l) * kernel(0) + input(i,j+1,k,l) * kernel(1) +
+                                 input(i,j+2,k,l) * kernel(2) + input(i,j+3,k,l) * kernel(3);
+          VERIFY_IS_APPROX(result, expected);
+        }
+      }
+    }
+  }
+
+  cudaFree(d_input);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
+}
+
+void test_cuda_convolution_inner_dim_col_major_1d()
+{
+  Tensor<float, 4, ColMajor> input(74,9,11,7);
+  Tensor<float, 1, ColMajor> kernel(4);
+  Tensor<float, 4, ColMajor> out(71,9,11,7);
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  cudaMalloc((void**)(&d_input), input_bytes);
+  cudaMalloc((void**)(&d_kernel), kernel_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4);
+  Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7);
+
+  Eigen::array<Eigen::DenseIndex, 1> dims(0);
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 71; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 11; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          const float result = out(i,j,k,l);
+          const float expected = input(i+0,j,k,l) * kernel(0) + input(i+1,j,k,l) * kernel(1) +
+                                 input(i+2,j,k,l) * kernel(2) + input(i+3,j,k,l) * kernel(3);
+          VERIFY_IS_APPROX(result, expected);
+        }
+      }
+    }
+  }
+
+  cudaFree(d_input);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
+}
+
+void test_cuda_convolution_inner_dim_row_major_1d()
+{
+  Tensor<float, 4, RowMajor> input(7,9,11,74);
+  Tensor<float, 1, RowMajor> kernel(4);
+  Tensor<float, 4, RowMajor> out(7,9,11,71);
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  cudaMalloc((void**)(&d_input), input_bytes);
+  cudaMalloc((void**)(&d_kernel), kernel_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4);
+  Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71);
+
+  Eigen::array<Eigen::DenseIndex, 1> dims(3);
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 7; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 11; ++k) {
+        for (int l = 0; l < 71; ++l) {
+          const float result = out(i,j,k,l);
+          const float expected = input(i,j,k,l+0) * kernel(0) + input(i,j,k,l+1) * kernel(1) +
+                                 input(i,j,k,l+2) * kernel(2) + input(i,j,k,l+3) * kernel(3);
+          VERIFY_IS_APPROX(result, expected);
+        }
+      }
+    }
+  }
+
+  cudaFree(d_input);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
+}
+
+template<int DataLayout>
+void test_cuda_convolution_2d()
+{
+  Tensor<float, 4, DataLayout> input(74,37,11,137);
+  Tensor<float, 2, DataLayout> kernel(3,4);
+  Tensor<float, 4, DataLayout> out(74,35,8,137);
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  cudaMalloc((void**)(&d_input), input_bytes);
+  cudaMalloc((void**)(&d_kernel), kernel_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137);
+  Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4);
+  Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137);
+
+  Eigen::array<Eigen::DenseIndex, 2> dims(1,2);
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 74; ++i) {
+    for (int j = 0; j < 35; ++j) {
+      for (int k = 0; k < 8; ++k) {
+        for (int l = 0; l < 137; ++l) {
+          const float result = out(i,j,k,l);
+          const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
+                                 input(i,j+1,k+0,l) * kernel(1,0) +
+                                 input(i,j+2,k+0,l) * kernel(2,0) +
+                                 input(i,j+0,k+1,l) * kernel(0,1) +
+                                 input(i,j+1,k+1,l) * kernel(1,1) +
+                                 input(i,j+2,k+1,l) * kernel(2,1) +
+                                 input(i,j+0,k+2,l) * kernel(0,2) +
+                                 input(i,j+1,k+2,l) * kernel(1,2) +
+                                 input(i,j+2,k+2,l) * kernel(2,2) +
+                                 input(i,j+0,k+3,l) * kernel(0,3) +
+                                 input(i,j+1,k+3,l) * kernel(1,3) +
+                                 input(i,j+2,k+3,l) * kernel(2,3);
+          VERIFY_IS_APPROX(result, expected);
+        }
+      }
+    }
+  }
+
+  cudaFree(d_input);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
+}
+
+template<int DataLayout>
+void test_cuda_convolution_3d()
+{
+  Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17));
+  Tensor<float, 3, DataLayout> kernel(3,4,2);
+  Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>(74,35,8,136,17));
+  input = input.constant(10.0f) + input.random();
+  kernel = kernel.constant(7.0f) + kernel.random();
+
+  std::size_t input_bytes = input.size() * sizeof(float);
+  std::size_t kernel_bytes = kernel.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_input;
+  float* d_kernel;
+  float* d_out;
+  cudaMalloc((void**)(&d_input), input_bytes);
+  cudaMalloc((void**)(&d_kernel), kernel_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;    
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17);
+  Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2);
+  Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17);
+
+  Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3);
+  gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 74; ++i) {
+    for (int j = 0; j < 35; ++j) {
+      for (int k = 0; k < 8; ++k) {
+        for (int l = 0; l < 136; ++l) {
+          for (int m = 0; m < 17; ++m) {
+            const float result = out(i,j,k,l,m);
+            const float expected = input(i,j+0,k+0,l+0,m) * kernel(0,0,0) +
+                                   input(i,j+1,k+0,l+0,m) * kernel(1,0,0) +
+                                   input(i,j+2,k+0,l+0,m) * kernel(2,0,0) +
+                                   input(i,j+0,k+1,l+0,m) * kernel(0,1,0) +
+                                   input(i,j+1,k+1,l+0,m) * kernel(1,1,0) +
+                                   input(i,j+2,k+1,l+0,m) * kernel(2,1,0) +
+                                   input(i,j+0,k+2,l+0,m) * kernel(0,2,0) +
+                                   input(i,j+1,k+2,l+0,m) * kernel(1,2,0) +
+                                   input(i,j+2,k+2,l+0,m) * kernel(2,2,0) +
+                                   input(i,j+0,k+3,l+0,m) * kernel(0,3,0) +
+                                   input(i,j+1,k+3,l+0,m) * kernel(1,3,0) +
+                                   input(i,j+2,k+3,l+0,m) * kernel(2,3,0) +
+                                   input(i,j+0,k+0,l+1,m) * kernel(0,0,1) +
+                                   input(i,j+1,k+0,l+1,m) * kernel(1,0,1) +
+                                   input(i,j+2,k+0,l+1,m) * kernel(2,0,1) +
+                                   input(i,j+0,k+1,l+1,m) * kernel(0,1,1) +
+                                   input(i,j+1,k+1,l+1,m) * kernel(1,1,1) +
+                                   input(i,j+2,k+1,l+1,m) * kernel(2,1,1) +
+                                   input(i,j+0,k+2,l+1,m) * kernel(0,2,1) +
+                                   input(i,j+1,k+2,l+1,m) * kernel(1,2,1) +
+                                   input(i,j+2,k+2,l+1,m) * kernel(2,2,1) +
+                                   input(i,j+0,k+3,l+1,m) * kernel(0,3,1) +
+                                   input(i,j+1,k+3,l+1,m) * kernel(1,3,1) +
+                                   input(i,j+2,k+3,l+1,m) * kernel(2,3,1);
+            VERIFY_IS_APPROX(result, expected);
+          }
+        }
+      }
+    }
+  }
+
+  cudaFree(d_input);
+  cudaFree(d_kernel);
+  cudaFree(d_out);
+}
+
+
+template <typename Scalar>
+void test_cuda_lgamma(const Scalar stddev)
+{
+  Tensor<Scalar, 2> in(72,97);
+  in.setRandom();
+  in *= in.constant(stddev);
+  Tensor<Scalar, 2> out(72,97);
+  out.setZero();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
+
+  gpu_out.device(gpu_device) = gpu_in.lgamma();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      VERIFY_IS_APPROX(out(i,j), (std::lgamma)(in(i,j)));
+    }
+  }
+
+  cudaFree(d_in);
+  cudaFree(d_out);
+}
+
+template <typename Scalar>
+void test_cuda_digamma()
+{
+  Tensor<Scalar, 1> in(7);
+  Tensor<Scalar, 1> out(7);
+  Tensor<Scalar, 1> expected_out(7);
+  out.setZero();
+
+  in(0) = Scalar(1);
+  in(1) = Scalar(1.5);
+  in(2) = Scalar(4);
+  in(3) = Scalar(-10.5);
+  in(4) = Scalar(10000.5);
+  in(5) = Scalar(0);
+  in(6) = Scalar(-1);
+
+  expected_out(0) = Scalar(-0.5772156649015329);
+  expected_out(1) = Scalar(0.03648997397857645);
+  expected_out(2) = Scalar(1.2561176684318);
+  expected_out(3) = Scalar(2.398239129535781);
+  expected_out(4) = Scalar(9.210340372392849);
+  expected_out(5) = std::numeric_limits<Scalar>::infinity();
+  expected_out(6) = std::numeric_limits<Scalar>::infinity();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
+
+  gpu_out.device(gpu_device) = gpu_in.digamma();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 5; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+  for (int i = 5; i < 7; ++i) {
+    VERIFY_IS_EQUAL(out(i), expected_out(i));
+  }
+
+  cudaFree(d_in);
+  cudaFree(d_out);
+}
+
+template <typename Scalar>
+void test_cuda_zeta()
+{
+  Tensor<Scalar, 1> in_x(6);
+  Tensor<Scalar, 1> in_q(6);
+  Tensor<Scalar, 1> out(6);
+  Tensor<Scalar, 1> expected_out(6);
+  out.setZero();
+
+  in_x(0) = Scalar(1);
+  in_x(1) = Scalar(1.5);
+  in_x(2) = Scalar(4);
+  in_x(3) = Scalar(-10.5);
+  in_x(4) = Scalar(10000.5);
+  in_x(5) = Scalar(3);
+  
+  in_q(0) = Scalar(1.2345);
+  in_q(1) = Scalar(2);
+  in_q(2) = Scalar(1.5);
+  in_q(3) = Scalar(3);
+  in_q(4) = Scalar(1.0001);
+  in_q(5) = Scalar(-2.5);
+
+  expected_out(0) = std::numeric_limits<Scalar>::infinity();
+  expected_out(1) = Scalar(1.61237534869);
+  expected_out(2) = Scalar(0.234848505667);
+  expected_out(3) = Scalar(1.03086757337e-5);
+  expected_out(4) = Scalar(0.367879440865);
+  expected_out(5) = Scalar(0.054102025820864097);
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in_x;
+  Scalar* d_in_q;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in_x), bytes);
+  cudaMalloc((void**)(&d_in_q), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in_q, in_q.data(), bytes, cudaMemcpyHostToDevice);
+  
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_q(d_in_q, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 6);
+
+  gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  VERIFY_IS_EQUAL(out(0), expected_out(0));
+  VERIFY((std::isnan)(out(3)));
+
+  for (int i = 1; i < 6; ++i) {
+    if (i != 3) {
+      VERIFY_IS_APPROX(out(i), expected_out(i));
+    }
+  }
+
+  cudaFree(d_in_x);
+  cudaFree(d_in_q);
+  cudaFree(d_out);
+}
+
+template <typename Scalar>
+void test_cuda_polygamma()
+{
+  Tensor<Scalar, 1> in_x(7);
+  Tensor<Scalar, 1> in_n(7);
+  Tensor<Scalar, 1> out(7);
+  Tensor<Scalar, 1> expected_out(7);
+  out.setZero();
+
+  in_n(0) = Scalar(1);
+  in_n(1) = Scalar(1);
+  in_n(2) = Scalar(1);
+  in_n(3) = Scalar(17);
+  in_n(4) = Scalar(31);
+  in_n(5) = Scalar(28);
+  in_n(6) = Scalar(8);
+  
+  in_x(0) = Scalar(2);
+  in_x(1) = Scalar(3);
+  in_x(2) = Scalar(25.5);
+  in_x(3) = Scalar(4.7);
+  in_x(4) = Scalar(11.8);
+  in_x(5) = Scalar(17.7);
+  in_x(6) = Scalar(30.2);
+
+  expected_out(0) = Scalar(0.644934066848);
+  expected_out(1) = Scalar(0.394934066848);
+  expected_out(2) = Scalar(0.0399946696496);
+  expected_out(3) = Scalar(293.334565435);
+  expected_out(4) = Scalar(0.445487887616);
+  expected_out(5) = Scalar(-2.47810300902e-07);
+  expected_out(6) = Scalar(-8.29668781082e-09);
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in_x;
+  Scalar* d_in_n;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in_x), bytes);
+  cudaMalloc((void**)(&d_in_n), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in_n, in_n.data(), bytes, cudaMemcpyHostToDevice);
+  
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_n(d_in_n, 7);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 7);
+
+  gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 7; ++i) {
+    VERIFY_IS_APPROX(out(i), expected_out(i));
+  }
+
+  cudaFree(d_in_x);
+  cudaFree(d_in_n);
+  cudaFree(d_out);
+}
+
+template <typename Scalar>
+void test_cuda_igamma()
+{
+  Tensor<Scalar, 2> a(6, 6);
+  Tensor<Scalar, 2> x(6, 6);
+  Tensor<Scalar, 2> out(6, 6);
+  out.setZero();
+
+  Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+  Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      a(i, j) = a_s[i];
+      x(i, j) = x_s[j];
+    }
+  }
+
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+  Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
+                          {0.0, 0.6321205588285578, 0.7768698398515702,
+                           0.9816843611112658, 9.999500016666262e-05, 1.0},
+                          {0.0, 0.4275932955291202, 0.608374823728911,
+                           0.9539882943107686, 7.522076445089201e-07, 1.0},
+                          {0.0, 0.01898815687615381, 0.06564245437845008,
+                           0.5665298796332909, 4.166333347221828e-18, 1.0},
+                          {0.0, 0.9999780593618628, 0.9999899967080838,
+                           0.9999996219837988, 0.9991370418689945, 1.0},
+                          {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
+
+
+
+  std::size_t bytes = a.size() * sizeof(Scalar);
+
+  Scalar* d_a;
+  Scalar* d_x;
+  Scalar* d_out;
+  assert(cudaMalloc((void**)(&d_a), bytes) == cudaSuccess);
+  assert(cudaMalloc((void**)(&d_x), bytes) == cudaSuccess);
+  assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
+
+  cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
+
+  gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      if ((std::isnan)(igamma_s[i][j])) {
+        VERIFY((std::isnan)(out(i, j)));
+      } else {
+        VERIFY_IS_APPROX(out(i, j), igamma_s[i][j]);
+      }
+    }
+  }
+
+  cudaFree(d_a);
+  cudaFree(d_x);
+  cudaFree(d_out);
+}
+
+template <typename Scalar>
+void test_cuda_igammac()
+{
+  Tensor<Scalar, 2> a(6, 6);
+  Tensor<Scalar, 2> x(6, 6);
+  Tensor<Scalar, 2> out(6, 6);
+  out.setZero();
+
+  Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+  Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      a(i, j) = a_s[i];
+      x(i, j) = x_s[j];
+    }
+  }
+
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+  Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
+                           {1.0, 0.36787944117144233, 0.22313016014842982,
+                            0.018315638888734182, 0.9999000049998333, 0.0},
+                           {1.0, 0.5724067044708798, 0.3916251762710878,
+                            0.04601170568923136, 0.9999992477923555, 0.0},
+                           {1.0, 0.9810118431238462, 0.9343575456215499,
+                            0.4334701203667089, 1.0, 0.0},
+                           {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
+                            3.7801620118431334e-07, 0.0008629581310054535,
+                            0.0},
+                           {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
+
+  std::size_t bytes = a.size() * sizeof(Scalar);
+
+  Scalar* d_a;
+  Scalar* d_x;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_a), bytes);
+  cudaMalloc((void**)(&d_x), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_x(d_x, 6, 6);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 6, 6);
+
+  gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 6; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      if ((std::isnan)(igammac_s[i][j])) {
+        VERIFY((std::isnan)(out(i, j)));
+      } else {
+        VERIFY_IS_APPROX(out(i, j), igammac_s[i][j]);
+      }
+    }
+  }
+
+  cudaFree(d_a);
+  cudaFree(d_x);
+  cudaFree(d_out);
+}
+
+template <typename Scalar>
+void test_cuda_erf(const Scalar stddev)
+{
+  Tensor<Scalar, 2> in(72,97);
+  in.setRandom();
+  in *= in.constant(stddev);
+  Tensor<Scalar, 2> out(72,97);
+  out.setZero();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  assert(cudaMalloc((void**)(&d_in), bytes) == cudaSuccess);
+  assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess);
+
+  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
+
+  gpu_out.device(gpu_device) = gpu_in.erf();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      VERIFY_IS_APPROX(out(i,j), (std::erf)(in(i,j)));
+    }
+  }
+
+  cudaFree(d_in);
+  cudaFree(d_out);
+}
+
+template <typename Scalar>
+void test_cuda_erfc(const Scalar stddev)
+{
+  Tensor<Scalar, 2> in(72,97);
+  in.setRandom();
+  in *= in.constant(stddev);
+  Tensor<Scalar, 2> out(72,97);
+  out.setZero();
+
+  std::size_t bytes = in.size() * sizeof(Scalar);
+
+  Scalar* d_in;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_out(d_out, 72, 97);
+
+  gpu_out.device(gpu_device) = gpu_in.erfc();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      VERIFY_IS_APPROX(out(i,j), (std::erfc)(in(i,j)));
+    }
+  }
+
+  cudaFree(d_in);
+  cudaFree(d_out);
+}
+
+template <typename Scalar>
+void test_cuda_betainc()
+{
+  Tensor<Scalar, 1> in_x(125);
+  Tensor<Scalar, 1> in_a(125);
+  Tensor<Scalar, 1> in_b(125);
+  Tensor<Scalar, 1> out(125);
+  Tensor<Scalar, 1> expected_out(125);
+  out.setZero();
+
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+
+  Array<Scalar, 1, Dynamic> x(125);
+  Array<Scalar, 1, Dynamic> a(125);
+  Array<Scalar, 1, Dynamic> b(125);
+  Array<Scalar, 1, Dynamic> v(125);
+
+  a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+      0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+      0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+      999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+      999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+      999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999;
+
+  b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+      0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+      999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+      0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+      999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+      0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+      999.999, 999.999, 999.999;
+
+  x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
+      1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+      0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+      0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
+      0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
+      -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
+      1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+      0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+      0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;
+
+  v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+      nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+      nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
+      0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
+      0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
+      0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan,
+      nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
+      0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
+      0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
+      0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
+      0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
+      1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan,
+      nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444,
+      nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229,
+      nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan,
+      nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306,
+      2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
+      1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252,
+      2.9303043666183996e-60, nan, nan, 2.248913486879199e-196,
+      0.5000000000004947, 0.9999999999999999, nan;
+
+  for (int i = 0; i < 125; ++i) {
+    in_x(i) = x(i);
+    in_a(i) = a(i);
+    in_b(i) = b(i);
+    expected_out(i) = v(i);
+  }
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in_x;
+  Scalar* d_in_a;
+  Scalar* d_in_b;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in_x), bytes);
+  cudaMalloc((void**)(&d_in_a), bytes);
+  cudaMalloc((void**)(&d_in_b), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in_a, in_a.data(), bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in_b, in_b.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125);
+
+  gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 1; i < 125; ++i) {
+    if ((std::isnan)(expected_out(i))) {
+      VERIFY((std::isnan)(out(i)));
+    } else {
+      VERIFY_IS_APPROX(out(i), expected_out(i));
+    }
+  }
+
+  cudaFree(d_in_x);
+  cudaFree(d_in_a);
+  cudaFree(d_in_b);
+  cudaFree(d_out);
+}
+
+
+void test_cxx11_tensor_cuda()
+{
+  CALL_SUBTEST_1(test_cuda_nullary());
+  CALL_SUBTEST_1(test_cuda_elementwise_small());
+  CALL_SUBTEST_1(test_cuda_elementwise());
+  CALL_SUBTEST_1(test_cuda_props());
+  CALL_SUBTEST_1(test_cuda_reduction());
+  CALL_SUBTEST_2(test_cuda_contraction<ColMajor>());
+  CALL_SUBTEST_2(test_cuda_contraction<RowMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_1d<ColMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_1d<RowMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_inner_dim_col_major_1d());
+  CALL_SUBTEST_3(test_cuda_convolution_inner_dim_row_major_1d());
+  CALL_SUBTEST_3(test_cuda_convolution_2d<ColMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_2d<RowMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>());
+  CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>());
+
+#if __cplusplus > 199711L
+  // std::erf, std::erfc, and so on where only added in c++11. We use them
+  // as a golden reference to validate the results produced by Eigen. Therefore
+  // we can only run these tests if we use a c++11 compiler.
+  CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f));
+  CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f));
+  CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f));
+  CALL_SUBTEST_4(test_cuda_lgamma<float>(0.001f));
+
+  CALL_SUBTEST_4(test_cuda_lgamma<double>(1.0));
+  CALL_SUBTEST_4(test_cuda_lgamma<double>(100.0));
+  CALL_SUBTEST_4(test_cuda_lgamma<double>(0.01));
+  CALL_SUBTEST_4(test_cuda_lgamma<double>(0.001));
+
+  CALL_SUBTEST_4(test_cuda_erf<float>(1.0f));
+  CALL_SUBTEST_4(test_cuda_erf<float>(100.0f));
+  CALL_SUBTEST_4(test_cuda_erf<float>(0.01f));
+  CALL_SUBTEST_4(test_cuda_erf<float>(0.001f));
+
+  CALL_SUBTEST_4(test_cuda_erfc<float>(1.0f));
+  // CALL_SUBTEST(test_cuda_erfc<float>(100.0f));
+  CALL_SUBTEST_4(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs
+  CALL_SUBTEST_4(test_cuda_erfc<float>(0.01f));
+  CALL_SUBTEST_4(test_cuda_erfc<float>(0.001f));
+
+  CALL_SUBTEST_4(test_cuda_erf<double>(1.0));
+  CALL_SUBTEST_4(test_cuda_erf<double>(100.0));
+  CALL_SUBTEST_4(test_cuda_erf<double>(0.01));
+  CALL_SUBTEST_4(test_cuda_erf<double>(0.001));
+
+  CALL_SUBTEST_4(test_cuda_erfc<double>(1.0));
+  // CALL_SUBTEST(test_cuda_erfc<double>(100.0));
+  CALL_SUBTEST_4(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs
+  CALL_SUBTEST_4(test_cuda_erfc<double>(0.01));
+  CALL_SUBTEST_4(test_cuda_erfc<double>(0.001));
+
+  CALL_SUBTEST_5(test_cuda_digamma<float>());
+  CALL_SUBTEST_5(test_cuda_digamma<double>());
+
+  CALL_SUBTEST_5(test_cuda_polygamma<float>());
+  CALL_SUBTEST_5(test_cuda_polygamma<double>());
+
+  CALL_SUBTEST_5(test_cuda_zeta<float>());
+  CALL_SUBTEST_5(test_cuda_zeta<double>());
+
+  CALL_SUBTEST_5(test_cuda_igamma<float>());
+  CALL_SUBTEST_5(test_cuda_igammac<float>());
+
+  CALL_SUBTEST_5(test_cuda_igamma<double>());
+  CALL_SUBTEST_5(test_cuda_igammac<double>());
+
+  CALL_SUBTEST_6(test_cuda_betainc<float>());
+  CALL_SUBTEST_6(test_cuda_betainc<double>());
+#endif
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_custom_index.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_custom_index.cpp
new file mode 100644
index 000000000..4528cc176
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_custom_index.cpp
@@ -0,0 +1,100 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <limits>
+#include <map>
+
+#include <Eigen/Dense>
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+
+template <int DataLayout>
+static void test_map_as_index()
+{
+#ifdef EIGEN_HAS_SFINAE
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  tensor.setRandom();
+
+  using NormalIndex = DSizes<ptrdiff_t, 4>;
+  using CustomIndex = std::map<ptrdiff_t, ptrdiff_t>;
+  CustomIndex coeffC;
+  coeffC[0] = 1;
+  coeffC[1] = 2;
+  coeffC[2] = 4;
+  coeffC[3] = 1;
+  NormalIndex coeff(1,2,4,1);
+
+  VERIFY_IS_EQUAL(tensor.coeff(coeffC), tensor.coeff(coeff));
+  VERIFY_IS_EQUAL(tensor.coeffRef(coeffC), tensor.coeffRef(coeff));
+#endif
+}
+
+
+template <int DataLayout>
+static void test_matrix_as_index()
+{
+#ifdef EIGEN_HAS_SFINAE
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  tensor.setRandom();
+
+  using NormalIndex = DSizes<ptrdiff_t, 4>;
+  using CustomIndex = Matrix<unsigned int, 4, 1>;
+  CustomIndex coeffC(1,2,4,1);
+  NormalIndex coeff(1,2,4,1);
+
+  VERIFY_IS_EQUAL(tensor.coeff(coeffC), tensor.coeff(coeff));
+  VERIFY_IS_EQUAL(tensor.coeffRef(coeffC), tensor.coeffRef(coeff));
+#endif
+}
+
+
+template <int DataLayout>
+static void test_varlist_as_index()
+{
+#ifdef EIGEN_HAS_SFINAE
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  tensor.setRandom();
+
+  DSizes<ptrdiff_t, 4> coeff(1,2,4,1);
+
+  VERIFY_IS_EQUAL(tensor.coeff({1,2,4,1}), tensor.coeff(coeff));
+  VERIFY_IS_EQUAL(tensor.coeffRef({1,2,4,1}), tensor.coeffRef(coeff));
+#endif
+}
+
+
+template <int DataLayout>
+static void test_sizes_as_index()
+{
+#ifdef EIGEN_HAS_SFINAE
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  tensor.setRandom();
+
+  DSizes<ptrdiff_t, 4> coeff(1,2,4,1);
+  Sizes<1,2,4,1> coeffC;
+
+  VERIFY_IS_EQUAL(tensor.coeff(coeffC), tensor.coeff(coeff));
+  VERIFY_IS_EQUAL(tensor.coeffRef(coeffC), tensor.coeffRef(coeff));
+#endif
+}
+
+
+void test_cxx11_tensor_custom_index() {
+  test_map_as_index<ColMajor>();
+  test_map_as_index<RowMajor>();
+  test_matrix_as_index<ColMajor>();
+  test_matrix_as_index<RowMajor>();
+  test_varlist_as_index<ColMajor>();
+  test_varlist_as_index<RowMajor>();
+  test_sizes_as_index<ColMajor>();
+  test_sizes_as_index<RowMajor>();
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_custom_op.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_custom_op.cpp
new file mode 100644
index 000000000..8baa477cc
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_custom_op.cpp
@@ -0,0 +1,111 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+
+struct InsertZeros {
+  DSizes<DenseIndex, 2> dimensions(const Tensor<float, 2>& input) const {
+    DSizes<DenseIndex, 2> result;
+    result[0] = input.dimension(0) * 2;
+    result[1] = input.dimension(1) * 2;
+    return result;
+  }
+
+  template <typename Output, typename Device>
+  void eval(const Tensor<float, 2>& input, Output& output, const Device& device) const
+  {
+    array<DenseIndex, 2> strides;
+    strides[0] = 2;
+    strides[1] = 2;
+    output.stride(strides).device(device) = input;
+
+    Eigen::DSizes<DenseIndex, 2> offsets(1,1);
+    Eigen::DSizes<DenseIndex, 2> extents(output.dimension(0)-1, output.dimension(1)-1);
+    output.slice(offsets, extents).stride(strides).device(device) = input.constant(0.0f);
+  }
+};
+
+static void test_custom_unary_op()
+{
+  Tensor<float, 2> tensor(3,5);
+  tensor.setRandom();
+
+  Tensor<float, 2> result = tensor.customOp(InsertZeros());
+  VERIFY_IS_EQUAL(result.dimension(0), 6);
+  VERIFY_IS_EQUAL(result.dimension(1), 10);
+
+  for (int i = 0; i < 6; i+=2) {
+    for (int j = 0; j < 10; j+=2) {
+      VERIFY_IS_EQUAL(result(i, j), tensor(i/2, j/2));
+    }
+  }
+  for (int i = 1; i < 6; i+=2) {
+    for (int j = 1; j < 10; j+=2) {
+      VERIFY_IS_EQUAL(result(i, j), 0);
+    }
+  }
+}
+
+
+struct BatchMatMul {
+  DSizes<DenseIndex, 3> dimensions(const Tensor<float, 3>& input1, const Tensor<float, 3>& input2) const {
+    DSizes<DenseIndex, 3> result;
+    result[0] = input1.dimension(0);
+    result[1] = input2.dimension(1);
+    result[2] = input2.dimension(2);
+    return result;
+  }
+
+  template <typename Output, typename Device>
+  void eval(const Tensor<float, 3>& input1, const Tensor<float, 3>& input2,
+            Output& output, const Device& device) const
+  {
+    typedef Tensor<float, 3>::DimensionPair DimPair;
+    array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
+    for (int i = 0; i < output.dimension(2); ++i) {
+      output.template chip<2>(i).device(device) = input1.chip<2>(i).contract(input2.chip<2>(i), dims);
+    }
+  }
+};
+
+
+static void test_custom_binary_op()
+{
+  Tensor<float, 3> tensor1(2,3,5);
+  tensor1.setRandom();
+  Tensor<float, 3> tensor2(3,7,5);
+  tensor2.setRandom();
+
+  Tensor<float, 3> result = tensor1.customOp(tensor2, BatchMatMul());
+  for (int i = 0; i < 5; ++i) {
+    typedef Tensor<float, 3>::DimensionPair DimPair;
+    array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
+    Tensor<float, 2> reference = tensor1.chip<2>(i).contract(tensor2.chip<2>(i), dims);
+    TensorRef<Tensor<float, 2> > val = result.chip<2>(i);
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(val(j, k), reference(j, k));
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_custom_op()
+{
+  CALL_SUBTEST(test_custom_unary_op());
+  CALL_SUBTEST(test_custom_binary_op());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_device.cu b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_device.cu
new file mode 100644
index 000000000..cbb43e210
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_device.cu
@@ -0,0 +1,387 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_device
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+// Context for evaluation on cpu
+struct CPUContext {
+  CPUContext(const Eigen::Tensor<float, 3>& in1, Eigen::Tensor<float, 3>& in2, Eigen::Tensor<float, 3>& out) : in1_(in1), in2_(in2), out_(out), kernel_1d_(2), kernel_2d_(2,2), kernel_3d_(2,2,2) {
+    kernel_1d_(0) = 3.14f;
+    kernel_1d_(1) = 2.7f;
+
+    kernel_2d_(0,0) = 3.14f;
+    kernel_2d_(1,0) = 2.7f;
+    kernel_2d_(0,1) = 0.2f;
+    kernel_2d_(1,1) = 7.0f;
+
+    kernel_3d_(0,0,0) = 3.14f;
+    kernel_3d_(0,1,0) = 2.7f;
+    kernel_3d_(0,0,1) = 0.2f;
+    kernel_3d_(0,1,1) = 7.0f;
+    kernel_3d_(1,0,0) = -1.0f;
+    kernel_3d_(1,1,0) = -0.3f;
+    kernel_3d_(1,0,1) = -0.7f;
+    kernel_3d_(1,1,1) = -0.5f;
+  }
+
+  const Eigen::DefaultDevice& device() const { return cpu_device_; }
+
+  const Eigen::Tensor<float, 3>& in1() const { return in1_; }
+  const Eigen::Tensor<float, 3>& in2() const { return in2_; }
+  Eigen::Tensor<float, 3>& out() { return out_; }
+  const Eigen::Tensor<float, 1>& kernel1d() const { return kernel_1d_; }
+  const Eigen::Tensor<float, 2>& kernel2d() const { return kernel_2d_; }
+  const Eigen::Tensor<float, 3>& kernel3d() const { return kernel_3d_; }
+
+ private:
+  const Eigen::Tensor<float, 3>& in1_;
+  const Eigen::Tensor<float, 3>& in2_;
+  Eigen::Tensor<float, 3>& out_;
+
+  Eigen::Tensor<float, 1> kernel_1d_;
+  Eigen::Tensor<float, 2> kernel_2d_;
+  Eigen::Tensor<float, 3> kernel_3d_;
+
+  Eigen::DefaultDevice cpu_device_;
+};
+
+
+// Context for evaluation on GPU
+struct GPUContext {
+  GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) {
+    assert(cudaMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == cudaSuccess);
+    float kernel_1d_val[] = {3.14f, 2.7f};
+    assert(cudaMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
+
+    assert(cudaMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == cudaSuccess);
+    float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f};
+    assert(cudaMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
+
+    assert(cudaMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == cudaSuccess);
+    float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f};
+    assert(cudaMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess);
+  }
+  ~GPUContext() {
+    assert(cudaFree(kernel_1d_) == cudaSuccess);
+    assert(cudaFree(kernel_2d_) == cudaSuccess);
+    assert(cudaFree(kernel_3d_) == cudaSuccess);
+  }
+
+  const Eigen::GpuDevice& device() const { return gpu_device_; }
+
+  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1() const { return in1_; }
+  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2() const { return in2_; }
+  Eigen::TensorMap<Eigen::Tensor<float, 3> >& out() { return out_; }
+  Eigen::TensorMap<Eigen::Tensor<float, 1> > kernel1d() const { return Eigen::TensorMap<Eigen::Tensor<float, 1> >(kernel_1d_, 2); }
+  Eigen::TensorMap<Eigen::Tensor<float, 2> > kernel2d() const { return Eigen::TensorMap<Eigen::Tensor<float, 2> >(kernel_2d_, 2, 2); }
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > kernel3d() const { return Eigen::TensorMap<Eigen::Tensor<float, 3> >(kernel_3d_, 2, 2, 2); }
+
+ private:
+  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1_;
+  const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2_;
+  Eigen::TensorMap<Eigen::Tensor<float, 3> >& out_;
+
+  float* kernel_1d_;
+  float* kernel_2d_;
+  float* kernel_3d_;
+
+  Eigen::CudaStreamDevice stream_;
+  Eigen::GpuDevice gpu_device_;
+};
+
+
+// The actual expression to evaluate
+template <typename Context>
+void test_contextual_eval(Context* context)
+{
+  context->out().device(context->device()) = context->in1() + context->in2() * 3.14f + context->in1().constant(2.718f);
+}
+
+template <typename Context>
+void test_forced_contextual_eval(Context* context)
+{
+  context->out().device(context->device()) = (context->in1() + context->in2()).eval() * 3.14f + context->in1().constant(2.718f);
+}
+
+template <typename Context>
+void test_compound_assignment(Context* context)
+{
+  context->out().device(context->device()) = context->in1().constant(2.718f);
+  context->out().device(context->device()) += context->in1() + context->in2() * 3.14f;
+}
+
+
+template <typename Context>
+void test_contraction(Context* context)
+{
+  Eigen::array<std::pair<int, int>, 2> dims;
+  dims[0] = std::make_pair(1, 1);
+  dims[1] = std::make_pair(2, 2);
+
+  Eigen::array<int, 2> shape(40, 50*70);
+
+  Eigen::DSizes<int, 2> indices(0,0);
+  Eigen::DSizes<int, 2> sizes(40,40);
+
+  context->out().reshape(shape).slice(indices, sizes).device(context->device()) = context->in1().contract(context->in2(), dims);
+}
+
+
+template <typename Context>
+void test_1d_convolution(Context* context)
+{
+  Eigen::DSizes<int, 3> indices(0,0,0);
+  Eigen::DSizes<int, 3> sizes(40,49,70);
+
+  Eigen::array<int, 1> dims(1);
+  context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel1d(), dims);
+}
+
+template <typename Context>
+void test_2d_convolution(Context* context)
+{
+  Eigen::DSizes<int, 3> indices(0,0,0);
+  Eigen::DSizes<int, 3> sizes(40,49,69);
+
+  Eigen::array<int, 2> dims(1,2);
+  context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel2d(), dims);
+}
+
+template <typename Context>
+void test_3d_convolution(Context* context)
+{
+  Eigen::DSizes<int, 3> indices(0,0,0);
+  Eigen::DSizes<int, 3> sizes(39,49,69);
+
+  Eigen::array<int, 3> dims(0,1,2);
+  context->out().slice(indices, sizes).device(context->device()) = context->in1().convolve(context->kernel3d(), dims);
+}
+
+
+void test_cpu() {
+  Eigen::Tensor<float, 3> in1(40,50,70);
+  Eigen::Tensor<float, 3> in2(40,50,70);
+  Eigen::Tensor<float, 3> out(40,50,70);
+
+  in1 = in1.random() + in1.constant(10.0f);
+  in2 = in2.random() + in2.constant(10.0f);
+
+  CPUContext context(in1, in2, out);
+  test_contextual_eval(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_forced_contextual_eval(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_compound_assignment(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_contraction(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 40; ++j) {
+      const float result = out(i,j,0);
+      float expected = 0;
+      for (int k = 0; k < 50; ++k) {
+        for (int l = 0; l < 70; ++l) {
+          expected += in1(i, k, l) * in2(j, k, l);
+        }
+      }
+      VERIFY_IS_APPROX(expected, result);
+    }
+  }
+
+  test_1d_convolution(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
+      }
+    }
+  }
+
+  test_2d_convolution(&context);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 69; ++k) {
+        const float result = out(i,j,k);
+        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f) +
+                               (in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
+        if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
+          continue;
+        }
+        VERIFY_IS_APPROX(expected, result);
+      }
+    }
+  }
+
+  test_3d_convolution(&context);
+  for (int i = 0; i < 39; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 69; ++k) {
+        const float result = out(i,j,k);
+        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
+                                in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f) +
+                               (in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
+                                in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
+        if (fabs(expected) < 1e-4f && fabs(result) < 1e-4f) {
+          continue;
+        }
+        VERIFY_IS_APPROX(expected, result);
+      }
+    }
+  }
+}
+
+void test_gpu() {
+  Eigen::Tensor<float, 3> in1(40,50,70);
+  Eigen::Tensor<float, 3> in2(40,50,70);
+  Eigen::Tensor<float, 3> out(40,50,70);
+  in1 = in1.random() + in1.constant(10.0f);
+  in2 = in2.random() + in2.constant(10.0f);
+
+  std::size_t in1_bytes = in1.size() * sizeof(float);
+  std::size_t in2_bytes = in2.size() * sizeof(float);
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_in2;
+  float* d_out;
+  cudaMalloc((void**)(&d_in1), in1_bytes);
+  cudaMalloc((void**)(&d_in2), in2_bytes);
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70);
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70);
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, 40,50,70);
+
+  GPUContext context(gpu_in1, gpu_in2, gpu_out);
+  test_contextual_eval(&context);
+  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_forced_contextual_eval(&context);
+  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) + in2(i,j,k)) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_compound_assignment(&context);
+  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 50; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f + 2.718f);
+      }
+    }
+  }
+
+  test_contraction(&context);
+  assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 40; ++j) {
+      const float result = out(i,j,0);
+      float expected = 0;
+      for (int k = 0; k < 50; ++k) {
+        for (int l = 0; l < 70; ++l) {
+          expected += in1(i, k, l) * in2(j, k, l);
+        }
+      }
+      VERIFY_IS_APPROX(expected, result);
+    }
+  }
+
+  test_1d_convolution(&context);
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 70; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f));
+      }
+    }
+  }
+
+  test_2d_convolution(&context);
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
+  for (int i = 0; i < 40; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 69; ++k) {
+        const float result = out(i,j,k);
+        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
+                                in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f);
+        VERIFY_IS_APPROX(expected, result);
+      }
+    }
+  }
+
+  test_3d_convolution(&context);
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess);
+  for (int i = 0; i < 39; ++i) {
+    for (int j = 0; j < 49; ++j) {
+      for (int k = 0; k < 69; ++k) {
+       const float result = out(i,j,k);
+        const float expected = (in1(i,j,k) * 3.14f + in1(i,j+1,k) * 2.7f +
+                                in1(i,j,k+1) * 0.2f + in1(i,j+1,k+1) * 7.0f +
+                                in1(i+1,j,k) * -1.0f + in1(i+1,j+1,k) * -0.3f +
+                                in1(i+1,j,k+1) * -0.7f + in1(i+1,j+1,k+1) * -0.5f);
+        VERIFY_IS_APPROX(expected, result);
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_device()
+{
+  CALL_SUBTEST_1(test_cpu());
+  CALL_SUBTEST_2(test_gpu());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_device_sycl.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_device_sycl.cpp
new file mode 100644
index 000000000..7f79753c5
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_device_sycl.cpp
@@ -0,0 +1,31 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_device_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+void test_device_sycl(const Eigen::SyclDevice &sycl_device) {
+  std::cout <<"Helo from ComputeCpp: the requested device exists and the device name is : "
+    << sycl_device.m_queue.get_device(). template get_info<cl::sycl::info::device::name>() <<std::endl;;
+}
+void test_cxx11_tensor_device_sycl() {
+  cl::sycl::gpu_selector s;
+  Eigen::SyclDevice sycl_device(s);
+  CALL_SUBTEST(test_device_sycl(sycl_device));
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_dimension.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_dimension.cpp
new file mode 100644
index 000000000..16f168ed4
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_dimension.cpp
@@ -0,0 +1,69 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+
+static void test_dynamic_size()
+{
+  Eigen::DSizes<int, 3> dimensions(2,3,7);
+
+  VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2);
+  VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3);
+  VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7);
+  VERIFY_IS_EQUAL((int)dimensions.TotalSize(), 2*3*7);
+  VERIFY_IS_EQUAL((int)dimensions[0], 2);
+  VERIFY_IS_EQUAL((int)dimensions[1], 3);
+  VERIFY_IS_EQUAL((int)dimensions[2], 7);
+}
+
+static void test_fixed_size()
+{
+  Eigen::Sizes<2,3,7> dimensions;
+
+  VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2);
+  VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3);
+  VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7);
+  VERIFY_IS_EQUAL((int)dimensions.TotalSize(), 2*3*7);
+}
+
+static void test_match()
+{
+  Eigen::DSizes<unsigned int, 3> dyn((unsigned int)2,(unsigned int)3,(unsigned int)7);
+  Eigen::Sizes<2,3,7> stat;
+  VERIFY_IS_EQUAL(Eigen::dimensions_match(dyn, stat), true);
+
+  Eigen::DSizes<int, 3> dyn1(2,3,7);
+  Eigen::DSizes<int, 2> dyn2(2,3);
+  VERIFY_IS_EQUAL(Eigen::dimensions_match(dyn1, dyn2), false);
+}
+
+static void test_rank_zero()
+{
+  Eigen::Sizes<> scalar;
+  VERIFY_IS_EQUAL((int)scalar.TotalSize(), 1);
+  VERIFY_IS_EQUAL((int)scalar.rank(), 0);
+  VERIFY_IS_EQUAL((int)internal::array_prod(scalar), 1);
+
+  Eigen::DSizes<ptrdiff_t, 0> dscalar;
+  VERIFY_IS_EQUAL((int)dscalar.TotalSize(), 1);
+  VERIFY_IS_EQUAL((int)dscalar.rank(), 0);
+}
+
+void test_cxx11_tensor_dimension()
+{
+  CALL_SUBTEST(test_dynamic_size());
+  CALL_SUBTEST(test_fixed_size());
+  CALL_SUBTEST(test_match());
+  CALL_SUBTEST(test_rank_zero());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_empty.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_empty.cpp
new file mode 100644
index 000000000..d7eea42d7
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_empty.cpp
@@ -0,0 +1,40 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+
+static void test_empty_tensor()
+{
+  Tensor<float, 2> source;
+  Tensor<float, 2> tgt1 = source;
+  Tensor<float, 2> tgt2(source);
+  Tensor<float, 2> tgt3;
+  tgt3 = tgt1;
+  tgt3 = tgt2;
+}
+
+static void test_empty_fixed_size_tensor()
+{
+  TensorFixedSize<float, Sizes<0> > source;
+  TensorFixedSize<float, Sizes<0> > tgt1 = source;
+  TensorFixedSize<float, Sizes<0> > tgt2(source);
+  TensorFixedSize<float, Sizes<0> > tgt3;
+  tgt3 = tgt1;
+  tgt3 = tgt2;
+}
+
+
+void test_cxx11_tensor_empty()
+{
+   CALL_SUBTEST(test_empty_tensor());
+   CALL_SUBTEST(test_empty_fixed_size_tensor());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_expr.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_expr.cpp
new file mode 100644
index 000000000..77e24cb67
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_expr.cpp
@@ -0,0 +1,314 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_1d()
+{
+  Tensor<float, 1> vec1(6);
+  Tensor<float, 1, RowMajor> vec2(6);
+
+  vec1(0) = 4.0;  vec2(0) = 0.0;
+  vec1(1) = 8.0;  vec2(1) = 1.0;
+  vec1(2) = 15.0; vec2(2) = 2.0;
+  vec1(3) = 16.0; vec2(3) = 3.0;
+  vec1(4) = 23.0; vec2(4) = 4.0;
+  vec1(5) = 42.0; vec2(5) = 5.0;
+
+  float data3[6];
+  TensorMap<Tensor<float, 1>> vec3(data3, 6);
+  vec3 = vec1.sqrt();
+  float data4[6];
+  TensorMap<Tensor<float, 1, RowMajor>> vec4(data4, 6);
+  vec4 = vec2.square();
+  float data5[6];
+  TensorMap<Tensor<float, 1, RowMajor>> vec5(data5, 6);
+  vec5 = vec2.cube();
+
+  VERIFY_IS_APPROX(vec3(0), sqrtf(4.0));
+  VERIFY_IS_APPROX(vec3(1), sqrtf(8.0));
+  VERIFY_IS_APPROX(vec3(2), sqrtf(15.0));
+  VERIFY_IS_APPROX(vec3(3), sqrtf(16.0));
+  VERIFY_IS_APPROX(vec3(4), sqrtf(23.0));
+  VERIFY_IS_APPROX(vec3(5), sqrtf(42.0));
+
+  VERIFY_IS_APPROX(vec4(0), 0.0f);
+  VERIFY_IS_APPROX(vec4(1), 1.0f);
+  VERIFY_IS_APPROX(vec4(2), 2.0f * 2.0f);
+  VERIFY_IS_APPROX(vec4(3), 3.0f * 3.0f);
+  VERIFY_IS_APPROX(vec4(4), 4.0f * 4.0f);
+  VERIFY_IS_APPROX(vec4(5), 5.0f * 5.0f);
+
+  VERIFY_IS_APPROX(vec5(0), 0.0f);
+  VERIFY_IS_APPROX(vec5(1), 1.0f);
+  VERIFY_IS_APPROX(vec5(2), 2.0f * 2.0f * 2.0f);
+  VERIFY_IS_APPROX(vec5(3), 3.0f * 3.0f * 3.0f);
+  VERIFY_IS_APPROX(vec5(4), 4.0f * 4.0f * 4.0f);
+  VERIFY_IS_APPROX(vec5(5), 5.0f * 5.0f * 5.0f);
+
+  vec3 = vec1 + vec2;
+  VERIFY_IS_APPROX(vec3(0), 4.0f + 0.0f);
+  VERIFY_IS_APPROX(vec3(1), 8.0f + 1.0f);
+  VERIFY_IS_APPROX(vec3(2), 15.0f + 2.0f);
+  VERIFY_IS_APPROX(vec3(3), 16.0f + 3.0f);
+  VERIFY_IS_APPROX(vec3(4), 23.0f + 4.0f);
+  VERIFY_IS_APPROX(vec3(5), 42.0f + 5.0f);
+}
+
+static void test_2d()
+{
+  float data1[6];
+  TensorMap<Tensor<float, 2>> mat1(data1, 2, 3);
+  float data2[6];
+  TensorMap<Tensor<float, 2, RowMajor>> mat2(data2, 2, 3);
+
+  mat1(0,0) = 0.0;
+  mat1(0,1) = 1.0;
+  mat1(0,2) = 2.0;
+  mat1(1,0) = 3.0;
+  mat1(1,1) = 4.0;
+  mat1(1,2) = 5.0;
+
+  mat2(0,0) = -0.0;
+  mat2(0,1) = -1.0;
+  mat2(0,2) = -2.0;
+  mat2(1,0) = -3.0;
+  mat2(1,1) = -4.0;
+  mat2(1,2) = -5.0;
+
+  Tensor<float, 2> mat3(2,3);
+  Tensor<float, 2, RowMajor> mat4(2,3);
+  mat3 = mat1.abs();
+  mat4 = mat2.abs();
+
+  VERIFY_IS_APPROX(mat3(0,0), 0.0f);
+  VERIFY_IS_APPROX(mat3(0,1), 1.0f);
+  VERIFY_IS_APPROX(mat3(0,2), 2.0f);
+  VERIFY_IS_APPROX(mat3(1,0), 3.0f);
+  VERIFY_IS_APPROX(mat3(1,1), 4.0f);
+  VERIFY_IS_APPROX(mat3(1,2), 5.0f);
+
+  VERIFY_IS_APPROX(mat4(0,0), 0.0f);
+  VERIFY_IS_APPROX(mat4(0,1), 1.0f);
+  VERIFY_IS_APPROX(mat4(0,2), 2.0f);
+  VERIFY_IS_APPROX(mat4(1,0), 3.0f);
+  VERIFY_IS_APPROX(mat4(1,1), 4.0f);
+  VERIFY_IS_APPROX(mat4(1,2), 5.0f);
+}
+
+static void test_3d()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3, RowMajor> mat2(2,3,7);
+
+  float val = 1.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        mat2(i,j,k) = val;
+        val += 1.0f;
+      }
+    }
+  }
+
+  Tensor<float, 3> mat3(2,3,7);
+  mat3 = mat1 + mat1;
+  Tensor<float, 3, RowMajor> mat4(2,3,7);
+  mat4 = mat2 * 3.14f;
+  Tensor<float, 3> mat5(2,3,7);
+  mat5 = mat1.inverse().log();
+  Tensor<float, 3, RowMajor> mat6(2,3,7);
+  mat6 = mat2.pow(0.5f) * 3.14f;
+  Tensor<float, 3> mat7(2,3,7);
+  mat7 = mat1.cwiseMax(mat5 * 2.0f).exp();
+  Tensor<float, 3, RowMajor> mat8(2,3,7);
+  mat8 = (-mat2).exp() * 3.14f;
+  Tensor<float, 3, RowMajor> mat9(2,3,7);
+  mat9 = mat2 + 3.14f;
+  Tensor<float, 3, RowMajor> mat10(2,3,7);
+  mat10 = mat2 - 3.14f;
+  Tensor<float, 3, RowMajor> mat11(2,3,7);
+  mat11 = mat2 / 3.14f;
+
+  val = 1.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), val + val);
+        VERIFY_IS_APPROX(mat4(i,j,k), val * 3.14f);
+        VERIFY_IS_APPROX(mat5(i,j,k), logf(1.0f/val));
+        VERIFY_IS_APPROX(mat6(i,j,k), sqrtf(val) * 3.14f);
+        VERIFY_IS_APPROX(mat7(i,j,k), expf((std::max)(val, mat5(i,j,k) * 2.0f)));
+        VERIFY_IS_APPROX(mat8(i,j,k), expf(-val) * 3.14f);
+        VERIFY_IS_APPROX(mat9(i,j,k), val + 3.14f);
+        VERIFY_IS_APPROX(mat10(i,j,k), val - 3.14f);
+        VERIFY_IS_APPROX(mat11(i,j,k), val / 3.14f);
+        val += 1.0f;
+      }
+    }
+  }
+}
+
+static void test_constants()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<float, 3> mat3(2,3,7);
+
+  float val = 1.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        val += 1.0f;
+      }
+    }
+  }
+  mat2 = mat1.constant(3.14f);
+  mat3 = mat1.cwiseMax(7.3f).exp();
+
+  val = 1.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat2(i,j,k), 3.14f);
+        VERIFY_IS_APPROX(mat3(i,j,k), expf((std::max)(val, 7.3f)));
+        val += 1.0f;
+      }
+    }
+  }
+}
+
+static void test_boolean()
+{
+  Tensor<int, 1> vec(6);
+  std::copy_n(std::begin({0, 1, 2, 3, 4, 5}), 6, vec.data());
+
+  // Test ||.
+  Tensor<bool, 1> bool1 = vec < vec.constant(1) || vec > vec.constant(4);
+  VERIFY_IS_EQUAL(bool1[0], true);
+  VERIFY_IS_EQUAL(bool1[1], false);
+  VERIFY_IS_EQUAL(bool1[2], false);
+  VERIFY_IS_EQUAL(bool1[3], false);
+  VERIFY_IS_EQUAL(bool1[4], false);
+  VERIFY_IS_EQUAL(bool1[5], true);
+
+  // Test &&, including cast of operand vec.
+  Tensor<bool, 1> bool2 = vec.cast<bool>() && vec < vec.constant(4);
+  VERIFY_IS_EQUAL(bool2[0], false);
+  VERIFY_IS_EQUAL(bool2[1], true);
+  VERIFY_IS_EQUAL(bool2[2], true);
+  VERIFY_IS_EQUAL(bool2[3], true);
+  VERIFY_IS_EQUAL(bool2[4], false);
+  VERIFY_IS_EQUAL(bool2[5], false);
+
+  // Compilation tests:
+  // Test Tensor<bool> against results of cast or comparison; verifies that
+  // CoeffReturnType is set to match Op return type of bool for Unary and Binary
+  // Ops.
+  Tensor<bool, 1> bool3 = vec.cast<bool>() && bool2;
+  bool3 = vec < vec.constant(4) && bool2;
+}
+
+static void test_functors()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<float, 3> mat3(2,3,7);
+
+  float val = 1.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        val += 1.0f;
+      }
+    }
+  }
+  mat2 = mat1.inverse().unaryExpr(&asinf);
+  mat3 = mat1.unaryExpr(&tanhf);
+
+  val = 1.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat2(i,j,k), asinf(1.0f / mat1(i,j,k)));
+        VERIFY_IS_APPROX(mat3(i,j,k), tanhf(mat1(i,j,k)));
+        val += 1.0f;
+      }
+    }
+  }
+}
+
+static void test_type_casting()
+{
+  Tensor<bool, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<double, 3> mat3(2,3,7);
+  mat1.setRandom();
+  mat2.setRandom();
+
+  mat3 = mat1.cast<double>();
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), mat1(i,j,k) ? 1.0 : 0.0);
+      }
+    }
+  }
+
+  mat3 = mat2.cast<double>();
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), static_cast<double>(mat2(i,j,k)));
+      }
+    }
+  }
+}
+
+static void test_select()
+{
+  Tensor<float, 3> selector(2,3,7);
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<float, 3> result(2,3,7);
+
+  selector.setRandom();
+  mat1.setRandom();
+  mat2.setRandom();
+  result = (selector > selector.constant(0.5f)).select(mat1, mat2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(result(i,j,k), (selector(i,j,k) > 0.5f) ? mat1(i,j,k) : mat2(i,j,k));
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_expr()
+{
+  CALL_SUBTEST(test_1d());
+  CALL_SUBTEST(test_2d());
+  CALL_SUBTEST(test_3d());
+  CALL_SUBTEST(test_constants());
+  CALL_SUBTEST(test_boolean());
+  CALL_SUBTEST(test_functors());
+  CALL_SUBTEST(test_type_casting());
+  CALL_SUBTEST(test_select());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_fft.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_fft.cpp
new file mode 100644
index 000000000..2f14ebc62
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_fft.cpp
@@ -0,0 +1,273 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Jianwei Cui <thucjw@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <int DataLayout>
+static void test_fft_2D_golden() {
+  Tensor<float, 2, DataLayout> input(2, 3);
+  input(0, 0) = 1;
+  input(0, 1) = 2;
+  input(0, 2) = 3;
+  input(1, 0) = 4;
+  input(1, 1) = 5;
+  input(1, 2) = 6;
+
+  array<ptrdiff_t, 2> fft;
+  fft[0] = 0;
+  fft[1] = 1;
+
+  Tensor<std::complex<float>, 2, DataLayout> output = input.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(fft);
+
+  std::complex<float> output_golden[6]; // in ColMajor order
+  output_golden[0] = std::complex<float>(21, 0);
+  output_golden[1] = std::complex<float>(-9, 0);
+  output_golden[2] = std::complex<float>(-3, 1.73205);
+  output_golden[3] = std::complex<float>( 0, 0);
+  output_golden[4] = std::complex<float>(-3, -1.73205);
+  output_golden[5] = std::complex<float>(0 ,0);
+
+  std::complex<float> c_offset = std::complex<float>(1.0, 1.0);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_APPROX(output(0) + c_offset, output_golden[0] + c_offset);
+    VERIFY_IS_APPROX(output(1) + c_offset, output_golden[1] + c_offset);
+    VERIFY_IS_APPROX(output(2) + c_offset, output_golden[2] + c_offset);
+    VERIFY_IS_APPROX(output(3) + c_offset, output_golden[3] + c_offset);
+    VERIFY_IS_APPROX(output(4) + c_offset, output_golden[4] + c_offset);
+    VERIFY_IS_APPROX(output(5) + c_offset, output_golden[5] + c_offset);
+  }
+  else {
+    VERIFY_IS_APPROX(output(0)+ c_offset, output_golden[0]+ c_offset);
+    VERIFY_IS_APPROX(output(1)+ c_offset, output_golden[2]+ c_offset);
+    VERIFY_IS_APPROX(output(2)+ c_offset, output_golden[4]+ c_offset);
+    VERIFY_IS_APPROX(output(3)+ c_offset, output_golden[1]+ c_offset);
+    VERIFY_IS_APPROX(output(4)+ c_offset, output_golden[3]+ c_offset);
+    VERIFY_IS_APPROX(output(5)+ c_offset, output_golden[5]+ c_offset);
+  }
+}
+
+static void test_fft_complex_input_golden() {
+  Tensor<std::complex<float>, 1, ColMajor> input(5);
+  input(0) = std::complex<float>(1, 1);
+  input(1) = std::complex<float>(2, 2);
+  input(2) = std::complex<float>(3, 3);
+  input(3) = std::complex<float>(4, 4);
+  input(4) = std::complex<float>(5, 5);
+
+  array<ptrdiff_t, 1> fft;
+  fft[0] = 0;
+
+  Tensor<std::complex<float>, 1, ColMajor> forward_output_both_parts = input.fft<BothParts, FFT_FORWARD>(fft);
+  Tensor<std::complex<float>, 1, ColMajor> reverse_output_both_parts = input.fft<BothParts, FFT_REVERSE>(fft);
+
+  Tensor<float, 1, ColMajor> forward_output_real_part = input.fft<RealPart, FFT_FORWARD>(fft);
+  Tensor<float, 1, ColMajor> reverse_output_real_part = input.fft<RealPart, FFT_REVERSE>(fft);
+
+  Tensor<float, 1, ColMajor> forward_output_imag_part = input.fft<ImagPart, FFT_FORWARD>(fft);
+  Tensor<float, 1, ColMajor> reverse_output_imag_part = input.fft<ImagPart, FFT_REVERSE>(fft);
+
+  VERIFY_IS_EQUAL(forward_output_both_parts.dimension(0), input.dimension(0));
+  VERIFY_IS_EQUAL(reverse_output_both_parts.dimension(0), input.dimension(0));
+
+  VERIFY_IS_EQUAL(forward_output_real_part.dimension(0), input.dimension(0));
+  VERIFY_IS_EQUAL(reverse_output_real_part.dimension(0), input.dimension(0));
+
+  VERIFY_IS_EQUAL(forward_output_imag_part.dimension(0), input.dimension(0));
+  VERIFY_IS_EQUAL(reverse_output_imag_part.dimension(0), input.dimension(0));
+
+  std::complex<float> forward_golden_result[5];
+  std::complex<float> reverse_golden_result[5];
+
+  forward_golden_result[0] = std::complex<float>(15.000000000000000,+15.000000000000000);
+  forward_golden_result[1] = std::complex<float>(-5.940954801177935, +0.940954801177934);
+  forward_golden_result[2] = std::complex<float>(-3.312299240582266, -1.687700759417735);
+  forward_golden_result[3] = std::complex<float>(-1.687700759417735, -3.312299240582266);
+  forward_golden_result[4] = std::complex<float>( 0.940954801177934, -5.940954801177935);
+
+  reverse_golden_result[0] = std::complex<float>( 3.000000000000000, + 3.000000000000000);
+  reverse_golden_result[1] = std::complex<float>( 0.188190960235587, - 1.188190960235587);
+  reverse_golden_result[2] = std::complex<float>(-0.337540151883547, - 0.662459848116453);
+  reverse_golden_result[3] = std::complex<float>(-0.662459848116453, - 0.337540151883547);
+  reverse_golden_result[4] = std::complex<float>(-1.188190960235587, + 0.188190960235587);
+
+  for(int i = 0; i < 5; ++i) {
+    VERIFY_IS_APPROX(forward_output_both_parts(i), forward_golden_result[i]);
+    VERIFY_IS_APPROX(forward_output_real_part(i), forward_golden_result[i].real());
+    VERIFY_IS_APPROX(forward_output_imag_part(i), forward_golden_result[i].imag());
+  }
+
+  for(int i = 0; i < 5; ++i) {
+    VERIFY_IS_APPROX(reverse_output_both_parts(i), reverse_golden_result[i]);
+    VERIFY_IS_APPROX(reverse_output_real_part(i), reverse_golden_result[i].real());
+    VERIFY_IS_APPROX(reverse_output_imag_part(i), reverse_golden_result[i].imag());
+  }
+}
+
+static void test_fft_real_input_golden() {
+  Tensor<float, 1, ColMajor> input(5);
+  input(0) = 1.0;
+  input(1) = 2.0;
+  input(2) = 3.0;
+  input(3) = 4.0;
+  input(4) = 5.0;
+
+  array<ptrdiff_t, 1> fft;
+  fft[0] = 0;
+
+  Tensor<std::complex<float>, 1, ColMajor> forward_output_both_parts = input.fft<BothParts, FFT_FORWARD>(fft);
+  Tensor<std::complex<float>, 1, ColMajor> reverse_output_both_parts = input.fft<BothParts, FFT_REVERSE>(fft);
+
+  Tensor<float, 1, ColMajor> forward_output_real_part = input.fft<RealPart, FFT_FORWARD>(fft);
+  Tensor<float, 1, ColMajor> reverse_output_real_part = input.fft<RealPart, FFT_REVERSE>(fft);
+
+  Tensor<float, 1, ColMajor> forward_output_imag_part = input.fft<ImagPart, FFT_FORWARD>(fft);
+  Tensor<float, 1, ColMajor> reverse_output_imag_part = input.fft<ImagPart, FFT_REVERSE>(fft);
+
+  VERIFY_IS_EQUAL(forward_output_both_parts.dimension(0), input.dimension(0));
+  VERIFY_IS_EQUAL(reverse_output_both_parts.dimension(0), input.dimension(0));
+
+  VERIFY_IS_EQUAL(forward_output_real_part.dimension(0), input.dimension(0));
+  VERIFY_IS_EQUAL(reverse_output_real_part.dimension(0), input.dimension(0));
+
+  VERIFY_IS_EQUAL(forward_output_imag_part.dimension(0), input.dimension(0));
+  VERIFY_IS_EQUAL(reverse_output_imag_part.dimension(0), input.dimension(0));
+
+  std::complex<float> forward_golden_result[5];
+  std::complex<float> reverse_golden_result[5];
+
+
+  forward_golden_result[0] = std::complex<float>(  15, 0);
+  forward_golden_result[1] = std::complex<float>(-2.5, +3.44095480117793);
+  forward_golden_result[2] = std::complex<float>(-2.5, +0.81229924058227);
+  forward_golden_result[3] = std::complex<float>(-2.5, -0.81229924058227);
+  forward_golden_result[4] = std::complex<float>(-2.5, -3.44095480117793);
+
+  reverse_golden_result[0] = std::complex<float>( 3.0, 0);
+  reverse_golden_result[1] = std::complex<float>(-0.5, -0.688190960235587);
+  reverse_golden_result[2] = std::complex<float>(-0.5, -0.162459848116453);
+  reverse_golden_result[3] = std::complex<float>(-0.5, +0.162459848116453);
+  reverse_golden_result[4] = std::complex<float>(-0.5, +0.688190960235587);
+
+  std::complex<float> c_offset(1.0, 1.0);
+  float r_offset = 1.0;
+
+  for(int i = 0; i < 5; ++i) {
+    VERIFY_IS_APPROX(forward_output_both_parts(i) + c_offset, forward_golden_result[i] + c_offset);
+    VERIFY_IS_APPROX(forward_output_real_part(i)  + r_offset, forward_golden_result[i].real() + r_offset);
+    VERIFY_IS_APPROX(forward_output_imag_part(i)  + r_offset, forward_golden_result[i].imag() + r_offset);
+  }
+
+  for(int i = 0; i < 5; ++i) {
+    VERIFY_IS_APPROX(reverse_output_both_parts(i) + c_offset, reverse_golden_result[i] + c_offset);
+    VERIFY_IS_APPROX(reverse_output_real_part(i)  + r_offset, reverse_golden_result[i].real() + r_offset);
+    VERIFY_IS_APPROX(reverse_output_imag_part(i)  + r_offset, reverse_golden_result[i].imag() + r_offset);
+  }
+}
+
+
+template <int DataLayout, typename RealScalar, bool isComplexInput, int FFTResultType, int FFTDirection, int TensorRank>
+static void test_fft_real_input_energy() {
+
+  Eigen::DSizes<ptrdiff_t, TensorRank> dimensions;
+  ptrdiff_t total_size = 1;
+  for (int i = 0; i < TensorRank; ++i) {
+    dimensions[i] = rand() % 20 + 1;
+    total_size *= dimensions[i];
+  }
+  const DSizes<ptrdiff_t, TensorRank> arr = dimensions;
+
+  typedef typename internal::conditional<isComplexInput == true, std::complex<RealScalar>, RealScalar>::type InputScalar;
+
+  Tensor<InputScalar, TensorRank, DataLayout> input;
+  input.resize(arr);
+  input.setRandom();
+
+  array<ptrdiff_t, TensorRank> fft;
+  for (int i = 0; i < TensorRank; ++i) {
+    fft[i] = i;
+  }
+
+  typedef typename internal::conditional<FFTResultType == Eigen::BothParts, std::complex<RealScalar>, RealScalar>::type OutputScalar;
+  Tensor<OutputScalar, TensorRank, DataLayout> output;
+  output = input.template fft<FFTResultType, FFTDirection>(fft);
+
+  for (int i = 0; i < TensorRank; ++i) {
+    VERIFY_IS_EQUAL(output.dimension(i), input.dimension(i));
+  }
+
+  RealScalar energy_original = 0.0;
+  RealScalar energy_after_fft = 0.0;
+
+  for (int i = 0; i < total_size; ++i) {
+    energy_original += numext::abs2(input(i));
+  }
+
+  for (int i = 0; i < total_size; ++i) {
+    energy_after_fft += numext::abs2(output(i));
+  }
+
+  if(FFTDirection == FFT_FORWARD) {
+    VERIFY_IS_APPROX(energy_original, energy_after_fft / total_size);
+  }
+  else {
+    VERIFY_IS_APPROX(energy_original, energy_after_fft * total_size);
+  }
+}
+
+void test_cxx11_tensor_fft() {
+    test_fft_complex_input_golden();
+    test_fft_real_input_golden();
+
+    test_fft_2D_golden<ColMajor>();
+    test_fft_2D_golden<RowMajor>();
+
+    test_fft_real_input_energy<ColMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 1>();
+    test_fft_real_input_energy<ColMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 1>();
+    test_fft_real_input_energy<ColMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 1>();
+    test_fft_real_input_energy<ColMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 1>();
+
+    test_fft_real_input_energy<ColMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 2>();
+    test_fft_real_input_energy<ColMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 2>();
+    test_fft_real_input_energy<ColMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 2>();
+    test_fft_real_input_energy<ColMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 2>();
+
+    test_fft_real_input_energy<ColMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 3>();
+    test_fft_real_input_energy<ColMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 3>();
+    test_fft_real_input_energy<ColMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 3>();
+    test_fft_real_input_energy<ColMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 3>();
+
+    test_fft_real_input_energy<ColMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 4>();
+    test_fft_real_input_energy<ColMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 4>();
+    test_fft_real_input_energy<ColMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 4>();
+    test_fft_real_input_energy<ColMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 4>();
+
+    test_fft_real_input_energy<RowMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 1>();
+    test_fft_real_input_energy<RowMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 1>();
+    test_fft_real_input_energy<RowMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 1>();
+    test_fft_real_input_energy<RowMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 1>();
+
+    test_fft_real_input_energy<RowMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 2>();
+    test_fft_real_input_energy<RowMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 2>();
+    test_fft_real_input_energy<RowMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 2>();
+    test_fft_real_input_energy<RowMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 2>();
+
+    test_fft_real_input_energy<RowMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 3>();
+    test_fft_real_input_energy<RowMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 3>();
+    test_fft_real_input_energy<RowMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 3>();
+    test_fft_real_input_energy<RowMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 3>();
+
+    test_fft_real_input_energy<RowMajor, float,  true,  Eigen::BothParts, FFT_FORWARD, 4>();
+    test_fft_real_input_energy<RowMajor, double, true,  Eigen::BothParts, FFT_FORWARD, 4>();
+    test_fft_real_input_energy<RowMajor, float,  false,  Eigen::BothParts, FFT_FORWARD, 4>();
+    test_fft_real_input_energy<RowMajor, double, false,  Eigen::BothParts, FFT_FORWARD, 4>();
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_fixed_size.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_fixed_size.cpp
new file mode 100644
index 000000000..4c660de65
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_fixed_size.cpp
@@ -0,0 +1,261 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+
+static void test_0d()
+{
+  TensorFixedSize<float, Sizes<> > scalar1;
+  TensorFixedSize<float, Sizes<>, RowMajor> scalar2;
+  VERIFY_IS_EQUAL(scalar1.rank(), 0);
+  VERIFY_IS_EQUAL(scalar1.size(), 1);
+  VERIFY_IS_EQUAL(array_prod(scalar1.dimensions()), 1);
+
+  scalar1() = 7.0;
+  scalar2() = 13.0;
+
+  // Test against shallow copy.
+  TensorFixedSize<float, Sizes<> > copy = scalar1;
+  VERIFY_IS_NOT_EQUAL(scalar1.data(), copy.data());
+  VERIFY_IS_APPROX(scalar1(), copy());
+  copy = scalar1;
+  VERIFY_IS_NOT_EQUAL(scalar1.data(), copy.data());
+  VERIFY_IS_APPROX(scalar1(), copy());
+
+  TensorFixedSize<float, Sizes<> > scalar3 = scalar1.sqrt();
+  TensorFixedSize<float, Sizes<>, RowMajor> scalar4 = scalar2.sqrt();
+  VERIFY_IS_EQUAL(scalar3.rank(), 0);
+  VERIFY_IS_APPROX(scalar3(), sqrtf(7.0));
+  VERIFY_IS_APPROX(scalar4(), sqrtf(13.0));
+
+  scalar3 = scalar1 + scalar2;
+  VERIFY_IS_APPROX(scalar3(), 7.0f + 13.0f);
+}
+
+static void test_1d()
+{
+  TensorFixedSize<float, Sizes<6> > vec1;
+  TensorFixedSize<float, Sizes<6>, RowMajor> vec2;
+
+  VERIFY_IS_EQUAL((vec1.size()), 6);
+  //  VERIFY_IS_EQUAL((vec1.dimensions()[0]), 6);
+  //  VERIFY_IS_EQUAL((vec1.dimension(0)), 6);
+
+  vec1(0) = 4.0;  vec2(0) = 0.0;
+  vec1(1) = 8.0;  vec2(1) = 1.0;
+  vec1(2) = 15.0; vec2(2) = 2.0;
+  vec1(3) = 16.0; vec2(3) = 3.0;
+  vec1(4) = 23.0; vec2(4) = 4.0;
+  vec1(5) = 42.0; vec2(5) = 5.0;
+
+  // Test against shallow copy.
+  TensorFixedSize<float, Sizes<6> > copy = vec1;
+  VERIFY_IS_NOT_EQUAL(vec1.data(), copy.data());
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_APPROX(vec1(i), copy(i));
+  }
+  copy = vec1;
+  VERIFY_IS_NOT_EQUAL(vec1.data(), copy.data());
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_APPROX(vec1(i), copy(i));
+  }
+
+  TensorFixedSize<float, Sizes<6> > vec3 = vec1.sqrt();
+  TensorFixedSize<float, Sizes<6>, RowMajor> vec4 = vec2.sqrt();
+
+  VERIFY_IS_EQUAL((vec3.size()), 6);
+  VERIFY_IS_EQUAL(vec3.rank(), 1);
+  //  VERIFY_IS_EQUAL((vec3.dimensions()[0]), 6);
+  //  VERIFY_IS_EQUAL((vec3.dimension(0)), 6);
+
+  VERIFY_IS_APPROX(vec3(0), sqrtf(4.0));
+  VERIFY_IS_APPROX(vec3(1), sqrtf(8.0));
+  VERIFY_IS_APPROX(vec3(2), sqrtf(15.0));
+  VERIFY_IS_APPROX(vec3(3), sqrtf(16.0));
+  VERIFY_IS_APPROX(vec3(4), sqrtf(23.0));
+  VERIFY_IS_APPROX(vec3(5), sqrtf(42.0));
+
+  VERIFY_IS_APPROX(vec4(0), sqrtf(0.0));
+  VERIFY_IS_APPROX(vec4(1), sqrtf(1.0));
+  VERIFY_IS_APPROX(vec4(2), sqrtf(2.0));
+  VERIFY_IS_APPROX(vec4(3), sqrtf(3.0));
+  VERIFY_IS_APPROX(vec4(4), sqrtf(4.0));
+  VERIFY_IS_APPROX(vec4(5), sqrtf(5.0));
+
+  vec3 = vec1 + vec2;
+  VERIFY_IS_APPROX(vec3(0), 4.0f + 0.0f);
+  VERIFY_IS_APPROX(vec3(1), 8.0f + 1.0f);
+  VERIFY_IS_APPROX(vec3(2), 15.0f + 2.0f);
+  VERIFY_IS_APPROX(vec3(3), 16.0f + 3.0f);
+  VERIFY_IS_APPROX(vec3(4), 23.0f + 4.0f);
+  VERIFY_IS_APPROX(vec3(5), 42.0f + 5.0f);
+}
+
+static void test_tensor_map()
+{
+  TensorFixedSize<float, Sizes<6> > vec1;
+  TensorFixedSize<float, Sizes<6>, RowMajor> vec2;
+
+  vec1(0) = 4.0;  vec2(0) = 0.0;
+  vec1(1) = 8.0;  vec2(1) = 1.0;
+  vec1(2) = 15.0; vec2(2) = 2.0;
+  vec1(3) = 16.0; vec2(3) = 3.0;
+  vec1(4) = 23.0; vec2(4) = 4.0;
+  vec1(5) = 42.0; vec2(5) = 5.0;
+
+  float data3[6];
+  TensorMap<TensorFixedSize<float, Sizes<6> > > vec3(data3, 6);
+  vec3 = vec1.sqrt() + vec2;
+
+  VERIFY_IS_APPROX(vec3(0), sqrtf(4.0));
+  VERIFY_IS_APPROX(vec3(1), sqrtf(8.0) + 1.0f);
+  VERIFY_IS_APPROX(vec3(2), sqrtf(15.0) + 2.0f);
+  VERIFY_IS_APPROX(vec3(3), sqrtf(16.0) + 3.0f);
+  VERIFY_IS_APPROX(vec3(4), sqrtf(23.0) + 4.0f);
+  VERIFY_IS_APPROX(vec3(5), sqrtf(42.0) + 5.0f);
+}
+
+static void test_2d()
+{
+  float data1[6];
+  TensorMap<TensorFixedSize<float, Sizes<2, 3> > > mat1(data1,2,3);
+  float data2[6];
+  TensorMap<TensorFixedSize<float, Sizes<2, 3>, RowMajor> > mat2(data2,2,3);
+
+  VERIFY_IS_EQUAL((mat1.size()), 2*3);
+  VERIFY_IS_EQUAL(mat1.rank(), 2);
+  //  VERIFY_IS_EQUAL((mat1.dimension(0)), 2);
+  //  VERIFY_IS_EQUAL((mat1.dimension(1)), 3);
+
+  mat1(0,0) = 0.0;
+  mat1(0,1) = 1.0;
+  mat1(0,2) = 2.0;
+  mat1(1,0) = 3.0;
+  mat1(1,1) = 4.0;
+  mat1(1,2) = 5.0;
+
+  mat2(0,0) = -0.0;
+  mat2(0,1) = -1.0;
+  mat2(0,2) = -2.0;
+  mat2(1,0) = -3.0;
+  mat2(1,1) = -4.0;
+  mat2(1,2) = -5.0;
+
+  TensorFixedSize<float, Sizes<2, 3> > mat3;
+  TensorFixedSize<float, Sizes<2, 3>, RowMajor> mat4;
+  mat3 = mat1.abs();
+  mat4 = mat2.abs();
+
+  VERIFY_IS_EQUAL((mat3.size()), 2*3);
+    //  VERIFY_IS_EQUAL((mat3.dimension(0)), 2);
+    //  VERIFY_IS_EQUAL((mat3.dimension(1)), 3);
+
+  VERIFY_IS_APPROX(mat3(0,0), 0.0f);
+  VERIFY_IS_APPROX(mat3(0,1), 1.0f);
+  VERIFY_IS_APPROX(mat3(0,2), 2.0f);
+  VERIFY_IS_APPROX(mat3(1,0), 3.0f);
+  VERIFY_IS_APPROX(mat3(1,1), 4.0f);
+  VERIFY_IS_APPROX(mat3(1,2), 5.0f);
+
+  VERIFY_IS_APPROX(mat4(0,0), 0.0f);
+  VERIFY_IS_APPROX(mat4(0,1), 1.0f);
+  VERIFY_IS_APPROX(mat4(0,2), 2.0f);
+  VERIFY_IS_APPROX(mat4(1,0), 3.0f);
+  VERIFY_IS_APPROX(mat4(1,1), 4.0f);
+  VERIFY_IS_APPROX(mat4(1,2), 5.0f);
+}
+
+static void test_3d()
+{
+  TensorFixedSize<float, Sizes<2, 3, 7> > mat1;
+  TensorFixedSize<float, Sizes<2, 3, 7>, RowMajor> mat2;
+
+  VERIFY_IS_EQUAL((mat1.size()), 2*3*7);
+  VERIFY_IS_EQUAL(mat1.rank(), 3);
+  //  VERIFY_IS_EQUAL((mat1.dimension(0)), 2);
+  //  VERIFY_IS_EQUAL((mat1.dimension(1)), 3);
+  //  VERIFY_IS_EQUAL((mat1.dimension(2)), 7);
+
+  float val = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        mat2(i,j,k) = val;
+        val += 1.0f;
+      }
+    }
+  }
+
+  TensorFixedSize<float, Sizes<2, 3, 7> > mat3;
+  mat3 = mat1.sqrt();
+  TensorFixedSize<float, Sizes<2, 3, 7>, RowMajor> mat4;
+  mat4 = mat2.sqrt();
+
+  VERIFY_IS_EQUAL((mat3.size()), 2*3*7);
+  //  VERIFY_IS_EQUAL((mat3.dimension(0)), 2);
+  //  VERIFY_IS_EQUAL((mat3.dimension(1)), 3);
+  //  VERIFY_IS_EQUAL((mat3.dimension(2)), 7);
+
+
+  val = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), sqrtf(val));
+        VERIFY_IS_APPROX(mat4(i,j,k), sqrtf(val));
+        val += 1.0f;
+      }
+    }
+  }
+}
+
+
+static void test_array()
+{
+  TensorFixedSize<float, Sizes<2, 3, 7> > mat1;
+  float val = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        val += 1.0f;
+      }
+    }
+  }
+
+  TensorFixedSize<float, Sizes<2, 3, 7> > mat3;
+  mat3 = mat1.pow(3.5f);
+
+  val = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), powf(val, 3.5f));
+        val += 1.0f;
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_fixed_size()
+{
+  CALL_SUBTEST(test_0d());
+  CALL_SUBTEST(test_1d());
+  CALL_SUBTEST(test_tensor_map());
+  CALL_SUBTEST(test_2d());
+  CALL_SUBTEST(test_3d());
+  CALL_SUBTEST(test_array());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_forced_eval.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_forced_eval.cpp
new file mode 100644
index 000000000..45d7345e9
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_forced_eval.cpp
@@ -0,0 +1,79 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/Core>
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::MatrixXf;
+using Eigen::Tensor;
+
+static void test_simple()
+{
+  MatrixXf m1(3,3);
+  MatrixXf m2(3,3);
+  m1.setRandom();
+  m2.setRandom();
+
+  TensorMap<Tensor<float, 2> > mat1(m1.data(), 3,3);
+  TensorMap<Tensor<float, 2> > mat2(m2.data(), 3,3);
+
+  Tensor<float, 2> mat3(3,3);
+  mat3 = mat1;
+
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims;
+  dims[0] = DimPair(1, 0);
+
+  mat3 = mat3.contract(mat2, dims).eval();
+
+  VERIFY_IS_APPROX(mat3(0, 0), (m1*m2).eval()(0,0));
+  VERIFY_IS_APPROX(mat3(0, 1), (m1*m2).eval()(0,1));
+  VERIFY_IS_APPROX(mat3(0, 2), (m1*m2).eval()(0,2));
+  VERIFY_IS_APPROX(mat3(1, 0), (m1*m2).eval()(1,0));
+  VERIFY_IS_APPROX(mat3(1, 1), (m1*m2).eval()(1,1));
+  VERIFY_IS_APPROX(mat3(1, 2), (m1*m2).eval()(1,2));
+  VERIFY_IS_APPROX(mat3(2, 0), (m1*m2).eval()(2,0));
+  VERIFY_IS_APPROX(mat3(2, 1), (m1*m2).eval()(2,1));
+  VERIFY_IS_APPROX(mat3(2, 2), (m1*m2).eval()(2,2));
+}
+
+
+static void test_const()
+{
+  MatrixXf input(3,3);
+  input.setRandom();
+  MatrixXf output = input;
+  output.rowwise() -= input.colwise().maxCoeff();
+
+  Eigen::array<int, 1> depth_dim;
+  depth_dim[0] = 0;
+  Tensor<float, 2>::Dimensions dims2d;
+  dims2d[0] = 1;
+  dims2d[1] = 3;
+  Eigen::array<int, 2> bcast;
+  bcast[0] = 3;
+  bcast[1] = 1;
+  const TensorMap<Tensor<const float, 2> > input_tensor(input.data(), 3, 3);
+  Tensor<float, 2> output_tensor= (input_tensor - input_tensor.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_APPROX(output(i, j), output_tensor(i, j));
+    }
+  }
+}
+
+
+void test_cxx11_tensor_forced_eval()
+{
+  CALL_SUBTEST(test_simple());
+  CALL_SUBTEST(test_const());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
new file mode 100644
index 000000000..5690da723
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp
@@ -0,0 +1,70 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_forced_eval_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) {
+
+  int sizeDim1 = 100;
+  int sizeDim2 = 200;
+  int sizeDim3 = 200;
+  Eigen::array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Eigen::Tensor<float, 3> in1(tensorRange);
+  Eigen::Tensor<float, 3> in2(tensorRange);
+  Eigen::Tensor<float, 3> out(tensorRange);
+
+  float * gpu_in1_data  = static_cast<float*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(float)));
+  float * gpu_in2_data  = static_cast<float*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(float)));
+  float * gpu_out_data =  static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float)));
+
+  in1 = in1.random() + in1.constant(10.0f);
+  in2 = in2.random() + in2.constant(10.0f);
+
+  // creating TensorMap from tensor
+  Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_in1(gpu_in1_data, tensorRange);
+  Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_in2(gpu_in2_data, tensorRange);
+  Eigen::TensorMap<Eigen::Tensor<float, 3>> gpu_out(gpu_out_data, tensorRange);
+  sycl_device.memcpyHostToDevice(gpu_in1_data, in1.data(),(in1.dimensions().TotalSize())*sizeof(float));
+  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in1.dimensions().TotalSize())*sizeof(float));
+  /// c=(a+b)*b
+  gpu_out.device(sycl_device) =(gpu_in1 + gpu_in2).eval() * gpu_in2;
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k < sizeDim3; ++k) {
+        VERIFY_IS_APPROX(out(i, j, k),
+                         (in1(i, j, k) + in2(i, j, k)) * in2(i, j, k));
+      }
+    }
+  }
+  printf("(a+b)*b Test Passed\n");
+  sycl_device.deallocate(gpu_in1_data);
+  sycl_device.deallocate(gpu_in2_data);
+  sycl_device.deallocate(gpu_out_data);
+
+}
+
+void test_cxx11_tensor_forced_eval_sycl() {
+  cl::sycl::gpu_selector s;
+  Eigen::SyclDevice sycl_device(s);
+  CALL_SUBTEST(test_forced_eval_sycl(sycl_device));
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_generator.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_generator.cpp
new file mode 100644
index 000000000..dcb928714
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_generator.cpp
@@ -0,0 +1,91 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+struct Generator1D {
+  Generator1D() { }
+
+  float operator()(const array<Eigen::DenseIndex, 1>& coordinates) const {
+    return coordinates[0];
+  }
+};
+
+template <int DataLayout>
+static void test_1D()
+{
+  Tensor<float, 1> vec(6);
+  Tensor<float, 1> result = vec.generate(Generator1D());
+
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(result(i), i);
+  }
+}
+
+
+struct Generator2D {
+  Generator2D() { }
+
+  float operator()(const array<Eigen::DenseIndex, 2>& coordinates) const {
+    return 3 * coordinates[0] + 11 * coordinates[1];
+  }
+};
+
+template <int DataLayout>
+static void test_2D()
+{
+  Tensor<float, 2> matrix(5, 7);
+  Tensor<float, 2> result = matrix.generate(Generator2D());
+
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      VERIFY_IS_EQUAL(result(i, j), 3*i + 11*j);
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_gaussian()
+{
+  int rows = 32;
+  int cols = 48;
+  array<float, 2> means;
+  means[0] = rows / 2.0f;
+  means[1] = cols / 2.0f;
+  array<float, 2> std_devs;
+  std_devs[0] = 3.14f;
+  std_devs[1] = 2.7f;
+  internal::GaussianGenerator<float, Eigen::DenseIndex, 2> gaussian_gen(means, std_devs);
+
+  Tensor<float, 2> matrix(rows, cols);
+  Tensor<float, 2> result = matrix.generate(gaussian_gen);
+
+  for (int i = 0; i < rows; ++i) {
+    for (int j = 0; j < cols; ++j) {
+      float g_rows = powf(rows/2.0f - i, 2) / (3.14f * 3.14f) * 0.5f;
+      float g_cols = powf(cols/2.0f - j, 2) / (2.7f * 2.7f) * 0.5f;
+      float gaussian = expf(-g_rows - g_cols);
+      VERIFY_IS_EQUAL(result(i, j), gaussian);
+    }
+  }
+}
+
+
+void test_cxx11_tensor_generator()
+{
+  CALL_SUBTEST(test_1D<ColMajor>());
+  CALL_SUBTEST(test_1D<RowMajor>());
+  CALL_SUBTEST(test_2D<ColMajor>());
+  CALL_SUBTEST(test_2D<RowMajor>());
+  CALL_SUBTEST(test_gaussian<ColMajor>());
+  CALL_SUBTEST(test_gaussian<RowMajor>());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_ifft.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_ifft.cpp
new file mode 100644
index 000000000..5fd88fa6c
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_ifft.cpp
@@ -0,0 +1,154 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Jianwei Cui <thucjw@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <complex>
+#include <cmath>
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <int DataLayout>
+static void test_1D_fft_ifft_invariant(int sequence_length) {
+  Tensor<double, 1, DataLayout> tensor(sequence_length);
+  tensor.setRandom();
+
+  array<int, 1> fft;
+  fft[0] = 0;
+
+  Tensor<std::complex<double>, 1, DataLayout> tensor_after_fft;
+  Tensor<std::complex<double>, 1, DataLayout> tensor_after_fft_ifft;
+
+  tensor_after_fft = tensor.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(fft);
+  tensor_after_fft_ifft = tensor_after_fft.template fft<Eigen::BothParts, Eigen::FFT_REVERSE>(fft);
+
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(0), sequence_length);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(0), sequence_length);
+
+  for (int i = 0; i < sequence_length; ++i) {
+    VERIFY_IS_APPROX(static_cast<float>(tensor(i)), static_cast<float>(std::real(tensor_after_fft_ifft(i))));
+  }
+}
+
+template <int DataLayout>
+static void test_2D_fft_ifft_invariant(int dim0, int dim1) {
+  Tensor<double, 2, DataLayout> tensor(dim0, dim1);
+  tensor.setRandom();
+
+  array<int, 2> fft;
+  fft[0] = 0;
+  fft[1] = 1;
+
+  Tensor<std::complex<double>, 2, DataLayout> tensor_after_fft;
+  Tensor<std::complex<double>, 2, DataLayout> tensor_after_fft_ifft;
+
+  tensor_after_fft = tensor.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(fft);
+  tensor_after_fft_ifft = tensor_after_fft.template fft<Eigen::BothParts, Eigen::FFT_REVERSE>(fft);
+
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(0), dim0);
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(1), dim1);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(0), dim0);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(1), dim1);
+
+  for (int i = 0; i < dim0; ++i) {
+    for (int j = 0; j < dim1; ++j) {
+      //std::cout << "[" << i << "][" << j << "]" <<  "  Original data: " << tensor(i,j) << " Transformed data:" << tensor_after_fft_ifft(i,j) << std::endl;
+      VERIFY_IS_APPROX(static_cast<float>(tensor(i,j)), static_cast<float>(std::real(tensor_after_fft_ifft(i,j))));
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_3D_fft_ifft_invariant(int dim0, int dim1, int dim2) {
+  Tensor<double, 3, DataLayout> tensor(dim0, dim1, dim2);
+  tensor.setRandom();
+
+  array<int, 3> fft;
+  fft[0] = 0;
+  fft[1] = 1;
+  fft[2] = 2;
+
+  Tensor<std::complex<double>, 3, DataLayout> tensor_after_fft;
+  Tensor<std::complex<double>, 3, DataLayout> tensor_after_fft_ifft;
+
+  tensor_after_fft = tensor.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(fft);
+  tensor_after_fft_ifft = tensor_after_fft.template fft<Eigen::BothParts, Eigen::FFT_REVERSE>(fft);
+
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(0), dim0);
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(1), dim1);
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(2), dim2);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(0), dim0);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(1), dim1);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(2), dim2);
+
+  for (int i = 0; i < dim0; ++i) {
+    for (int j = 0; j < dim1; ++j) {
+      for (int k = 0; k < dim2; ++k) {
+        VERIFY_IS_APPROX(static_cast<float>(tensor(i,j,k)), static_cast<float>(std::real(tensor_after_fft_ifft(i,j,k))));
+      }
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_sub_fft_ifft_invariant(int dim0, int dim1, int dim2, int dim3) {
+  Tensor<double, 4, DataLayout> tensor(dim0, dim1, dim2, dim3);
+  tensor.setRandom();
+
+  array<int, 2> fft;
+  fft[0] = 2;
+  fft[1] = 0;
+
+  Tensor<std::complex<double>, 4, DataLayout> tensor_after_fft;
+  Tensor<double, 4, DataLayout> tensor_after_fft_ifft;
+
+  tensor_after_fft = tensor.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(fft);
+  tensor_after_fft_ifft = tensor_after_fft.template fft<Eigen::RealPart, Eigen::FFT_REVERSE>(fft);
+
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(0), dim0);
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(1), dim1);
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(2), dim2);
+  VERIFY_IS_EQUAL(tensor_after_fft.dimension(3), dim3);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(0), dim0);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(1), dim1);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(2), dim2);
+  VERIFY_IS_EQUAL(tensor_after_fft_ifft.dimension(3), dim3);
+
+  for (int i = 0; i < dim0; ++i) {
+    for (int j = 0; j < dim1; ++j) {
+      for (int k = 0; k < dim2; ++k) {
+        for (int l = 0; l < dim3; ++l) {
+          VERIFY_IS_APPROX(static_cast<float>(tensor(i,j,k,l)), static_cast<float>(tensor_after_fft_ifft(i,j,k,l)));
+        }
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_ifft() {
+  CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(4));
+  CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(16));
+  CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(32));
+  CALL_SUBTEST(test_1D_fft_ifft_invariant<ColMajor>(1024*1024));
+
+  CALL_SUBTEST(test_2D_fft_ifft_invariant<ColMajor>(4,4));
+  CALL_SUBTEST(test_2D_fft_ifft_invariant<ColMajor>(8,16));
+  CALL_SUBTEST(test_2D_fft_ifft_invariant<ColMajor>(16,32));
+  CALL_SUBTEST(test_2D_fft_ifft_invariant<ColMajor>(1024,1024));
+
+  CALL_SUBTEST(test_3D_fft_ifft_invariant<ColMajor>(4,4,4));
+  CALL_SUBTEST(test_3D_fft_ifft_invariant<ColMajor>(8,16,32));
+  CALL_SUBTEST(test_3D_fft_ifft_invariant<ColMajor>(16,4,8));
+  CALL_SUBTEST(test_3D_fft_ifft_invariant<ColMajor>(256,256,256));
+
+  CALL_SUBTEST(test_sub_fft_ifft_invariant<ColMajor>(4,4,4,4));
+  CALL_SUBTEST(test_sub_fft_ifft_invariant<ColMajor>(8,16,32,64));
+  CALL_SUBTEST(test_sub_fft_ifft_invariant<ColMajor>(16,4,8,12));
+  CALL_SUBTEST(test_sub_fft_ifft_invariant<ColMajor>(64,64,64,64));
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_image_patch.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_image_patch.cpp
new file mode 100644
index 000000000..475c59651
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_image_patch.cpp
@@ -0,0 +1,757 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_simple_patch()
+{
+  Tensor<float, 4> tensor(2,3,5,7);
+  tensor.setRandom();
+  Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(3), tensor_row_major.dimension(0));
+
+  // Single pixel patch: ColMajor
+  Tensor<float, 5> single_pixel_patch;
+  single_pixel_patch = tensor.extract_image_patches(1, 1);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(3), 3*5);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(4), 7);
+
+  // Single pixel patch: RowMajor
+  Tensor<float, 5, RowMajor> single_pixel_patch_row_major;
+  single_pixel_patch_row_major = tensor_row_major.extract_image_patches(1, 1);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(1), 3*5);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(3), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(4), 2);
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    // ColMajor
+    if (tensor.data()[i] != single_pixel_patch.data()[i]) {
+      std::cout << "Mismatch detected at index " << i << " : "
+           << tensor.data()[i] << " vs " << single_pixel_patch.data()[i]
+           << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_pixel_patch.data()[i], tensor.data()[i]);
+    // RowMajor
+    if (tensor_row_major.data()[i] != single_pixel_patch_row_major.data()[i]) {
+      std::cout << "Mismatch detected at index " << i << " : "
+           << tensor.data()[i] << " vs "
+           << single_pixel_patch_row_major.data()[i] << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_pixel_patch_row_major.data()[i],
+                    tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor.data()[i], tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(single_pixel_patch.data()[i],
+                    single_pixel_patch_row_major.data()[i]);
+  }
+
+  // Entire image patch: ColMajor
+  Tensor<float, 5> entire_image_patch;
+  entire_image_patch = tensor.extract_image_patches(3, 5);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(1), 3);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(2), 5);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(3), 3*5);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(4), 7);
+
+  // Entire image patch: RowMajor
+  Tensor<float, 5, RowMajor> entire_image_patch_row_major;
+  entire_image_patch_row_major = tensor_row_major.extract_image_patches(3, 5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 3*5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 3);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(4), 2);
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      int patchId = i+3*j;
+      for (int r = 0; r < 3; ++r) {
+        for (int c = 0; c < 5; ++c) {
+          for (int d = 0; d < 2; ++d) {
+            for (int b = 0; b < 7; ++b) {
+              float expected = 0.0f;
+              float expected_row_major = 0.0f;
+              if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) {
+                expected = tensor(d, r-1+i, c-2+j, b);
+                expected_row_major = tensor_row_major(b, c-2+j, r-1+i, d);
+              }
+              // ColMajor
+              if (entire_image_patch(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(entire_image_patch(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (entire_image_patch_row_major(b, patchId, c, r, d) !=
+                  expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j
+                     << " r=" << r << " c=" << c << " d=" << d << " b=" << b
+                     << std::endl;
+              }
+              VERIFY_IS_EQUAL(entire_image_patch_row_major(b, patchId, c, r, d),
+                              expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // 2D patch: ColMajor
+  Tensor<float, 5> twod_patch;
+  twod_patch = tensor.extract_image_patches(2, 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(1), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(3), 3*5);
+  VERIFY_IS_EQUAL(twod_patch.dimension(4), 7);
+
+  // 2D patch: RowMajor
+  Tensor<float, 5, RowMajor> twod_patch_row_major;
+  twod_patch_row_major = tensor_row_major.extract_image_patches(2, 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 3*5);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(4), 2);
+
+
+  // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+  int row_padding = 0;
+  int col_padding = 0;
+  int stride = 1;
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      int patchId = i+3*j;
+      for (int r = 0; r < 2; ++r) {
+        for (int c = 0; c < 2; ++c) {
+          for (int d = 0; d < 2; ++d) {
+            for (int b = 0; b < 7; ++b) {
+              float expected = 0.0f;
+              float expected_row_major = 0.0f;
+              int row_offset = r*stride + i - row_padding;
+              int col_offset = c*stride + j - col_padding;
+              // ColMajor
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor.dimension(1) && col_offset < tensor.dimension(2)) {
+                expected = tensor(d, row_offset, col_offset, b);
+              }
+              if (twod_patch(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(twod_patch(d, r, c, patchId, b), expected);
+
+              // RowMajor
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(2) && col_offset < tensor_row_major.dimension(1)) {
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+
+              }
+              if (twod_patch_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(twod_patch_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Verifies VALID padding (no padding) with incrementing values.
+void test_patch_padding_valid()
+{
+  int input_depth = 3;
+  int input_rows = 3;
+  int input_cols = 3;
+  int input_batches = 1;
+  int ksize = 2;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  int stride = 2;  // Only same stride is supported.
+  Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches);
+  // Initializes tensor with incrementing numbers.
+  for (int i = 0; i < tensor.size(); ++i) {
+    tensor.data()[i] = i + 1;
+  }
+  // ColMajor
+  Tensor<float, 5> result = tensor.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+
+  VERIFY_IS_EQUAL(result.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result.dimension(3), 1);  // number of patches
+  VERIFY_IS_EQUAL(result.dimension(4), input_batches);  // number of batches
+
+  // RowMajor
+  Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(3), tensor_row_major.dimension(0));
+
+  Tensor<float, 5, RowMajor> result_row_major = tensor_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+  VERIFY_IS_EQUAL(result.dimension(0), result_row_major.dimension(4));
+  VERIFY_IS_EQUAL(result.dimension(1), result_row_major.dimension(3));
+  VERIFY_IS_EQUAL(result.dimension(2), result_row_major.dimension(2));
+  VERIFY_IS_EQUAL(result.dimension(3), result_row_major.dimension(1));
+  VERIFY_IS_EQUAL(result.dimension(4), result_row_major.dimension(0));
+
+  // No padding is carried out.
+  int row_padding = 0;
+  int col_padding = 0;
+
+  for (int i = 0; (i+stride+ksize-1) < input_rows; i += stride) {  // input rows
+    for (int j = 0; (j+stride+ksize-1) < input_cols; j += stride) {  // input cols
+      int patchId = i+input_rows*j;
+      for (int r = 0; r < ksize; ++r) {  // patch rows
+        for (int c = 0; c < ksize; ++c) {  // patch cols
+          for (int d = 0; d < input_depth; ++d) {  // depth
+            for (int b = 0; b < input_batches; ++b) {  // batch
+              float expected = 0.0f;
+              float expected_row_major = 0.0f;
+              int row_offset = r + i - row_padding;
+              int col_offset = c + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected = tensor(d, row_offset, col_offset, b);
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+              }
+              // ColMajor
+              if (result(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Verifies VALID padding (no padding) with the same value.
+void test_patch_padding_valid_same_value()
+{
+  int input_depth = 1;
+  int input_rows = 5;
+  int input_cols = 5;
+  int input_batches = 2;
+  int ksize = 3;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  int stride = 2;  // Only same stride is supported.
+  // ColMajor
+  Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches);
+  tensor = tensor.constant(11.0f);
+  Tensor<float, 5> result = tensor.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+
+  VERIFY_IS_EQUAL(result.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result.dimension(3), 4);  // number of patches
+  VERIFY_IS_EQUAL(result.dimension(4), input_batches);  // number of batches
+
+  // RowMajor
+  Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(3), tensor_row_major.dimension(0));
+
+  Tensor<float, 5, RowMajor> result_row_major = tensor_row_major.extract_image_patches(ksize, ksize, stride, stride, 1, 1, PADDING_VALID);
+  VERIFY_IS_EQUAL(result.dimension(0), result_row_major.dimension(4));
+  VERIFY_IS_EQUAL(result.dimension(1), result_row_major.dimension(3));
+  VERIFY_IS_EQUAL(result.dimension(2), result_row_major.dimension(2));
+  VERIFY_IS_EQUAL(result.dimension(3), result_row_major.dimension(1));
+  VERIFY_IS_EQUAL(result.dimension(4), result_row_major.dimension(0));
+
+  // No padding is carried out.
+  int row_padding = 0;
+  int col_padding = 0;
+
+  for (int i = 0; (i+stride+ksize-1) <= input_rows; i += stride) {  // input rows
+    for (int j = 0; (j+stride+ksize-1) <= input_cols; j += stride) {  // input cols
+      int patchId = i+input_rows*j;
+      for (int r = 0; r < ksize; ++r) {  // patch rows
+        for (int c = 0; c < ksize; ++c) {  // patch cols
+          for (int d = 0; d < input_depth; ++d) {  // depth
+            for (int b = 0; b < input_batches; ++b) {  // batch
+              float expected = 0.0f;
+              float expected_row_major = 0.0f;
+              int row_offset = r + i - row_padding;
+              int col_offset = c + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected = tensor(d, row_offset, col_offset, b);
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+              }
+              // ColMajor
+              if (result(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// Verifies SAME padding.
+void test_patch_padding_same()
+{
+  int input_depth = 3;
+  int input_rows = 4;
+  int input_cols = 2;
+  int input_batches = 1;
+  int ksize = 2;  // Corresponds to the Rows and Cols for tensor.extract_image_patches<>.
+  int stride = 2;  // Only same stride is supported.
+  // ColMajor
+  Tensor<float, 4> tensor(input_depth, input_rows, input_cols, input_batches);
+  // Initializes tensor with incrementing numbers.
+  for (int i = 0; i < tensor.size(); ++i) {
+    tensor.data()[i] = i + 1;
+  }
+  Tensor<float, 5> result = tensor.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME);
+
+  VERIFY_IS_EQUAL(result.dimension(0), input_depth);  // depth
+  VERIFY_IS_EQUAL(result.dimension(1), ksize);  // kernel rows
+  VERIFY_IS_EQUAL(result.dimension(2), ksize);  // kernel cols
+  VERIFY_IS_EQUAL(result.dimension(3), 2);  // number of patches
+  VERIFY_IS_EQUAL(result.dimension(4), input_batches);  // number of batches
+
+  // RowMajor
+  Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor_row_major.dimension(3));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(3), tensor_row_major.dimension(0));
+
+  Tensor<float, 5, RowMajor> result_row_major = tensor_row_major.extract_image_patches(ksize, ksize, stride, stride, PADDING_SAME);
+  VERIFY_IS_EQUAL(result.dimension(0), result_row_major.dimension(4));
+  VERIFY_IS_EQUAL(result.dimension(1), result_row_major.dimension(3));
+  VERIFY_IS_EQUAL(result.dimension(2), result_row_major.dimension(2));
+  VERIFY_IS_EQUAL(result.dimension(3), result_row_major.dimension(1));
+  VERIFY_IS_EQUAL(result.dimension(4), result_row_major.dimension(0));
+
+  // Based on the calculation described in TensorTraits.h, padding happens to be
+  // 0.
+  int row_padding = 0;
+  int col_padding = 0;
+
+  for (int i = 0; (i+stride+ksize-1) <= input_rows; i += stride) {  // input rows
+    for (int j = 0; (j+stride+ksize-1) <= input_cols; j += stride) {  // input cols
+      int patchId = i+input_rows*j;
+      for (int r = 0; r < ksize; ++r) {  // patch rows
+        for (int c = 0; c < ksize; ++c) {  // patch cols
+          for (int d = 0; d < input_depth; ++d) {  // depth
+            for (int b = 0; b < input_batches; ++b) {  // batch
+              float expected = 0.0f;
+              float expected_row_major = 0.0f;
+              int row_offset = r*stride + i - row_padding;
+              int col_offset = c*stride + j - col_padding;
+              if (row_offset >= 0 && col_offset >= 0 && row_offset < input_rows && col_offset < input_cols) {
+                expected = tensor(d, row_offset, col_offset, b);
+                expected_row_major = tensor_row_major(b, col_offset, row_offset, d);
+              }
+              // ColMajor
+              if (result(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (result_row_major(b, patchId, c, r, d) != expected_row_major) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(result_row_major(b, patchId, c, r, d), expected_row_major);
+              // Check that ColMajor and RowMajor agree.
+              VERIFY_IS_EQUAL(expected, expected_row_major);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void test_patch_no_extra_dim()
+{
+  Tensor<float, 3> tensor(2,3,5);
+  tensor.setRandom();
+  Tensor<float, 3, RowMajor> tensor_row_major = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor_row_major.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor_row_major.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor_row_major.dimension(0));
+
+  // Single pixel patch: ColMajor
+  Tensor<float, 4> single_pixel_patch;
+  single_pixel_patch = tensor.extract_image_patches(1, 1);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch.dimension(3), 3*5);
+
+  // Single pixel patch: RowMajor
+  Tensor<float, 4, RowMajor> single_pixel_patch_row_major;
+  single_pixel_patch_row_major = tensor_row_major.extract_image_patches(1, 1);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(0), 3*5);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_pixel_patch_row_major.dimension(3), 2);
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    // ColMajor
+    if (tensor.data()[i] != single_pixel_patch.data()[i]) {
+      std::cout << "Mismatch detected at index " << i << " : " << tensor.data()[i] << " vs " << single_pixel_patch.data()[i] << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_pixel_patch.data()[i], tensor.data()[i]);
+    // RowMajor
+    if (tensor_row_major.data()[i] != single_pixel_patch_row_major.data()[i]) {
+      std::cout << "Mismatch detected at index " << i << " : "
+           << tensor.data()[i] << " vs "
+           << single_pixel_patch_row_major.data()[i] << std::endl;
+    }
+    VERIFY_IS_EQUAL(single_pixel_patch_row_major.data()[i],
+                    tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor.data()[i], tensor_row_major.data()[i]);
+    VERIFY_IS_EQUAL(single_pixel_patch.data()[i],
+                    single_pixel_patch_row_major.data()[i]);
+  }
+
+  // Entire image patch: ColMajor
+  Tensor<float, 4> entire_image_patch;
+  entire_image_patch = tensor.extract_image_patches(3, 5);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(1), 3);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(2), 5);
+  VERIFY_IS_EQUAL(entire_image_patch.dimension(3), 3*5);
+
+  // Entire image patch: RowMajor
+  Tensor<float, 4, RowMajor> entire_image_patch_row_major;
+  entire_image_patch_row_major = tensor_row_major.extract_image_patches(3, 5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(0), 3*5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(1), 5);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(2), 3);
+  VERIFY_IS_EQUAL(entire_image_patch_row_major.dimension(3), 2);
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      int patchId = i+3*j;
+      for (int r = 0; r < 3; ++r) {
+        for (int c = 0; c < 5; ++c) {
+          for (int d = 0; d < 2; ++d) {
+            float expected = 0.0f;
+            float expected_row_major = 0.0f;
+            if (r-1+i >= 0 && c-2+j >= 0 && r-1+i < 3 && c-2+j < 5) {
+              expected = tensor(d, r-1+i, c-2+j);
+              expected_row_major = tensor_row_major(c-2+j, r-1+i, d);
+            }
+            // ColMajor
+            if (entire_image_patch(d, r, c, patchId) != expected) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(entire_image_patch(d, r, c, patchId), expected);
+            // RowMajor
+            if (entire_image_patch_row_major(patchId, c, r, d) !=
+                expected_row_major) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(entire_image_patch_row_major(patchId, c, r, d),
+                            expected_row_major);
+            // Check that ColMajor and RowMajor agree.
+            VERIFY_IS_EQUAL(expected, expected_row_major);
+          }
+        }
+      }
+    }
+  }
+
+  // 2D patch: ColMajor
+  Tensor<float, 4> twod_patch;
+  twod_patch = tensor.extract_image_patches(2, 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(0), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(1), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch.dimension(3), 3*5);
+
+  // 2D patch: RowMajor
+  Tensor<float, 4, RowMajor> twod_patch_row_major;
+  twod_patch_row_major = tensor_row_major.extract_image_patches(2, 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(0), 3*5);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(1), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(2), 2);
+  VERIFY_IS_EQUAL(twod_patch_row_major.dimension(3), 2);
+
+  // Based on the calculation described in TensorTraits.h, padding happens to be 0.
+  int row_padding = 0;
+  int col_padding = 0;
+  int stride = 1;
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      int patchId = i+3*j;
+      for (int r = 0; r < 2; ++r) {
+        for (int c = 0; c < 2; ++c) {
+          for (int d = 0; d < 2; ++d) {
+            float expected = 0.0f;
+            float expected_row_major = 0.0f;
+            int row_offset = r*stride + i - row_padding;
+            int col_offset = c*stride + j - col_padding;
+            // ColMajor
+            if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor.dimension(1) && col_offset < tensor.dimension(2)) {
+              expected = tensor(d, row_offset, col_offset);
+            }
+            if (twod_patch(d, r, c, patchId) != expected) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(twod_patch(d, r, c, patchId), expected);
+            // RowMajor
+            if (row_offset >= 0 && col_offset >= 0 && row_offset < tensor_row_major.dimension(1) && col_offset < tensor_row_major.dimension(0)) {
+              expected_row_major = tensor_row_major(col_offset, row_offset, d);
+            }
+            if (twod_patch_row_major(patchId, c, r, d) != expected_row_major) {
+              std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << std::endl;
+            }
+            VERIFY_IS_EQUAL(twod_patch_row_major(patchId, c, r, d), expected_row_major);
+            // Check that ColMajor and RowMajor agree.
+            VERIFY_IS_EQUAL(expected, expected_row_major);
+          }
+        }
+      }
+    }
+  }
+}
+
+void test_imagenet_patches()
+{
+  // Test the code on typical configurations used by the 'imagenet' benchmarks at
+  // https://github.com/soumith/convnet-benchmarks
+  // ColMajor
+  Tensor<float, 4> l_in(3, 128, 128, 16);
+  l_in.setRandom();
+  Tensor<float, 5> l_out = l_in.extract_image_patches(11, 11);
+  VERIFY_IS_EQUAL(l_out.dimension(0), 3);
+  VERIFY_IS_EQUAL(l_out.dimension(1), 11);
+  VERIFY_IS_EQUAL(l_out.dimension(2), 11);
+  VERIFY_IS_EQUAL(l_out.dimension(3), 128*128);
+  VERIFY_IS_EQUAL(l_out.dimension(4), 16);
+
+  // RowMajor
+  Tensor<float, 5, RowMajor> l_out_row_major = l_in.swap_layout().extract_image_patches(11, 11);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 16);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 128*128);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 11);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 11);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 3);
+
+  for (int b = 0; b < 16; ++b) {
+    for (int i = 0; i < 128; ++i) {
+      for (int j = 0; j < 128; ++j) {
+        int patchId = i+128*j;
+        for (int c = 0; c < 11; ++c) {
+          for (int r = 0; r < 11; ++r) {
+            for (int d = 0; d < 3; ++d) {
+              float expected = 0.0f;
+              if (r-5+i >= 0 && c-5+j >= 0 && r-5+i < 128 && c-5+j < 128) {
+                expected = l_in(d, r-5+i, c-5+j, b);
+              }
+              // ColMajor
+              if (l_out(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) !=
+                  expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j
+                     << " r=" << r << " c=" << c << " d=" << d << " b=" << b
+                     << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d),
+                              expected);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // ColMajor
+  l_in.resize(16, 64, 64, 32);
+  l_in.setRandom();
+  l_out = l_in.extract_image_patches(9, 9);
+  VERIFY_IS_EQUAL(l_out.dimension(0), 16);
+  VERIFY_IS_EQUAL(l_out.dimension(1), 9);
+  VERIFY_IS_EQUAL(l_out.dimension(2), 9);
+  VERIFY_IS_EQUAL(l_out.dimension(3), 64*64);
+  VERIFY_IS_EQUAL(l_out.dimension(4), 32);
+
+  // RowMajor
+  l_out_row_major = l_in.swap_layout().extract_image_patches(9, 9);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 64*64);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 9);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 9);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 16);
+
+  for (int b = 0; b < 32; ++b) {
+    for (int i = 0; i < 64; ++i) {
+      for (int j = 0; j < 64; ++j) {
+        int patchId = i+64*j;
+        for (int c = 0; c < 9; ++c) {
+          for (int r = 0; r < 9; ++r) {
+            for (int d = 0; d < 16; ++d) {
+              float expected = 0.0f;
+              if (r-4+i >= 0 && c-4+j >= 0 && r-4+i < 64 && c-4+j < 64) {
+                expected = l_in(d, r-4+i, c-4+j, b);
+              }
+              // ColMajor
+              if (l_out(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // ColMajor
+  l_in.resize(32, 16, 16, 32);
+  l_in.setRandom();
+  l_out = l_in.extract_image_patches(7, 7);
+  VERIFY_IS_EQUAL(l_out.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out.dimension(1), 7);
+  VERIFY_IS_EQUAL(l_out.dimension(2), 7);
+  VERIFY_IS_EQUAL(l_out.dimension(3), 16*16);
+  VERIFY_IS_EQUAL(l_out.dimension(4), 32);
+
+  // RowMajor
+  l_out_row_major = l_in.swap_layout().extract_image_patches(7, 7);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 16*16);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 7);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 7);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 32);
+
+  for (int b = 0; b < 32; ++b) {
+    for (int i = 0; i < 16; ++i) {
+      for (int j = 0; j < 16; ++j) {
+        int patchId = i+16*j;
+        for (int c = 0; c < 7; ++c) {
+          for (int r = 0; r < 7; ++r) {
+            for (int d = 0; d < 32; ++d) {
+              float expected = 0.0f;
+              if (r-3+i >= 0 && c-3+j >= 0 && r-3+i < 16 && c-3+j < 16) {
+                expected = l_in(d, r-3+i, c-3+j, b);
+              }
+              // ColMajor
+              if (l_out(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // ColMajor
+  l_in.resize(64, 13, 13, 32);
+  l_in.setRandom();
+  l_out = l_in.extract_image_patches(3, 3);
+  VERIFY_IS_EQUAL(l_out.dimension(0), 64);
+  VERIFY_IS_EQUAL(l_out.dimension(1), 3);
+  VERIFY_IS_EQUAL(l_out.dimension(2), 3);
+  VERIFY_IS_EQUAL(l_out.dimension(3), 13*13);
+  VERIFY_IS_EQUAL(l_out.dimension(4), 32);
+
+  // RowMajor
+  l_out_row_major = l_in.swap_layout().extract_image_patches(3, 3);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(0), 32);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(1), 13*13);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(2), 3);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(3), 3);
+  VERIFY_IS_EQUAL(l_out_row_major.dimension(4), 64);
+
+  for (int b = 0; b < 32; ++b) {
+    for (int i = 0; i < 13; ++i) {
+      for (int j = 0; j < 13; ++j) {
+        int patchId = i+13*j;
+        for (int c = 0; c < 3; ++c) {
+          for (int r = 0; r < 3; ++r) {
+            for (int d = 0; d < 64; ++d) {
+              float expected = 0.0f;
+              if (r-1+i >= 0 && c-1+j >= 0 && r-1+i < 13 && c-1+j < 13) {
+                expected = l_in(d, r-1+i, c-1+j, b);
+              }
+              // ColMajor
+              if (l_out(d, r, c, patchId, b) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out(d, r, c, patchId, b), expected);
+              // RowMajor
+              if (l_out_row_major(b, patchId, c, r, d) != expected) {
+                std::cout << "Mismatch detected at index i=" << i << " j=" << j << " r=" << r << " c=" << c << " d=" << d << " b=" << b << std::endl;
+              }
+              VERIFY_IS_EQUAL(l_out_row_major(b, patchId, c, r, d), expected);
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_image_patch()
+{
+  CALL_SUBTEST_1(test_simple_patch());
+  CALL_SUBTEST_2(test_patch_no_extra_dim());
+  CALL_SUBTEST_3(test_patch_padding_valid());
+  CALL_SUBTEST_4(test_patch_padding_valid_same_value());
+  CALL_SUBTEST_5(test_patch_padding_same());
+  CALL_SUBTEST_6(test_imagenet_patches());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_index_list.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_index_list.cpp
new file mode 100644
index 000000000..4cf5df666
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_index_list.cpp
@@ -0,0 +1,386 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+#ifdef EIGEN_HAS_INDEX_LIST
+
+static void test_static_index_list()
+{
+  Tensor<float, 4> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  constexpr auto reduction_axis = make_index_list(0, 1, 2);
+  VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 0);
+  VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
+  VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 0);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 2);
+
+  EIGEN_STATIC_ASSERT((internal::array_get<0>(reduction_axis) == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::array_get<1>(reduction_axis) == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::array_get<2>(reduction_axis) == 2), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  Tensor<float, 1> result = tensor.sum(reduction_axis);
+  for (int i = 0; i < result.size(); ++i) {
+    float expected = 0.0f;
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 5; ++l) {
+          expected += tensor(j,k,l,i);
+        }
+      }
+    }
+    VERIFY_IS_APPROX(result(i), expected);
+  }
+}
+
+
+static void test_type2index_list()
+{
+  Tensor<float, 5> tensor(2,3,5,7,11);
+  tensor.setRandom();
+  tensor += tensor.constant(10.0f);
+
+  typedef Eigen::IndexList<Eigen::type2index<0>> Dims0;
+  typedef Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1>> Dims1;
+  typedef Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1>, Eigen::type2index<2>> Dims2;
+  typedef Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1>, Eigen::type2index<2>, Eigen::type2index<3>> Dims3;
+  typedef Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1>, Eigen::type2index<2>, Eigen::type2index<3>, Eigen::type2index<4>> Dims4;
+
+#if 0
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims0>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims1>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims2>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims3>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<Dims4>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+#endif
+
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims0, 1, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims1, 2, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims2, 3, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims3, 4, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims4, 5, ColMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims0, 1, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims1, 2, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims2, 3, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims3, 4, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::are_inner_most_dims<Dims4, 5, RowMajor>::value == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  const Dims0 reduction_axis0;
+  Tensor<float, 4> result0 = tensor.sum(reduction_axis0);
+  for (int m = 0; m < 11; ++m) {
+    for (int l = 0; l < 7; ++l) {
+      for (int k = 0; k < 5; ++k) {
+        for (int j = 0; j < 3; ++j) {
+          float expected = 0.0f;
+          for (int i = 0; i < 2; ++i) {
+            expected += tensor(i,j,k,l,m);
+          }
+          VERIFY_IS_APPROX(result0(j,k,l,m), expected);
+        }
+      }
+    }
+  }
+
+  const Dims1 reduction_axis1;
+  Tensor<float, 3> result1 = tensor.sum(reduction_axis1);
+  for (int m = 0; m < 11; ++m) {
+    for (int l = 0; l < 7; ++l) {
+      for (int k = 0; k < 5; ++k) {
+        float expected = 0.0f;
+        for (int j = 0; j < 3; ++j) {
+          for (int i = 0; i < 2; ++i) {
+            expected += tensor(i,j,k,l,m);
+          }
+        }
+        VERIFY_IS_APPROX(result1(k,l,m), expected);
+      }
+    }
+  }
+
+  const Dims2 reduction_axis2;
+  Tensor<float, 2> result2 = tensor.sum(reduction_axis2);
+  for (int m = 0; m < 11; ++m) {
+    for (int l = 0; l < 7; ++l) {
+      float expected = 0.0f;
+      for (int k = 0; k < 5; ++k) {
+        for (int j = 0; j < 3; ++j) {
+          for (int i = 0; i < 2; ++i) {
+            expected += tensor(i,j,k,l,m);
+          }
+        }
+      }
+      VERIFY_IS_APPROX(result2(l,m), expected);
+    }
+  }
+
+  const Dims3 reduction_axis3;
+  Tensor<float, 1> result3 = tensor.sum(reduction_axis3);
+  for (int m = 0; m < 11; ++m) {
+    float expected = 0.0f;
+    for (int l = 0; l < 7; ++l) {
+      for (int k = 0; k < 5; ++k) {
+        for (int j = 0; j < 3; ++j) {
+          for (int i = 0; i < 2; ++i) {
+            expected += tensor(i,j,k,l,m);
+          }
+        }
+      }
+    }
+    VERIFY_IS_APPROX(result3(m), expected);
+  }
+
+  const Dims4 reduction_axis4;
+  Tensor<float, 0> result4 = tensor.sum(reduction_axis4);
+  float expected = 0.0f;
+  for (int m = 0; m < 11; ++m) {
+    for (int l = 0; l < 7; ++l) {
+      for (int k = 0; k < 5; ++k) {
+        for (int j = 0; j < 3; ++j) {
+          for (int i = 0; i < 2; ++i) {
+            expected += tensor(i,j,k,l,m);
+          }
+        }
+      }
+    }
+  }
+  VERIFY_IS_APPROX(result4(), expected);
+}
+
+
+static void test_type2indexpair_list()
+{
+  Tensor<float, 5> tensor(2,3,5,7,11);
+  tensor.setRandom();
+  tensor += tensor.constant(10.0f);
+
+  typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>> Dims0;
+  typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::type2indexpair<1,11>, Eigen::type2indexpair<2,12>> Dims2_a;
+  typedef Eigen::IndexPairList<Eigen::type2indexpair<0,10>, Eigen::IndexPair<DenseIndex>, Eigen::type2indexpair<2,12>> Dims2_b;
+  typedef Eigen::IndexPairList<Eigen::IndexPair<DenseIndex>, Eigen::type2indexpair<1,11>, Eigen::IndexPair<DenseIndex>> Dims2_c;
+
+  Dims0 d0;
+  Dims2_a d2_a;
+
+  Dims2_b d2_b;
+  d2_b.set(1, Eigen::IndexPair<DenseIndex>(1,11));
+
+  Dims2_c d2_c;
+  d2_c.set(0, Eigen::IndexPair<DenseIndex>(Eigen::IndexPair<DenseIndex>(0,10)));
+  d2_c.set(1, Eigen::IndexPair<DenseIndex>(1,11));  // setting type2indexpair to correct value.
+  d2_c.set(2, Eigen::IndexPair<DenseIndex>(2,12));
+
+  VERIFY_IS_EQUAL(d2_a[0].first, 0);
+  VERIFY_IS_EQUAL(d2_a[0].second, 10);
+  VERIFY_IS_EQUAL(d2_a[1].first, 1);
+  VERIFY_IS_EQUAL(d2_a[1].second, 11);
+  VERIFY_IS_EQUAL(d2_a[2].first, 2);
+  VERIFY_IS_EQUAL(d2_a[2].second, 12);
+
+  VERIFY_IS_EQUAL(d2_b[0].first, 0);
+  VERIFY_IS_EQUAL(d2_b[0].second, 10);
+  VERIFY_IS_EQUAL(d2_b[1].first, 1);
+  VERIFY_IS_EQUAL(d2_b[1].second, 11);
+  VERIFY_IS_EQUAL(d2_b[2].first, 2);
+  VERIFY_IS_EQUAL(d2_b[2].second, 12);
+
+  VERIFY_IS_EQUAL(d2_c[0].first, 0);
+  VERIFY_IS_EQUAL(d2_c[0].second, 10);
+  VERIFY_IS_EQUAL(d2_c[1].first, 1);
+  VERIFY_IS_EQUAL(d2_c[1].second, 11);
+  VERIFY_IS_EQUAL(d2_c[2].first, 2);
+  VERIFY_IS_EQUAL(d2_c[2].second, 12);
+
+  EIGEN_STATIC_ASSERT((d2_a.value_known_statically(0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((d2_a.value_known_statically(1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((d2_a.value_known_statically(2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((d2_b.value_known_statically(0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((d2_b.value_known_statically(1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((d2_b.value_known_statically(2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((d2_c.value_known_statically(0) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((d2_c.value_known_statically(1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((d2_c.value_known_statically(2) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims0>(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims0>(0, 1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(0, 1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(1, 1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(1, 2) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_a>(2, 3) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(0, 1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(1, 1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(1, 2) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_b>(2, 3) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(0, 0) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(0, 1) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(1, 1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(1, 2) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(2, 2) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_first_statically_eq<Dims2_c>(2, 3) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims0>(0, 10) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims0>(0, 11) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(0, 10) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(0, 11) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(1, 11) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(1, 12) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(2, 12) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_a>(2, 13) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(0, 10) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(0, 11) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(1, 11) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(1, 12) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(2, 12) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_b>(2, 13) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(0, 10) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(0, 11) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(1, 11) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(1, 12) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(2, 12) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((Eigen::internal::index_pair_second_statically_eq<Dims2_c>(2, 13) == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+}
+
+
+static void test_dynamic_index_list()
+{
+  Tensor<float, 4> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  int dim1 = 2;
+  int dim2 = 1;
+  int dim3 = 0;
+
+  auto reduction_axis = make_index_list(dim1, dim2, dim3);
+
+  VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 2);
+  VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
+  VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 0);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 2);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 0);
+
+  Tensor<float, 1> result = tensor.sum(reduction_axis);
+  for (int i = 0; i < result.size(); ++i) {
+    float expected = 0.0f;
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 5; ++l) {
+          expected += tensor(j,k,l,i);
+        }
+      }
+    }
+    VERIFY_IS_APPROX(result(i), expected);
+  }
+}
+
+static void test_mixed_index_list()
+{
+  Tensor<float, 4> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  int dim2 = 1;
+  int dim4 = 3;
+
+  auto reduction_axis = make_index_list(0, dim2, 2, dim4);
+
+  VERIFY_IS_EQUAL(internal::array_get<0>(reduction_axis), 0);
+  VERIFY_IS_EQUAL(internal::array_get<1>(reduction_axis), 1);
+  VERIFY_IS_EQUAL(internal::array_get<2>(reduction_axis), 2);
+  VERIFY_IS_EQUAL(internal::array_get<3>(reduction_axis), 3);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[0]), 0);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[1]), 1);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[2]), 2);
+  VERIFY_IS_EQUAL(static_cast<DenseIndex>(reduction_axis[3]), 3);
+
+  typedef IndexList<type2index<0>, int, type2index<2>, int> ReductionIndices;
+  ReductionIndices reduction_indices;
+  reduction_indices.set(1, 1);
+  reduction_indices.set(3, 3);
+  EIGEN_STATIC_ASSERT((internal::array_get<0>(reduction_indices) == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::array_get<2>(reduction_indices) == 2), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_known_statically<ReductionIndices>(0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_known_statically<ReductionIndices>(2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionIndices>(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionIndices>(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+#if 0
+  EIGEN_STATIC_ASSERT((internal::all_indices_known_statically<ReductionIndices>() == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<ReductionIndices>() == false), YOU_MADE_A_PROGRAMMING_MISTAKE);
+#endif
+
+  typedef IndexList<type2index<0>, type2index<1>, type2index<2>, type2index<3>> ReductionList;
+  ReductionList reduction_list;
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>(0, 0) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>(1, 1) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>(2, 2) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::index_statically_eq<ReductionList>(3, 3) == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+#if 0
+  EIGEN_STATIC_ASSERT((internal::all_indices_known_statically<ReductionList>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::indices_statically_known_to_increase<ReductionList>() == true), YOU_MADE_A_PROGRAMMING_MISTAKE);
+#endif
+
+  Tensor<float, 0> result1 = tensor.sum(reduction_axis);
+  Tensor<float, 0> result2 = tensor.sum(reduction_indices);
+  Tensor<float, 0> result3 = tensor.sum(reduction_list);
+
+  float expected = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          expected += tensor(i,j,k,l);
+        }
+      }
+    }
+  }
+  VERIFY_IS_APPROX(result1(), expected);
+  VERIFY_IS_APPROX(result2(), expected);
+  VERIFY_IS_APPROX(result3(), expected);
+}
+
+
+static void test_dim_check()
+{
+  Eigen::IndexList<Eigen::type2index<1>, int> dim1;
+  dim1.set(1, 2);
+  Eigen::IndexList<Eigen::type2index<1>, int> dim2;
+  dim2.set(1, 2);
+  VERIFY(dimensions_match(dim1, dim2));
+}
+
+
+#endif
+
+void test_cxx11_tensor_index_list()
+{
+#ifdef EIGEN_HAS_INDEX_LIST
+  CALL_SUBTEST(test_static_index_list());
+  CALL_SUBTEST(test_type2index_list());
+  CALL_SUBTEST(test_type2indexpair_list());
+  CALL_SUBTEST(test_dynamic_index_list());
+  CALL_SUBTEST(test_mixed_index_list());
+  CALL_SUBTEST(test_dim_check());
+#endif
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_inflation.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_inflation.cpp
new file mode 100644
index 000000000..4997935e9
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_inflation.cpp
@@ -0,0 +1,81 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Ke Yang <yangke@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<int DataLayout>
+static void test_simple_inflation()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> strides;
+
+  strides[0] = 1;
+  strides[1] = 1;
+  strides[2] = 1;
+  strides[3] = 1;
+
+  Tensor<float, 4, DataLayout> no_stride;
+  no_stride = tensor.inflate(strides);
+
+  VERIFY_IS_EQUAL(no_stride.dimension(0), 2);
+  VERIFY_IS_EQUAL(no_stride.dimension(1), 3);
+  VERIFY_IS_EQUAL(no_stride.dimension(2), 5);
+  VERIFY_IS_EQUAL(no_stride.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+  Tensor<float, 4, DataLayout> inflated;
+  inflated = tensor.inflate(strides);
+
+  VERIFY_IS_EQUAL(inflated.dimension(0), 3);
+  VERIFY_IS_EQUAL(inflated.dimension(1), 9);
+  VERIFY_IS_EQUAL(inflated.dimension(2), 9);
+  VERIFY_IS_EQUAL(inflated.dimension(3), 19);
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 9; ++j) {
+      for (int k = 0; k < 9; ++k) {
+        for (int l = 0; l < 19; ++l) {
+          if (i % 2 == 0 &&
+              j % 4 == 0 &&
+              k % 2 == 0 &&
+              l % 3 == 0) {
+            VERIFY_IS_EQUAL(inflated(i,j,k,l),
+                            tensor(i/2, j/4, k/2, l/3));
+          } else {
+            VERIFY_IS_EQUAL(0, inflated(i,j,k,l));
+          }
+        }
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_inflation()
+{
+  CALL_SUBTEST(test_simple_inflation<ColMajor>());
+  CALL_SUBTEST(test_simple_inflation<RowMajor>());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_intdiv.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_intdiv.cpp
new file mode 100644
index 000000000..8e2b70b75
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_intdiv.cpp
@@ -0,0 +1,147 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+
+void test_signed_32bit()
+{
+  // Divide by one
+  const Eigen::internal::TensorIntDivisor<int32_t, false> div_by_one(1);
+
+  for (int32_t j = 0; j < 25000; ++j) {
+    const int32_t fast_div = j / div_by_one;
+    const int32_t slow_div = j / 1;
+    VERIFY_IS_EQUAL(fast_div, slow_div);
+  }
+
+  // Standard divide by 2 or more
+  for (int32_t i = 2; i < 25000; ++i) {
+    const Eigen::internal::TensorIntDivisor<int32_t, false> div(i);
+
+    for (int32_t j = 0; j < 25000; ++j) {
+      const int32_t fast_div = j / div;
+      const int32_t slow_div = j / i;
+      VERIFY_IS_EQUAL(fast_div, slow_div);
+    }
+  }
+
+  // Optimized divide by 2 or more
+  for (int32_t i = 2; i < 25000; ++i) {
+    const Eigen::internal::TensorIntDivisor<int32_t, true> div(i);
+
+    for (int32_t j = 0; j < 25000; ++j) {
+      const int32_t fast_div = j / div;
+      const int32_t slow_div = j / i;
+      VERIFY_IS_EQUAL(fast_div, slow_div);
+    }
+  }
+}
+
+
+void test_unsigned_32bit()
+{
+  for (uint32_t i = 1; i < 25000; ++i) {
+    const Eigen::internal::TensorIntDivisor<uint32_t> div(i);
+
+    for (uint32_t j = 0; j < 25000; ++j) {
+      const uint32_t fast_div = j / div;
+      const uint32_t slow_div = j / i;
+      VERIFY_IS_EQUAL(fast_div, slow_div);
+    }
+  }
+}
+
+
+void test_signed_64bit()
+{
+  for (int64_t i = 1; i < 25000; ++i) {
+    const Eigen::internal::TensorIntDivisor<int64_t> div(i);
+
+    for (int64_t j = 0; j < 25000; ++j) {
+      const int64_t fast_div = j / div;
+      const int64_t slow_div = j / i;
+      VERIFY_IS_EQUAL(fast_div, slow_div);
+    }
+  }
+}
+
+
+void test_unsigned_64bit()
+{
+  for (uint64_t i = 1; i < 25000; ++i) {
+    const Eigen::internal::TensorIntDivisor<uint64_t> div(i);
+
+    for (uint64_t j = 0; j < 25000; ++j) {
+      const uint64_t fast_div = j / div;
+      const uint64_t slow_div = j / i;
+      VERIFY_IS_EQUAL(fast_div, slow_div);
+    }
+  }
+}
+
+void test_powers_32bit() {
+  for (int expon = 1; expon < 31; expon++) {
+    int32_t div = (1 << expon);
+    for (int num_expon = 0; num_expon < 32; num_expon++) {
+      int32_t start_num = (1 << num_expon) - 100;
+      int32_t end_num = (1 << num_expon) + 100;
+      if (start_num < 0)
+        start_num = 0;
+      for (int32_t num = start_num; num < end_num; num++) {
+        Eigen::internal::TensorIntDivisor<int32_t> divider =
+          Eigen::internal::TensorIntDivisor<int32_t>(div);
+        int32_t result = num/div;
+        int32_t result_op = divider.divide(num);
+        VERIFY_IS_EQUAL(result_op, result);
+      }
+    }
+  }
+}
+
+void test_powers_64bit() {
+  for (int expon = 0; expon < 63; expon++) {
+    int64_t div = (1ull << expon);
+    for (int num_expon = 0; num_expon < 63; num_expon++) {
+      int64_t start_num = (1ull << num_expon) - 10;
+      int64_t end_num = (1ull << num_expon) + 10;
+      if (start_num < 0)
+        start_num = 0;
+      for (int64_t num = start_num; num < end_num; num++) {
+        Eigen::internal::TensorIntDivisor<int64_t> divider(div);
+        int64_t result = num/div;
+        int64_t result_op = divider.divide(num);
+        VERIFY_IS_EQUAL(result_op, result);
+      }
+    }
+  }
+}
+
+void test_specific() {
+  // A particular combination that was previously failing
+  int64_t div = 209715200;
+  int64_t num = 3238002688ll;
+  Eigen::internal::TensorIntDivisor<int64_t> divider(div);
+  int64_t result = num/div;
+  int64_t result_op = divider.divide(num);
+  VERIFY_IS_EQUAL(result, result_op);
+}
+
+void test_cxx11_tensor_intdiv()
+{
+  CALL_SUBTEST_1(test_signed_32bit());
+  CALL_SUBTEST_2(test_unsigned_32bit());
+  CALL_SUBTEST_3(test_signed_64bit());
+  CALL_SUBTEST_4(test_unsigned_64bit());
+  CALL_SUBTEST_5(test_powers_32bit());
+  CALL_SUBTEST_6(test_powers_64bit());
+  CALL_SUBTEST_7(test_specific());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_io.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_io.cpp
new file mode 100644
index 000000000..489960529
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_io.cpp
@@ -0,0 +1,136 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <sstream>
+#include <string>
+#include <Eigen/CXX11/Tensor>
+
+
+template<int DataLayout>
+static void test_output_0d()
+{
+  Tensor<int, 0, DataLayout> tensor;
+  tensor() = 123;
+
+  std::stringstream os;
+  os << tensor;
+
+  std::string expected("123");
+  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+}
+
+
+template<int DataLayout>
+static void test_output_1d()
+{
+  Tensor<int, 1, DataLayout> tensor(5);
+  for (int i = 0; i < 5; ++i) {
+    tensor(i) = i;
+  }
+
+  std::stringstream os;
+  os << tensor;
+
+  std::string expected("0\n1\n2\n3\n4");
+  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+
+  Eigen::Tensor<double,1,DataLayout> empty_tensor(0);
+  std::stringstream empty_os;
+  empty_os << empty_tensor;
+  std::string empty_string;
+  VERIFY_IS_EQUAL(std::string(empty_os.str()), empty_string);
+}
+
+
+template<int DataLayout>
+static void test_output_2d()
+{
+  Tensor<int, 2, DataLayout> tensor(5, 3);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      tensor(i, j) = i*j;
+    }
+  }
+
+  std::stringstream os;
+  os << tensor;
+
+  std::string expected("0  0  0\n0  1  2\n0  2  4\n0  3  6\n0  4  8");
+  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+}
+
+
+template<int DataLayout>
+static void test_output_expr()
+{
+  Tensor<int, 1, DataLayout> tensor1(5);
+  Tensor<int, 1, DataLayout> tensor2(5);
+  for (int i = 0; i < 5; ++i) {
+    tensor1(i) = i;
+    tensor2(i) = 7;
+  }
+
+  std::stringstream os;
+  os << tensor1 + tensor2;
+
+  std::string expected(" 7\n 8\n 9\n10\n11");
+  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+}
+
+
+template<int DataLayout>
+static void test_output_string()
+{
+  Tensor<std::string, 2, DataLayout> tensor(5, 3);
+  tensor.setConstant(std::string("foo"));
+
+  std::cout << tensor << std::endl;
+
+  std::stringstream os;
+  os << tensor;
+
+  std::string expected("foo  foo  foo\nfoo  foo  foo\nfoo  foo  foo\nfoo  foo  foo\nfoo  foo  foo");
+  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+}
+
+
+template<int DataLayout>
+static void test_output_const()
+{
+  Tensor<int, 1, DataLayout> tensor(5);
+  for (int i = 0; i < 5; ++i) {
+    tensor(i) = i;
+  }
+
+  TensorMap<Tensor<const int, 1, DataLayout> > tensor_map(tensor.data(), 5);
+
+  std::stringstream os;
+  os << tensor_map;
+
+  std::string expected("0\n1\n2\n3\n4");
+  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+}
+
+
+void test_cxx11_tensor_io()
+{
+  CALL_SUBTEST(test_output_0d<ColMajor>());
+  CALL_SUBTEST(test_output_0d<RowMajor>());
+  CALL_SUBTEST(test_output_1d<ColMajor>());
+  CALL_SUBTEST(test_output_1d<RowMajor>());
+  CALL_SUBTEST(test_output_2d<ColMajor>());
+  CALL_SUBTEST(test_output_2d<RowMajor>());
+  CALL_SUBTEST(test_output_expr<ColMajor>());
+  CALL_SUBTEST(test_output_expr<RowMajor>());
+  CALL_SUBTEST(test_output_string<ColMajor>());
+  CALL_SUBTEST(test_output_string<RowMajor>());
+  CALL_SUBTEST(test_output_const<ColMajor>());
+  CALL_SUBTEST(test_output_const<RowMajor>());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_layout_swap.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_layout_swap.cpp
new file mode 100644
index 000000000..ae297a9da
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_layout_swap.cpp
@@ -0,0 +1,61 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+static void test_simple_swap()
+{
+  Tensor<float, 3, ColMajor> tensor(2,3,7);
+  tensor.setRandom();
+
+  Tensor<float, 3, RowMajor> tensor2 = tensor.swap_layout();
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor2.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor2.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor2.dimension(0));
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor(i,j,k), tensor2(k,j,i));
+      }
+    }
+  }
+}
+
+
+static void test_swap_as_lvalue()
+{
+  Tensor<float, 3, ColMajor> tensor(2,3,7);
+  tensor.setRandom();
+
+  Tensor<float, 3, RowMajor> tensor2(7,3,2);
+  tensor2.swap_layout() = tensor;
+  VERIFY_IS_EQUAL(tensor.dimension(0), tensor2.dimension(2));
+  VERIFY_IS_EQUAL(tensor.dimension(1), tensor2.dimension(1));
+  VERIFY_IS_EQUAL(tensor.dimension(2), tensor2.dimension(0));
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor(i,j,k), tensor2(k,j,i));
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_layout_swap()
+{
+  CALL_SUBTEST(test_simple_swap());
+  CALL_SUBTEST(test_swap_as_lvalue());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_lvalue.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_lvalue.cpp
new file mode 100644
index 000000000..071f5b406
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_lvalue.cpp
@@ -0,0 +1,42 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+
+static void test_compound_assignment()
+{
+  Tensor<float, 3> mat1(2,3,7);
+  Tensor<float, 3> mat2(2,3,7);
+  Tensor<float, 3> mat3(2,3,7);
+
+  mat1.setRandom();
+  mat2.setRandom();
+  mat3 = mat1;
+  mat3 += mat2;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(mat3(i,j,k), mat1(i,j,k) + mat2(i,j,k));
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_lvalue()
+{
+  CALL_SUBTEST(test_compound_assignment());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_map.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_map.cpp
new file mode 100644
index 000000000..3db0ee7c0
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_map.cpp
@@ -0,0 +1,277 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_0d()
+{
+  Tensor<int, 0> scalar1;
+  Tensor<int, 0, RowMajor> scalar2;
+
+  TensorMap<Tensor<const int, 0> > scalar3(scalar1.data());
+  TensorMap<Tensor<const int, 0, RowMajor> > scalar4(scalar2.data());
+
+  scalar1() = 7;
+  scalar2() = 13;
+
+  VERIFY_IS_EQUAL(scalar1.rank(), 0);
+  VERIFY_IS_EQUAL(scalar1.size(), 1);
+
+  VERIFY_IS_EQUAL(scalar3(), 7);
+  VERIFY_IS_EQUAL(scalar4(), 13);
+}
+
+static void test_1d()
+{
+  Tensor<int, 1> vec1(6);
+  Tensor<int, 1, RowMajor> vec2(6);
+
+  TensorMap<Tensor<const int, 1> > vec3(vec1.data(), 6);
+  TensorMap<Tensor<const int, 1, RowMajor> > vec4(vec2.data(), 6);
+
+  vec1(0) = 4;  vec2(0) = 0;
+  vec1(1) = 8;  vec2(1) = 1;
+  vec1(2) = 15; vec2(2) = 2;
+  vec1(3) = 16; vec2(3) = 3;
+  vec1(4) = 23; vec2(4) = 4;
+  vec1(5) = 42; vec2(5) = 5;
+
+  VERIFY_IS_EQUAL(vec1.rank(), 1);
+  VERIFY_IS_EQUAL(vec1.size(), 6);
+  VERIFY_IS_EQUAL(vec1.dimension(0), 6);
+
+  VERIFY_IS_EQUAL(vec3(0), 4);
+  VERIFY_IS_EQUAL(vec3(1), 8);
+  VERIFY_IS_EQUAL(vec3(2), 15);
+  VERIFY_IS_EQUAL(vec3(3), 16);
+  VERIFY_IS_EQUAL(vec3(4), 23);
+  VERIFY_IS_EQUAL(vec3(5), 42);
+
+  VERIFY_IS_EQUAL(vec4(0), 0);
+  VERIFY_IS_EQUAL(vec4(1), 1);
+  VERIFY_IS_EQUAL(vec4(2), 2);
+  VERIFY_IS_EQUAL(vec4(3), 3);
+  VERIFY_IS_EQUAL(vec4(4), 4);
+  VERIFY_IS_EQUAL(vec4(5), 5);
+}
+
+static void test_2d()
+{
+  Tensor<int, 2> mat1(2,3);
+  Tensor<int, 2, RowMajor> mat2(2,3);
+
+  mat1(0,0) = 0;
+  mat1(0,1) = 1;
+  mat1(0,2) = 2;
+  mat1(1,0) = 3;
+  mat1(1,1) = 4;
+  mat1(1,2) = 5;
+
+  mat2(0,0) = 0;
+  mat2(0,1) = 1;
+  mat2(0,2) = 2;
+  mat2(1,0) = 3;
+  mat2(1,1) = 4;
+  mat2(1,2) = 5;
+
+  TensorMap<Tensor<const int, 2> > mat3(mat1.data(), 2, 3);
+  TensorMap<Tensor<const int, 2, RowMajor> > mat4(mat2.data(), 2, 3);
+
+  VERIFY_IS_EQUAL(mat3.rank(), 2);
+  VERIFY_IS_EQUAL(mat3.size(), 6);
+  VERIFY_IS_EQUAL(mat3.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat3.dimension(1), 3);
+
+  VERIFY_IS_EQUAL(mat4.rank(), 2);
+  VERIFY_IS_EQUAL(mat4.size(), 6);
+  VERIFY_IS_EQUAL(mat4.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat4.dimension(1), 3);
+
+  VERIFY_IS_EQUAL(mat3(0,0), 0);
+  VERIFY_IS_EQUAL(mat3(0,1), 1);
+  VERIFY_IS_EQUAL(mat3(0,2), 2);
+  VERIFY_IS_EQUAL(mat3(1,0), 3);
+  VERIFY_IS_EQUAL(mat3(1,1), 4);
+  VERIFY_IS_EQUAL(mat3(1,2), 5);
+
+  VERIFY_IS_EQUAL(mat4(0,0), 0);
+  VERIFY_IS_EQUAL(mat4(0,1), 1);
+  VERIFY_IS_EQUAL(mat4(0,2), 2);
+  VERIFY_IS_EQUAL(mat4(1,0), 3);
+  VERIFY_IS_EQUAL(mat4(1,1), 4);
+  VERIFY_IS_EQUAL(mat4(1,2), 5);
+}
+
+static void test_3d()
+{
+  Tensor<int, 3> mat1(2,3,7);
+  Tensor<int, 3, RowMajor> mat2(2,3,7);
+
+  int val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        mat2(i,j,k) = val;
+        val++;
+      }
+    }
+  }
+
+  TensorMap<Tensor<const int, 3> > mat3(mat1.data(), 2, 3, 7);
+  TensorMap<Tensor<const int, 3, RowMajor> > mat4(mat2.data(), 2, 3, 7);
+
+  VERIFY_IS_EQUAL(mat3.rank(), 3);
+  VERIFY_IS_EQUAL(mat3.size(), 2*3*7);
+  VERIFY_IS_EQUAL(mat3.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat3.dimension(1), 3);
+  VERIFY_IS_EQUAL(mat3.dimension(2), 7);
+
+  VERIFY_IS_EQUAL(mat4.rank(), 3);
+  VERIFY_IS_EQUAL(mat4.size(), 2*3*7);
+  VERIFY_IS_EQUAL(mat4.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat4.dimension(1), 3);
+  VERIFY_IS_EQUAL(mat4.dimension(2), 7);
+
+  val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(mat3(i,j,k), val);
+        VERIFY_IS_EQUAL(mat4(i,j,k), val);
+        val++;
+      }
+    }
+  }
+}
+
+
+static void test_from_tensor()
+{
+  Tensor<int, 3> mat1(2,3,7);
+  Tensor<int, 3, RowMajor> mat2(2,3,7);
+
+  int val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        mat1(i,j,k) = val;
+        mat2(i,j,k) = val;
+        val++;
+      }
+    }
+  }
+
+  TensorMap<Tensor<int, 3> > mat3(mat1);
+  TensorMap<Tensor<int, 3, RowMajor> > mat4(mat2);
+
+  VERIFY_IS_EQUAL(mat3.rank(), 3);
+  VERIFY_IS_EQUAL(mat3.size(), 2*3*7);
+  VERIFY_IS_EQUAL(mat3.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat3.dimension(1), 3);
+  VERIFY_IS_EQUAL(mat3.dimension(2), 7);
+
+  VERIFY_IS_EQUAL(mat4.rank(), 3);
+  VERIFY_IS_EQUAL(mat4.size(), 2*3*7);
+  VERIFY_IS_EQUAL(mat4.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat4.dimension(1), 3);
+  VERIFY_IS_EQUAL(mat4.dimension(2), 7);
+
+  val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(mat3(i,j,k), val);
+        VERIFY_IS_EQUAL(mat4(i,j,k), val);
+        val++;
+      }
+    }
+  }
+
+  TensorFixedSize<int, Sizes<2,3,7> > mat5;
+
+  val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        array<ptrdiff_t, 3> coords;
+        coords[0] = i;
+        coords[1] = j;
+        coords[2] = k;
+        mat5(coords) = val;
+        val++;
+      }
+    }
+  }
+
+  TensorMap<TensorFixedSize<int, Sizes<2,3,7> > > mat6(mat5);
+
+  VERIFY_IS_EQUAL(mat6.rank(), 3);
+  VERIFY_IS_EQUAL(mat6.size(), 2*3*7);
+  VERIFY_IS_EQUAL(mat6.dimension(0), 2);
+  VERIFY_IS_EQUAL(mat6.dimension(1), 3);
+  VERIFY_IS_EQUAL(mat6.dimension(2), 7);
+
+  val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(mat6(i,j,k), val);
+        val++;
+      }
+    }
+  }
+}
+
+
+static int f(const TensorMap<Tensor<int, 3> >& tensor) {
+  //  Size<0> empty;
+  EIGEN_STATIC_ASSERT((internal::array_size<Sizes<> >::value == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  EIGEN_STATIC_ASSERT((internal::array_size<DSizes<int, 0> >::value == 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  Tensor<int, 0> result = tensor.sum();
+  return result();
+}
+
+static void test_casting()
+{
+  Tensor<int, 3> tensor(2,3,7);
+
+  int val = 0;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        tensor(i,j,k) = val;
+        val++;
+      }
+    }
+  }
+
+  TensorMap<Tensor<int, 3> > map(tensor);
+  int sum1 = f(map);
+  int sum2 = f(tensor);
+
+  VERIFY_IS_EQUAL(sum1, sum2);
+  VERIFY_IS_EQUAL(sum1, 861);
+}
+
+void test_cxx11_tensor_map()
+{
+  CALL_SUBTEST(test_0d());
+  CALL_SUBTEST(test_1d());
+  CALL_SUBTEST(test_2d());
+  CALL_SUBTEST(test_3d());
+
+  CALL_SUBTEST(test_from_tensor());
+  CALL_SUBTEST(test_casting());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_math.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_math.cpp
new file mode 100644
index 000000000..61c742a16
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_math.cpp
@@ -0,0 +1,46 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_tanh()
+{
+  Tensor<float, 1> vec1(6);
+  vec1.setRandom();
+
+  Tensor<float, 1> vec2 = vec1.tanh();
+
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_APPROX(vec2(i), tanhf(vec1(i)));
+  }
+}
+
+static void test_sigmoid()
+{
+  Tensor<float, 1> vec1(6);
+  vec1.setRandom();
+
+  Tensor<float, 1> vec2 = vec1.sigmoid();
+
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_APPROX(vec2(i), 1.0f / (1.0f + std::exp(-vec1(i))));
+  }
+}
+
+
+void test_cxx11_tensor_math()
+{
+  CALL_SUBTEST(test_tanh());
+  CALL_SUBTEST(test_sigmoid());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_mixed_indices.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_mixed_indices.cpp
new file mode 100644
index 000000000..4fba6fdd1
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_mixed_indices.cpp
@@ -0,0 +1,53 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+
+static void test_simple()
+{
+  Tensor<float, 1, ColMajor> vec1(6);
+  Tensor<float, 1, ColMajor, int> vec2(6);
+
+  vec1(0) = 4.0;  vec2(0) = 0.0;
+  vec1(1) = 8.0;  vec2(1) = 1.0;
+  vec1(2) = 15.0; vec2(2) = 2.0;
+  vec1(3) = 16.0; vec2(3) = 3.0;
+  vec1(4) = 23.0; vec2(4) = 4.0;
+  vec1(5) = 42.0; vec2(5) = 5.0;
+
+  float data3[6];
+  TensorMap<Tensor<float, 1, ColMajor>> vec3(data3, 6);
+  vec3 = vec1.sqrt();
+  float data4[6];
+  TensorMap<Tensor<float, 1, ColMajor, int>> vec4(data4, 6);
+  vec4 = vec2.square();
+
+  VERIFY_IS_APPROX(vec3(0), sqrtf(4.0));
+  VERIFY_IS_APPROX(vec3(1), sqrtf(8.0));
+  VERIFY_IS_APPROX(vec3(2), sqrtf(15.0));
+  VERIFY_IS_APPROX(vec3(3), sqrtf(16.0));
+  VERIFY_IS_APPROX(vec3(4), sqrtf(23.0));
+  VERIFY_IS_APPROX(vec3(5), sqrtf(42.0));
+
+  VERIFY_IS_APPROX(vec4(0), 0.0f);
+  VERIFY_IS_APPROX(vec4(1), 1.0f);
+  VERIFY_IS_APPROX(vec4(2), 2.0f * 2.0f);
+  VERIFY_IS_APPROX(vec4(3), 3.0f * 3.0f);
+  VERIFY_IS_APPROX(vec4(4), 4.0f * 4.0f);
+  VERIFY_IS_APPROX(vec4(5), 5.0f * 5.0f);
+}
+
+
+void test_cxx11_tensor_mixed_indices()
+{
+  CALL_SUBTEST(test_simple());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_morphing.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_morphing.cpp
new file mode 100644
index 000000000..f7de43110
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_morphing.cpp
@@ -0,0 +1,485 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<typename>
+static void test_simple_reshape()
+{
+  Tensor<float, 5> tensor1(2,3,1,7,1);
+  tensor1.setRandom();
+
+  Tensor<float, 3> tensor2(2,3,7);
+  Tensor<float, 2> tensor3(6,7);
+  Tensor<float, 2> tensor4(2,21);
+
+  Tensor<float, 3>::Dimensions dim1(2,3,7);
+  tensor2 = tensor1.reshape(dim1);
+  Tensor<float, 2>::Dimensions dim2(6,7);
+  tensor3 = tensor1.reshape(dim2);
+  Tensor<float, 2>::Dimensions dim3(2,21);
+  tensor4 = tensor1.reshape(dim1).reshape(dim3);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor2(i,j,k));
+        VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor3(i+2*j,k));
+        VERIFY_IS_EQUAL(tensor1(i,j,0,k,0), tensor4(i,j+3*k));
+      }
+    }
+  }
+}
+
+template<typename>
+static void test_reshape_in_expr() {
+  MatrixXf m1(2,3*5*7*11);
+  MatrixXf m2(3*5*7*11,13);
+  m1.setRandom();
+  m2.setRandom();
+  MatrixXf m3 = m1 * m2;
+
+  TensorMap<Tensor<float, 5>> tensor1(m1.data(), 2,3,5,7,11);
+  TensorMap<Tensor<float, 5>> tensor2(m2.data(), 3,5,7,11,13);
+  Tensor<float, 2>::Dimensions newDims1(2,3*5*7*11);
+  Tensor<float, 2>::Dimensions newDims2(3*5*7*11,13);
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  array<DimPair, 1> contract_along{{DimPair(1, 0)}};
+  Tensor<float, 2> tensor3(2,13);
+  tensor3 = tensor1.reshape(newDims1).contract(tensor2.reshape(newDims2), contract_along);
+
+  Map<MatrixXf> res(tensor3.data(), 2, 13);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 13; ++j) {
+      VERIFY_IS_APPROX(res(i,j), m3(i,j));
+    }
+  }
+}
+
+template<typename>
+static void test_reshape_as_lvalue()
+{
+  Tensor<float, 3> tensor(2,3,7);
+  tensor.setRandom();
+
+  Tensor<float, 2> tensor2d(6,7);
+  Tensor<float, 3>::Dimensions dim(2,3,7);
+  tensor2d.reshape(dim) = tensor;
+
+  float scratch[2*3*1*7*1];
+  TensorMap<Tensor<float, 5>> tensor5d(scratch, 2,3,1,7,1);
+  tensor5d.reshape(dim).device(Eigen::DefaultDevice()) = tensor;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(tensor2d(i+2*j,k), tensor(i,j,k));
+        VERIFY_IS_EQUAL(tensor5d(i,j,0,k,0), tensor(i,j,k));
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_simple_slice()
+{
+  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
+  tensor.setRandom();
+
+  Tensor<float, 5, DataLayout> slice1(1,1,1,1,1);
+  Eigen::DSizes<ptrdiff_t, 5> indices(1,2,3,4,5);
+  Eigen::DSizes<ptrdiff_t, 5> sizes(1,1,1,1,1);
+  slice1 = tensor.slice(indices, sizes);
+  VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
+
+  Tensor<float, 5, DataLayout> slice2(1,1,2,2,3);
+  Eigen::DSizes<ptrdiff_t, 5> indices2(1,1,3,4,5);
+  Eigen::DSizes<ptrdiff_t, 5> sizes2(1,1,2,2,3);
+  slice2 = tensor.slice(indices2, sizes2);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+      }
+    }
+  }
+}
+
+template<typename=void>
+static void test_const_slice()
+{
+  const float b[1] = {42};
+  TensorMap<Tensor<const float, 1> > m(b, 1);
+  DSizes<DenseIndex, 1> offsets;
+  offsets[0] = 0;
+  TensorRef<Tensor<const float, 1> > slice_ref(m.slice(offsets, m.dimensions()));
+  VERIFY_IS_EQUAL(slice_ref(0), 42);
+}
+
+template<int DataLayout>
+static void test_slice_in_expr() {
+  typedef Matrix<float, Dynamic, Dynamic, DataLayout> Mtx;
+  Mtx m1(7,7);
+  Mtx m2(3,3);
+  m1.setRandom();
+  m2.setRandom();
+
+  Mtx m3 = m1.block(1, 2, 3, 3) * m2.block(0, 2, 3, 1);
+
+  TensorMap<Tensor<float, 2, DataLayout>> tensor1(m1.data(), 7, 7);
+  TensorMap<Tensor<float, 2, DataLayout>> tensor2(m2.data(), 3, 3);
+  Tensor<float, 2, DataLayout> tensor3(3,1);
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  array<DimPair, 1> contract_along{{DimPair(1, 0)}};
+
+  Eigen::DSizes<ptrdiff_t, 2> indices1(1,2);
+  Eigen::DSizes<ptrdiff_t, 2> sizes1(3,3);
+  Eigen::DSizes<ptrdiff_t, 2> indices2(0,2);
+  Eigen::DSizes<ptrdiff_t, 2> sizes2(3,1);
+  tensor3 = tensor1.slice(indices1, sizes1).contract(tensor2.slice(indices2, sizes2), contract_along);
+
+  Map<Mtx> res(tensor3.data(), 3, 1);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 1; ++j) {
+      VERIFY_IS_APPROX(res(i,j), m3(i,j));
+    }
+  }
+
+  // Take an arbitrary slice of an arbitrarily sized tensor.
+  TensorMap<Tensor<const float, 2, DataLayout>> tensor4(m1.data(), 7, 7);
+  Tensor<float, 1, DataLayout> tensor6 = tensor4.reshape(DSizes<ptrdiff_t, 1>(7*7)).exp().slice(DSizes<ptrdiff_t, 1>(0), DSizes<ptrdiff_t, 1>(35));
+  for (int i = 0; i < 35; ++i) {
+    VERIFY_IS_APPROX(tensor6(i), expf(tensor4.data()[i]));
+  }
+}
+
+template<int DataLayout>
+static void test_slice_as_lvalue()
+{
+  Tensor<float, 3, DataLayout> tensor1(2,2,7);
+  tensor1.setRandom();
+  Tensor<float, 3, DataLayout> tensor2(2,2,7);
+  tensor2.setRandom();
+  Tensor<float, 3, DataLayout> tensor3(4,3,5);
+  tensor3.setRandom();
+  Tensor<float, 3, DataLayout> tensor4(4,3,2);
+  tensor4.setRandom();
+  Tensor<float, 3, DataLayout> tensor5(10,13,12);
+  tensor5.setRandom();
+
+  Tensor<float, 3, DataLayout> result(4,5,7);
+  Eigen::DSizes<ptrdiff_t, 3> sizes12(2,2,7);
+  Eigen::DSizes<ptrdiff_t, 3> first_slice(0,0,0);
+  result.slice(first_slice, sizes12) = tensor1;
+  Eigen::DSizes<ptrdiff_t, 3> second_slice(2,0,0);
+  result.slice(second_slice, sizes12).device(Eigen::DefaultDevice()) = tensor2;
+
+  Eigen::DSizes<ptrdiff_t, 3> sizes3(4,3,5);
+  Eigen::DSizes<ptrdiff_t, 3> third_slice(0,2,0);
+  result.slice(third_slice, sizes3) = tensor3;
+
+  Eigen::DSizes<ptrdiff_t, 3> sizes4(4,3,2);
+  Eigen::DSizes<ptrdiff_t, 3> fourth_slice(0,2,5);
+  result.slice(fourth_slice, sizes4) = tensor4;
+
+  for (int j = 0; j < 2; ++j) {
+    for (int k = 0; k < 7; ++k) {
+      for (int i = 0; i < 2; ++i) {
+        VERIFY_IS_EQUAL(result(i,j,k), tensor1(i,j,k));
+        VERIFY_IS_EQUAL(result(i+2,j,k), tensor2(i,j,k));
+      }
+    }
+  }
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 2; j < 5; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        VERIFY_IS_EQUAL(result(i,j,k), tensor3(i,j-2,k));
+      }
+      for (int k = 5; k < 7; ++k) {
+        VERIFY_IS_EQUAL(result(i,j,k), tensor4(i,j-2,k-5));
+      }
+    }
+  }
+
+  Eigen::DSizes<ptrdiff_t, 3> sizes5(4,5,7);
+  Eigen::DSizes<ptrdiff_t, 3> fifth_slice(0,0,0);
+  result.slice(fifth_slice, sizes5) = tensor5.slice(fifth_slice, sizes5);
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 2; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(result(i,j,k), tensor5(i,j,k));
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_slice_raw_data()
+{
+  Tensor<float, 4, DataLayout> tensor(3,5,7,11);
+  tensor.setRandom();
+
+  Eigen::DSizes<ptrdiff_t, 4> offsets(1,2,3,4);
+  Eigen::DSizes<ptrdiff_t, 4> extents(1,1,1,1);
+  typedef TensorEvaluator<decltype(tensor.slice(offsets, extents)), DefaultDevice> SliceEvaluator;
+  auto slice1 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+  VERIFY_IS_EQUAL(slice1.dimensions().TotalSize(), 1);
+  VERIFY_IS_EQUAL(slice1.data()[0], tensor(1,2,3,4));
+
+  if (DataLayout == ColMajor) {
+    extents = Eigen::DSizes<ptrdiff_t, 4>(2,1,1,1);
+    auto slice2 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice2.dimensions().TotalSize(), 2);
+    VERIFY_IS_EQUAL(slice2.data()[0], tensor(1,2,3,4));
+    VERIFY_IS_EQUAL(slice2.data()[1], tensor(2,2,3,4));
+  } else {
+    extents = Eigen::DSizes<ptrdiff_t, 4>(1,1,1,2);
+    auto slice2 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice2.dimensions().TotalSize(), 2);
+    VERIFY_IS_EQUAL(slice2.data()[0], tensor(1,2,3,4));
+    VERIFY_IS_EQUAL(slice2.data()[1], tensor(1,2,3,5));
+  }
+
+  extents = Eigen::DSizes<ptrdiff_t, 4>(1,2,1,1);
+  auto slice3 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+  VERIFY_IS_EQUAL(slice3.dimensions().TotalSize(), 2);
+  VERIFY_IS_EQUAL(slice3.data(), static_cast<float*>(0));
+
+  if (DataLayout == ColMajor) {
+    offsets = Eigen::DSizes<ptrdiff_t, 4>(0,2,3,4);
+    extents = Eigen::DSizes<ptrdiff_t, 4>(3,2,1,1);
+    auto slice4 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice4.dimensions().TotalSize(), 6);
+    for (int i = 0; i < 3; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        VERIFY_IS_EQUAL(slice4.data()[i+3*j], tensor(i,2+j,3,4));
+      }
+    }
+  } else {
+    offsets = Eigen::DSizes<ptrdiff_t, 4>(1,2,3,0);
+    extents = Eigen::DSizes<ptrdiff_t, 4>(1,1,2,11);
+    auto slice4 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice4.dimensions().TotalSize(), 22);
+    for (int l = 0; l < 11; ++l) {
+      for (int k = 0; k < 2; ++k) {
+        VERIFY_IS_EQUAL(slice4.data()[l+11*k], tensor(1,2,3+k,l));
+      }
+    }
+  }
+
+  if (DataLayout == ColMajor) {
+    offsets = Eigen::DSizes<ptrdiff_t, 4>(0,0,0,4);
+    extents = Eigen::DSizes<ptrdiff_t, 4>(3,5,7,2);
+    auto slice5 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice5.dimensions().TotalSize(), 210);
+    for (int i = 0; i < 3; ++i) {
+      for (int j = 0; j < 5; ++j) {
+        for (int k = 0; k < 7; ++k) {
+          for (int l = 0; l < 2; ++l) {
+            int slice_index = i + 3 * (j + 5 * (k + 7 * l));
+            VERIFY_IS_EQUAL(slice5.data()[slice_index], tensor(i,j,k,l+4));
+          }
+        }
+      }
+    }
+  } else {
+    offsets = Eigen::DSizes<ptrdiff_t, 4>(1,0,0,0);
+    extents = Eigen::DSizes<ptrdiff_t, 4>(2,5,7,11);
+    auto slice5 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+    VERIFY_IS_EQUAL(slice5.dimensions().TotalSize(), 770);
+    for (int l = 0; l < 11; ++l) {
+      for (int k = 0; k < 7; ++k) {
+        for (int j = 0; j < 5; ++j) {
+          for (int i = 0; i < 2; ++i) {
+            int slice_index = l + 11 * (k + 7 * (j + 5 * i));
+            VERIFY_IS_EQUAL(slice5.data()[slice_index], tensor(i+1,j,k,l));
+          }
+        }
+      }
+    }
+
+  }
+
+  offsets = Eigen::DSizes<ptrdiff_t, 4>(0,0,0,0);
+  extents = Eigen::DSizes<ptrdiff_t, 4>(3,5,7,11);
+  auto slice6 = SliceEvaluator(tensor.slice(offsets, extents), DefaultDevice());
+  VERIFY_IS_EQUAL(slice6.dimensions().TotalSize(), 3*5*7*11);
+  VERIFY_IS_EQUAL(slice6.data(), tensor.data());
+}
+
+
+template<int DataLayout>
+static void test_strided_slice()
+{
+  typedef Tensor<float, 5, DataLayout> Tensor5f;
+  typedef Eigen::DSizes<Eigen::DenseIndex, 5> Index5;
+  typedef Tensor<float, 2, DataLayout> Tensor2f;
+  typedef Eigen::DSizes<Eigen::DenseIndex, 2> Index2;
+  Tensor<float, 5, DataLayout> tensor(2,3,5,7,11);
+  Tensor<float, 2, DataLayout> tensor2(7,11);
+  tensor.setRandom();
+  tensor2.setRandom();
+
+  if (true) {
+    Tensor2f slice(2,3);
+    Index2 strides(-2,-1);
+    Index2 indicesStart(5,7);
+    Index2 indicesStop(0,4);
+    slice = tensor2.stridedSlice(indicesStart, indicesStop, strides);
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        VERIFY_IS_EQUAL(slice(j,k), tensor2(5-2*j,7-k));
+      }
+    }
+  }
+
+  if(true) {
+    Tensor2f slice(0,1);
+    Index2 strides(1,1);
+    Index2 indicesStart(5,4);
+    Index2 indicesStop(5,5);
+    slice = tensor2.stridedSlice(indicesStart, indicesStop, strides);
+  }
+
+  if(true) { // test clamped degenerate interavls
+    Tensor2f slice(7,11);
+    Index2 strides(1,-1);
+    Index2 indicesStart(-3,20); // should become 0,10
+    Index2 indicesStop(20,-11); // should become 11, -1
+    slice = tensor2.stridedSlice(indicesStart, indicesStop, strides);
+    for (int j = 0; j < 7; ++j) {
+      for (int k = 0; k < 11; ++k) {
+        VERIFY_IS_EQUAL(slice(j,k), tensor2(j,10-k));
+      }
+    }
+  }
+
+  if(true) {
+    Tensor5f slice1(1,1,1,1,1);
+    Eigen::DSizes<Eigen::DenseIndex, 5> indicesStart(1, 2, 3, 4, 5);
+    Eigen::DSizes<Eigen::DenseIndex, 5> indicesStop(2, 3, 4, 5, 6);
+    Eigen::DSizes<Eigen::DenseIndex, 5> strides(1, 1, 1, 1, 1);
+    slice1 = tensor.stridedSlice(indicesStart, indicesStop, strides);
+    VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5));
+  }
+
+  if(true) {
+    Tensor5f slice(1,1,2,2,3);
+    Index5 start(1, 1, 3, 4, 5);
+    Index5 stop(2, 2, 5, 6, 8);
+    Index5 strides(1, 1, 1, 1, 1);
+    slice = tensor.stridedSlice(start, stop, strides);
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        for (int k = 0; k < 3; ++k) {
+          VERIFY_IS_EQUAL(slice(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+        }
+      }
+    }
+  }
+
+  if(true) {
+    Tensor5f slice(1,1,2,2,3);
+    Index5 strides3(1, 1, -2, 1, -1);
+    Index5 indices3Start(1, 1, 4, 4, 7);
+    Index5 indices3Stop(2, 2, 0, 6, 4);
+    slice = tensor.stridedSlice(indices3Start, indices3Stop, strides3);
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        for (int k = 0; k < 3; ++k) {
+          VERIFY_IS_EQUAL(slice(0,0,i,j,k), tensor(1,1,4-2*i,4+j,7-k));
+        }
+      }
+    }
+  }
+
+  if(false) { // tests degenerate interval
+    Tensor5f slice(1,1,2,2,3);
+    Index5 strides3(1, 1, 2, 1, 1);
+    Index5 indices3Start(1, 1, 4, 4, 7);
+    Index5 indices3Stop(2, 2, 0, 6, 4);
+    slice = tensor.stridedSlice(indices3Start, indices3Stop, strides3);
+  }
+}
+
+template<int DataLayout>
+static void test_strided_slice_write()
+{
+  typedef Tensor<float, 2, DataLayout> Tensor2f;
+  typedef Eigen::DSizes<Eigen::DenseIndex, 2> Index2;
+
+  Tensor<float, 2, DataLayout> tensor(7,11),tensor2(7,11);
+  tensor.setRandom();
+  tensor2=tensor;
+  Tensor2f slice(2,3);
+
+  slice.setRandom();
+
+  Index2 strides(1,1);
+  Index2 indicesStart(3,4);
+  Index2 indicesStop(5,7);
+  Index2 lengths(2,3);
+
+  tensor.slice(indicesStart,lengths)=slice;
+  tensor2.stridedSlice(indicesStart,indicesStop,strides)=slice;
+
+  for(int i=0;i<7;i++) for(int j=0;j<11;j++){
+    VERIFY_IS_EQUAL(tensor(i,j), tensor2(i,j));
+  }
+}
+
+
+template<int DataLayout>
+static void test_composition()
+{
+  Eigen::Tensor<float, 2, DataLayout> matrix(7, 11);
+  matrix.setRandom();
+
+  const DSizes<ptrdiff_t, 3> newDims(1, 1, 11);
+  Eigen::Tensor<float, 3, DataLayout> tensor =
+      matrix.slice(DSizes<ptrdiff_t, 2>(2, 0), DSizes<ptrdiff_t, 2>(1, 11)).reshape(newDims);
+
+  VERIFY_IS_EQUAL(tensor.dimensions().TotalSize(), 11);
+  VERIFY_IS_EQUAL(tensor.dimension(0), 1);
+  VERIFY_IS_EQUAL(tensor.dimension(1), 1);
+  VERIFY_IS_EQUAL(tensor.dimension(2), 11);
+  for (int i = 0; i < 11; ++i) {
+    VERIFY_IS_EQUAL(tensor(0,0,i), matrix(2,i));
+  }
+}
+
+
+void test_cxx11_tensor_morphing()
+{
+  CALL_SUBTEST_1(test_simple_reshape<void>());
+  CALL_SUBTEST_1(test_reshape_in_expr<void>());
+  CALL_SUBTEST_1(test_reshape_as_lvalue<void>());
+
+  CALL_SUBTEST_1(test_simple_slice<ColMajor>());
+  CALL_SUBTEST_1(test_simple_slice<RowMajor>());
+  CALL_SUBTEST_1(test_const_slice());
+  CALL_SUBTEST_2(test_slice_in_expr<ColMajor>());
+  CALL_SUBTEST_3(test_slice_in_expr<RowMajor>());
+  CALL_SUBTEST_4(test_slice_as_lvalue<ColMajor>());
+  CALL_SUBTEST_4(test_slice_as_lvalue<RowMajor>());
+  CALL_SUBTEST_5(test_slice_raw_data<ColMajor>());
+  CALL_SUBTEST_5(test_slice_raw_data<RowMajor>());
+
+  CALL_SUBTEST_6(test_strided_slice_write<ColMajor>());
+  CALL_SUBTEST_6(test_strided_slice<ColMajor>());
+  CALL_SUBTEST_6(test_strided_slice_write<RowMajor>());
+  CALL_SUBTEST_6(test_strided_slice<RowMajor>());
+
+  CALL_SUBTEST_7(test_composition<ColMajor>());
+  CALL_SUBTEST_7(test_composition<RowMajor>());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_notification.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_notification.cpp
new file mode 100644
index 000000000..c946007b8
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_notification.cpp
@@ -0,0 +1,81 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Vijay Vasudevan <vrv@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+
+#include <stdlib.h>
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+
+#if EIGEN_OS_WIN || EIGEN_OS_WIN64
+#include <windows.h>
+void sleep(int seconds) {
+  Sleep(seconds*1000);
+}
+#else
+#include <unistd.h>
+#endif
+
+
+namespace {
+
+void WaitAndAdd(Eigen::Notification* n, int* counter) {
+  n->Wait();
+  *counter = *counter + 1;
+}
+
+}  // namespace
+
+static void test_notification_single()
+{
+  ThreadPool thread_pool(1);
+
+  int counter = 0;
+  Eigen::Notification n;
+  std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter);
+  thread_pool.Schedule(func);
+  sleep(1);
+
+  // The thread should be waiting for the notification.
+  VERIFY_IS_EQUAL(counter, 0);
+
+  // Unblock the thread
+  n.Notify();
+
+  sleep(1);
+
+  // Verify the counter has been incremented
+  VERIFY_IS_EQUAL(counter, 1);
+}
+
+// Like test_notification_single() but enqueues multiple threads to
+// validate that all threads get notified by Notify().
+static void test_notification_multiple()
+{
+  ThreadPool thread_pool(1);
+
+  int counter = 0;
+  Eigen::Notification n;
+  std::function<void()> func = std::bind(&WaitAndAdd, &n, &counter);
+  thread_pool.Schedule(func);
+  thread_pool.Schedule(func);
+  thread_pool.Schedule(func);
+  thread_pool.Schedule(func);
+  sleep(1);
+  VERIFY_IS_EQUAL(counter, 0);
+  n.Notify();
+  sleep(1);
+  VERIFY_IS_EQUAL(counter, 4);
+}
+
+void test_cxx11_tensor_notification()
+{
+  CALL_SUBTEST(test_notification_single());
+  CALL_SUBTEST(test_notification_multiple());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_of_complex.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_of_complex.cpp
new file mode 100644
index 000000000..e9d1b2d3c
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_of_complex.cpp
@@ -0,0 +1,103 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+
+
+static void test_additions()
+{
+  Tensor<std::complex<float>, 1> data1(3);
+  Tensor<std::complex<float>, 1> data2(3);
+  for (int i = 0; i < 3; ++i) {
+    data1(i) = std::complex<float>(i, -i);
+    data2(i) = std::complex<float>(i, 7 * i);
+  }
+
+  Tensor<std::complex<float>, 1> sum = data1 + data2;
+  for (int i = 0; i < 3; ++i) {
+    VERIFY_IS_EQUAL(sum(i),  std::complex<float>(2*i, 6*i));
+  }
+}
+
+
+static void test_abs()
+{
+  Tensor<std::complex<float>, 1> data1(3);
+  Tensor<std::complex<double>, 1> data2(3);
+  data1.setRandom();
+  data2.setRandom();
+
+  Tensor<float, 1> abs1 = data1.abs();
+  Tensor<double, 1> abs2 = data2.abs();
+  for (int i = 0; i < 3; ++i) {
+    VERIFY_IS_APPROX(abs1(i), std::abs(data1(i)));
+    VERIFY_IS_APPROX(abs2(i), std::abs(data2(i)));
+  }
+}
+
+
+static void test_conjugate()
+{
+  Tensor<std::complex<float>, 1> data1(3);
+  Tensor<std::complex<double>, 1> data2(3);
+  Tensor<int, 1> data3(3);
+  data1.setRandom();
+  data2.setRandom();
+  data3.setRandom();
+
+  Tensor<std::complex<float>, 1> conj1 = data1.conjugate();
+  Tensor<std::complex<double>, 1> conj2 = data2.conjugate();
+  Tensor<int, 1> conj3 = data3.conjugate();
+  for (int i = 0; i < 3; ++i) {
+    VERIFY_IS_APPROX(conj1(i), std::conj(data1(i)));
+    VERIFY_IS_APPROX(conj2(i), std::conj(data2(i)));
+    VERIFY_IS_APPROX(conj3(i), data3(i));
+  }
+}
+
+static void test_contractions()
+{
+  Tensor<std::complex<float>, 4> t_left(30, 50, 8, 31);
+  Tensor<std::complex<float>, 5> t_right(8, 31, 7, 20, 10);
+  Tensor<std::complex<float>, 5> t_result(30, 50, 7, 20, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  typedef Map<Matrix<std::complex<float>, Dynamic, Dynamic>> MapXcf;
+  MapXcf m_left(t_left.data(), 1500, 248);
+  MapXcf m_right(t_right.data(), 248, 1400);
+  Matrix<std::complex<float>, Dynamic, Dynamic> m_result(1500, 1400);
+
+  // This contraction should be equivalent to a regular matrix multiplication
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 2> dims;
+  dims[0] = DimPair(2, 0);
+  dims[1] = DimPair(3, 1);
+  t_result = t_left.contract(t_right, dims);
+  m_result = m_left * m_right;
+  for (int i = 0; i < t_result.dimensions().TotalSize(); i++) {
+    VERIFY_IS_APPROX(t_result.data()[i], m_result.data()[i]);
+  }
+}
+
+
+void test_cxx11_tensor_of_complex()
+{
+  CALL_SUBTEST(test_additions());
+  CALL_SUBTEST(test_abs());
+  CALL_SUBTEST(test_conjugate());
+  CALL_SUBTEST(test_contractions());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_of_const_values.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_of_const_values.cpp
new file mode 100644
index 000000000..f179a0c21
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_of_const_values.cpp
@@ -0,0 +1,105 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_assign()
+{
+  float data1[6];
+  TensorMap<Tensor<const float, 2>> mat1(data1, 2, 3);
+  float data2[6];
+  const TensorMap<Tensor<float, 2>> mat2(data2, 2, 3);
+
+  for (int i = 0; i < 6; ++i) {
+    data1[i] = i;
+    data2[i] = -i;
+  }
+
+  Tensor<float, 2> rslt1;
+  rslt1 = mat1;
+  Tensor<float, 2> rslt2;
+  rslt2 = mat2;
+
+  Tensor<float, 2> rslt3 = mat1;
+  Tensor<float, 2> rslt4 = mat2;
+
+  Tensor<float, 2> rslt5(mat1);
+  Tensor<float, 2> rslt6(mat2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_APPROX(rslt1(i,j), static_cast<float>(i + 2*j));
+      VERIFY_IS_APPROX(rslt2(i,j), static_cast<float>(-i - 2*j));
+      VERIFY_IS_APPROX(rslt3(i,j), static_cast<float>(i + 2*j));
+      VERIFY_IS_APPROX(rslt4(i,j), static_cast<float>(-i - 2*j));
+      VERIFY_IS_APPROX(rslt5(i,j), static_cast<float>(i + 2*j));
+      VERIFY_IS_APPROX(rslt6(i,j), static_cast<float>(-i - 2*j));
+    }
+  }
+}
+
+
+static void test_plus()
+{
+  float data1[6];
+  TensorMap<Tensor<const float, 2>> mat1(data1, 2, 3);
+  float data2[6];
+  TensorMap<Tensor<float, 2>> mat2(data2, 2, 3);
+
+  for (int i = 0; i < 6; ++i) {
+    data1[i] = i;
+    data2[i] = -i;
+  }
+
+  Tensor<float, 2> sum1;
+  sum1 = mat1 + mat2;
+  Tensor<float, 2> sum2;
+  sum2 = mat2 + mat1;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_APPROX(sum1(i,j), 0.0f);
+      VERIFY_IS_APPROX(sum2(i,j), 0.0f);
+    }
+  }
+}
+
+
+static void test_plus_equal()
+{
+  float data1[6];
+  TensorMap<Tensor<const float, 2>> mat1(data1, 2, 3);
+  float data2[6];
+  TensorMap<Tensor<float, 2>> mat2(data2, 2, 3);
+
+  for (int i = 0; i < 6; ++i) {
+    data1[i] = i;
+    data2[i] = -i;
+  }
+  mat2 += mat1;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_APPROX(mat2(i,j), 0.0f);
+    }
+  }
+}
+
+
+void test_cxx11_tensor_of_const_values()
+{
+  CALL_SUBTEST(test_assign());
+  CALL_SUBTEST(test_plus());
+  CALL_SUBTEST(test_plus_equal());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_of_float16_cuda.cu
new file mode 100644
index 000000000..e296bf991
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_of_float16_cuda.cu
@@ -0,0 +1,491 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_cuda
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<typename>
+void test_cuda_numext() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  bool* d_res_half = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
+  bool* d_res_float = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
+  gpu_res_float.device(gpu_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>());
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>());
+
+  Tensor<bool, 1> half_prec(num_elem);
+  Tensor<bool, 1> full_prec(num_elem);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(bool));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(bool));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking numext " << i << std::endl;
+    VERIFY_IS_EQUAL(full_prec(i), half_prec(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+
+#ifdef EIGEN_HAS_CUDA_FP16
+
+template<typename>
+void test_cuda_conversion() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+  
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  float* d_conv = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_half(
+      d_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_conv(
+      d_conv, num_elem);
+
+  gpu_float.device(gpu_device) = gpu_float.random();
+  gpu_half.device(gpu_device) = gpu_float.cast<Eigen::half>();
+  gpu_conv.device(gpu_device) = gpu_half.cast<float>();
+
+  Tensor<float, 1> initial(num_elem);
+  Tensor<float, 1> final(num_elem);
+  gpu_device.memcpyDeviceToHost(initial.data(), d_float, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(final.data(), d_conv, num_elem*sizeof(float));
+
+  for (int i = 0; i < num_elem; ++i) {
+    VERIFY_IS_APPROX(initial(i), final(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_half);
+  gpu_device.deallocate(d_conv);
+}
+
+template<typename>
+void test_cuda_unary() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
+  gpu_res_float.device(gpu_device) = gpu_float.abs();
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().cast<float>();
+
+  Tensor<float, 1> half_prec(num_elem);
+  Tensor<float, 1> full_prec(num_elem);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking unary " << i << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
+void test_cuda_elementwise() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(
+      d_float1, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(
+      d_float2, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random();
+  gpu_float2.device(gpu_device) = gpu_float2.random();
+  gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1;
+  gpu_res_half.device(gpu_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>();
+
+  Tensor<float, 1> half_prec(num_elem);
+  Tensor<float, 1> full_prec(num_elem);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise " << i << ": full prec = " << full_prec(i) << " vs half prec = " << half_prec(i) << std::endl;
+    VERIFY_IS_APPROX(static_cast<Eigen::half>(full_prec(i)), static_cast<Eigen::half>(half_prec(i)));
+  }
+
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
+void test_cuda_trancendental() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
+  gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
+  gpu_float3.device(gpu_device) = gpu_float3.random();
+  gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
+  gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
+  gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>();
+
+  gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>();
+  gpu_res1_half.device(gpu_device) = gpu_res1_half.exp();
+
+  gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>();
+  gpu_res2_half.device(gpu_device) = gpu_res2_half.log();
+
+  gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
+  gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p();
+
+  Tensor<float, 1> input1(num_elem);
+  Tensor<Eigen::half, 1> half_prec1(num_elem);
+  Tensor<Eigen::half, 1> full_prec1(num_elem);
+  Tensor<float, 1> input2(num_elem);
+  Tensor<Eigen::half, 1> half_prec2(num_elem);
+  Tensor<Eigen::half, 1> full_prec2(num_elem);
+  Tensor<float, 1> input3(num_elem);
+  Tensor<Eigen::half, 1> half_prec3(num_elem);
+  Tensor<Eigen::half, 1> full_prec3(num_elem);
+  gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise exp " << i << " input = " << input1(i) << " full = " << full_prec1(i) << " half = " << half_prec1(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec1(i), half_prec1(i));
+  }
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
+    if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1
+      VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
+    else
+      VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
+  }
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
+  }
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_float3);
+  gpu_device.deallocate(d_res1_half);
+  gpu_device.deallocate(d_res1_float);
+  gpu_device.deallocate(d_res2_half);
+  gpu_device.deallocate(d_res2_float);
+  gpu_device.deallocate(d_res3_float);
+  gpu_device.deallocate(d_res3_half);
+}
+
+template<typename>
+void test_cuda_contractions() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int rows = 23;
+  int cols = 23;
+  int num_elem = rows*cols;
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
+      d_float1, rows, cols);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
+      d_float2, rows, cols);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_half(
+      d_res_half, rows, cols);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 2>, Eigen::Aligned> gpu_res_float(
+      d_res_float, rows, cols);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
+  gpu_float2.device(gpu_device) = gpu_float2.random() - gpu_float2.constant(0.5f);
+
+  typedef Tensor<float, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims(DimPair(1, 0));
+  gpu_res_float.device(gpu_device) = gpu_float1.contract(gpu_float2, dims).cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().contract(gpu_float2.cast<Eigen::half>(), dims);
+
+  Tensor<Eigen::half, 2> half_prec(rows, cols);
+  Tensor<Eigen::half, 2> full_prec(rows, cols);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(Eigen::half));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < rows; ++i) {
+    for (int j = 0; j < cols; ++j) {
+      std::cout << "Checking contract " << i << " " << j << full_prec(i, j) << " " << half_prec(i, j) << std::endl;
+      if (numext::abs(full_prec(i, j) - half_prec(i, j)) > Eigen::half(1e-2f)) {
+        VERIFY_IS_APPROX(full_prec(i, j), half_prec(i, j));
+      }
+    }
+  }
+
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
+void test_cuda_reductions(int size1, int size2, int redux) {
+
+   std::cout << "Reducing " << size1 << " by " << size2
+             << " tensor along dim " << redux << std::endl; 
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = size1*size2;
+  int result_size = (redux == 1 ? size1 : size2);
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
+  Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
+      d_float1, size1, size2);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
+      d_float2, size1, size2);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, result_size);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, result_size);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f;
+  gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f;
+
+  Eigen::array<int, 1> redux_dim = {{redux}};
+  gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim);
+
+  Tensor<Eigen::half, 1> half_prec(result_size);
+  Tensor<Eigen::half, 1> full_prec(result_size);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, result_size*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, result_size*sizeof(Eigen::half));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < result_size; ++i) {
+    std::cout << "EXPECTED " << full_prec(i) << " GOT " << half_prec(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec(i));
+  }
+
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
+void test_cuda_reductions() {
+  test_cuda_reductions<void>(13, 13, 0);
+  test_cuda_reductions<void>(13, 13, 1);
+
+  test_cuda_reductions<void>(35, 36, 0);
+  test_cuda_reductions<void>(35, 36, 1);
+
+  test_cuda_reductions<void>(36, 35, 0);
+  test_cuda_reductions<void>(36, 35, 1);
+}
+
+template<typename>
+void test_cuda_full_reductions() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int size = 13;
+  int num_elem = size*size;
+
+  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
+  Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
+      d_float1, size, size);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
+      d_float2, size, size);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
+      d_res_half);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
+      d_res_float);
+
+  gpu_float1.device(gpu_device) = gpu_float1.random();
+  gpu_float2.device(gpu_device) = gpu_float2.random();
+
+  gpu_res_float.device(gpu_device) = gpu_float1.sum().cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum();
+
+  Tensor<Eigen::half, 0> half_prec;
+  Tensor<Eigen::half, 0> full_prec;
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
+  gpu_device.synchronize();
+
+  VERIFY_IS_APPROX(full_prec(), half_prec());
+
+  gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().maximum();
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
+  gpu_device.synchronize();
+
+  VERIFY_IS_APPROX(full_prec(), half_prec());
+
+  gpu_device.deallocate(d_float1);
+  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+template<typename>
+void test_cuda_forced_evals() {
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_half2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_res_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_half1(
+      d_res_half1, num_elem);
+ Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Unaligned> gpu_res_half2(
+      d_res_half2, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  Eigen::array<int, 1> no_bcast;
+  no_bcast[0] = 1;
+
+  gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
+  gpu_res_float.device(gpu_device) = gpu_float.abs();
+  gpu_res_half1.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
+  gpu_res_half2.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>();
+
+  Tensor<float, 1> half_prec1(num_elem);
+  Tensor<float, 1> half_prec2(num_elem);
+  Tensor<float, 1> full_prec(num_elem);
+  gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res_half1, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res_half1, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(float));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking forced eval " << i << full_prec(i) << " vs " << half_prec1(i) << " vs " << half_prec2(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec(i), half_prec1(i));
+    VERIFY_IS_APPROX(full_prec(i), half_prec2(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_res_half1);
+  gpu_device.deallocate(d_res_half2);
+  gpu_device.deallocate(d_res_float);
+}
+#endif
+
+
+void test_cxx11_tensor_of_float16_cuda()
+{
+  CALL_SUBTEST_1(test_cuda_numext<void>());
+
+#ifdef EIGEN_HAS_CUDA_FP16
+  CALL_SUBTEST_1(test_cuda_conversion<void>());
+  CALL_SUBTEST_1(test_cuda_unary<void>());
+  CALL_SUBTEST_1(test_cuda_elementwise<void>());
+  CALL_SUBTEST_1(test_cuda_trancendental<void>());
+  CALL_SUBTEST_2(test_cuda_contractions<void>());
+  CALL_SUBTEST_3(test_cuda_reductions<void>());
+  CALL_SUBTEST_4(test_cuda_full_reductions<void>());
+  CALL_SUBTEST_5(test_cuda_forced_evals<void>());
+#else
+  std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl;
+#endif
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_of_strings.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_of_strings.cpp
new file mode 100644
index 000000000..4ef9aed91
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_of_strings.cpp
@@ -0,0 +1,152 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+static void test_assign()
+{
+  std::string data1[6];
+  TensorMap<Tensor<std::string, 2>> mat1(data1, 2, 3);
+  std::string data2[6];
+  const TensorMap<Tensor<const std::string, 2>> mat2(data2, 2, 3);
+
+  for (int i = 0; i < 6; ++i) {
+    std::ostringstream s1;
+    s1 << "abc" << i*3;
+    data1[i] = s1.str();
+    std::ostringstream s2;
+    s2 << "def" << i*5;
+    data2[i] = s2.str();
+  }
+
+  Tensor<std::string, 2> rslt1;
+  rslt1 = mat1;
+  Tensor<std::string, 2> rslt2;
+  rslt2 = mat2;
+
+  Tensor<std::string, 2> rslt3 = mat1;
+  Tensor<std::string, 2> rslt4 = mat2;
+
+  Tensor<std::string, 2> rslt5(mat1);
+  Tensor<std::string, 2> rslt6(mat2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(rslt1(i,j), data1[i+2*j]);
+      VERIFY_IS_EQUAL(rslt2(i,j), data2[i+2*j]);
+      VERIFY_IS_EQUAL(rslt3(i,j), data1[i+2*j]);
+      VERIFY_IS_EQUAL(rslt4(i,j), data2[i+2*j]);
+      VERIFY_IS_EQUAL(rslt5(i,j), data1[i+2*j]);
+      VERIFY_IS_EQUAL(rslt6(i,j), data2[i+2*j]);
+    }
+  }
+}
+
+
+static void test_concat()
+{
+  Tensor<std::string, 2> t1(2, 3);
+  Tensor<std::string, 2> t2(2, 3);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      std::ostringstream s1;
+      s1 << "abc" << i + j*2;
+      t1(i, j) = s1.str();
+      std::ostringstream s2;
+      s2 << "def" << i*5 + j*32;
+      t2(i, j) = s2.str();
+    }
+  }
+
+  Tensor<std::string, 2> result = t1.concatenate(t2, 1);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 6);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(result(i, j),   t1(i, j));
+      VERIFY_IS_EQUAL(result(i, j+3), t2(i, j));
+    }
+  }
+}
+
+
+static void test_slices()
+{
+  Tensor<std::string, 2> data(2, 6);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      std::ostringstream s1;
+      s1 << "abc" << i + j*2;
+      data(i, j) = s1.str();
+    }
+  }
+
+  const Eigen::DSizes<ptrdiff_t, 2> half_size(2, 3);
+  const Eigen::DSizes<ptrdiff_t, 2> first_half(0, 0);
+  const Eigen::DSizes<ptrdiff_t, 2> second_half(0, 3);
+
+  Tensor<std::string, 2> t1 = data.slice(first_half, half_size);
+  Tensor<std::string, 2> t2 = data.slice(second_half, half_size);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      VERIFY_IS_EQUAL(data(i, j),   t1(i, j));
+      VERIFY_IS_EQUAL(data(i, j+3), t2(i, j));
+    }
+  }
+}
+
+
+static void test_additions()
+{
+  Tensor<std::string, 1> data1(3);
+  Tensor<std::string, 1> data2(3);
+  for (int i = 0; i < 3; ++i) {
+    data1(i) = "abc";
+    std::ostringstream s1;
+    s1 << i;
+    data2(i) = s1.str();
+  }
+
+  Tensor<std::string, 1> sum = data1 + data2;
+  for (int i = 0; i < 3; ++i) {
+    std::ostringstream concat;
+    concat << "abc" << i;
+    std::string expected = concat.str();
+    VERIFY_IS_EQUAL(sum(i), expected);
+  }
+}
+
+
+static void test_initialization()
+{
+  Tensor<std::string, 2> a(2, 3);
+  a.setConstant(std::string("foo"));
+  for (int i = 0; i < 2*3; ++i) {
+    VERIFY_IS_EQUAL(a(i), std::string("foo"));
+  }
+}
+
+
+void test_cxx11_tensor_of_strings()
+{
+  // Beware: none of this is likely to ever work on a GPU.
+  CALL_SUBTEST(test_assign());
+  CALL_SUBTEST(test_concat());
+  CALL_SUBTEST(test_slices());
+  CALL_SUBTEST(test_additions());
+  CALL_SUBTEST(test_initialization());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_padding.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_padding.cpp
new file mode 100644
index 000000000..ffa19896e
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_padding.cpp
@@ -0,0 +1,93 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<int DataLayout>
+static void test_simple_padding()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  array<std::pair<ptrdiff_t, ptrdiff_t>, 4> paddings;
+  paddings[0] = std::make_pair(0, 0);
+  paddings[1] = std::make_pair(2, 1);
+  paddings[2] = std::make_pair(3, 4);
+  paddings[3] = std::make_pair(0, 0);
+
+  Tensor<float, 4, DataLayout> padded;
+  padded = tensor.pad(paddings);
+
+  VERIFY_IS_EQUAL(padded.dimension(0), 2+0);
+  VERIFY_IS_EQUAL(padded.dimension(1), 3+3);
+  VERIFY_IS_EQUAL(padded.dimension(2), 5+7);
+  VERIFY_IS_EQUAL(padded.dimension(3), 7+0);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      for (int k = 0; k < 12; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          if (j >= 2 && j < 5 && k >= 3 && k < 8) {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), tensor(i,j-2,k-3,l));
+          } else {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), 0.0f);
+          }
+        }
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+static void test_padded_expr()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  array<std::pair<ptrdiff_t, ptrdiff_t>, 4> paddings;
+  paddings[0] = std::make_pair(0, 0);
+  paddings[1] = std::make_pair(2, 1);
+  paddings[2] = std::make_pair(3, 4);
+  paddings[3] = std::make_pair(0, 0);
+
+  Eigen::DSizes<ptrdiff_t, 2> reshape_dims;
+  reshape_dims[0] = 12;
+  reshape_dims[1] = 84;
+
+  Tensor<float, 2, DataLayout> result;
+  result = tensor.pad(paddings).reshape(reshape_dims);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      for (int k = 0; k < 12; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          const float result_value = DataLayout == ColMajor ?
+              result(i+2*j,k+12*l) : result(j+6*i,l+7*k);
+          if (j >= 2 && j < 5 && k >= 3 && k < 8) {
+            VERIFY_IS_EQUAL(result_value, tensor(i,j-2,k-3,l));
+          } else {
+            VERIFY_IS_EQUAL(result_value, 0.0f);
+          }
+        }
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_padding()
+{
+  CALL_SUBTEST(test_simple_padding<ColMajor>());
+  CALL_SUBTEST(test_simple_padding<RowMajor>());
+  CALL_SUBTEST(test_padded_expr<ColMajor>());
+  CALL_SUBTEST(test_padded_expr<RowMajor>());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_patch.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_patch.cpp
new file mode 100644
index 000000000..434359730
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_patch.cpp
@@ -0,0 +1,172 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<int DataLayout>
+static void test_simple_patch()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> patch_dims;
+
+  patch_dims[0] = 1;
+  patch_dims[1] = 1;
+  patch_dims[2] = 1;
+  patch_dims[3] = 1;
+
+  Tensor<float, 5, DataLayout> no_patch;
+  no_patch = tensor.extract_patches(patch_dims);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(no_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(2), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(3), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(4), tensor.size());
+  } else {
+    VERIFY_IS_EQUAL(no_patch.dimension(0), tensor.size());
+    VERIFY_IS_EQUAL(no_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(2), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(3), 1);
+    VERIFY_IS_EQUAL(no_patch.dimension(4), 1);
+  }
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    VERIFY_IS_EQUAL(tensor.data()[i], no_patch.data()[i]);
+  }
+
+  patch_dims[0] = 2;
+  patch_dims[1] = 3;
+  patch_dims[2] = 5;
+  patch_dims[3] = 7;
+  Tensor<float, 5, DataLayout> single_patch;
+  single_patch = tensor.extract_patches(patch_dims);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(single_patch.dimension(0), 2);
+    VERIFY_IS_EQUAL(single_patch.dimension(1), 3);
+    VERIFY_IS_EQUAL(single_patch.dimension(2), 5);
+    VERIFY_IS_EQUAL(single_patch.dimension(3), 7);
+    VERIFY_IS_EQUAL(single_patch.dimension(4), 1);
+  } else {
+    VERIFY_IS_EQUAL(single_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(single_patch.dimension(1), 2);
+    VERIFY_IS_EQUAL(single_patch.dimension(2), 3);
+    VERIFY_IS_EQUAL(single_patch.dimension(3), 5);
+    VERIFY_IS_EQUAL(single_patch.dimension(4), 7);
+  }
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    VERIFY_IS_EQUAL(tensor.data()[i], single_patch.data()[i]);
+  }
+
+  patch_dims[0] = 1;
+  patch_dims[1] = 2;
+  patch_dims[2] = 2;
+  patch_dims[3] = 1;
+  Tensor<float, 5, DataLayout> twod_patch;
+  twod_patch = tensor.extract_patches(patch_dims);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(twod_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(twod_patch.dimension(1), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(3), 1);
+    VERIFY_IS_EQUAL(twod_patch.dimension(4), 2*2*4*7);
+  } else {
+    VERIFY_IS_EQUAL(twod_patch.dimension(0), 2*2*4*7);
+    VERIFY_IS_EQUAL(twod_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(twod_patch.dimension(2), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(3), 2);
+    VERIFY_IS_EQUAL(twod_patch.dimension(4), 1);
+  }
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 4; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          int patch_loc;
+          if (DataLayout == ColMajor) {
+            patch_loc = i + 2 * (j + 2 * (k + 4 * l));
+          } else {
+            patch_loc = l + 7 * (k + 4 * (j + 2 * i));
+          }
+          for (int x = 0; x < 2; ++x) {
+            for (int y = 0; y < 2; ++y) {
+              if (DataLayout == ColMajor) {
+                VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(0,x,y,0,patch_loc));
+              } else {
+                VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l), twod_patch(patch_loc,0,x,y,0));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  patch_dims[0] = 1;
+  patch_dims[1] = 2;
+  patch_dims[2] = 3;
+  patch_dims[3] = 5;
+  Tensor<float, 5, DataLayout> threed_patch;
+  threed_patch = tensor.extract_patches(patch_dims);
+
+  if (DataLayout == ColMajor) {
+    VERIFY_IS_EQUAL(threed_patch.dimension(0), 1);
+    VERIFY_IS_EQUAL(threed_patch.dimension(1), 2);
+    VERIFY_IS_EQUAL(threed_patch.dimension(2), 3);
+    VERIFY_IS_EQUAL(threed_patch.dimension(3), 5);
+    VERIFY_IS_EQUAL(threed_patch.dimension(4), 2*2*3*3);
+  } else {
+    VERIFY_IS_EQUAL(threed_patch.dimension(0), 2*2*3*3);
+    VERIFY_IS_EQUAL(threed_patch.dimension(1), 1);
+    VERIFY_IS_EQUAL(threed_patch.dimension(2), 2);
+    VERIFY_IS_EQUAL(threed_patch.dimension(3), 3);
+    VERIFY_IS_EQUAL(threed_patch.dimension(4), 5);
+  }
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 3; ++l) {
+          int patch_loc;
+          if (DataLayout == ColMajor) {
+            patch_loc = i + 2 * (j + 2 * (k + 3 * l));
+          } else {
+            patch_loc = l + 3 * (k + 3 * (j + 2 * i));
+          }
+          for (int x = 0; x < 2; ++x) {
+            for (int y = 0; y < 3; ++y) {
+              for (int z = 0; z < 5; ++z) {
+                if (DataLayout == ColMajor) {
+                  VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(0,x,y,z,patch_loc));
+                } else {
+                  VERIFY_IS_EQUAL(tensor(i,j+x,k+y,l+z), threed_patch(patch_loc,0,x,y,z));
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_patch()
+{
+   CALL_SUBTEST(test_simple_patch<ColMajor>());
+   CALL_SUBTEST(test_simple_patch<RowMajor>());
+   //   CALL_SUBTEST(test_expr_shuffling());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_random.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_random.cpp
new file mode 100644
index 000000000..0f3dc5787
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_random.cpp
@@ -0,0 +1,78 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+static void test_default()
+{
+  Tensor<float, 1> vec(6);
+  vec.setRandom();
+
+  // Fixme: we should check that the generated numbers follow a uniform
+  // distribution instead.
+  for (int i = 1; i < 6; ++i) {
+    VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1));
+  }
+}
+
+static void test_normal()
+{
+  Tensor<float, 1> vec(6);
+  vec.setRandom<Eigen::internal::NormalRandomGenerator<float>>();
+
+  // Fixme: we should check that the generated numbers follow a gaussian
+  // distribution instead.
+  for (int i = 1; i < 6; ++i) {
+    VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1));
+  }
+}
+
+
+struct MyGenerator {
+  MyGenerator() { }
+  MyGenerator(const MyGenerator&) { }
+
+  // Return a random value to be used.  "element_location" is the
+  // location of the entry to set in the tensor, it can typically
+  // be ignored.
+  int operator()(Eigen::DenseIndex element_location, Eigen::DenseIndex /*unused*/ = 0) const {
+    return static_cast<int>(3 * element_location);
+  }
+
+  // Same as above but generates several numbers at a time.
+  internal::packet_traits<int>::type packetOp(
+      Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const {
+    const int packetSize = internal::packet_traits<int>::size;
+    EIGEN_ALIGN_MAX int values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = static_cast<int>(3 * (packet_location + i));
+    }
+    return internal::pload<typename internal::packet_traits<int>::type>(values);
+  }
+};
+
+
+static void test_custom()
+{
+  Tensor<int, 1> vec(6);
+  vec.setRandom<MyGenerator>();
+
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(vec(i), 3*i);
+  }
+}
+
+void test_cxx11_tensor_random()
+{
+  CALL_SUBTEST(test_default());
+  CALL_SUBTEST(test_normal());
+  CALL_SUBTEST(test_custom());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_random_cuda.cu b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_random_cuda.cu
new file mode 100644
index 000000000..fa1a46732
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_random_cuda.cu
@@ -0,0 +1,85 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_random_cuda
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <Eigen/CXX11/Tensor>
+
+
+void test_cuda_random_uniform()
+{
+  Tensor<float, 2> out(72,97);
+  out.setZero();
+
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_out;
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
+
+  gpu_out.device(gpu_device) = gpu_out.random();
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  // For now we just check thes code doesn't crash.
+  // TODO: come up with a valid test of randomness
+}
+
+
+void test_cuda_random_normal()
+{
+  Tensor<float, 2> out(72,97);
+  out.setZero();
+
+  std::size_t out_bytes = out.size() * sizeof(float);
+
+  float* d_out;
+  cudaMalloc((void**)(&d_out), out_bytes);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
+
+  Eigen::internal::NormalRandomGenerator<float> gen(true);
+  gpu_out.device(gpu_device) = gpu_out.random(gen);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+}
+
+static void test_complex()
+{
+  Tensor<std::complex<float>, 1> vec(6);
+  vec.setRandom();
+
+  // Fixme: we should check that the generated numbers follow a uniform
+  // distribution instead.
+  for (int i = 1; i < 6; ++i) {
+    VERIFY_IS_NOT_EQUAL(vec(i), vec(i-1));
+  }
+}
+
+
+void test_cxx11_tensor_random_cuda()
+{
+  CALL_SUBTEST(test_cuda_random_uniform());
+  CALL_SUBTEST(test_cuda_random_normal());
+  CALL_SUBTEST(test_complex());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_reduction.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_reduction.cpp
new file mode 100644
index 000000000..1490ec3da
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_reduction.cpp
@@ -0,0 +1,508 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <limits>
+#include <numeric>
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <int DataLayout>
+static void test_trivial_reductions() {
+  {
+    Tensor<float, 0, DataLayout> tensor;
+    tensor.setRandom();
+    array<ptrdiff_t, 0> reduction_axis;
+
+    Tensor<float, 0, DataLayout> result = tensor.sum(reduction_axis);
+    VERIFY_IS_EQUAL(result(), tensor());
+  }
+
+  {
+    Tensor<float, 1, DataLayout> tensor(7);
+    tensor.setRandom();
+    array<ptrdiff_t, 0> reduction_axis;
+
+    Tensor<float, 1, DataLayout> result = tensor.sum(reduction_axis);
+    VERIFY_IS_EQUAL(result.dimension(0), 7);
+    for (int i = 0; i < 7; ++i) {
+      VERIFY_IS_EQUAL(result(i), tensor(i));
+    }
+  }
+
+  {
+    Tensor<float, 2, DataLayout> tensor(2, 3);
+    tensor.setRandom();
+    array<ptrdiff_t, 0> reduction_axis;
+
+    Tensor<float, 2, DataLayout> result = tensor.sum(reduction_axis);
+    VERIFY_IS_EQUAL(result.dimension(0), 2);
+    VERIFY_IS_EQUAL(result.dimension(1), 3);
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 3; ++j) {
+        VERIFY_IS_EQUAL(result(i, j), tensor(i, j));
+      }
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_simple_reductions() {
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  tensor.setRandom();
+  array<ptrdiff_t, 2> reduction_axis2;
+  reduction_axis2[0] = 1;
+  reduction_axis2[1] = 3;
+
+  Tensor<float, 2, DataLayout> result = tensor.sum(reduction_axis2);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 5);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      float sum = 0.0f;
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          sum += tensor(i, k, j, l);
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), sum);
+    }
+  }
+
+  {
+    Tensor<float, 0, DataLayout> sum1 = tensor.sum();
+    VERIFY_IS_EQUAL(sum1.rank(), 0);
+
+    array<ptrdiff_t, 4> reduction_axis4;
+    reduction_axis4[0] = 0;
+    reduction_axis4[1] = 1;
+    reduction_axis4[2] = 2;
+    reduction_axis4[3] = 3;
+    Tensor<float, 0, DataLayout> sum2 = tensor.sum(reduction_axis4);
+    VERIFY_IS_EQUAL(sum2.rank(), 0);
+
+    VERIFY_IS_APPROX(sum1(), sum2());
+  }
+
+  reduction_axis2[0] = 0;
+  reduction_axis2[1] = 2;
+  result = tensor.prod(reduction_axis2);
+  VERIFY_IS_EQUAL(result.dimension(0), 3);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      float prod = 1.0f;
+      for (int k = 0; k < 2; ++k) {
+        for (int l = 0; l < 5; ++l) {
+          prod *= tensor(k, i, l, j);
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), prod);
+    }
+  }
+
+  {
+    Tensor<float, 0, DataLayout> prod1 = tensor.prod();
+    VERIFY_IS_EQUAL(prod1.rank(), 0);
+
+    array<ptrdiff_t, 4> reduction_axis4;
+    reduction_axis4[0] = 0;
+    reduction_axis4[1] = 1;
+    reduction_axis4[2] = 2;
+    reduction_axis4[3] = 3;
+    Tensor<float, 0, DataLayout> prod2 = tensor.prod(reduction_axis4);
+    VERIFY_IS_EQUAL(prod2.rank(), 0);
+
+    VERIFY_IS_APPROX(prod1(), prod2());
+  }
+
+  reduction_axis2[0] = 0;
+  reduction_axis2[1] = 2;
+  result = tensor.maximum(reduction_axis2);
+  VERIFY_IS_EQUAL(result.dimension(0), 3);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      float max_val = std::numeric_limits<float>::lowest();
+      for (int k = 0; k < 2; ++k) {
+        for (int l = 0; l < 5; ++l) {
+          max_val = (std::max)(max_val, tensor(k, i, l, j));
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), max_val);
+    }
+  }
+
+  {
+    Tensor<float, 0, DataLayout> max1 = tensor.maximum();
+    VERIFY_IS_EQUAL(max1.rank(), 0);
+
+    array<ptrdiff_t, 4> reduction_axis4;
+    reduction_axis4[0] = 0;
+    reduction_axis4[1] = 1;
+    reduction_axis4[2] = 2;
+    reduction_axis4[3] = 3;
+    Tensor<float, 0, DataLayout> max2 = tensor.maximum(reduction_axis4);
+    VERIFY_IS_EQUAL(max2.rank(), 0);
+
+    VERIFY_IS_APPROX(max1(), max2());
+  }
+
+  reduction_axis2[0] = 0;
+  reduction_axis2[1] = 1;
+  result = tensor.minimum(reduction_axis2);
+  VERIFY_IS_EQUAL(result.dimension(0), 5);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      float min_val = (std::numeric_limits<float>::max)();
+      for (int k = 0; k < 2; ++k) {
+        for (int l = 0; l < 3; ++l) {
+          min_val = (std::min)(min_val, tensor(k, l, i, j));
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), min_val);
+    }
+  }
+
+  {
+    Tensor<float, 0, DataLayout> min1 = tensor.minimum();
+    VERIFY_IS_EQUAL(min1.rank(), 0);
+
+    array<ptrdiff_t, 4> reduction_axis4;
+    reduction_axis4[0] = 0;
+    reduction_axis4[1] = 1;
+    reduction_axis4[2] = 2;
+    reduction_axis4[3] = 3;
+    Tensor<float, 0, DataLayout> min2 = tensor.minimum(reduction_axis4);
+    VERIFY_IS_EQUAL(min2.rank(), 0);
+
+    VERIFY_IS_APPROX(min1(), min2());
+  }
+
+  reduction_axis2[0] = 0;
+  reduction_axis2[1] = 1;
+  result = tensor.mean(reduction_axis2);
+  VERIFY_IS_EQUAL(result.dimension(0), 5);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      float sum = 0.0f;
+      int count = 0;
+      for (int k = 0; k < 2; ++k) {
+        for (int l = 0; l < 3; ++l) {
+          sum += tensor(k, l, i, j);
+          ++count;
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), sum / count);
+    }
+  }
+
+  {
+    Tensor<float, 0, DataLayout> mean1 = tensor.mean();
+    VERIFY_IS_EQUAL(mean1.rank(), 0);
+
+    array<ptrdiff_t, 4> reduction_axis4;
+    reduction_axis4[0] = 0;
+    reduction_axis4[1] = 1;
+    reduction_axis4[2] = 2;
+    reduction_axis4[3] = 3;
+    Tensor<float, 0, DataLayout> mean2 = tensor.mean(reduction_axis4);
+    VERIFY_IS_EQUAL(mean2.rank(), 0);
+
+    VERIFY_IS_APPROX(mean1(), mean2());
+  }
+
+  {
+    Tensor<int, 1> ints(10);
+    std::iota(ints.data(), ints.data() + ints.dimension(0), 0);
+
+    TensorFixedSize<bool, Sizes<> > all;
+    all = ints.all();
+    VERIFY(!all());
+    all = (ints >= ints.constant(0)).all();
+    VERIFY(all());
+
+    TensorFixedSize<bool, Sizes<> > any;
+    any = (ints > ints.constant(10)).any();
+    VERIFY(!any());
+    any = (ints < ints.constant(1)).any();
+    VERIFY(any());
+  }
+}
+
+
+template <int DataLayout>
+static void test_reductions_in_expr() {
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  tensor.setRandom();
+  array<ptrdiff_t, 2> reduction_axis2;
+  reduction_axis2[0] = 1;
+  reduction_axis2[1] = 3;
+
+  Tensor<float, 2, DataLayout> result(2, 5);
+  result = result.constant(1.0f) - tensor.sum(reduction_axis2);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 5);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      float sum = 0.0f;
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          sum += tensor(i, k, j, l);
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), 1.0f - sum);
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_full_reductions() {
+  Tensor<float, 2, DataLayout> tensor(2, 3);
+  tensor.setRandom();
+  array<ptrdiff_t, 2> reduction_axis;
+  reduction_axis[0] = 0;
+  reduction_axis[1] = 1;
+
+  Tensor<float, 0, DataLayout> result = tensor.sum(reduction_axis);
+  VERIFY_IS_EQUAL(result.rank(), 0);
+
+  float sum = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      sum += tensor(i, j);
+    }
+  }
+  VERIFY_IS_APPROX(result(0), sum);
+
+  result = tensor.square().sum(reduction_axis).sqrt();
+  VERIFY_IS_EQUAL(result.rank(), 0);
+
+  sum = 0.0f;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      sum += tensor(i, j) * tensor(i, j);
+    }
+  }
+  VERIFY_IS_APPROX(result(), sqrtf(sum));
+}
+
+struct UserReducer {
+  static const bool PacketAccess = false;
+  UserReducer(float offset) : offset_(offset) {}
+  void reduce(const float val, float* accum) { *accum += val * val; }
+  float initialize() const { return 0; }
+  float finalize(const float accum) const { return 1.0f / (accum + offset_); }
+
+ private:
+  const float offset_;
+};
+
+template <int DataLayout>
+static void test_user_defined_reductions() {
+  Tensor<float, 2, DataLayout> tensor(5, 7);
+  tensor.setRandom();
+  array<ptrdiff_t, 1> reduction_axis;
+  reduction_axis[0] = 1;
+
+  UserReducer reducer(10.0f);
+  Tensor<float, 1, DataLayout> result = tensor.reduce(reduction_axis, reducer);
+  VERIFY_IS_EQUAL(result.dimension(0), 5);
+  for (int i = 0; i < 5; ++i) {
+    float expected = 10.0f;
+    for (int j = 0; j < 7; ++j) {
+      expected += tensor(i, j) * tensor(i, j);
+    }
+    expected = 1.0f / expected;
+    VERIFY_IS_APPROX(result(i), expected);
+  }
+}
+
+template <int DataLayout>
+static void test_tensor_maps() {
+  int inputs[2 * 3 * 5 * 7];
+  TensorMap<Tensor<int, 4, DataLayout> > tensor_map(inputs, 2, 3, 5, 7);
+  TensorMap<Tensor<const int, 4, DataLayout> > tensor_map_const(inputs, 2, 3, 5,
+                                                                7);
+  const TensorMap<Tensor<const int, 4, DataLayout> > tensor_map_const_const(
+      inputs, 2, 3, 5, 7);
+
+  tensor_map.setRandom();
+  array<ptrdiff_t, 2> reduction_axis;
+  reduction_axis[0] = 1;
+  reduction_axis[1] = 3;
+
+  Tensor<int, 2, DataLayout> result = tensor_map.sum(reduction_axis);
+  Tensor<int, 2, DataLayout> result2 = tensor_map_const.sum(reduction_axis);
+  Tensor<int, 2, DataLayout> result3 =
+      tensor_map_const_const.sum(reduction_axis);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      int sum = 0;
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          sum += tensor_map(i, k, j, l);
+        }
+      }
+      VERIFY_IS_EQUAL(result(i, j), sum);
+      VERIFY_IS_EQUAL(result2(i, j), sum);
+      VERIFY_IS_EQUAL(result3(i, j), sum);
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_static_dims() {
+  Tensor<float, 4, DataLayout> in(72, 53, 97, 113);
+  Tensor<float, 2, DataLayout> out(72, 97);
+  in.setRandom();
+
+#if !EIGEN_HAS_CONSTEXPR 
+  array<int, 2> reduction_axis;
+  reduction_axis[0] = 1;
+  reduction_axis[1] = 3;
+#else
+  Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<3> > reduction_axis;
+#endif
+
+  out = in.maximum(reduction_axis);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 97; ++j) {
+      float expected = -1e10f;
+      for (int k = 0; k < 53; ++k) {
+        for (int l = 0; l < 113; ++l) {
+          expected = (std::max)(expected, in(i, k, j, l));
+        }
+      }
+      VERIFY_IS_APPROX(out(i, j), expected);
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_innermost_last_dims() {
+  Tensor<float, 4, DataLayout> in(72, 53, 97, 113);
+  Tensor<float, 2, DataLayout> out(97, 113);
+  in.setRandom();
+
+// Reduce on the innermost dimensions.
+#if !EIGEN_HAS_CONSTEXPR
+  array<int, 2> reduction_axis;
+  reduction_axis[0] = 0;
+  reduction_axis[1] = 1;
+#else
+  // This triggers the use of packets for ColMajor.
+  Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<1> > reduction_axis;
+#endif
+
+  out = in.maximum(reduction_axis);
+
+  for (int i = 0; i < 97; ++i) {
+    for (int j = 0; j < 113; ++j) {
+      float expected = -1e10f;
+      for (int k = 0; k < 53; ++k) {
+        for (int l = 0; l < 72; ++l) {
+          expected = (std::max)(expected, in(l, k, i, j));
+        }
+      }
+      VERIFY_IS_APPROX(out(i, j), expected);
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_innermost_first_dims() {
+  Tensor<float, 4, DataLayout> in(72, 53, 97, 113);
+  Tensor<float, 2, DataLayout> out(72, 53);
+  in.setRandom();
+
+// Reduce on the innermost dimensions.
+#if !EIGEN_HAS_CONSTEXPR
+  array<int, 2> reduction_axis;
+  reduction_axis[0] = 2;
+  reduction_axis[1] = 3;
+#else
+  // This triggers the use of packets for RowMajor.
+  Eigen::IndexList<Eigen::type2index<2>, Eigen::type2index<3>> reduction_axis;
+#endif
+
+  out = in.maximum(reduction_axis);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 53; ++j) {
+      float expected = -1e10f;
+      for (int k = 0; k < 97; ++k) {
+        for (int l = 0; l < 113; ++l) {
+          expected = (std::max)(expected, in(i, j, k, l));
+        }
+      }
+      VERIFY_IS_APPROX(out(i, j), expected);
+    }
+  }
+}
+
+template <int DataLayout>
+static void test_reduce_middle_dims() {
+  Tensor<float, 4, DataLayout> in(72, 53, 97, 113);
+  Tensor<float, 2, DataLayout> out(72, 53);
+  in.setRandom();
+
+// Reduce on the innermost dimensions.
+#if !EIGEN_HAS_CONSTEXPR
+  array<int, 2> reduction_axis;
+  reduction_axis[0] = 1;
+  reduction_axis[1] = 2;
+#else
+  // This triggers the use of packets for RowMajor.
+  Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2>> reduction_axis;
+#endif
+
+  out = in.maximum(reduction_axis);
+
+  for (int i = 0; i < 72; ++i) {
+    for (int j = 0; j < 113; ++j) {
+      float expected = -1e10f;
+      for (int k = 0; k < 53; ++k) {
+        for (int l = 0; l < 97; ++l) {
+          expected = (std::max)(expected, in(i, k, l, j));
+        }
+      }
+      VERIFY_IS_APPROX(out(i, j), expected);
+    }
+  }
+}
+
+void test_cxx11_tensor_reduction() {
+  CALL_SUBTEST(test_trivial_reductions<ColMajor>());
+  CALL_SUBTEST(test_trivial_reductions<RowMajor>());
+  CALL_SUBTEST(test_simple_reductions<ColMajor>());
+  CALL_SUBTEST(test_simple_reductions<RowMajor>());
+  CALL_SUBTEST(test_reductions_in_expr<ColMajor>());
+  CALL_SUBTEST(test_reductions_in_expr<RowMajor>());
+  CALL_SUBTEST(test_full_reductions<ColMajor>());
+  CALL_SUBTEST(test_full_reductions<RowMajor>());
+  CALL_SUBTEST(test_user_defined_reductions<ColMajor>());
+  CALL_SUBTEST(test_user_defined_reductions<RowMajor>());
+  CALL_SUBTEST(test_tensor_maps<ColMajor>());
+  CALL_SUBTEST(test_tensor_maps<RowMajor>());
+  CALL_SUBTEST(test_static_dims<ColMajor>());
+  CALL_SUBTEST(test_static_dims<RowMajor>());
+  CALL_SUBTEST(test_innermost_last_dims<ColMajor>());
+  CALL_SUBTEST(test_innermost_last_dims<RowMajor>());
+  CALL_SUBTEST(test_innermost_first_dims<ColMajor>());
+  CALL_SUBTEST(test_innermost_first_dims<RowMajor>());
+  CALL_SUBTEST(test_reduce_middle_dims<ColMajor>());
+  CALL_SUBTEST(test_reduce_middle_dims<RowMajor>());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_reduction_cuda.cu b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_reduction_cuda.cu
new file mode 100644
index 000000000..ec0669704
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_reduction_cuda.cu
@@ -0,0 +1,154 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+
+template<typename Type, int DataLayout>
+static void test_full_reductions() {
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  const int num_rows = internal::random<int>(1024, 5*1024);
+  const int num_cols = internal::random<int>(1024, 5*1024);
+
+  Tensor<Type, 2, DataLayout> in(num_rows, num_cols);
+  in.setRandom();
+
+  Tensor<Type, 0, DataLayout> full_redux;
+  full_redux = in.sum();
+
+  std::size_t in_bytes = in.size() * sizeof(Type);
+  std::size_t out_bytes = full_redux.size() * sizeof(Type);
+  Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes));
+  Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes));
+  gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
+
+  TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+  TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr);
+
+  out_gpu.device(gpu_device) = in_gpu.sum();
+
+  Tensor<Type, 0, DataLayout> full_redux_gpu;
+  gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
+  gpu_device.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux(), full_redux_gpu());
+
+  gpu_device.deallocate(gpu_in_ptr);
+  gpu_device.deallocate(gpu_out_ptr);
+}
+
+template<typename Type, int DataLayout>
+static void test_first_dim_reductions() {
+  int dim_x = 33;
+  int dim_y = 1;
+  int dim_z = 128;
+
+  Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
+  in.setRandom();
+
+  Eigen::array<int, 1> red_axis;
+  red_axis[0] = 0;
+  Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
+
+  // Create device
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice dev(&stream);
+  
+  // Create data(T)
+  Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
+  Type* out_data = (Type*)dev.allocate(dim_z*dim_y*sizeof(Type));
+  Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
+  Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_y, dim_z);
+  
+  // Perform operation
+  dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
+  gpu_out.device(dev) = gpu_in.sum(red_axis);
+  gpu_out.device(dev) += gpu_in.sum(red_axis);
+  Tensor<Type, 2, DataLayout> redux_gpu(dim_y, dim_z);
+  dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
+  dev.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (int i = 0; i < gpu_out.size(); ++i) {
+    VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
+  }
+
+  dev.deallocate(in_data);
+  dev.deallocate(out_data);
+}
+
+template<typename Type, int DataLayout>
+static void test_last_dim_reductions() {
+  int dim_x = 128;
+  int dim_y = 1;
+  int dim_z = 33;
+
+  Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
+  in.setRandom();
+
+  Eigen::array<int, 1> red_axis;
+  red_axis[0] = 2;
+  Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
+
+  // Create device
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice dev(&stream);
+  
+  // Create data
+  Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
+  Type* out_data = (Type*)dev.allocate(dim_x*dim_y*sizeof(Type));
+  Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
+  Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_x, dim_y);
+  
+  // Perform operation
+  dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
+  gpu_out.device(dev) = gpu_in.sum(red_axis);
+  gpu_out.device(dev) += gpu_in.sum(red_axis);
+  Tensor<Type, 2, DataLayout> redux_gpu(dim_x, dim_y);
+  dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
+  dev.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (int i = 0; i < gpu_out.size(); ++i) {
+    VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
+  }
+
+  dev.deallocate(in_data);
+  dev.deallocate(out_data);
+}
+
+
+void test_cxx11_tensor_reduction_cuda() {
+  CALL_SUBTEST_1((test_full_reductions<float, ColMajor>()));
+  CALL_SUBTEST_1((test_full_reductions<double, ColMajor>()));
+  CALL_SUBTEST_2((test_full_reductions<float, RowMajor>()));
+  CALL_SUBTEST_2((test_full_reductions<double, RowMajor>()));
+  
+  CALL_SUBTEST_3((test_first_dim_reductions<float, ColMajor>()));
+  CALL_SUBTEST_3((test_first_dim_reductions<double, ColMajor>()));
+  CALL_SUBTEST_4((test_first_dim_reductions<float, RowMajor>()));
+// Outer reductions of doubles aren't supported just yet.  					      
+//  CALL_SUBTEST_4((test_first_dim_reductions<double, RowMajor>()))
+
+  CALL_SUBTEST_5((test_last_dim_reductions<float, ColMajor>()));
+// Outer reductions of doubles aren't supported just yet.  					      
+//  CALL_SUBTEST_5((test_last_dim_reductions<double, ColMajor>()));
+  CALL_SUBTEST_6((test_last_dim_reductions<float, RowMajor>()));
+  CALL_SUBTEST_6((test_last_dim_reductions<double, RowMajor>()));
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_reduction_sycl.cpp
new file mode 100644
index 000000000..a9ef82907
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_reduction_sycl.cpp
@@ -0,0 +1,138 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_reduction_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+
+
+static void test_full_reductions_sycl(const Eigen::SyclDevice&  sycl_device) {
+
+  const int num_rows = 452;
+  const int num_cols = 765;
+  array<int, 2> tensorRange = {{num_rows, num_cols}};
+
+  Tensor<float, 2> in(tensorRange);
+  Tensor<float, 0> full_redux;
+  Tensor<float, 0> full_redux_gpu;
+
+  in.setRandom();
+
+  full_redux = in.sum();
+
+  float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float)));
+  float* gpu_out_data =(float*)sycl_device.allocate(sizeof(float));
+
+  TensorMap<Tensor<float, 2> >  in_gpu(gpu_in_data, tensorRange);
+  TensorMap<Tensor<float, 0> >  out_gpu(gpu_out_data);
+
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float));
+  out_gpu.device(sycl_device) = in_gpu.sum();
+  sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(float));
+  // Check that the CPU and GPU reductions return the same result.
+  VERIFY_IS_APPROX(full_redux_gpu(), full_redux());
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+static void test_first_dim_reductions_sycl(const Eigen::SyclDevice& sycl_device) {
+
+  int dim_x = 145;
+  int dim_y = 1;
+  int dim_z = 67;
+
+  array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<int, 1> red_axis;
+  red_axis[0] = 0;
+  array<int, 2> reduced_tensorRange = {{dim_y, dim_z}};
+
+  Tensor<float, 3> in(tensorRange);
+  Tensor<float, 2> redux(reduced_tensorRange);
+  Tensor<float, 2> redux_gpu(reduced_tensorRange);
+
+  in.setRandom();
+
+  redux= in.sum(red_axis);
+
+  float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float)));
+  float* gpu_out_data = static_cast<float*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(float)));
+
+  TensorMap<Tensor<float, 3> >  in_gpu(gpu_in_data, tensorRange);
+  TensorMap<Tensor<float, 2> >  out_gpu(gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float));
+  out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
+  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(float));
+
+  // Check that the CPU and GPU reductions return the same result.
+  for(int j=0; j<reduced_tensorRange[0]; j++ )
+    for(int k=0; k<reduced_tensorRange[1]; k++ )
+      VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+
+static void test_last_dim_reductions_sycl(const Eigen::SyclDevice &sycl_device) {
+
+  int dim_x = 567;
+  int dim_y = 1;
+  int dim_z = 47;
+
+  array<int, 3> tensorRange = {{dim_x, dim_y, dim_z}};
+  Eigen::array<int, 1> red_axis;
+  red_axis[0] = 2;
+  array<int, 2> reduced_tensorRange = {{dim_x, dim_y}};
+
+  Tensor<float, 3> in(tensorRange);
+  Tensor<float, 2> redux(reduced_tensorRange);
+  Tensor<float, 2> redux_gpu(reduced_tensorRange);
+
+  in.setRandom();
+
+  redux= in.sum(red_axis);
+
+  float* gpu_in_data = static_cast<float*>(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(float)));
+  float* gpu_out_data = static_cast<float*>(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(float)));
+
+  TensorMap<Tensor<float, 3> >  in_gpu(gpu_in_data, tensorRange);
+  TensorMap<Tensor<float, 2> >  out_gpu(gpu_out_data, reduced_tensorRange);
+
+  sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(float));
+  out_gpu.device(sycl_device) = in_gpu.sum(red_axis);
+  sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(float));
+  // Check that the CPU and GPU reductions return the same result.
+  for(int j=0; j<reduced_tensorRange[0]; j++ )
+    for(int k=0; k<reduced_tensorRange[1]; k++ )
+      VERIFY_IS_APPROX(redux_gpu(j,k), redux(j,k));
+
+  sycl_device.deallocate(gpu_in_data);
+  sycl_device.deallocate(gpu_out_data);
+
+}
+
+void test_cxx11_tensor_reduction_sycl() {
+  cl::sycl::gpu_selector s;
+  Eigen::SyclDevice sycl_device(s);
+  CALL_SUBTEST((test_full_reductions_sycl(sycl_device)));
+  CALL_SUBTEST((test_first_dim_reductions_sycl(sycl_device)));
+  CALL_SUBTEST((test_last_dim_reductions_sycl(sycl_device)));
+
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_ref.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_ref.cpp
new file mode 100644
index 000000000..c8f105e3d
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_ref.cpp
@@ -0,0 +1,248 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_simple_lvalue_ref()
+{
+  Tensor<int, 1> input(6);
+  input.setRandom();
+
+  TensorRef<Tensor<int, 1>> ref3(input);
+  TensorRef<Tensor<int, 1>> ref4 = input;
+
+  VERIFY_IS_EQUAL(ref3.data(), input.data());
+  VERIFY_IS_EQUAL(ref4.data(), input.data());
+
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(ref3(i), input(i));
+    VERIFY_IS_EQUAL(ref4(i), input(i));
+  }
+
+  for (int i = 0; i < 6; ++i) {
+    ref3.coeffRef(i) = i;
+  }
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(input(i), i);
+  }
+  for (int i = 0; i < 6; ++i) {
+    ref4.coeffRef(i) = -i * 2;
+  }
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(input(i), -i*2);
+  }
+}
+
+
+static void test_simple_rvalue_ref()
+{
+  Tensor<int, 1> input1(6);
+  input1.setRandom();
+  Tensor<int, 1> input2(6);
+  input2.setRandom();
+
+  TensorRef<Tensor<int, 1>> ref3(input1 + input2);
+  TensorRef<Tensor<int, 1>> ref4 = input1 + input2;
+
+  VERIFY_IS_NOT_EQUAL(ref3.data(), input1.data());
+  VERIFY_IS_NOT_EQUAL(ref4.data(), input1.data());
+  VERIFY_IS_NOT_EQUAL(ref3.data(), input2.data());
+  VERIFY_IS_NOT_EQUAL(ref4.data(), input2.data());
+
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_EQUAL(ref3(i), input1(i) + input2(i));
+    VERIFY_IS_EQUAL(ref4(i), input1(i) + input2(i));
+  }
+}
+
+
+static void test_multiple_dims()
+{
+  Tensor<float, 3> input(3,5,7);
+  input.setRandom();
+
+  TensorRef<Tensor<float, 3>> ref(input);
+  VERIFY_IS_EQUAL(ref.data(), input.data());
+  VERIFY_IS_EQUAL(ref.dimension(0), 3);
+  VERIFY_IS_EQUAL(ref.dimension(1), 5);
+  VERIFY_IS_EQUAL(ref.dimension(2), 7);
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(ref(i,j,k), input(i,j,k));
+      }
+    }
+  }
+}
+
+
+static void test_slice()
+{
+  Tensor<float, 5> tensor(2,3,5,7,11);
+  tensor.setRandom();
+
+  Eigen::DSizes<ptrdiff_t, 5> indices(1,2,3,4,5);
+  Eigen::DSizes<ptrdiff_t, 5> sizes(1,1,1,1,1);
+  TensorRef<Tensor<float, 5>> slice = tensor.slice(indices, sizes);
+  VERIFY_IS_EQUAL(slice(0,0,0,0,0), tensor(1,2,3,4,5));
+
+  Eigen::DSizes<ptrdiff_t, 5> indices2(1,1,3,4,5);
+  Eigen::DSizes<ptrdiff_t, 5> sizes2(1,1,2,2,3);
+  slice = tensor.slice(indices2, sizes2);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        VERIFY_IS_EQUAL(slice(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k));
+      }
+    }
+  }
+
+  Eigen::DSizes<ptrdiff_t, 5> indices3(0,0,0,0,0);
+  Eigen::DSizes<ptrdiff_t, 5> sizes3(2,3,1,1,1);
+  slice = tensor.slice(indices3, sizes3);
+  VERIFY_IS_EQUAL(slice.data(), tensor.data());
+}
+
+
+static void test_ref_of_ref()
+{
+  Tensor<float, 3> input(3,5,7);
+  input.setRandom();
+
+  TensorRef<Tensor<float, 3>> ref(input);
+  TensorRef<Tensor<float, 3>> ref_of_ref(ref);
+  TensorRef<Tensor<float, 3>> ref_of_ref2;
+  ref_of_ref2 = ref;
+
+  VERIFY_IS_EQUAL(ref_of_ref.data(), input.data());
+  VERIFY_IS_EQUAL(ref_of_ref.dimension(0), 3);
+  VERIFY_IS_EQUAL(ref_of_ref.dimension(1), 5);
+  VERIFY_IS_EQUAL(ref_of_ref.dimension(2), 7);
+
+  VERIFY_IS_EQUAL(ref_of_ref2.data(), input.data());
+  VERIFY_IS_EQUAL(ref_of_ref2.dimension(0), 3);
+  VERIFY_IS_EQUAL(ref_of_ref2.dimension(1), 5);
+  VERIFY_IS_EQUAL(ref_of_ref2.dimension(2), 7);
+
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(ref_of_ref(i,j,k), input(i,j,k));
+        VERIFY_IS_EQUAL(ref_of_ref2(i,j,k), input(i,j,k));
+     }
+    }
+  }
+}
+
+
+static void test_ref_in_expr()
+{
+  Tensor<float, 3> input(3,5,7);
+  input.setRandom();
+  TensorRef<Tensor<float, 3>> input_ref(input);
+
+  Tensor<float, 3> result(3,5,7);
+  result.setRandom();
+  TensorRef<Tensor<float, 3>> result_ref(result);
+
+  Tensor<float, 3> bias(3,5,7);
+  bias.setRandom();
+
+  result_ref = input_ref + bias;
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(result_ref(i,j,k), input(i,j,k) + bias(i,j,k));
+        VERIFY_IS_NOT_EQUAL(result(i,j,k), input(i,j,k) + bias(i,j,k));
+      }
+    }
+  }
+
+  result = result_ref;
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_EQUAL(result(i,j,k), input(i,j,k) + bias(i,j,k));
+      }
+    }
+  }
+}
+
+
+static void test_coeff_ref()
+{
+  Tensor<float, 5> tensor(2,3,5,7,11);
+  tensor.setRandom();
+  Tensor<float, 5> original = tensor;
+
+  TensorRef<Tensor<float, 4>> slice = tensor.chip(7, 4);
+  slice.coeffRef(0, 0, 0, 0) = 1.0f;
+  slice.coeffRef(1, 0, 0, 0) += 2.0f;
+
+  VERIFY_IS_EQUAL(tensor(0,0,0,0,7), 1.0f);
+  VERIFY_IS_EQUAL(tensor(1,0,0,0,7), original(1,0,0,0,7) + 2.0f);
+}
+
+
+static void test_nested_ops_with_ref()
+{
+  Tensor<float, 4> t(2, 3, 5, 7);
+  t.setRandom();
+  TensorMap<Tensor<const float, 4> > m(t.data(), 2, 3, 5, 7);
+  array<std::pair<ptrdiff_t, ptrdiff_t>, 4> paddings;
+  paddings[0] = std::make_pair(0, 0);
+  paddings[1] = std::make_pair(2, 1);
+  paddings[2] = std::make_pair(3, 4);
+  paddings[3] = std::make_pair(0, 0);
+  DSizes<Eigen::DenseIndex, 4> shuffle_dims(0, 1, 2, 3);
+  TensorRef<Tensor<const float, 4> > ref(m.pad(paddings));
+  array<std::pair<ptrdiff_t, ptrdiff_t>, 4> trivial;
+  trivial[0] = std::make_pair(0, 0);
+  trivial[1] = std::make_pair(0, 0);
+  trivial[2] = std::make_pair(0, 0);
+  trivial[3] = std::make_pair(0, 0);
+  Tensor<float, 4> padded = ref.shuffle(shuffle_dims).pad(trivial);
+  VERIFY_IS_EQUAL(padded.dimension(0), 2+0);
+  VERIFY_IS_EQUAL(padded.dimension(1), 3+3);
+  VERIFY_IS_EQUAL(padded.dimension(2), 5+7);
+  VERIFY_IS_EQUAL(padded.dimension(3), 7+0);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 6; ++j) {
+      for (int k = 0; k < 12; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          if (j >= 2 && j < 5 && k >= 3 && k < 8) {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), t(i,j-2,k-3,l));
+          } else {
+            VERIFY_IS_EQUAL(padded(i,j,k,l), 0.0f);
+          }
+        }
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_ref()
+{
+  CALL_SUBTEST(test_simple_lvalue_ref());
+  CALL_SUBTEST(test_simple_rvalue_ref());
+  CALL_SUBTEST(test_multiple_dims());
+  CALL_SUBTEST(test_slice());
+  CALL_SUBTEST(test_ref_of_ref());
+  CALL_SUBTEST(test_ref_in_expr());
+  CALL_SUBTEST(test_coeff_ref());
+  CALL_SUBTEST(test_nested_ops_with_ref());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_reverse.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_reverse.cpp
new file mode 100644
index 000000000..b35b8d29e
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_reverse.cpp
@@ -0,0 +1,190 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com and
+//                    Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::array;
+
+template <int DataLayout>
+static void test_simple_reverse()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  array<bool, 4> dim_rev;
+  dim_rev[0] = false;
+  dim_rev[1] = true;
+  dim_rev[2] = true;
+  dim_rev[3] = false;
+
+  Tensor<float, 4, DataLayout> reversed_tensor;
+  reversed_tensor = tensor.reverse(dim_rev);
+
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(0), 2);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(1), 3);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(2), 5);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(i,2-j,4-k,l));
+        }
+      }
+    }
+  }
+
+  dim_rev[0] = true;
+  dim_rev[1] = false;
+  dim_rev[2] = false;
+  dim_rev[3] = false;
+
+  reversed_tensor = tensor.reverse(dim_rev);
+
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(0), 2);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(1), 3);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(2), 5);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(3), 7);
+
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,l));
+        }
+      }
+    }
+  }
+
+  dim_rev[0] = true;
+  dim_rev[1] = false;
+  dim_rev[2] = false;
+  dim_rev[3] = true;
+
+  reversed_tensor = tensor.reverse(dim_rev);
+
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(0), 2);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(1), 3);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(2), 5);
+  VERIFY_IS_EQUAL(reversed_tensor.dimension(3), 7);
+
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,6-l));
+        }
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_expr_reverse(bool LValue)
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  array<bool, 4> dim_rev;
+  dim_rev[0] = false;
+  dim_rev[1] = true;
+  dim_rev[2] = false;
+  dim_rev[3] = true;
+
+  Tensor<float, 4, DataLayout> expected(2, 3, 5, 7);
+  if (LValue) {
+    expected.reverse(dim_rev) = tensor;
+  } else {
+    expected = tensor.reverse(dim_rev);
+  }
+
+  Tensor<float, 4, DataLayout> result(2,3,5,7);
+
+  array<ptrdiff_t, 4> src_slice_dim;
+  src_slice_dim[0] = 2;
+  src_slice_dim[1] = 3;
+  src_slice_dim[2] = 1;
+  src_slice_dim[3] = 7;
+  array<ptrdiff_t, 4> src_slice_start;
+  src_slice_start[0] = 0;
+  src_slice_start[1] = 0;
+  src_slice_start[2] = 0;
+  src_slice_start[3] = 0;
+  array<ptrdiff_t, 4> dst_slice_dim = src_slice_dim;
+  array<ptrdiff_t, 4> dst_slice_start = src_slice_start;
+
+  for (int i = 0; i < 5; ++i) {
+    if (LValue) {
+      result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev) =
+          tensor.slice(src_slice_start, src_slice_dim);
+    } else {
+      result.slice(dst_slice_start, dst_slice_dim) =
+          tensor.slice(src_slice_start, src_slice_dim).reverse(dim_rev);
+    }
+    src_slice_start[2] += 1;
+    dst_slice_start[2] += 1;
+  }
+
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 3);
+  VERIFY_IS_EQUAL(result.dimension(2), 5);
+  VERIFY_IS_EQUAL(result.dimension(3), 7);
+
+  for (int i = 0; i < expected.dimension(0); ++i) {
+    for (int j = 0; j < expected.dimension(1); ++j) {
+      for (int k = 0; k < expected.dimension(2); ++k) {
+        for (int l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  dst_slice_start[2] = 0;
+  result.setRandom();
+  for (int i = 0; i < 5; ++i) {
+     if (LValue) {
+       result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev) =
+           tensor.slice(dst_slice_start, dst_slice_dim);
+     } else {
+       result.slice(dst_slice_start, dst_slice_dim) =
+           tensor.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim);
+     }
+    dst_slice_start[2] += 1;
+  }
+
+  for (int i = 0; i < expected.dimension(0); ++i) {
+    for (int j = 0; j < expected.dimension(1); ++j) {
+      for (int k = 0; k < expected.dimension(2); ++k) {
+        for (int l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_reverse()
+{
+  CALL_SUBTEST(test_simple_reverse<ColMajor>());
+  CALL_SUBTEST(test_simple_reverse<RowMajor>());
+  CALL_SUBTEST(test_expr_reverse<ColMajor>(true));
+  CALL_SUBTEST(test_expr_reverse<RowMajor>(true));
+  CALL_SUBTEST(test_expr_reverse<ColMajor>(false));
+  CALL_SUBTEST(test_expr_reverse<RowMajor>(false));
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_roundings.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_roundings.cpp
new file mode 100644
index 000000000..2c26151ab
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_roundings.cpp
@@ -0,0 +1,62 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+
+static void test_float_rounding()
+{
+  Tensor<float, 2> ftensor(20,30);
+  ftensor = ftensor.random() * 100.f;
+
+  Tensor<float, 2> result = ftensor.round();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_EQUAL(result(i,j), numext::round(ftensor(i,j)));
+    }
+  }
+}
+
+static void test_float_flooring()
+{
+  Tensor<float, 2> ftensor(20,30);
+  ftensor = ftensor.random() * 100.f;
+
+  Tensor<float, 2> result = ftensor.floor();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_EQUAL(result(i,j), numext::floor(ftensor(i,j)));
+    }
+  }
+}
+
+static void test_float_ceiling()
+{
+  Tensor<float, 2> ftensor(20,30);
+  ftensor = ftensor.random() * 100.f;
+
+  Tensor<float, 2> result = ftensor.ceil();
+
+  for (int i = 0; i < 20; ++i) {
+    for (int j = 0; j < 30; ++j) {
+      VERIFY_IS_EQUAL(result(i,j), numext::ceil(ftensor(i,j)));
+    }
+  }
+}
+
+void test_cxx11_tensor_roundings()
+{
+   CALL_SUBTEST(test_float_rounding());
+   CALL_SUBTEST(test_float_ceiling());
+   CALL_SUBTEST(test_float_flooring());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_scan.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_scan.cpp
new file mode 100644
index 000000000..af59aa3ef
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_scan.cpp
@@ -0,0 +1,110 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <limits>
+#include <numeric>
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template <int DataLayout, typename Type=float, bool Exclusive = false>
+static void test_1d_scan()
+{
+  int size = 50;
+  Tensor<Type, 1, DataLayout> tensor(size);
+  tensor.setRandom();
+  Tensor<Type, 1, DataLayout> result = tensor.cumsum(0, Exclusive);
+
+  VERIFY_IS_EQUAL(tensor.dimension(0), result.dimension(0));
+
+  float accum = 0;
+  for (int i = 0; i < size; i++) {
+    if (Exclusive) {
+      VERIFY_IS_EQUAL(result(i), accum);
+      accum += tensor(i);
+    } else {
+      accum += tensor(i);
+      VERIFY_IS_EQUAL(result(i), accum);
+    }
+  }
+
+  accum = 1;
+  result = tensor.cumprod(0, Exclusive);
+  for (int i = 0; i < size; i++) {
+    if (Exclusive) {
+      VERIFY_IS_EQUAL(result(i), accum);
+      accum *= tensor(i);
+    } else {
+      accum *= tensor(i);
+      VERIFY_IS_EQUAL(result(i), accum);
+    }
+  }
+}
+
+template <int DataLayout, typename Type=float>
+static void test_4d_scan()
+{
+  int size = 5;
+  Tensor<Type, 4, DataLayout> tensor(size, size, size, size);
+  tensor.setRandom();
+
+  Tensor<Type, 4, DataLayout> result(size, size, size, size);
+
+  result = tensor.cumsum(0);
+  float accum = 0;
+  for (int i = 0; i < size; i++) {
+    accum += tensor(i, 1, 2, 3);
+    VERIFY_IS_EQUAL(result(i, 1, 2, 3), accum);
+  }
+  result = tensor.cumsum(1);
+  accum = 0;
+  for (int i = 0; i < size; i++) {
+    accum += tensor(1, i, 2, 3);
+    VERIFY_IS_EQUAL(result(1, i, 2, 3), accum);
+  }
+  result = tensor.cumsum(2);
+  accum = 0;
+  for (int i = 0; i < size; i++) {
+    accum += tensor(1, 2, i, 3);
+    VERIFY_IS_EQUAL(result(1, 2, i, 3), accum);
+  }
+  result = tensor.cumsum(3);
+  accum = 0;
+  for (int i = 0; i < size; i++) {
+    accum += tensor(1, 2, 3, i);
+    VERIFY_IS_EQUAL(result(1, 2, 3, i), accum);
+  }
+}
+
+template <int DataLayout>
+static void test_tensor_maps() {
+  int inputs[20];
+  TensorMap<Tensor<int, 1, DataLayout> > tensor_map(inputs, 20);
+  tensor_map.setRandom();
+
+  Tensor<int, 1, DataLayout> result = tensor_map.cumsum(0);
+
+  int accum = 0;
+  for (int i = 0; i < 20; ++i) {
+    accum += tensor_map(i);
+    VERIFY_IS_EQUAL(result(i), accum);
+  }
+}
+
+void test_cxx11_tensor_scan() {
+  CALL_SUBTEST((test_1d_scan<ColMajor, float, true>()));
+  CALL_SUBTEST((test_1d_scan<ColMajor, float, false>()));
+  CALL_SUBTEST((test_1d_scan<RowMajor, float, true>()));
+  CALL_SUBTEST((test_1d_scan<RowMajor, float, false>()));
+  CALL_SUBTEST(test_4d_scan<ColMajor>());
+  CALL_SUBTEST(test_4d_scan<RowMajor>());
+  CALL_SUBTEST(test_tensor_maps<ColMajor>());
+  CALL_SUBTEST(test_tensor_maps<RowMajor>());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_scan_cuda.cu b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_scan_cuda.cu
new file mode 100644
index 000000000..de1c0ac95
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_scan_cuda.cu
@@ -0,0 +1,76 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_scan_cuda
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template<int DataLayout>
+void test_cuda_cumsum(int m_size, int k_size, int n_size)
+{
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+  Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size);
+  Tensor<float, 3, DataLayout> t_result(m_size, k_size, n_size);
+  Tensor<float, 3, DataLayout> t_result_gpu(m_size, k_size, n_size);
+
+  t_input.setRandom();
+
+  std::size_t t_input_bytes = t_input.size()  * sizeof(float);
+  std::size_t t_result_bytes = t_result.size() * sizeof(float);
+
+  float* d_t_input;
+  float* d_t_result;
+
+  cudaMalloc((void**)(&d_t_input), t_input_bytes);
+  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+
+  cudaMemcpy(d_t_input, t_input.data(), t_input_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
+      gpu_t_input(d_t_input, Eigen::array<int, 3>(m_size, k_size, n_size));
+  Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
+      gpu_t_result(d_t_result, Eigen::array<int, 3>(m_size, k_size, n_size));
+
+  gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1);
+  t_result = t_input.cumsum(1);
+
+  cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  for (DenseIndex i = 0; i < t_result.size(); i++) {
+    if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
+      continue;
+    }
+    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+              << " vs " <<  t_result_gpu(i) << std::endl;
+    assert(false);
+  }
+
+  cudaFree((void*)d_t_input);
+  cudaFree((void*)d_t_result);
+}
+
+
+void test_cxx11_tensor_scan_cuda()
+{
+  CALL_SUBTEST_1(test_cuda_cumsum<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_2(test_cuda_cumsum<RowMajor>(128, 128, 128));
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_shuffling.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_shuffling.cpp
new file mode 100644
index 000000000..d11444a14
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_shuffling.cpp
@@ -0,0 +1,228 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::array;
+
+template <int DataLayout>
+static void test_simple_shuffling()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> shuffles;
+  shuffles[0] = 0;
+  shuffles[1] = 1;
+  shuffles[2] = 2;
+  shuffles[3] = 3;
+
+  Tensor<float, 4, DataLayout> no_shuffle;
+  no_shuffle = tensor.shuffle(shuffles);
+
+  VERIFY_IS_EQUAL(no_shuffle.dimension(0), 2);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(1), 3);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(2), 5);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  shuffles[0] = 2;
+  shuffles[1] = 3;
+  shuffles[2] = 1;
+  shuffles[3] = 0;
+  Tensor<float, 4, DataLayout> shuffle;
+  shuffle = tensor.shuffle(shuffles);
+
+  VERIFY_IS_EQUAL(shuffle.dimension(0), 5);
+  VERIFY_IS_EQUAL(shuffle.dimension(1), 7);
+  VERIFY_IS_EQUAL(shuffle.dimension(2), 3);
+  VERIFY_IS_EQUAL(shuffle.dimension(3), 2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i));
+        }
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_expr_shuffling()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  array<ptrdiff_t, 4> shuffles;
+  shuffles[0] = 2;
+  shuffles[1] = 3;
+  shuffles[2] = 1;
+  shuffles[3] = 0;
+  Tensor<float, 4, DataLayout> expected;
+  expected = tensor.shuffle(shuffles);
+
+  Tensor<float, 4, DataLayout> result(5,7,3,2);
+
+  array<int, 4> src_slice_dim{{2,3,1,7}};
+  array<int, 4> src_slice_start{{0,0,0,0}};
+  array<int, 4> dst_slice_dim{{1,7,3,2}};
+  array<int, 4> dst_slice_start{{0,0,0,0}};
+
+  for (int i = 0; i < 5; ++i) {
+    result.slice(dst_slice_start, dst_slice_dim) =
+        tensor.slice(src_slice_start, src_slice_dim).shuffle(shuffles);
+    src_slice_start[2] += 1;
+    dst_slice_start[0] += 1;
+  }
+
+  VERIFY_IS_EQUAL(result.dimension(0), 5);
+  VERIFY_IS_EQUAL(result.dimension(1), 7);
+  VERIFY_IS_EQUAL(result.dimension(2), 3);
+  VERIFY_IS_EQUAL(result.dimension(3), 2);
+
+  for (int i = 0; i < expected.dimension(0); ++i) {
+    for (int j = 0; j < expected.dimension(1); ++j) {
+      for (int k = 0; k < expected.dimension(2); ++k) {
+        for (int l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  dst_slice_start[0] = 0;
+  result.setRandom();
+  for (int i = 0; i < 5; ++i) {
+    result.slice(dst_slice_start, dst_slice_dim) =
+        tensor.shuffle(shuffles).slice(dst_slice_start, dst_slice_dim);
+    dst_slice_start[0] += 1;
+  }
+
+  for (int i = 0; i < expected.dimension(0); ++i) {
+    for (int j = 0; j < expected.dimension(1); ++j) {
+      for (int k = 0; k < expected.dimension(2); ++k) {
+        for (int l = 0; l < expected.dimension(3); ++l) {
+          VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_shuffling_as_value()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> shuffles;
+  shuffles[2] = 0;
+  shuffles[3] = 1;
+  shuffles[1] = 2;
+  shuffles[0] = 3;
+  Tensor<float, 4, DataLayout> shuffle(5,7,3,2);
+  shuffle.shuffle(shuffles) = tensor;
+
+  VERIFY_IS_EQUAL(shuffle.dimension(0), 5);
+  VERIFY_IS_EQUAL(shuffle.dimension(1), 7);
+  VERIFY_IS_EQUAL(shuffle.dimension(2), 3);
+  VERIFY_IS_EQUAL(shuffle.dimension(3), 2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i));
+        }
+      }
+    }
+  }
+
+  array<ptrdiff_t, 4> no_shuffle;
+  no_shuffle[0] = 0;
+  no_shuffle[1] = 1;
+  no_shuffle[2] = 2;
+  no_shuffle[3] = 3;
+  Tensor<float, 4, DataLayout> shuffle2(5,7,3,2);
+  shuffle2.shuffle(shuffles) = tensor.shuffle(no_shuffle);
+  for (int i = 0; i < 5; ++i) {
+    for (int j = 0; j < 7; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 2; ++l) {
+          VERIFY_IS_EQUAL(shuffle2(i,j,k,l), shuffle(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+
+template <int DataLayout>
+static void test_shuffle_unshuffle()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+
+  // Choose a random permutation.
+  array<ptrdiff_t, 4> shuffles;
+  for (int i = 0; i < 4; ++i) {
+    shuffles[i] = i;
+  }
+  array<ptrdiff_t, 4> shuffles_inverse;
+  for (int i = 0; i < 4; ++i) {
+    const ptrdiff_t index = internal::random<ptrdiff_t>(i, 3);
+    shuffles_inverse[shuffles[index]] = i;
+    std::swap(shuffles[i], shuffles[index]);
+  }
+
+  Tensor<float, 4, DataLayout> shuffle;
+  shuffle = tensor.shuffle(shuffles).shuffle(shuffles_inverse);
+
+  VERIFY_IS_EQUAL(shuffle.dimension(0), 2);
+  VERIFY_IS_EQUAL(shuffle.dimension(1), 3);
+  VERIFY_IS_EQUAL(shuffle.dimension(2), 5);
+  VERIFY_IS_EQUAL(shuffle.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_shuffling()
+{
+  CALL_SUBTEST(test_simple_shuffling<ColMajor>());
+  CALL_SUBTEST(test_simple_shuffling<RowMajor>());
+  CALL_SUBTEST(test_expr_shuffling<ColMajor>());
+  CALL_SUBTEST(test_expr_shuffling<RowMajor>());
+  CALL_SUBTEST(test_shuffling_as_value<ColMajor>());
+  CALL_SUBTEST(test_shuffling_as_value<RowMajor>());
+  CALL_SUBTEST(test_shuffle_unshuffle<ColMajor>());
+  CALL_SUBTEST(test_shuffle_unshuffle<RowMajor>());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_simple.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_simple.cpp
new file mode 100644
index 000000000..5a0d339ef
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_simple.cpp
@@ -0,0 +1,327 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_0d()
+{
+  Tensor<int, 0> scalar1;
+  Tensor<int, 0, RowMajor> scalar2;
+  Tensor<int, 0> scalar3;
+  Tensor<int, 0, RowMajor> scalar4;
+
+  scalar3.resize();
+  scalar4.resize();
+
+  scalar1() = 7;
+  scalar2() = 13;
+  scalar3.setValues(17);
+  scalar4.setZero();
+
+  VERIFY_IS_EQUAL(scalar1.rank(), 0);
+  VERIFY_IS_EQUAL(scalar1.size(), 1);
+
+  VERIFY_IS_EQUAL(scalar1(), 7);
+  VERIFY_IS_EQUAL(scalar2(), 13);
+  VERIFY_IS_EQUAL(scalar3(), 17);
+  VERIFY_IS_EQUAL(scalar4(), 0);
+
+  Tensor<int, 0> scalar5(scalar1);
+
+  VERIFY_IS_EQUAL(scalar5(), 7);
+  VERIFY_IS_EQUAL(scalar5.data()[0], 7);
+}
+
+static void test_1d()
+{
+  Tensor<int, 1> vec1(6);
+  Tensor<int, 1, RowMajor> vec2(6);
+  Tensor<int, 1> vec3;
+  Tensor<int, 1, RowMajor> vec4;
+
+  vec3.resize(6);
+  vec4.resize(6);
+
+  vec1(0) = 4;  vec2(0) = 0; vec3(0) = 5;
+  vec1(1) = 8;  vec2(1) = 1; vec3(1) = 4;
+  vec1(2) = 15; vec2(2) = 2; vec3(2) = 3;
+  vec1(3) = 16; vec2(3) = 3; vec3(3) = 2;
+  vec1(4) = 23; vec2(4) = 4; vec3(4) = 1;
+  vec1(5) = 42; vec2(5) = 5; vec3(5) = 0;
+  vec4.setZero();
+
+  VERIFY_IS_EQUAL((vec1.rank()), 1);
+  VERIFY_IS_EQUAL((vec1.size()), 6);
+  VERIFY_IS_EQUAL((vec1.dimensions()[0]), 6);
+
+  VERIFY_IS_EQUAL((vec1[0]), 4);
+  VERIFY_IS_EQUAL((vec1[1]), 8);
+  VERIFY_IS_EQUAL((vec1[2]), 15);
+  VERIFY_IS_EQUAL((vec1[3]), 16);
+  VERIFY_IS_EQUAL((vec1[4]), 23);
+  VERIFY_IS_EQUAL((vec1[5]), 42);
+
+  VERIFY_IS_EQUAL((vec2[0]), 0);
+  VERIFY_IS_EQUAL((vec2[1]), 1);
+  VERIFY_IS_EQUAL((vec2[2]), 2);
+  VERIFY_IS_EQUAL((vec2[3]), 3);
+  VERIFY_IS_EQUAL((vec2[4]), 4);
+  VERIFY_IS_EQUAL((vec2[5]), 5);
+
+  VERIFY_IS_EQUAL((vec3[0]), 5);
+  VERIFY_IS_EQUAL((vec3[1]), 4);
+  VERIFY_IS_EQUAL((vec3[2]), 3);
+  VERIFY_IS_EQUAL((vec3[3]), 2);
+  VERIFY_IS_EQUAL((vec3[4]), 1);
+  VERIFY_IS_EQUAL((vec3[5]), 0);
+
+  VERIFY_IS_EQUAL((vec4[0]), 0);
+  VERIFY_IS_EQUAL((vec4[1]), 0);
+  VERIFY_IS_EQUAL((vec4[2]), 0);
+  VERIFY_IS_EQUAL((vec4[3]), 0);
+  VERIFY_IS_EQUAL((vec4[4]), 0);
+  VERIFY_IS_EQUAL((vec4[5]), 0);
+
+  Tensor<int, 1> vec5(vec1);
+
+  VERIFY_IS_EQUAL((vec5(0)), 4);
+  VERIFY_IS_EQUAL((vec5(1)), 8);
+  VERIFY_IS_EQUAL((vec5(2)), 15);
+  VERIFY_IS_EQUAL((vec5(3)), 16);
+  VERIFY_IS_EQUAL((vec5(4)), 23);
+  VERIFY_IS_EQUAL((vec5(5)), 42);
+
+  VERIFY_IS_EQUAL((vec5.data()[0]), 4);
+  VERIFY_IS_EQUAL((vec5.data()[1]), 8);
+  VERIFY_IS_EQUAL((vec5.data()[2]), 15);
+  VERIFY_IS_EQUAL((vec5.data()[3]), 16);
+  VERIFY_IS_EQUAL((vec5.data()[4]), 23);
+  VERIFY_IS_EQUAL((vec5.data()[5]), 42);
+}
+
+static void test_2d()
+{
+  Tensor<int, 2> mat1(2,3);
+  Tensor<int, 2, RowMajor> mat2(2,3);
+
+  mat1(0,0) = 0;
+  mat1(0,1) = 1;
+  mat1(0,2) = 2;
+  mat1(1,0) = 3;
+  mat1(1,1) = 4;
+  mat1(1,2) = 5;
+
+  mat2(0,0) = 0;
+  mat2(0,1) = 1;
+  mat2(0,2) = 2;
+  mat2(1,0) = 3;
+  mat2(1,1) = 4;
+  mat2(1,2) = 5;
+
+  VERIFY_IS_EQUAL((mat1.rank()), 2);
+  VERIFY_IS_EQUAL((mat1.size()), 6);
+  VERIFY_IS_EQUAL((mat1.dimensions()[0]), 2);
+  VERIFY_IS_EQUAL((mat1.dimensions()[1]), 3);
+
+  VERIFY_IS_EQUAL((mat2.rank()), 2);
+  VERIFY_IS_EQUAL((mat2.size()), 6);
+  VERIFY_IS_EQUAL((mat2.dimensions()[0]), 2);
+  VERIFY_IS_EQUAL((mat2.dimensions()[1]), 3);
+
+  VERIFY_IS_EQUAL((mat1.data()[0]), 0);
+  VERIFY_IS_EQUAL((mat1.data()[1]), 3);
+  VERIFY_IS_EQUAL((mat1.data()[2]), 1);
+  VERIFY_IS_EQUAL((mat1.data()[3]), 4);
+  VERIFY_IS_EQUAL((mat1.data()[4]), 2);
+  VERIFY_IS_EQUAL((mat1.data()[5]), 5);
+
+  VERIFY_IS_EQUAL((mat2.data()[0]), 0);
+  VERIFY_IS_EQUAL((mat2.data()[1]), 1);
+  VERIFY_IS_EQUAL((mat2.data()[2]), 2);
+  VERIFY_IS_EQUAL((mat2.data()[3]), 3);
+  VERIFY_IS_EQUAL((mat2.data()[4]), 4);
+  VERIFY_IS_EQUAL((mat2.data()[5]), 5);
+}
+
+static void test_3d()
+{
+  Tensor<int, 3> epsilon(3,3,3);
+  epsilon.setZero();
+  epsilon(0,1,2) = epsilon(2,0,1) = epsilon(1,2,0) = 1;
+  epsilon(2,1,0) = epsilon(0,2,1) = epsilon(1,0,2) = -1;
+
+  VERIFY_IS_EQUAL((epsilon.size()), 27);
+  VERIFY_IS_EQUAL((epsilon.dimensions()[0]), 3);
+  VERIFY_IS_EQUAL((epsilon.dimensions()[1]), 3);
+  VERIFY_IS_EQUAL((epsilon.dimensions()[2]), 3);
+
+  VERIFY_IS_EQUAL((epsilon(0,0,0)), 0);
+  VERIFY_IS_EQUAL((epsilon(0,0,1)), 0);
+  VERIFY_IS_EQUAL((epsilon(0,0,2)), 0);
+  VERIFY_IS_EQUAL((epsilon(0,1,0)), 0);
+  VERIFY_IS_EQUAL((epsilon(0,1,1)), 0);
+  VERIFY_IS_EQUAL((epsilon(0,2,0)), 0);
+  VERIFY_IS_EQUAL((epsilon(0,2,2)), 0);
+  VERIFY_IS_EQUAL((epsilon(1,0,0)), 0);
+  VERIFY_IS_EQUAL((epsilon(1,0,1)), 0);
+  VERIFY_IS_EQUAL((epsilon(1,1,0)), 0);
+  VERIFY_IS_EQUAL((epsilon(1,1,1)), 0);
+  VERIFY_IS_EQUAL((epsilon(1,1,2)), 0);
+  VERIFY_IS_EQUAL((epsilon(1,2,1)), 0);
+  VERIFY_IS_EQUAL((epsilon(1,2,2)), 0);
+  VERIFY_IS_EQUAL((epsilon(2,0,0)), 0);
+  VERIFY_IS_EQUAL((epsilon(2,0,2)), 0);
+  VERIFY_IS_EQUAL((epsilon(2,1,1)), 0);
+  VERIFY_IS_EQUAL((epsilon(2,1,2)), 0);
+  VERIFY_IS_EQUAL((epsilon(2,2,0)), 0);
+  VERIFY_IS_EQUAL((epsilon(2,2,1)), 0);
+  VERIFY_IS_EQUAL((epsilon(2,2,2)), 0);
+
+  VERIFY_IS_EQUAL((epsilon(0,1,2)), 1);
+  VERIFY_IS_EQUAL((epsilon(2,0,1)), 1);
+  VERIFY_IS_EQUAL((epsilon(1,2,0)), 1);
+  VERIFY_IS_EQUAL((epsilon(2,1,0)), -1);
+  VERIFY_IS_EQUAL((epsilon(0,2,1)), -1);
+  VERIFY_IS_EQUAL((epsilon(1,0,2)), -1);
+
+  array<Eigen::DenseIndex, 3> dims;
+  dims[0] = 2;
+  dims[1] = 3;
+  dims[2] = 4;
+  Tensor<int, 3> t1(dims);
+  Tensor<int, 3, RowMajor> t2(dims);
+
+  VERIFY_IS_EQUAL((t1.size()), 24);
+  VERIFY_IS_EQUAL((t1.dimensions()[0]), 2);
+  VERIFY_IS_EQUAL((t1.dimensions()[1]), 3);
+  VERIFY_IS_EQUAL((t1.dimensions()[2]), 4);
+
+  VERIFY_IS_EQUAL((t2.size()), 24);
+  VERIFY_IS_EQUAL((t2.dimensions()[0]), 2);
+  VERIFY_IS_EQUAL((t2.dimensions()[1]), 3);
+  VERIFY_IS_EQUAL((t2.dimensions()[2]), 4);
+
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < 3; j++) {
+      for (int k = 0; k < 4; k++) {
+        t1(i, j, k) = 100 * i + 10 * j + k;
+        t2(i, j, k) = 100 * i + 10 * j + k;
+      }
+    }
+  }
+
+  VERIFY_IS_EQUAL((t1.data()[0]),    0);
+  VERIFY_IS_EQUAL((t1.data()[1]),  100);
+  VERIFY_IS_EQUAL((t1.data()[2]),   10);
+  VERIFY_IS_EQUAL((t1.data()[3]),  110);
+  VERIFY_IS_EQUAL((t1.data()[4]),   20);
+  VERIFY_IS_EQUAL((t1.data()[5]),  120);
+  VERIFY_IS_EQUAL((t1.data()[6]),    1);
+  VERIFY_IS_EQUAL((t1.data()[7]),  101);
+  VERIFY_IS_EQUAL((t1.data()[8]),   11);
+  VERIFY_IS_EQUAL((t1.data()[9]),  111);
+  VERIFY_IS_EQUAL((t1.data()[10]),  21);
+  VERIFY_IS_EQUAL((t1.data()[11]), 121);
+  VERIFY_IS_EQUAL((t1.data()[12]),   2);
+  VERIFY_IS_EQUAL((t1.data()[13]), 102);
+  VERIFY_IS_EQUAL((t1.data()[14]),  12);
+  VERIFY_IS_EQUAL((t1.data()[15]), 112);
+  VERIFY_IS_EQUAL((t1.data()[16]),  22);
+  VERIFY_IS_EQUAL((t1.data()[17]), 122);
+  VERIFY_IS_EQUAL((t1.data()[18]),   3);
+  VERIFY_IS_EQUAL((t1.data()[19]), 103);
+  VERIFY_IS_EQUAL((t1.data()[20]),  13);
+  VERIFY_IS_EQUAL((t1.data()[21]), 113);
+  VERIFY_IS_EQUAL((t1.data()[22]),  23);
+  VERIFY_IS_EQUAL((t1.data()[23]), 123);
+
+  VERIFY_IS_EQUAL((t2.data()[0]),    0);
+  VERIFY_IS_EQUAL((t2.data()[1]),    1);
+  VERIFY_IS_EQUAL((t2.data()[2]),    2);
+  VERIFY_IS_EQUAL((t2.data()[3]),    3);
+  VERIFY_IS_EQUAL((t2.data()[4]),   10);
+  VERIFY_IS_EQUAL((t2.data()[5]),   11);
+  VERIFY_IS_EQUAL((t2.data()[6]),   12);
+  VERIFY_IS_EQUAL((t2.data()[7]),   13);
+  VERIFY_IS_EQUAL((t2.data()[8]),   20);
+  VERIFY_IS_EQUAL((t2.data()[9]),   21);
+  VERIFY_IS_EQUAL((t2.data()[10]),  22);
+  VERIFY_IS_EQUAL((t2.data()[11]),  23);
+  VERIFY_IS_EQUAL((t2.data()[12]), 100);
+  VERIFY_IS_EQUAL((t2.data()[13]), 101);
+  VERIFY_IS_EQUAL((t2.data()[14]), 102);
+  VERIFY_IS_EQUAL((t2.data()[15]), 103);
+  VERIFY_IS_EQUAL((t2.data()[16]), 110);
+  VERIFY_IS_EQUAL((t2.data()[17]), 111);
+  VERIFY_IS_EQUAL((t2.data()[18]), 112);
+  VERIFY_IS_EQUAL((t2.data()[19]), 113);
+  VERIFY_IS_EQUAL((t2.data()[20]), 120);
+  VERIFY_IS_EQUAL((t2.data()[21]), 121);
+  VERIFY_IS_EQUAL((t2.data()[22]), 122);
+  VERIFY_IS_EQUAL((t2.data()[23]), 123);
+}
+
+static void test_simple_assign()
+{
+  Tensor<int, 3> epsilon(3,3,3);
+  epsilon.setZero();
+  epsilon(0,1,2) = epsilon(2,0,1) = epsilon(1,2,0) = 1;
+  epsilon(2,1,0) = epsilon(0,2,1) = epsilon(1,0,2) = -1;
+
+  Tensor<int, 3> e2(3,3,3);
+  e2.setZero();
+  VERIFY_IS_EQUAL((e2(1,2,0)), 0);
+
+  e2 = epsilon;
+  VERIFY_IS_EQUAL((e2(1,2,0)), 1);
+  VERIFY_IS_EQUAL((e2(0,1,2)), 1);
+  VERIFY_IS_EQUAL((e2(2,0,1)), 1);
+  VERIFY_IS_EQUAL((e2(2,1,0)), -1);
+  VERIFY_IS_EQUAL((e2(0,2,1)), -1);
+  VERIFY_IS_EQUAL((e2(1,0,2)), -1);
+}
+
+static void test_resize()
+{
+  Tensor<int, 3> epsilon;
+  epsilon.resize(2,3,7);
+  VERIFY_IS_EQUAL(epsilon.dimension(0), 2);
+  VERIFY_IS_EQUAL(epsilon.dimension(1), 3);
+  VERIFY_IS_EQUAL(epsilon.dimension(2), 7);
+  VERIFY_IS_EQUAL(epsilon.size(), 2*3*7);
+
+  const int* old_data = epsilon.data();
+  epsilon.resize(3,2,7);
+  VERIFY_IS_EQUAL(epsilon.dimension(0), 3);
+  VERIFY_IS_EQUAL(epsilon.dimension(1), 2);
+  VERIFY_IS_EQUAL(epsilon.dimension(2), 7);
+  VERIFY_IS_EQUAL(epsilon.size(), 2*3*7);
+  VERIFY_IS_EQUAL(epsilon.data(), old_data);
+
+  epsilon.resize(3,5,7);
+  VERIFY_IS_EQUAL(epsilon.dimension(0), 3);
+  VERIFY_IS_EQUAL(epsilon.dimension(1), 5);
+  VERIFY_IS_EQUAL(epsilon.dimension(2), 7);
+  VERIFY_IS_EQUAL(epsilon.size(), 3*5*7);
+}
+
+void test_cxx11_tensor_simple()
+{
+  CALL_SUBTEST(test_0d());
+  CALL_SUBTEST(test_1d());
+  CALL_SUBTEST(test_2d());
+  CALL_SUBTEST(test_3d());
+  CALL_SUBTEST(test_simple_assign());
+  CALL_SUBTEST(test_resize());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_striding.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_striding.cpp
new file mode 100644
index 000000000..935b908cc
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_striding.cpp
@@ -0,0 +1,119 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+template<int DataLayout>
+static void test_simple_striding()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> strides;
+  strides[0] = 1;
+  strides[1] = 1;
+  strides[2] = 1;
+  strides[3] = 1;
+
+  Tensor<float, 4, DataLayout> no_stride;
+  no_stride = tensor.stride(strides);
+
+  VERIFY_IS_EQUAL(no_stride.dimension(0), 2);
+  VERIFY_IS_EQUAL(no_stride.dimension(1), 3);
+  VERIFY_IS_EQUAL(no_stride.dimension(2), 5);
+  VERIFY_IS_EQUAL(no_stride.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_stride(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+  Tensor<float, 4, DataLayout> stride;
+  stride = tensor.stride(strides);
+
+  VERIFY_IS_EQUAL(stride.dimension(0), 1);
+  VERIFY_IS_EQUAL(stride.dimension(1), 1);
+  VERIFY_IS_EQUAL(stride.dimension(2), 3);
+  VERIFY_IS_EQUAL(stride.dimension(3), 3);
+
+  for (int i = 0; i < 1; ++i) {
+    for (int j = 0; j < 1; ++j) {
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 3; ++l) {
+          VERIFY_IS_EQUAL(tensor(2*i,4*j,2*k,3*l), stride(i,j,k,l));
+        }
+      }
+    }
+  }
+}
+
+
+template<int DataLayout>
+static void test_striding_as_lvalue()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,5,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> strides;
+  strides[0] = 2;
+  strides[1] = 4;
+  strides[2] = 2;
+  strides[3] = 3;
+
+  Tensor<float, 4, DataLayout> result(3, 12, 10, 21);
+  result.stride(strides) = tensor;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), result(2*i,4*j,2*k,3*l));
+        }
+      }
+    }
+  }
+
+  array<ptrdiff_t, 4> no_strides;
+  no_strides[0] = 1;
+  no_strides[1] = 1;
+  no_strides[2] = 1;
+  no_strides[3] = 1;
+  Tensor<float, 4, DataLayout> result2(3, 12, 10, 21);
+  result2.stride(strides) = tensor.stride(no_strides);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 5; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), result2(2*i,4*j,2*k,3*l));
+        }
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_striding()
+{
+  CALL_SUBTEST(test_simple_striding<ColMajor>());
+  CALL_SUBTEST(test_simple_striding<RowMajor>());
+  CALL_SUBTEST(test_striding_as_lvalue<ColMajor>());
+  CALL_SUBTEST(test_striding_as_lvalue<RowMajor>());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_sugar.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_sugar.cpp
new file mode 100644
index 000000000..2f56eb495
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_sugar.cpp
@@ -0,0 +1,81 @@
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+using Eigen::RowMajor;
+
+static void test_comparison_sugar() {
+  // we already trust comparisons between tensors, we're simply checking that
+  // the sugared versions are doing the same thing
+  Tensor<int, 3> t(6, 7, 5);
+
+  t.setRandom();
+  // make sure we have at least one value == 0
+  t(0,0,0) = 0;
+
+  Tensor<bool,0> b;
+
+#define TEST_TENSOR_EQUAL(e1, e2) \
+  b = ((e1) == (e2)).all();       \
+  VERIFY(b())
+
+#define TEST_OP(op) TEST_TENSOR_EQUAL(t op 0, t op t.constant(0))
+
+  TEST_OP(==);
+  TEST_OP(!=);
+  TEST_OP(<=);
+  TEST_OP(>=);
+  TEST_OP(<);
+  TEST_OP(>);
+#undef TEST_OP
+#undef TEST_TENSOR_EQUAL
+}
+
+
+static void test_scalar_sugar_add_mul() {
+  Tensor<float, 3> A(6, 7, 5);
+  Tensor<float, 3> B(6, 7, 5);
+  A.setRandom();
+  B.setRandom();
+
+  const float alpha = 0.43f;
+  const float beta = 0.21f;
+  const float gamma = 0.14f;
+
+  Tensor<float, 3> R = A.constant(gamma) + A * A.constant(alpha) + B * B.constant(beta);
+  Tensor<float, 3> S = A * alpha + B * beta + gamma;
+  Tensor<float, 3> T = gamma + alpha * A + beta * B;
+
+  for (int i = 0; i < 6*7*5; ++i) {
+    VERIFY_IS_APPROX(R(i), S(i));
+    VERIFY_IS_APPROX(R(i), T(i));
+  }
+}
+
+static void test_scalar_sugar_sub_div() {
+  Tensor<float, 3> A(6, 7, 5);
+  Tensor<float, 3> B(6, 7, 5);
+  A.setRandom();
+  B.setRandom();
+
+  const float alpha = 0.43f;
+  const float beta = 0.21f;
+  const float gamma = 0.14f;
+  const float delta = 0.32f;
+
+  Tensor<float, 3> R = A.constant(gamma) - A / A.constant(alpha)
+      - B.constant(beta) / B - A.constant(delta);
+  Tensor<float, 3> S = gamma - A / alpha - beta / B - delta;
+
+  for (int i = 0; i < 6*7*5; ++i) {
+    VERIFY_IS_APPROX(R(i), S(i));
+  }
+}
+
+void test_cxx11_tensor_sugar()
+{
+  CALL_SUBTEST(test_comparison_sugar());
+  CALL_SUBTEST(test_scalar_sugar_add_mul());
+  CALL_SUBTEST(test_scalar_sugar_sub_div());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_sycl.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_sycl.cpp
new file mode 100644
index 000000000..6a9c33422
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_sycl.cpp
@@ -0,0 +1,159 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_sycl
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_SYCL
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::array;
+using Eigen::SyclDevice;
+using Eigen::Tensor;
+using Eigen::TensorMap;
+
+void test_sycl_cpu(const Eigen::SyclDevice &sycl_device) {
+
+  int sizeDim1 = 100;
+  int sizeDim2 = 100;
+  int sizeDim3 = 100;
+  array<int, 3> tensorRange = {{sizeDim1, sizeDim2, sizeDim3}};
+  Tensor<float, 3> in1(tensorRange);
+  Tensor<float, 3> in2(tensorRange);
+  Tensor<float, 3> in3(tensorRange);
+  Tensor<float, 3> out(tensorRange);
+
+  in2 = in2.random();
+  in3 = in3.random();
+
+  float * gpu_in1_data  = static_cast<float*>(sycl_device.allocate(in1.dimensions().TotalSize()*sizeof(float)));
+  float * gpu_in2_data  = static_cast<float*>(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(float)));
+  float * gpu_in3_data  = static_cast<float*>(sycl_device.allocate(in3.dimensions().TotalSize()*sizeof(float)));
+  float * gpu_out_data =  static_cast<float*>(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(float)));
+
+  TensorMap<Tensor<float, 3>> gpu_in1(gpu_in1_data, tensorRange);
+  TensorMap<Tensor<float, 3>> gpu_in2(gpu_in2_data, tensorRange);
+  TensorMap<Tensor<float, 3>> gpu_in3(gpu_in3_data, tensorRange);
+  TensorMap<Tensor<float, 3>> gpu_out(gpu_out_data, tensorRange);
+
+  /// a=1.2f
+  gpu_in1.device(sycl_device) = gpu_in1.constant(1.2f);
+  sycl_device.memcpyDeviceToHost(in1.data(), gpu_in1_data ,(in1.dimensions().TotalSize())*sizeof(float));
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k < sizeDim3; ++k) {
+        VERIFY_IS_APPROX(in1(i,j,k), 1.2f);
+      }
+    }
+  }
+  printf("a=1.2f Test passed\n");
+
+  /// a=b*1.2f
+  gpu_out.device(sycl_device) = gpu_in1 * 1.2f;
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data ,(out.dimensions().TotalSize())*sizeof(float));
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k < sizeDim3; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k),
+                         in1(i,j,k) * 1.2f);
+      }
+    }
+  }
+  printf("a=b*1.2f Test Passed\n");
+
+  /// c=a*b
+  sycl_device.memcpyHostToDevice(gpu_in2_data, in2.data(),(in2.dimensions().TotalSize())*sizeof(float));
+  gpu_out.device(sycl_device) = gpu_in1 * gpu_in2;
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k < sizeDim3; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k),
+                         in1(i,j,k) *
+                             in2(i,j,k));
+      }
+    }
+  }
+  printf("c=a*b Test Passed\n");
+
+  /// c=a+b
+  gpu_out.device(sycl_device) = gpu_in1 + gpu_in2;
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k < sizeDim3; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k),
+                         in1(i,j,k) +
+                             in2(i,j,k));
+      }
+    }
+  }
+  printf("c=a+b Test Passed\n");
+
+  /// c=a*a
+  gpu_out.device(sycl_device) = gpu_in1 * gpu_in1;
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k < sizeDim3; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k),
+                         in1(i,j,k) *
+                             in1(i,j,k));
+      }
+    }
+  }
+  printf("c= a*a Test Passed\n");
+
+  //a*3.14f + b*2.7f
+  gpu_out.device(sycl_device) =  gpu_in1 * gpu_in1.constant(3.14f) + gpu_in2 * gpu_in2.constant(2.7f);
+  sycl_device.memcpyDeviceToHost(out.data(),gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k < sizeDim3; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k),
+                         in1(i,j,k) * 3.14f
+                       + in2(i,j,k) * 2.7f);
+      }
+    }
+  }
+  printf("a*3.14f + b*2.7f Test Passed\n");
+
+  ///d= (a>0.5? b:c)
+  sycl_device.memcpyHostToDevice(gpu_in3_data, in3.data(),(in3.dimensions().TotalSize())*sizeof(float));
+  gpu_out.device(sycl_device) =(gpu_in1 > gpu_in1.constant(0.5f)).select(gpu_in2, gpu_in3);
+  sycl_device.memcpyDeviceToHost(out.data(), gpu_out_data,(out.dimensions().TotalSize())*sizeof(float));
+  for (int i = 0; i < sizeDim1; ++i) {
+    for (int j = 0; j < sizeDim2; ++j) {
+      for (int k = 0; k < sizeDim3; ++k) {
+        VERIFY_IS_APPROX(out(i, j, k), (in1(i, j, k) > 0.5f)
+                                                ? in2(i, j, k)
+                                                : in3(i, j, k));
+      }
+    }
+  }
+  printf("d= (a>0.5? b:c) Test Passed\n");
+  sycl_device.deallocate(gpu_in1_data);
+  sycl_device.deallocate(gpu_in2_data);
+  sycl_device.deallocate(gpu_in3_data);
+  sycl_device.deallocate(gpu_out_data);
+}
+void test_cxx11_tensor_sycl() {
+  cl::sycl::gpu_selector s;
+  Eigen::SyclDevice sycl_device(s);
+  CALL_SUBTEST(test_sycl_cpu(sycl_device));
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_symmetry.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_symmetry.cpp
new file mode 100644
index 000000000..d680e9b3b
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_symmetry.cpp
@@ -0,0 +1,818 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+#include <Eigen/CXX11/TensorSymmetry>
+
+#include <map>
+#include <set>
+
+using Eigen::Tensor;
+using Eigen::SGroup;
+using Eigen::DynamicSGroup;
+using Eigen::StaticSGroup;
+using Eigen::Symmetry;
+using Eigen::AntiSymmetry;
+using Eigen::Hermiticity;
+using Eigen::AntiHermiticity;
+
+using Eigen::NegationFlag;
+using Eigen::ConjugationFlag;
+using Eigen::GlobalZeroFlag;
+using Eigen::GlobalRealFlag;
+using Eigen::GlobalImagFlag;
+
+// helper function to determine if the compiler intantiated a static
+// or dynamic symmetry group
+template<typename... Sym>
+bool isDynGroup(StaticSGroup<Sym...> const& dummy)
+{
+  (void)dummy;
+  return false;
+}
+
+bool isDynGroup(DynamicSGroup const& dummy)
+{
+  (void)dummy;
+  return true;
+}
+
+// helper class for checking that the symmetry groups are correct
+struct checkIdx {
+  template<typename ArrType>
+  static inline int doCheck_(ArrType e, int flags, int dummy, std::set<uint64_t>& found, std::map<uint64_t, int> const& expected)
+  {
+    // use decimal representation of value
+    uint64_t value = e[0];
+    for (std::size_t i = 1; i < e.size(); i++)
+      value = value * 10 + e[i];
+
+    // we want to make sure that we find each element
+    auto it = expected.find(value);
+    VERIFY((it != expected.end()));
+    VERIFY_IS_EQUAL(it->second, flags);
+
+    // we want to make sure we only have each element once;
+    // set::insert returns true for the second part of the pair
+    // if the element was really inserted and not already there
+    auto p = found.insert(value);
+    VERIFY((p.second));
+
+    return dummy;
+  }
+
+  static inline int run(std::vector<int> e, int flags, int dummy, std::set<uint64_t>& found, std::map<uint64_t, int> const& expected)
+  {
+    return doCheck_(e, flags, dummy, found, expected);
+  }
+
+  template<std::size_t N>
+  static inline int run(std::array<int, N> e, int flags, int dummy, std::set<uint64_t>& found, std::map<uint64_t, int> const& expected)
+  {
+    return doCheck_(e, flags, dummy, found, expected);
+  }
+};
+
+static void test_symgroups_static()
+{
+  std::array<int, 7> identity{{0,1,2,3,4,5,6}};
+
+  // Simple static symmetry group
+  StaticSGroup<
+    AntiSymmetry<0,1>,
+    Hermiticity<0,2>
+  > group;
+
+  std::set<uint64_t> found;
+  std::map<uint64_t, int> expected;
+  expected[ 123456] = 0;
+  expected[1023456] = NegationFlag;
+  expected[2103456] = ConjugationFlag;
+  expected[1203456] = ConjugationFlag | NegationFlag;
+  expected[2013456] = ConjugationFlag | NegationFlag;
+  expected[ 213456] = ConjugationFlag;
+
+  VERIFY_IS_EQUAL(group.size(), 6u);
+  VERIFY_IS_EQUAL(group.globalFlags(), GlobalImagFlag);
+  group.apply<checkIdx, int>(identity, 0, found, expected);
+  VERIFY_IS_EQUAL(found.size(), 6u);
+}
+
+static void test_symgroups_dynamic()
+{
+  std::vector<int> identity;
+  for (int i = 0; i <= 6; i++)
+    identity.push_back(i);
+
+  // Simple dynamic symmetry group
+  DynamicSGroup group;
+  group.add(0,1,NegationFlag);
+  group.add(0,2,ConjugationFlag);
+
+  VERIFY_IS_EQUAL(group.size(), 6u);
+  VERIFY_IS_EQUAL(group.globalFlags(), GlobalImagFlag);
+
+  std::set<uint64_t> found;
+  std::map<uint64_t, int> expected;
+  expected[ 123456] = 0;
+  expected[1023456] = NegationFlag;
+  expected[2103456] = ConjugationFlag;
+  expected[1203456] = ConjugationFlag | NegationFlag;
+  expected[2013456] = ConjugationFlag | NegationFlag;
+  expected[ 213456] = ConjugationFlag;
+
+  VERIFY_IS_EQUAL(group.size(), 6u);
+  VERIFY_IS_EQUAL(group.globalFlags(), GlobalImagFlag);
+  group.apply<checkIdx, int>(identity, 0, found, expected);
+  VERIFY_IS_EQUAL(found.size(), 6u);
+}
+
+static void test_symgroups_selection()
+{
+  std::array<int, 7> identity7{{0,1,2,3,4,5,6}};
+  std::array<int, 10> identity10{{0,1,2,3,4,5,6,7,8,9}};
+
+  {
+    // Do the same test as in test_symgroups_static but
+    // require selection via SGroup
+    SGroup<
+      AntiSymmetry<0,1>,
+      Hermiticity<0,2>
+    > group;
+
+    std::set<uint64_t> found;
+    std::map<uint64_t, int> expected;
+    expected[ 123456] = 0;
+    expected[1023456] = NegationFlag;
+    expected[2103456] = ConjugationFlag;
+    expected[1203456] = ConjugationFlag | NegationFlag;
+    expected[2013456] = ConjugationFlag | NegationFlag;
+    expected[ 213456] = ConjugationFlag;
+
+    VERIFY(!isDynGroup(group));
+    VERIFY_IS_EQUAL(group.size(), 6u);
+    VERIFY_IS_EQUAL(group.globalFlags(), GlobalImagFlag);
+    group.apply<checkIdx, int>(identity7, 0, found, expected);
+    VERIFY_IS_EQUAL(found.size(), 6u);
+  }
+
+  {
+    // simple factorizing group: 5 generators, 2^5 = 32 elements
+    // selection should make this dynamic, although static group
+    // can still be reasonably generated
+    SGroup<
+      Symmetry<0,1>,
+      Symmetry<2,3>,
+      Symmetry<4,5>,
+      Symmetry<6,7>,
+      Symmetry<8,9>
+    > group;
+
+    std::set<uint64_t> found;
+    std::map<uint64_t, int> expected;
+    expected[ 123456789] = 0; expected[ 123456798] = 0; expected[ 123457689] = 0; expected[ 123457698] = 0;
+    expected[ 123546789] = 0; expected[ 123546798] = 0; expected[ 123547689] = 0; expected[ 123547698] = 0;
+    expected[ 132456789] = 0; expected[ 132456798] = 0; expected[ 132457689] = 0; expected[ 132457698] = 0;
+    expected[ 132546789] = 0; expected[ 132546798] = 0; expected[ 132547689] = 0; expected[ 132547698] = 0;
+    expected[1023456789] = 0; expected[1023456798] = 0; expected[1023457689] = 0; expected[1023457698] = 0;
+    expected[1023546789] = 0; expected[1023546798] = 0; expected[1023547689] = 0; expected[1023547698] = 0;
+    expected[1032456789] = 0; expected[1032456798] = 0; expected[1032457689] = 0; expected[1032457698] = 0;
+    expected[1032546789] = 0; expected[1032546798] = 0; expected[1032547689] = 0; expected[1032547698] = 0;
+
+    VERIFY(isDynGroup(group));
+    VERIFY_IS_EQUAL(group.size(), 32u);
+    VERIFY_IS_EQUAL(group.globalFlags(), 0);
+    group.apply<checkIdx, int>(identity10, 0, found, expected);
+    VERIFY_IS_EQUAL(found.size(), 32u);
+
+    // no verify that we could also generate a static group
+    // with these generators
+    found.clear();
+    StaticSGroup<
+      Symmetry<0,1>,
+      Symmetry<2,3>,
+      Symmetry<4,5>,
+      Symmetry<6,7>,
+      Symmetry<8,9>
+    > group_static;
+    VERIFY_IS_EQUAL(group_static.size(), 32u);
+    VERIFY_IS_EQUAL(group_static.globalFlags(), 0);
+    group_static.apply<checkIdx, int>(identity10, 0, found, expected);
+    VERIFY_IS_EQUAL(found.size(), 32u);
+  }
+
+  {
+    // try to create a HUGE group
+    SGroup<
+      Symmetry<0,1>,
+      Symmetry<1,2>,
+      Symmetry<2,3>,
+      Symmetry<3,4>,
+      Symmetry<4,5>,
+      Symmetry<5,6>
+    > group;
+
+    std::set<uint64_t> found;
+    uint64_t pre_expected[5040] = {
+       123456, 1023456,  213456, 2013456, 1203456, 2103456,  132456, 1032456,  312456, 3012456, 1302456, 3102456,
+       231456, 2031456,  321456, 3021456, 2301456, 3201456, 1230456, 2130456, 1320456, 3120456, 2310456, 3210456,
+       124356, 1024356,  214356, 2014356, 1204356, 2104356,  142356, 1042356,  412356, 4012356, 1402356, 4102356,
+       241356, 2041356,  421356, 4021356, 2401356, 4201356, 1240356, 2140356, 1420356, 4120356, 2410356, 4210356,
+       134256, 1034256,  314256, 3014256, 1304256, 3104256,  143256, 1043256,  413256, 4013256, 1403256, 4103256,
+       341256, 3041256,  431256, 4031256, 3401256, 4301256, 1340256, 3140256, 1430256, 4130256, 3410256, 4310256,
+       234156, 2034156,  324156, 3024156, 2304156, 3204156,  243156, 2043156,  423156, 4023156, 2403156, 4203156,
+       342156, 3042156,  432156, 4032156, 3402156, 4302156, 2340156, 3240156, 2430156, 4230156, 3420156, 4320156,
+      1234056, 2134056, 1324056, 3124056, 2314056, 3214056, 1243056, 2143056, 1423056, 4123056, 2413056, 4213056,
+      1342056, 3142056, 1432056, 4132056, 3412056, 4312056, 2341056, 3241056, 2431056, 4231056, 3421056, 4321056,
+       123546, 1023546,  213546, 2013546, 1203546, 2103546,  132546, 1032546,  312546, 3012546, 1302546, 3102546,
+       231546, 2031546,  321546, 3021546, 2301546, 3201546, 1230546, 2130546, 1320546, 3120546, 2310546, 3210546,
+       125346, 1025346,  215346, 2015346, 1205346, 2105346,  152346, 1052346,  512346, 5012346, 1502346, 5102346,
+       251346, 2051346,  521346, 5021346, 2501346, 5201346, 1250346, 2150346, 1520346, 5120346, 2510346, 5210346,
+       135246, 1035246,  315246, 3015246, 1305246, 3105246,  153246, 1053246,  513246, 5013246, 1503246, 5103246,
+       351246, 3051246,  531246, 5031246, 3501246, 5301246, 1350246, 3150246, 1530246, 5130246, 3510246, 5310246,
+       235146, 2035146,  325146, 3025146, 2305146, 3205146,  253146, 2053146,  523146, 5023146, 2503146, 5203146,
+       352146, 3052146,  532146, 5032146, 3502146, 5302146, 2350146, 3250146, 2530146, 5230146, 3520146, 5320146,
+      1235046, 2135046, 1325046, 3125046, 2315046, 3215046, 1253046, 2153046, 1523046, 5123046, 2513046, 5213046,
+      1352046, 3152046, 1532046, 5132046, 3512046, 5312046, 2351046, 3251046, 2531046, 5231046, 3521046, 5321046,
+       124536, 1024536,  214536, 2014536, 1204536, 2104536,  142536, 1042536,  412536, 4012536, 1402536, 4102536,
+       241536, 2041536,  421536, 4021536, 2401536, 4201536, 1240536, 2140536, 1420536, 4120536, 2410536, 4210536,
+       125436, 1025436,  215436, 2015436, 1205436, 2105436,  152436, 1052436,  512436, 5012436, 1502436, 5102436,
+       251436, 2051436,  521436, 5021436, 2501436, 5201436, 1250436, 2150436, 1520436, 5120436, 2510436, 5210436,
+       145236, 1045236,  415236, 4015236, 1405236, 4105236,  154236, 1054236,  514236, 5014236, 1504236, 5104236,
+       451236, 4051236,  541236, 5041236, 4501236, 5401236, 1450236, 4150236, 1540236, 5140236, 4510236, 5410236,
+       245136, 2045136,  425136, 4025136, 2405136, 4205136,  254136, 2054136,  524136, 5024136, 2504136, 5204136,
+       452136, 4052136,  542136, 5042136, 4502136, 5402136, 2450136, 4250136, 2540136, 5240136, 4520136, 5420136,
+      1245036, 2145036, 1425036, 4125036, 2415036, 4215036, 1254036, 2154036, 1524036, 5124036, 2514036, 5214036,
+      1452036, 4152036, 1542036, 5142036, 4512036, 5412036, 2451036, 4251036, 2541036, 5241036, 4521036, 5421036,
+       134526, 1034526,  314526, 3014526, 1304526, 3104526,  143526, 1043526,  413526, 4013526, 1403526, 4103526,
+       341526, 3041526,  431526, 4031526, 3401526, 4301526, 1340526, 3140526, 1430526, 4130526, 3410526, 4310526,
+       135426, 1035426,  315426, 3015426, 1305426, 3105426,  153426, 1053426,  513426, 5013426, 1503426, 5103426,
+       351426, 3051426,  531426, 5031426, 3501426, 5301426, 1350426, 3150426, 1530426, 5130426, 3510426, 5310426,
+       145326, 1045326,  415326, 4015326, 1405326, 4105326,  154326, 1054326,  514326, 5014326, 1504326, 5104326,
+       451326, 4051326,  541326, 5041326, 4501326, 5401326, 1450326, 4150326, 1540326, 5140326, 4510326, 5410326,
+       345126, 3045126,  435126, 4035126, 3405126, 4305126,  354126, 3054126,  534126, 5034126, 3504126, 5304126,
+       453126, 4053126,  543126, 5043126, 4503126, 5403126, 3450126, 4350126, 3540126, 5340126, 4530126, 5430126,
+      1345026, 3145026, 1435026, 4135026, 3415026, 4315026, 1354026, 3154026, 1534026, 5134026, 3514026, 5314026,
+      1453026, 4153026, 1543026, 5143026, 4513026, 5413026, 3451026, 4351026, 3541026, 5341026, 4531026, 5431026,
+       234516, 2034516,  324516, 3024516, 2304516, 3204516,  243516, 2043516,  423516, 4023516, 2403516, 4203516,
+       342516, 3042516,  432516, 4032516, 3402516, 4302516, 2340516, 3240516, 2430516, 4230516, 3420516, 4320516,
+       235416, 2035416,  325416, 3025416, 2305416, 3205416,  253416, 2053416,  523416, 5023416, 2503416, 5203416,
+       352416, 3052416,  532416, 5032416, 3502416, 5302416, 2350416, 3250416, 2530416, 5230416, 3520416, 5320416,
+       245316, 2045316,  425316, 4025316, 2405316, 4205316,  254316, 2054316,  524316, 5024316, 2504316, 5204316,
+       452316, 4052316,  542316, 5042316, 4502316, 5402316, 2450316, 4250316, 2540316, 5240316, 4520316, 5420316,
+       345216, 3045216,  435216, 4035216, 3405216, 4305216,  354216, 3054216,  534216, 5034216, 3504216, 5304216,
+       453216, 4053216,  543216, 5043216, 4503216, 5403216, 3450216, 4350216, 3540216, 5340216, 4530216, 5430216,
+      2345016, 3245016, 2435016, 4235016, 3425016, 4325016, 2354016, 3254016, 2534016, 5234016, 3524016, 5324016,
+      2453016, 4253016, 2543016, 5243016, 4523016, 5423016, 3452016, 4352016, 3542016, 5342016, 4532016, 5432016,
+      1234506, 2134506, 1324506, 3124506, 2314506, 3214506, 1243506, 2143506, 1423506, 4123506, 2413506, 4213506,
+      1342506, 3142506, 1432506, 4132506, 3412506, 4312506, 2341506, 3241506, 2431506, 4231506, 3421506, 4321506,
+      1235406, 2135406, 1325406, 3125406, 2315406, 3215406, 1253406, 2153406, 1523406, 5123406, 2513406, 5213406,
+      1352406, 3152406, 1532406, 5132406, 3512406, 5312406, 2351406, 3251406, 2531406, 5231406, 3521406, 5321406,
+      1245306, 2145306, 1425306, 4125306, 2415306, 4215306, 1254306, 2154306, 1524306, 5124306, 2514306, 5214306,
+      1452306, 4152306, 1542306, 5142306, 4512306, 5412306, 2451306, 4251306, 2541306, 5241306, 4521306, 5421306,
+      1345206, 3145206, 1435206, 4135206, 3415206, 4315206, 1354206, 3154206, 1534206, 5134206, 3514206, 5314206,
+      1453206, 4153206, 1543206, 5143206, 4513206, 5413206, 3451206, 4351206, 3541206, 5341206, 4531206, 5431206,
+      2345106, 3245106, 2435106, 4235106, 3425106, 4325106, 2354106, 3254106, 2534106, 5234106, 3524106, 5324106,
+      2453106, 4253106, 2543106, 5243106, 4523106, 5423106, 3452106, 4352106, 3542106, 5342106, 4532106, 5432106,
+       123465, 1023465,  213465, 2013465, 1203465, 2103465,  132465, 1032465,  312465, 3012465, 1302465, 3102465,
+       231465, 2031465,  321465, 3021465, 2301465, 3201465, 1230465, 2130465, 1320465, 3120465, 2310465, 3210465,
+       124365, 1024365,  214365, 2014365, 1204365, 2104365,  142365, 1042365,  412365, 4012365, 1402365, 4102365,
+       241365, 2041365,  421365, 4021365, 2401365, 4201365, 1240365, 2140365, 1420365, 4120365, 2410365, 4210365,
+       134265, 1034265,  314265, 3014265, 1304265, 3104265,  143265, 1043265,  413265, 4013265, 1403265, 4103265,
+       341265, 3041265,  431265, 4031265, 3401265, 4301265, 1340265, 3140265, 1430265, 4130265, 3410265, 4310265,
+       234165, 2034165,  324165, 3024165, 2304165, 3204165,  243165, 2043165,  423165, 4023165, 2403165, 4203165,
+       342165, 3042165,  432165, 4032165, 3402165, 4302165, 2340165, 3240165, 2430165, 4230165, 3420165, 4320165,
+      1234065, 2134065, 1324065, 3124065, 2314065, 3214065, 1243065, 2143065, 1423065, 4123065, 2413065, 4213065,
+      1342065, 3142065, 1432065, 4132065, 3412065, 4312065, 2341065, 3241065, 2431065, 4231065, 3421065, 4321065,
+       123645, 1023645,  213645, 2013645, 1203645, 2103645,  132645, 1032645,  312645, 3012645, 1302645, 3102645,
+       231645, 2031645,  321645, 3021645, 2301645, 3201645, 1230645, 2130645, 1320645, 3120645, 2310645, 3210645,
+       126345, 1026345,  216345, 2016345, 1206345, 2106345,  162345, 1062345,  612345, 6012345, 1602345, 6102345,
+       261345, 2061345,  621345, 6021345, 2601345, 6201345, 1260345, 2160345, 1620345, 6120345, 2610345, 6210345,
+       136245, 1036245,  316245, 3016245, 1306245, 3106245,  163245, 1063245,  613245, 6013245, 1603245, 6103245,
+       361245, 3061245,  631245, 6031245, 3601245, 6301245, 1360245, 3160245, 1630245, 6130245, 3610245, 6310245,
+       236145, 2036145,  326145, 3026145, 2306145, 3206145,  263145, 2063145,  623145, 6023145, 2603145, 6203145,
+       362145, 3062145,  632145, 6032145, 3602145, 6302145, 2360145, 3260145, 2630145, 6230145, 3620145, 6320145,
+      1236045, 2136045, 1326045, 3126045, 2316045, 3216045, 1263045, 2163045, 1623045, 6123045, 2613045, 6213045,
+      1362045, 3162045, 1632045, 6132045, 3612045, 6312045, 2361045, 3261045, 2631045, 6231045, 3621045, 6321045,
+       124635, 1024635,  214635, 2014635, 1204635, 2104635,  142635, 1042635,  412635, 4012635, 1402635, 4102635,
+       241635, 2041635,  421635, 4021635, 2401635, 4201635, 1240635, 2140635, 1420635, 4120635, 2410635, 4210635,
+       126435, 1026435,  216435, 2016435, 1206435, 2106435,  162435, 1062435,  612435, 6012435, 1602435, 6102435,
+       261435, 2061435,  621435, 6021435, 2601435, 6201435, 1260435, 2160435, 1620435, 6120435, 2610435, 6210435,
+       146235, 1046235,  416235, 4016235, 1406235, 4106235,  164235, 1064235,  614235, 6014235, 1604235, 6104235,
+       461235, 4061235,  641235, 6041235, 4601235, 6401235, 1460235, 4160235, 1640235, 6140235, 4610235, 6410235,
+       246135, 2046135,  426135, 4026135, 2406135, 4206135,  264135, 2064135,  624135, 6024135, 2604135, 6204135,
+       462135, 4062135,  642135, 6042135, 4602135, 6402135, 2460135, 4260135, 2640135, 6240135, 4620135, 6420135,
+      1246035, 2146035, 1426035, 4126035, 2416035, 4216035, 1264035, 2164035, 1624035, 6124035, 2614035, 6214035,
+      1462035, 4162035, 1642035, 6142035, 4612035, 6412035, 2461035, 4261035, 2641035, 6241035, 4621035, 6421035,
+       134625, 1034625,  314625, 3014625, 1304625, 3104625,  143625, 1043625,  413625, 4013625, 1403625, 4103625,
+       341625, 3041625,  431625, 4031625, 3401625, 4301625, 1340625, 3140625, 1430625, 4130625, 3410625, 4310625,
+       136425, 1036425,  316425, 3016425, 1306425, 3106425,  163425, 1063425,  613425, 6013425, 1603425, 6103425,
+       361425, 3061425,  631425, 6031425, 3601425, 6301425, 1360425, 3160425, 1630425, 6130425, 3610425, 6310425,
+       146325, 1046325,  416325, 4016325, 1406325, 4106325,  164325, 1064325,  614325, 6014325, 1604325, 6104325,
+       461325, 4061325,  641325, 6041325, 4601325, 6401325, 1460325, 4160325, 1640325, 6140325, 4610325, 6410325,
+       346125, 3046125,  436125, 4036125, 3406125, 4306125,  364125, 3064125,  634125, 6034125, 3604125, 6304125,
+       463125, 4063125,  643125, 6043125, 4603125, 6403125, 3460125, 4360125, 3640125, 6340125, 4630125, 6430125,
+      1346025, 3146025, 1436025, 4136025, 3416025, 4316025, 1364025, 3164025, 1634025, 6134025, 3614025, 6314025,
+      1463025, 4163025, 1643025, 6143025, 4613025, 6413025, 3461025, 4361025, 3641025, 6341025, 4631025, 6431025,
+       234615, 2034615,  324615, 3024615, 2304615, 3204615,  243615, 2043615,  423615, 4023615, 2403615, 4203615,
+       342615, 3042615,  432615, 4032615, 3402615, 4302615, 2340615, 3240615, 2430615, 4230615, 3420615, 4320615,
+       236415, 2036415,  326415, 3026415, 2306415, 3206415,  263415, 2063415,  623415, 6023415, 2603415, 6203415,
+       362415, 3062415,  632415, 6032415, 3602415, 6302415, 2360415, 3260415, 2630415, 6230415, 3620415, 6320415,
+       246315, 2046315,  426315, 4026315, 2406315, 4206315,  264315, 2064315,  624315, 6024315, 2604315, 6204315,
+       462315, 4062315,  642315, 6042315, 4602315, 6402315, 2460315, 4260315, 2640315, 6240315, 4620315, 6420315,
+       346215, 3046215,  436215, 4036215, 3406215, 4306215,  364215, 3064215,  634215, 6034215, 3604215, 6304215,
+       463215, 4063215,  643215, 6043215, 4603215, 6403215, 3460215, 4360215, 3640215, 6340215, 4630215, 6430215,
+      2346015, 3246015, 2436015, 4236015, 3426015, 4326015, 2364015, 3264015, 2634015, 6234015, 3624015, 6324015,
+      2463015, 4263015, 2643015, 6243015, 4623015, 6423015, 3462015, 4362015, 3642015, 6342015, 4632015, 6432015,
+      1234605, 2134605, 1324605, 3124605, 2314605, 3214605, 1243605, 2143605, 1423605, 4123605, 2413605, 4213605,
+      1342605, 3142605, 1432605, 4132605, 3412605, 4312605, 2341605, 3241605, 2431605, 4231605, 3421605, 4321605,
+      1236405, 2136405, 1326405, 3126405, 2316405, 3216405, 1263405, 2163405, 1623405, 6123405, 2613405, 6213405,
+      1362405, 3162405, 1632405, 6132405, 3612405, 6312405, 2361405, 3261405, 2631405, 6231405, 3621405, 6321405,
+      1246305, 2146305, 1426305, 4126305, 2416305, 4216305, 1264305, 2164305, 1624305, 6124305, 2614305, 6214305,
+      1462305, 4162305, 1642305, 6142305, 4612305, 6412305, 2461305, 4261305, 2641305, 6241305, 4621305, 6421305,
+      1346205, 3146205, 1436205, 4136205, 3416205, 4316205, 1364205, 3164205, 1634205, 6134205, 3614205, 6314205,
+      1463205, 4163205, 1643205, 6143205, 4613205, 6413205, 3461205, 4361205, 3641205, 6341205, 4631205, 6431205,
+      2346105, 3246105, 2436105, 4236105, 3426105, 4326105, 2364105, 3264105, 2634105, 6234105, 3624105, 6324105,
+      2463105, 4263105, 2643105, 6243105, 4623105, 6423105, 3462105, 4362105, 3642105, 6342105, 4632105, 6432105,
+       123564, 1023564,  213564, 2013564, 1203564, 2103564,  132564, 1032564,  312564, 3012564, 1302564, 3102564,
+       231564, 2031564,  321564, 3021564, 2301564, 3201564, 1230564, 2130564, 1320564, 3120564, 2310564, 3210564,
+       125364, 1025364,  215364, 2015364, 1205364, 2105364,  152364, 1052364,  512364, 5012364, 1502364, 5102364,
+       251364, 2051364,  521364, 5021364, 2501364, 5201364, 1250364, 2150364, 1520364, 5120364, 2510364, 5210364,
+       135264, 1035264,  315264, 3015264, 1305264, 3105264,  153264, 1053264,  513264, 5013264, 1503264, 5103264,
+       351264, 3051264,  531264, 5031264, 3501264, 5301264, 1350264, 3150264, 1530264, 5130264, 3510264, 5310264,
+       235164, 2035164,  325164, 3025164, 2305164, 3205164,  253164, 2053164,  523164, 5023164, 2503164, 5203164,
+       352164, 3052164,  532164, 5032164, 3502164, 5302164, 2350164, 3250164, 2530164, 5230164, 3520164, 5320164,
+      1235064, 2135064, 1325064, 3125064, 2315064, 3215064, 1253064, 2153064, 1523064, 5123064, 2513064, 5213064,
+      1352064, 3152064, 1532064, 5132064, 3512064, 5312064, 2351064, 3251064, 2531064, 5231064, 3521064, 5321064,
+       123654, 1023654,  213654, 2013654, 1203654, 2103654,  132654, 1032654,  312654, 3012654, 1302654, 3102654,
+       231654, 2031654,  321654, 3021654, 2301654, 3201654, 1230654, 2130654, 1320654, 3120654, 2310654, 3210654,
+       126354, 1026354,  216354, 2016354, 1206354, 2106354,  162354, 1062354,  612354, 6012354, 1602354, 6102354,
+       261354, 2061354,  621354, 6021354, 2601354, 6201354, 1260354, 2160354, 1620354, 6120354, 2610354, 6210354,
+       136254, 1036254,  316254, 3016254, 1306254, 3106254,  163254, 1063254,  613254, 6013254, 1603254, 6103254,
+       361254, 3061254,  631254, 6031254, 3601254, 6301254, 1360254, 3160254, 1630254, 6130254, 3610254, 6310254,
+       236154, 2036154,  326154, 3026154, 2306154, 3206154,  263154, 2063154,  623154, 6023154, 2603154, 6203154,
+       362154, 3062154,  632154, 6032154, 3602154, 6302154, 2360154, 3260154, 2630154, 6230154, 3620154, 6320154,
+      1236054, 2136054, 1326054, 3126054, 2316054, 3216054, 1263054, 2163054, 1623054, 6123054, 2613054, 6213054,
+      1362054, 3162054, 1632054, 6132054, 3612054, 6312054, 2361054, 3261054, 2631054, 6231054, 3621054, 6321054,
+       125634, 1025634,  215634, 2015634, 1205634, 2105634,  152634, 1052634,  512634, 5012634, 1502634, 5102634,
+       251634, 2051634,  521634, 5021634, 2501634, 5201634, 1250634, 2150634, 1520634, 5120634, 2510634, 5210634,
+       126534, 1026534,  216534, 2016534, 1206534, 2106534,  162534, 1062534,  612534, 6012534, 1602534, 6102534,
+       261534, 2061534,  621534, 6021534, 2601534, 6201534, 1260534, 2160534, 1620534, 6120534, 2610534, 6210534,
+       156234, 1056234,  516234, 5016234, 1506234, 5106234,  165234, 1065234,  615234, 6015234, 1605234, 6105234,
+       561234, 5061234,  651234, 6051234, 5601234, 6501234, 1560234, 5160234, 1650234, 6150234, 5610234, 6510234,
+       256134, 2056134,  526134, 5026134, 2506134, 5206134,  265134, 2065134,  625134, 6025134, 2605134, 6205134,
+       562134, 5062134,  652134, 6052134, 5602134, 6502134, 2560134, 5260134, 2650134, 6250134, 5620134, 6520134,
+      1256034, 2156034, 1526034, 5126034, 2516034, 5216034, 1265034, 2165034, 1625034, 6125034, 2615034, 6215034,
+      1562034, 5162034, 1652034, 6152034, 5612034, 6512034, 2561034, 5261034, 2651034, 6251034, 5621034, 6521034,
+       135624, 1035624,  315624, 3015624, 1305624, 3105624,  153624, 1053624,  513624, 5013624, 1503624, 5103624,
+       351624, 3051624,  531624, 5031624, 3501624, 5301624, 1350624, 3150624, 1530624, 5130624, 3510624, 5310624,
+       136524, 1036524,  316524, 3016524, 1306524, 3106524,  163524, 1063524,  613524, 6013524, 1603524, 6103524,
+       361524, 3061524,  631524, 6031524, 3601524, 6301524, 1360524, 3160524, 1630524, 6130524, 3610524, 6310524,
+       156324, 1056324,  516324, 5016324, 1506324, 5106324,  165324, 1065324,  615324, 6015324, 1605324, 6105324,
+       561324, 5061324,  651324, 6051324, 5601324, 6501324, 1560324, 5160324, 1650324, 6150324, 5610324, 6510324,
+       356124, 3056124,  536124, 5036124, 3506124, 5306124,  365124, 3065124,  635124, 6035124, 3605124, 6305124,
+       563124, 5063124,  653124, 6053124, 5603124, 6503124, 3560124, 5360124, 3650124, 6350124, 5630124, 6530124,
+      1356024, 3156024, 1536024, 5136024, 3516024, 5316024, 1365024, 3165024, 1635024, 6135024, 3615024, 6315024,
+      1563024, 5163024, 1653024, 6153024, 5613024, 6513024, 3561024, 5361024, 3651024, 6351024, 5631024, 6531024,
+       235614, 2035614,  325614, 3025614, 2305614, 3205614,  253614, 2053614,  523614, 5023614, 2503614, 5203614,
+       352614, 3052614,  532614, 5032614, 3502614, 5302614, 2350614, 3250614, 2530614, 5230614, 3520614, 5320614,
+       236514, 2036514,  326514, 3026514, 2306514, 3206514,  263514, 2063514,  623514, 6023514, 2603514, 6203514,
+       362514, 3062514,  632514, 6032514, 3602514, 6302514, 2360514, 3260514, 2630514, 6230514, 3620514, 6320514,
+       256314, 2056314,  526314, 5026314, 2506314, 5206314,  265314, 2065314,  625314, 6025314, 2605314, 6205314,
+       562314, 5062314,  652314, 6052314, 5602314, 6502314, 2560314, 5260314, 2650314, 6250314, 5620314, 6520314,
+       356214, 3056214,  536214, 5036214, 3506214, 5306214,  365214, 3065214,  635214, 6035214, 3605214, 6305214,
+       563214, 5063214,  653214, 6053214, 5603214, 6503214, 3560214, 5360214, 3650214, 6350214, 5630214, 6530214,
+      2356014, 3256014, 2536014, 5236014, 3526014, 5326014, 2365014, 3265014, 2635014, 6235014, 3625014, 6325014,
+      2563014, 5263014, 2653014, 6253014, 5623014, 6523014, 3562014, 5362014, 3652014, 6352014, 5632014, 6532014,
+      1235604, 2135604, 1325604, 3125604, 2315604, 3215604, 1253604, 2153604, 1523604, 5123604, 2513604, 5213604,
+      1352604, 3152604, 1532604, 5132604, 3512604, 5312604, 2351604, 3251604, 2531604, 5231604, 3521604, 5321604,
+      1236504, 2136504, 1326504, 3126504, 2316504, 3216504, 1263504, 2163504, 1623504, 6123504, 2613504, 6213504,
+      1362504, 3162504, 1632504, 6132504, 3612504, 6312504, 2361504, 3261504, 2631504, 6231504, 3621504, 6321504,
+      1256304, 2156304, 1526304, 5126304, 2516304, 5216304, 1265304, 2165304, 1625304, 6125304, 2615304, 6215304,
+      1562304, 5162304, 1652304, 6152304, 5612304, 6512304, 2561304, 5261304, 2651304, 6251304, 5621304, 6521304,
+      1356204, 3156204, 1536204, 5136204, 3516204, 5316204, 1365204, 3165204, 1635204, 6135204, 3615204, 6315204,
+      1563204, 5163204, 1653204, 6153204, 5613204, 6513204, 3561204, 5361204, 3651204, 6351204, 5631204, 6531204,
+      2356104, 3256104, 2536104, 5236104, 3526104, 5326104, 2365104, 3265104, 2635104, 6235104, 3625104, 6325104,
+      2563104, 5263104, 2653104, 6253104, 5623104, 6523104, 3562104, 5362104, 3652104, 6352104, 5632104, 6532104,
+       124563, 1024563,  214563, 2014563, 1204563, 2104563,  142563, 1042563,  412563, 4012563, 1402563, 4102563,
+       241563, 2041563,  421563, 4021563, 2401563, 4201563, 1240563, 2140563, 1420563, 4120563, 2410563, 4210563,
+       125463, 1025463,  215463, 2015463, 1205463, 2105463,  152463, 1052463,  512463, 5012463, 1502463, 5102463,
+       251463, 2051463,  521463, 5021463, 2501463, 5201463, 1250463, 2150463, 1520463, 5120463, 2510463, 5210463,
+       145263, 1045263,  415263, 4015263, 1405263, 4105263,  154263, 1054263,  514263, 5014263, 1504263, 5104263,
+       451263, 4051263,  541263, 5041263, 4501263, 5401263, 1450263, 4150263, 1540263, 5140263, 4510263, 5410263,
+       245163, 2045163,  425163, 4025163, 2405163, 4205163,  254163, 2054163,  524163, 5024163, 2504163, 5204163,
+       452163, 4052163,  542163, 5042163, 4502163, 5402163, 2450163, 4250163, 2540163, 5240163, 4520163, 5420163,
+      1245063, 2145063, 1425063, 4125063, 2415063, 4215063, 1254063, 2154063, 1524063, 5124063, 2514063, 5214063,
+      1452063, 4152063, 1542063, 5142063, 4512063, 5412063, 2451063, 4251063, 2541063, 5241063, 4521063, 5421063,
+       124653, 1024653,  214653, 2014653, 1204653, 2104653,  142653, 1042653,  412653, 4012653, 1402653, 4102653,
+       241653, 2041653,  421653, 4021653, 2401653, 4201653, 1240653, 2140653, 1420653, 4120653, 2410653, 4210653,
+       126453, 1026453,  216453, 2016453, 1206453, 2106453,  162453, 1062453,  612453, 6012453, 1602453, 6102453,
+       261453, 2061453,  621453, 6021453, 2601453, 6201453, 1260453, 2160453, 1620453, 6120453, 2610453, 6210453,
+       146253, 1046253,  416253, 4016253, 1406253, 4106253,  164253, 1064253,  614253, 6014253, 1604253, 6104253,
+       461253, 4061253,  641253, 6041253, 4601253, 6401253, 1460253, 4160253, 1640253, 6140253, 4610253, 6410253,
+       246153, 2046153,  426153, 4026153, 2406153, 4206153,  264153, 2064153,  624153, 6024153, 2604153, 6204153,
+       462153, 4062153,  642153, 6042153, 4602153, 6402153, 2460153, 4260153, 2640153, 6240153, 4620153, 6420153,
+      1246053, 2146053, 1426053, 4126053, 2416053, 4216053, 1264053, 2164053, 1624053, 6124053, 2614053, 6214053,
+      1462053, 4162053, 1642053, 6142053, 4612053, 6412053, 2461053, 4261053, 2641053, 6241053, 4621053, 6421053,
+       125643, 1025643,  215643, 2015643, 1205643, 2105643,  152643, 1052643,  512643, 5012643, 1502643, 5102643,
+       251643, 2051643,  521643, 5021643, 2501643, 5201643, 1250643, 2150643, 1520643, 5120643, 2510643, 5210643,
+       126543, 1026543,  216543, 2016543, 1206543, 2106543,  162543, 1062543,  612543, 6012543, 1602543, 6102543,
+       261543, 2061543,  621543, 6021543, 2601543, 6201543, 1260543, 2160543, 1620543, 6120543, 2610543, 6210543,
+       156243, 1056243,  516243, 5016243, 1506243, 5106243,  165243, 1065243,  615243, 6015243, 1605243, 6105243,
+       561243, 5061243,  651243, 6051243, 5601243, 6501243, 1560243, 5160243, 1650243, 6150243, 5610243, 6510243,
+       256143, 2056143,  526143, 5026143, 2506143, 5206143,  265143, 2065143,  625143, 6025143, 2605143, 6205143,
+       562143, 5062143,  652143, 6052143, 5602143, 6502143, 2560143, 5260143, 2650143, 6250143, 5620143, 6520143,
+      1256043, 2156043, 1526043, 5126043, 2516043, 5216043, 1265043, 2165043, 1625043, 6125043, 2615043, 6215043,
+      1562043, 5162043, 1652043, 6152043, 5612043, 6512043, 2561043, 5261043, 2651043, 6251043, 5621043, 6521043,
+       145623, 1045623,  415623, 4015623, 1405623, 4105623,  154623, 1054623,  514623, 5014623, 1504623, 5104623,
+       451623, 4051623,  541623, 5041623, 4501623, 5401623, 1450623, 4150623, 1540623, 5140623, 4510623, 5410623,
+       146523, 1046523,  416523, 4016523, 1406523, 4106523,  164523, 1064523,  614523, 6014523, 1604523, 6104523,
+       461523, 4061523,  641523, 6041523, 4601523, 6401523, 1460523, 4160523, 1640523, 6140523, 4610523, 6410523,
+       156423, 1056423,  516423, 5016423, 1506423, 5106423,  165423, 1065423,  615423, 6015423, 1605423, 6105423,
+       561423, 5061423,  651423, 6051423, 5601423, 6501423, 1560423, 5160423, 1650423, 6150423, 5610423, 6510423,
+       456123, 4056123,  546123, 5046123, 4506123, 5406123,  465123, 4065123,  645123, 6045123, 4605123, 6405123,
+       564123, 5064123,  654123, 6054123, 5604123, 6504123, 4560123, 5460123, 4650123, 6450123, 5640123, 6540123,
+      1456023, 4156023, 1546023, 5146023, 4516023, 5416023, 1465023, 4165023, 1645023, 6145023, 4615023, 6415023,
+      1564023, 5164023, 1654023, 6154023, 5614023, 6514023, 4561023, 5461023, 4651023, 6451023, 5641023, 6541023,
+       245613, 2045613,  425613, 4025613, 2405613, 4205613,  254613, 2054613,  524613, 5024613, 2504613, 5204613,
+       452613, 4052613,  542613, 5042613, 4502613, 5402613, 2450613, 4250613, 2540613, 5240613, 4520613, 5420613,
+       246513, 2046513,  426513, 4026513, 2406513, 4206513,  264513, 2064513,  624513, 6024513, 2604513, 6204513,
+       462513, 4062513,  642513, 6042513, 4602513, 6402513, 2460513, 4260513, 2640513, 6240513, 4620513, 6420513,
+       256413, 2056413,  526413, 5026413, 2506413, 5206413,  265413, 2065413,  625413, 6025413, 2605413, 6205413,
+       562413, 5062413,  652413, 6052413, 5602413, 6502413, 2560413, 5260413, 2650413, 6250413, 5620413, 6520413,
+       456213, 4056213,  546213, 5046213, 4506213, 5406213,  465213, 4065213,  645213, 6045213, 4605213, 6405213,
+       564213, 5064213,  654213, 6054213, 5604213, 6504213, 4560213, 5460213, 4650213, 6450213, 5640213, 6540213,
+      2456013, 4256013, 2546013, 5246013, 4526013, 5426013, 2465013, 4265013, 2645013, 6245013, 4625013, 6425013,
+      2564013, 5264013, 2654013, 6254013, 5624013, 6524013, 4562013, 5462013, 4652013, 6452013, 5642013, 6542013,
+      1245603, 2145603, 1425603, 4125603, 2415603, 4215603, 1254603, 2154603, 1524603, 5124603, 2514603, 5214603,
+      1452603, 4152603, 1542603, 5142603, 4512603, 5412603, 2451603, 4251603, 2541603, 5241603, 4521603, 5421603,
+      1246503, 2146503, 1426503, 4126503, 2416503, 4216503, 1264503, 2164503, 1624503, 6124503, 2614503, 6214503,
+      1462503, 4162503, 1642503, 6142503, 4612503, 6412503, 2461503, 4261503, 2641503, 6241503, 4621503, 6421503,
+      1256403, 2156403, 1526403, 5126403, 2516403, 5216403, 1265403, 2165403, 1625403, 6125403, 2615403, 6215403,
+      1562403, 5162403, 1652403, 6152403, 5612403, 6512403, 2561403, 5261403, 2651403, 6251403, 5621403, 6521403,
+      1456203, 4156203, 1546203, 5146203, 4516203, 5416203, 1465203, 4165203, 1645203, 6145203, 4615203, 6415203,
+      1564203, 5164203, 1654203, 6154203, 5614203, 6514203, 4561203, 5461203, 4651203, 6451203, 5641203, 6541203,
+      2456103, 4256103, 2546103, 5246103, 4526103, 5426103, 2465103, 4265103, 2645103, 6245103, 4625103, 6425103,
+      2564103, 5264103, 2654103, 6254103, 5624103, 6524103, 4562103, 5462103, 4652103, 6452103, 5642103, 6542103,
+       134562, 1034562,  314562, 3014562, 1304562, 3104562,  143562, 1043562,  413562, 4013562, 1403562, 4103562,
+       341562, 3041562,  431562, 4031562, 3401562, 4301562, 1340562, 3140562, 1430562, 4130562, 3410562, 4310562,
+       135462, 1035462,  315462, 3015462, 1305462, 3105462,  153462, 1053462,  513462, 5013462, 1503462, 5103462,
+       351462, 3051462,  531462, 5031462, 3501462, 5301462, 1350462, 3150462, 1530462, 5130462, 3510462, 5310462,
+       145362, 1045362,  415362, 4015362, 1405362, 4105362,  154362, 1054362,  514362, 5014362, 1504362, 5104362,
+       451362, 4051362,  541362, 5041362, 4501362, 5401362, 1450362, 4150362, 1540362, 5140362, 4510362, 5410362,
+       345162, 3045162,  435162, 4035162, 3405162, 4305162,  354162, 3054162,  534162, 5034162, 3504162, 5304162,
+       453162, 4053162,  543162, 5043162, 4503162, 5403162, 3450162, 4350162, 3540162, 5340162, 4530162, 5430162,
+      1345062, 3145062, 1435062, 4135062, 3415062, 4315062, 1354062, 3154062, 1534062, 5134062, 3514062, 5314062,
+      1453062, 4153062, 1543062, 5143062, 4513062, 5413062, 3451062, 4351062, 3541062, 5341062, 4531062, 5431062,
+       134652, 1034652,  314652, 3014652, 1304652, 3104652,  143652, 1043652,  413652, 4013652, 1403652, 4103652,
+       341652, 3041652,  431652, 4031652, 3401652, 4301652, 1340652, 3140652, 1430652, 4130652, 3410652, 4310652,
+       136452, 1036452,  316452, 3016452, 1306452, 3106452,  163452, 1063452,  613452, 6013452, 1603452, 6103452,
+       361452, 3061452,  631452, 6031452, 3601452, 6301452, 1360452, 3160452, 1630452, 6130452, 3610452, 6310452,
+       146352, 1046352,  416352, 4016352, 1406352, 4106352,  164352, 1064352,  614352, 6014352, 1604352, 6104352,
+       461352, 4061352,  641352, 6041352, 4601352, 6401352, 1460352, 4160352, 1640352, 6140352, 4610352, 6410352,
+       346152, 3046152,  436152, 4036152, 3406152, 4306152,  364152, 3064152,  634152, 6034152, 3604152, 6304152,
+       463152, 4063152,  643152, 6043152, 4603152, 6403152, 3460152, 4360152, 3640152, 6340152, 4630152, 6430152,
+      1346052, 3146052, 1436052, 4136052, 3416052, 4316052, 1364052, 3164052, 1634052, 6134052, 3614052, 6314052,
+      1463052, 4163052, 1643052, 6143052, 4613052, 6413052, 3461052, 4361052, 3641052, 6341052, 4631052, 6431052,
+       135642, 1035642,  315642, 3015642, 1305642, 3105642,  153642, 1053642,  513642, 5013642, 1503642, 5103642,
+       351642, 3051642,  531642, 5031642, 3501642, 5301642, 1350642, 3150642, 1530642, 5130642, 3510642, 5310642,
+       136542, 1036542,  316542, 3016542, 1306542, 3106542,  163542, 1063542,  613542, 6013542, 1603542, 6103542,
+       361542, 3061542,  631542, 6031542, 3601542, 6301542, 1360542, 3160542, 1630542, 6130542, 3610542, 6310542,
+       156342, 1056342,  516342, 5016342, 1506342, 5106342,  165342, 1065342,  615342, 6015342, 1605342, 6105342,
+       561342, 5061342,  651342, 6051342, 5601342, 6501342, 1560342, 5160342, 1650342, 6150342, 5610342, 6510342,
+       356142, 3056142,  536142, 5036142, 3506142, 5306142,  365142, 3065142,  635142, 6035142, 3605142, 6305142,
+       563142, 5063142,  653142, 6053142, 5603142, 6503142, 3560142, 5360142, 3650142, 6350142, 5630142, 6530142,
+      1356042, 3156042, 1536042, 5136042, 3516042, 5316042, 1365042, 3165042, 1635042, 6135042, 3615042, 6315042,
+      1563042, 5163042, 1653042, 6153042, 5613042, 6513042, 3561042, 5361042, 3651042, 6351042, 5631042, 6531042,
+       145632, 1045632,  415632, 4015632, 1405632, 4105632,  154632, 1054632,  514632, 5014632, 1504632, 5104632,
+       451632, 4051632,  541632, 5041632, 4501632, 5401632, 1450632, 4150632, 1540632, 5140632, 4510632, 5410632,
+       146532, 1046532,  416532, 4016532, 1406532, 4106532,  164532, 1064532,  614532, 6014532, 1604532, 6104532,
+       461532, 4061532,  641532, 6041532, 4601532, 6401532, 1460532, 4160532, 1640532, 6140532, 4610532, 6410532,
+       156432, 1056432,  516432, 5016432, 1506432, 5106432,  165432, 1065432,  615432, 6015432, 1605432, 6105432,
+       561432, 5061432,  651432, 6051432, 5601432, 6501432, 1560432, 5160432, 1650432, 6150432, 5610432, 6510432,
+       456132, 4056132,  546132, 5046132, 4506132, 5406132,  465132, 4065132,  645132, 6045132, 4605132, 6405132,
+       564132, 5064132,  654132, 6054132, 5604132, 6504132, 4560132, 5460132, 4650132, 6450132, 5640132, 6540132,
+      1456032, 4156032, 1546032, 5146032, 4516032, 5416032, 1465032, 4165032, 1645032, 6145032, 4615032, 6415032,
+      1564032, 5164032, 1654032, 6154032, 5614032, 6514032, 4561032, 5461032, 4651032, 6451032, 5641032, 6541032,
+       345612, 3045612,  435612, 4035612, 3405612, 4305612,  354612, 3054612,  534612, 5034612, 3504612, 5304612,
+       453612, 4053612,  543612, 5043612, 4503612, 5403612, 3450612, 4350612, 3540612, 5340612, 4530612, 5430612,
+       346512, 3046512,  436512, 4036512, 3406512, 4306512,  364512, 3064512,  634512, 6034512, 3604512, 6304512,
+       463512, 4063512,  643512, 6043512, 4603512, 6403512, 3460512, 4360512, 3640512, 6340512, 4630512, 6430512,
+       356412, 3056412,  536412, 5036412, 3506412, 5306412,  365412, 3065412,  635412, 6035412, 3605412, 6305412,
+       563412, 5063412,  653412, 6053412, 5603412, 6503412, 3560412, 5360412, 3650412, 6350412, 5630412, 6530412,
+       456312, 4056312,  546312, 5046312, 4506312, 5406312,  465312, 4065312,  645312, 6045312, 4605312, 6405312,
+       564312, 5064312,  654312, 6054312, 5604312, 6504312, 4560312, 5460312, 4650312, 6450312, 5640312, 6540312,
+      3456012, 4356012, 3546012, 5346012, 4536012, 5436012, 3465012, 4365012, 3645012, 6345012, 4635012, 6435012,
+      3564012, 5364012, 3654012, 6354012, 5634012, 6534012, 4563012, 5463012, 4653012, 6453012, 5643012, 6543012,
+      1345602, 3145602, 1435602, 4135602, 3415602, 4315602, 1354602, 3154602, 1534602, 5134602, 3514602, 5314602,
+      1453602, 4153602, 1543602, 5143602, 4513602, 5413602, 3451602, 4351602, 3541602, 5341602, 4531602, 5431602,
+      1346502, 3146502, 1436502, 4136502, 3416502, 4316502, 1364502, 3164502, 1634502, 6134502, 3614502, 6314502,
+      1463502, 4163502, 1643502, 6143502, 4613502, 6413502, 3461502, 4361502, 3641502, 6341502, 4631502, 6431502,
+      1356402, 3156402, 1536402, 5136402, 3516402, 5316402, 1365402, 3165402, 1635402, 6135402, 3615402, 6315402,
+      1563402, 5163402, 1653402, 6153402, 5613402, 6513402, 3561402, 5361402, 3651402, 6351402, 5631402, 6531402,
+      1456302, 4156302, 1546302, 5146302, 4516302, 5416302, 1465302, 4165302, 1645302, 6145302, 4615302, 6415302,
+      1564302, 5164302, 1654302, 6154302, 5614302, 6514302, 4561302, 5461302, 4651302, 6451302, 5641302, 6541302,
+      3456102, 4356102, 3546102, 5346102, 4536102, 5436102, 3465102, 4365102, 3645102, 6345102, 4635102, 6435102,
+      3564102, 5364102, 3654102, 6354102, 5634102, 6534102, 4563102, 5463102, 4653102, 6453102, 5643102, 6543102,
+       234561, 2034561,  324561, 3024561, 2304561, 3204561,  243561, 2043561,  423561, 4023561, 2403561, 4203561,
+       342561, 3042561,  432561, 4032561, 3402561, 4302561, 2340561, 3240561, 2430561, 4230561, 3420561, 4320561,
+       235461, 2035461,  325461, 3025461, 2305461, 3205461,  253461, 2053461,  523461, 5023461, 2503461, 5203461,
+       352461, 3052461,  532461, 5032461, 3502461, 5302461, 2350461, 3250461, 2530461, 5230461, 3520461, 5320461,
+       245361, 2045361,  425361, 4025361, 2405361, 4205361,  254361, 2054361,  524361, 5024361, 2504361, 5204361,
+       452361, 4052361,  542361, 5042361, 4502361, 5402361, 2450361, 4250361, 2540361, 5240361, 4520361, 5420361,
+       345261, 3045261,  435261, 4035261, 3405261, 4305261,  354261, 3054261,  534261, 5034261, 3504261, 5304261,
+       453261, 4053261,  543261, 5043261, 4503261, 5403261, 3450261, 4350261, 3540261, 5340261, 4530261, 5430261,
+      2345061, 3245061, 2435061, 4235061, 3425061, 4325061, 2354061, 3254061, 2534061, 5234061, 3524061, 5324061,
+      2453061, 4253061, 2543061, 5243061, 4523061, 5423061, 3452061, 4352061, 3542061, 5342061, 4532061, 5432061,
+       234651, 2034651,  324651, 3024651, 2304651, 3204651,  243651, 2043651,  423651, 4023651, 2403651, 4203651,
+       342651, 3042651,  432651, 4032651, 3402651, 4302651, 2340651, 3240651, 2430651, 4230651, 3420651, 4320651,
+       236451, 2036451,  326451, 3026451, 2306451, 3206451,  263451, 2063451,  623451, 6023451, 2603451, 6203451,
+       362451, 3062451,  632451, 6032451, 3602451, 6302451, 2360451, 3260451, 2630451, 6230451, 3620451, 6320451,
+       246351, 2046351,  426351, 4026351, 2406351, 4206351,  264351, 2064351,  624351, 6024351, 2604351, 6204351,
+       462351, 4062351,  642351, 6042351, 4602351, 6402351, 2460351, 4260351, 2640351, 6240351, 4620351, 6420351,
+       346251, 3046251,  436251, 4036251, 3406251, 4306251,  364251, 3064251,  634251, 6034251, 3604251, 6304251,
+       463251, 4063251,  643251, 6043251, 4603251, 6403251, 3460251, 4360251, 3640251, 6340251, 4630251, 6430251,
+      2346051, 3246051, 2436051, 4236051, 3426051, 4326051, 2364051, 3264051, 2634051, 6234051, 3624051, 6324051,
+      2463051, 4263051, 2643051, 6243051, 4623051, 6423051, 3462051, 4362051, 3642051, 6342051, 4632051, 6432051,
+       235641, 2035641,  325641, 3025641, 2305641, 3205641,  253641, 2053641,  523641, 5023641, 2503641, 5203641,
+       352641, 3052641,  532641, 5032641, 3502641, 5302641, 2350641, 3250641, 2530641, 5230641, 3520641, 5320641,
+       236541, 2036541,  326541, 3026541, 2306541, 3206541,  263541, 2063541,  623541, 6023541, 2603541, 6203541,
+       362541, 3062541,  632541, 6032541, 3602541, 6302541, 2360541, 3260541, 2630541, 6230541, 3620541, 6320541,
+       256341, 2056341,  526341, 5026341, 2506341, 5206341,  265341, 2065341,  625341, 6025341, 2605341, 6205341,
+       562341, 5062341,  652341, 6052341, 5602341, 6502341, 2560341, 5260341, 2650341, 6250341, 5620341, 6520341,
+       356241, 3056241,  536241, 5036241, 3506241, 5306241,  365241, 3065241,  635241, 6035241, 3605241, 6305241,
+       563241, 5063241,  653241, 6053241, 5603241, 6503241, 3560241, 5360241, 3650241, 6350241, 5630241, 6530241,
+      2356041, 3256041, 2536041, 5236041, 3526041, 5326041, 2365041, 3265041, 2635041, 6235041, 3625041, 6325041,
+      2563041, 5263041, 2653041, 6253041, 5623041, 6523041, 3562041, 5362041, 3652041, 6352041, 5632041, 6532041,
+       245631, 2045631,  425631, 4025631, 2405631, 4205631,  254631, 2054631,  524631, 5024631, 2504631, 5204631,
+       452631, 4052631,  542631, 5042631, 4502631, 5402631, 2450631, 4250631, 2540631, 5240631, 4520631, 5420631,
+       246531, 2046531,  426531, 4026531, 2406531, 4206531,  264531, 2064531,  624531, 6024531, 2604531, 6204531,
+       462531, 4062531,  642531, 6042531, 4602531, 6402531, 2460531, 4260531, 2640531, 6240531, 4620531, 6420531,
+       256431, 2056431,  526431, 5026431, 2506431, 5206431,  265431, 2065431,  625431, 6025431, 2605431, 6205431,
+       562431, 5062431,  652431, 6052431, 5602431, 6502431, 2560431, 5260431, 2650431, 6250431, 5620431, 6520431,
+       456231, 4056231,  546231, 5046231, 4506231, 5406231,  465231, 4065231,  645231, 6045231, 4605231, 6405231,
+       564231, 5064231,  654231, 6054231, 5604231, 6504231, 4560231, 5460231, 4650231, 6450231, 5640231, 6540231,
+      2456031, 4256031, 2546031, 5246031, 4526031, 5426031, 2465031, 4265031, 2645031, 6245031, 4625031, 6425031,
+      2564031, 5264031, 2654031, 6254031, 5624031, 6524031, 4562031, 5462031, 4652031, 6452031, 5642031, 6542031,
+       345621, 3045621,  435621, 4035621, 3405621, 4305621,  354621, 3054621,  534621, 5034621, 3504621, 5304621,
+       453621, 4053621,  543621, 5043621, 4503621, 5403621, 3450621, 4350621, 3540621, 5340621, 4530621, 5430621,
+       346521, 3046521,  436521, 4036521, 3406521, 4306521,  364521, 3064521,  634521, 6034521, 3604521, 6304521,
+       463521, 4063521,  643521, 6043521, 4603521, 6403521, 3460521, 4360521, 3640521, 6340521, 4630521, 6430521,
+       356421, 3056421,  536421, 5036421, 3506421, 5306421,  365421, 3065421,  635421, 6035421, 3605421, 6305421,
+       563421, 5063421,  653421, 6053421, 5603421, 6503421, 3560421, 5360421, 3650421, 6350421, 5630421, 6530421,
+       456321, 4056321,  546321, 5046321, 4506321, 5406321,  465321, 4065321,  645321, 6045321, 4605321, 6405321,
+       564321, 5064321,  654321, 6054321, 5604321, 6504321, 4560321, 5460321, 4650321, 6450321, 5640321, 6540321,
+      3456021, 4356021, 3546021, 5346021, 4536021, 5436021, 3465021, 4365021, 3645021, 6345021, 4635021, 6435021,
+      3564021, 5364021, 3654021, 6354021, 5634021, 6534021, 4563021, 5463021, 4653021, 6453021, 5643021, 6543021,
+      2345601, 3245601, 2435601, 4235601, 3425601, 4325601, 2354601, 3254601, 2534601, 5234601, 3524601, 5324601,
+      2453601, 4253601, 2543601, 5243601, 4523601, 5423601, 3452601, 4352601, 3542601, 5342601, 4532601, 5432601,
+      2346501, 3246501, 2436501, 4236501, 3426501, 4326501, 2364501, 3264501, 2634501, 6234501, 3624501, 6324501,
+      2463501, 4263501, 2643501, 6243501, 4623501, 6423501, 3462501, 4362501, 3642501, 6342501, 4632501, 6432501,
+      2356401, 3256401, 2536401, 5236401, 3526401, 5326401, 2365401, 3265401, 2635401, 6235401, 3625401, 6325401,
+      2563401, 5263401, 2653401, 6253401, 5623401, 6523401, 3562401, 5362401, 3652401, 6352401, 5632401, 6532401,
+      2456301, 4256301, 2546301, 5246301, 4526301, 5426301, 2465301, 4265301, 2645301, 6245301, 4625301, 6425301,
+      2564301, 5264301, 2654301, 6254301, 5624301, 6524301, 4562301, 5462301, 4652301, 6452301, 5642301, 6542301,
+      3456201, 4356201, 3546201, 5346201, 4536201, 5436201, 3465201, 4365201, 3645201, 6345201, 4635201, 6435201,
+      3564201, 5364201, 3654201, 6354201, 5634201, 6534201, 4563201, 5463201, 4653201, 6453201, 5643201, 6543201,
+      1234560, 2134560, 1324560, 3124560, 2314560, 3214560, 1243560, 2143560, 1423560, 4123560, 2413560, 4213560,
+      1342560, 3142560, 1432560, 4132560, 3412560, 4312560, 2341560, 3241560, 2431560, 4231560, 3421560, 4321560,
+      1235460, 2135460, 1325460, 3125460, 2315460, 3215460, 1253460, 2153460, 1523460, 5123460, 2513460, 5213460,
+      1352460, 3152460, 1532460, 5132460, 3512460, 5312460, 2351460, 3251460, 2531460, 5231460, 3521460, 5321460,
+      1245360, 2145360, 1425360, 4125360, 2415360, 4215360, 1254360, 2154360, 1524360, 5124360, 2514360, 5214360,
+      1452360, 4152360, 1542360, 5142360, 4512360, 5412360, 2451360, 4251360, 2541360, 5241360, 4521360, 5421360,
+      1345260, 3145260, 1435260, 4135260, 3415260, 4315260, 1354260, 3154260, 1534260, 5134260, 3514260, 5314260,
+      1453260, 4153260, 1543260, 5143260, 4513260, 5413260, 3451260, 4351260, 3541260, 5341260, 4531260, 5431260,
+      2345160, 3245160, 2435160, 4235160, 3425160, 4325160, 2354160, 3254160, 2534160, 5234160, 3524160, 5324160,
+      2453160, 4253160, 2543160, 5243160, 4523160, 5423160, 3452160, 4352160, 3542160, 5342160, 4532160, 5432160,
+      1234650, 2134650, 1324650, 3124650, 2314650, 3214650, 1243650, 2143650, 1423650, 4123650, 2413650, 4213650,
+      1342650, 3142650, 1432650, 4132650, 3412650, 4312650, 2341650, 3241650, 2431650, 4231650, 3421650, 4321650,
+      1236450, 2136450, 1326450, 3126450, 2316450, 3216450, 1263450, 2163450, 1623450, 6123450, 2613450, 6213450,
+      1362450, 3162450, 1632450, 6132450, 3612450, 6312450, 2361450, 3261450, 2631450, 6231450, 3621450, 6321450,
+      1246350, 2146350, 1426350, 4126350, 2416350, 4216350, 1264350, 2164350, 1624350, 6124350, 2614350, 6214350,
+      1462350, 4162350, 1642350, 6142350, 4612350, 6412350, 2461350, 4261350, 2641350, 6241350, 4621350, 6421350,
+      1346250, 3146250, 1436250, 4136250, 3416250, 4316250, 1364250, 3164250, 1634250, 6134250, 3614250, 6314250,
+      1463250, 4163250, 1643250, 6143250, 4613250, 6413250, 3461250, 4361250, 3641250, 6341250, 4631250, 6431250,
+      2346150, 3246150, 2436150, 4236150, 3426150, 4326150, 2364150, 3264150, 2634150, 6234150, 3624150, 6324150,
+      2463150, 4263150, 2643150, 6243150, 4623150, 6423150, 3462150, 4362150, 3642150, 6342150, 4632150, 6432150,
+      1235640, 2135640, 1325640, 3125640, 2315640, 3215640, 1253640, 2153640, 1523640, 5123640, 2513640, 5213640,
+      1352640, 3152640, 1532640, 5132640, 3512640, 5312640, 2351640, 3251640, 2531640, 5231640, 3521640, 5321640,
+      1236540, 2136540, 1326540, 3126540, 2316540, 3216540, 1263540, 2163540, 1623540, 6123540, 2613540, 6213540,
+      1362540, 3162540, 1632540, 6132540, 3612540, 6312540, 2361540, 3261540, 2631540, 6231540, 3621540, 6321540,
+      1256340, 2156340, 1526340, 5126340, 2516340, 5216340, 1265340, 2165340, 1625340, 6125340, 2615340, 6215340,
+      1562340, 5162340, 1652340, 6152340, 5612340, 6512340, 2561340, 5261340, 2651340, 6251340, 5621340, 6521340,
+      1356240, 3156240, 1536240, 5136240, 3516240, 5316240, 1365240, 3165240, 1635240, 6135240, 3615240, 6315240,
+      1563240, 5163240, 1653240, 6153240, 5613240, 6513240, 3561240, 5361240, 3651240, 6351240, 5631240, 6531240,
+      2356140, 3256140, 2536140, 5236140, 3526140, 5326140, 2365140, 3265140, 2635140, 6235140, 3625140, 6325140,
+      2563140, 5263140, 2653140, 6253140, 5623140, 6523140, 3562140, 5362140, 3652140, 6352140, 5632140, 6532140,
+      1245630, 2145630, 1425630, 4125630, 2415630, 4215630, 1254630, 2154630, 1524630, 5124630, 2514630, 5214630,
+      1452630, 4152630, 1542630, 5142630, 4512630, 5412630, 2451630, 4251630, 2541630, 5241630, 4521630, 5421630,
+      1246530, 2146530, 1426530, 4126530, 2416530, 4216530, 1264530, 2164530, 1624530, 6124530, 2614530, 6214530,
+      1462530, 4162530, 1642530, 6142530, 4612530, 6412530, 2461530, 4261530, 2641530, 6241530, 4621530, 6421530,
+      1256430, 2156430, 1526430, 5126430, 2516430, 5216430, 1265430, 2165430, 1625430, 6125430, 2615430, 6215430,
+      1562430, 5162430, 1652430, 6152430, 5612430, 6512430, 2561430, 5261430, 2651430, 6251430, 5621430, 6521430,
+      1456230, 4156230, 1546230, 5146230, 4516230, 5416230, 1465230, 4165230, 1645230, 6145230, 4615230, 6415230,
+      1564230, 5164230, 1654230, 6154230, 5614230, 6514230, 4561230, 5461230, 4651230, 6451230, 5641230, 6541230,
+      2456130, 4256130, 2546130, 5246130, 4526130, 5426130, 2465130, 4265130, 2645130, 6245130, 4625130, 6425130,
+      2564130, 5264130, 2654130, 6254130, 5624130, 6524130, 4562130, 5462130, 4652130, 6452130, 5642130, 6542130,
+      1345620, 3145620, 1435620, 4135620, 3415620, 4315620, 1354620, 3154620, 1534620, 5134620, 3514620, 5314620,
+      1453620, 4153620, 1543620, 5143620, 4513620, 5413620, 3451620, 4351620, 3541620, 5341620, 4531620, 5431620,
+      1346520, 3146520, 1436520, 4136520, 3416520, 4316520, 1364520, 3164520, 1634520, 6134520, 3614520, 6314520,
+      1463520, 4163520, 1643520, 6143520, 4613520, 6413520, 3461520, 4361520, 3641520, 6341520, 4631520, 6431520,
+      1356420, 3156420, 1536420, 5136420, 3516420, 5316420, 1365420, 3165420, 1635420, 6135420, 3615420, 6315420,
+      1563420, 5163420, 1653420, 6153420, 5613420, 6513420, 3561420, 5361420, 3651420, 6351420, 5631420, 6531420,
+      1456320, 4156320, 1546320, 5146320, 4516320, 5416320, 1465320, 4165320, 1645320, 6145320, 4615320, 6415320,
+      1564320, 5164320, 1654320, 6154320, 5614320, 6514320, 4561320, 5461320, 4651320, 6451320, 5641320, 6541320,
+      3456120, 4356120, 3546120, 5346120, 4536120, 5436120, 3465120, 4365120, 3645120, 6345120, 4635120, 6435120,
+      3564120, 5364120, 3654120, 6354120, 5634120, 6534120, 4563120, 5463120, 4653120, 6453120, 5643120, 6543120,
+      2345610, 3245610, 2435610, 4235610, 3425610, 4325610, 2354610, 3254610, 2534610, 5234610, 3524610, 5324610,
+      2453610, 4253610, 2543610, 5243610, 4523610, 5423610, 3452610, 4352610, 3542610, 5342610, 4532610, 5432610,
+      2346510, 3246510, 2436510, 4236510, 3426510, 4326510, 2364510, 3264510, 2634510, 6234510, 3624510, 6324510,
+      2463510, 4263510, 2643510, 6243510, 4623510, 6423510, 3462510, 4362510, 3642510, 6342510, 4632510, 6432510,
+      2356410, 3256410, 2536410, 5236410, 3526410, 5326410, 2365410, 3265410, 2635410, 6235410, 3625410, 6325410,
+      2563410, 5263410, 2653410, 6253410, 5623410, 6523410, 3562410, 5362410, 3652410, 6352410, 5632410, 6532410,
+      2456310, 4256310, 2546310, 5246310, 4526310, 5426310, 2465310, 4265310, 2645310, 6245310, 4625310, 6425310,
+      2564310, 5264310, 2654310, 6254310, 5624310, 6524310, 4562310, 5462310, 4652310, 6452310, 5642310, 6542310,
+      3456210, 4356210, 3546210, 5346210, 4536210, 5436210, 3465210, 4365210, 3645210, 6345210, 4635210, 6435210,
+      3564210, 5364210, 3654210, 6354210, 5634210, 6534210, 4563210, 5463210, 4653210, 6453210, 5643210, 6543210
+    };
+    std::map<uint64_t, int> expected;
+    for (std::size_t i = 0; i < 5040; i++)
+      expected[pre_expected[i]] = 0; // flags are 0, everything is symmetric here
+
+    VERIFY(isDynGroup(group));
+    VERIFY_IS_EQUAL(group.size(), 5040u);
+    VERIFY_IS_EQUAL(group.globalFlags(), 0);
+    group.apply<checkIdx, int>(identity7, 0, found, expected);
+    VERIFY_IS_EQUAL(found.size(), 5040u);
+  }
+}
+
+static void test_tensor_epsilon()
+{
+  SGroup<AntiSymmetry<0,1>, AntiSymmetry<1,2>> sym;
+  Tensor<int, 3> epsilon(3,3,3);
+
+  epsilon.setZero();
+  sym(epsilon, 0, 1, 2) = 1;
+
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < 3; j++) {
+      for (int k = 0; k < 3; k++) {
+        VERIFY_IS_EQUAL((epsilon(i,j,k)), (- (j - i) * (k - j) * (i - k) / 2) );
+      }
+    }
+  }
+}
+
+static void test_tensor_sym()
+{
+  SGroup<Symmetry<0,1>, Symmetry<2,3>> sym;
+  Tensor<int, 4> t(10,10,10,10);
+
+  t.setZero();
+
+  for (int l = 0; l < 10; l++) {
+    for (int k = l; k < 10; k++) {
+      for (int j = 0; j < 10; j++) {
+        for (int i = j; i < 10; i++) {
+          sym(t, i, j, k, l) = (i + j) * (k + l);
+        }
+      }
+    }
+  }
+
+  for (int l = 0; l < 10; l++) {
+    for (int k = 0; k < 10; k++) {
+      for (int j = 0; j < 10; j++) {
+        for (int i = 0; i < 10; i++) {
+          VERIFY_IS_EQUAL((t(i, j, k, l)), ((i + j) * (k + l)));
+        }
+      }
+    }
+  }
+
+}
+
+static void test_tensor_asym()
+{
+  SGroup<AntiSymmetry<0,1>, AntiSymmetry<2,3>> sym;
+  Tensor<int, 4> t(10,10,10,10);
+
+  t.setZero();
+
+  for (int l = 0; l < 10; l++) {
+    for (int k = l + 1; k < 10; k++) {
+      for (int j = 0; j < 10; j++) {
+        for (int i = j + 1; i < 10; i++) {
+          sym(t, i, j, k, l) = ((i * j) + (k * l));
+        }
+      }
+    }
+  }
+
+  for (int l = 0; l < 10; l++) {
+    for (int k = 0; k < 10; k++) {
+      for (int j = 0; j < 10; j++) {
+        for (int i = 0; i < 10; i++) {
+          if (i < j && k < l)
+            VERIFY_IS_EQUAL((t(i, j, k, l)), (((i * j) + (k * l))));
+          else if (i > j && k > l)
+            VERIFY_IS_EQUAL((t(i, j, k, l)), (((i * j) + (k * l))));
+          else if (i < j && k > l)
+            VERIFY_IS_EQUAL((t(i, j, k, l)), (- ((i * j) + (k * l))));
+          else if (i > j && k < l)
+            VERIFY_IS_EQUAL((t(i, j, k, l)), (- ((i * j) + (k * l))));
+          else
+            VERIFY_IS_EQUAL((t(i, j, k, l)), 0);
+        }
+      }
+    }
+  }
+}
+
+static void test_tensor_dynsym()
+{
+  DynamicSGroup sym;
+  sym.addSymmetry(0,1);
+  sym.addSymmetry(2,3);
+  Tensor<int, 4> t(10,10,10,10);
+
+  t.setZero();
+
+  for (int l = 0; l < 10; l++) {
+    for (int k = l; k < 10; k++) {
+      for (int j = 0; j < 10; j++) {
+        for (int i = j; i < 10; i++) {
+          sym(t, i, j, k, l) = (i + j) * (k + l);
+        }
+      }
+    }
+  }
+
+  for (int l = 0; l < 10; l++) {
+    for (int k = 0; k < 10; k++) {
+      for (int j = 0; j < 10; j++) {
+        for (int i = 0; i < 10; i++) {
+          VERIFY_IS_EQUAL((t(i, j, k, l)), ((i + j) * (k + l)));
+        }
+      }
+    }
+  }
+}
+
+static void test_tensor_randacc()
+{
+  SGroup<Symmetry<0,1>, Symmetry<2,3>> sym;
+  Tensor<int, 4> t(10,10,10,10);
+
+  t.setZero();
+
+  // set elements 1 million times, that way we access the
+  // entire matrix
+  for (int n = 0; n < 1000000; n++) {
+    int i = rand() % 10;
+    int j = rand() % 10;
+    int k = rand() % 10;
+    int l = rand() % 10;
+    // only access those indices in a given order
+    if (i < j)
+      std::swap(i, j);
+    if (k < l)
+      std::swap(k, l);
+    sym(t, i, j, k, l) = (i + j) * (k + l);
+  }
+
+  for (int l = 0; l < 10; l++) {
+    for (int k = 0; k < 10; k++) {
+      for (int j = 0; j < 10; j++) {
+        for (int i = 0; i < 10; i++) {
+          VERIFY_IS_EQUAL((t(i, j, k, l)), ((i + j) * (k + l)));
+        }
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_symmetry()
+{
+  CALL_SUBTEST(test_symgroups_static());
+  CALL_SUBTEST(test_symgroups_dynamic());
+  CALL_SUBTEST(test_symgroups_selection());
+  CALL_SUBTEST(test_tensor_epsilon());
+  CALL_SUBTEST(test_tensor_sym());
+  CALL_SUBTEST(test_tensor_asym());
+  CALL_SUBTEST(test_tensor_dynsym());
+  CALL_SUBTEST(test_tensor_randacc());
+}
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_thread_pool.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_thread_pool.cpp
new file mode 100644
index 000000000..2ef665f30
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_thread_pool.cpp
@@ -0,0 +1,373 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_USE_THREADS
+
+
+#include "main.h"
+#include <iostream>
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+
+void test_multithread_elementwise()
+{
+  Tensor<float, 3> in1(2,3,7);
+  Tensor<float, 3> in2(2,3,7);
+  Tensor<float, 3> out(2,3,7);
+
+  in1.setRandom();
+  in2.setRandom();
+
+  Eigen::ThreadPool tp(internal::random<int>(3, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
+  out.device(thread_pool_device) = in1 + in2 * 3.14f;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f);
+      }
+    }
+  }
+}
+
+
+void test_multithread_compound_assignment()
+{
+  Tensor<float, 3> in1(2,3,7);
+  Tensor<float, 3> in2(2,3,7);
+  Tensor<float, 3> out(2,3,7);
+
+  in1.setRandom();
+  in2.setRandom();
+
+  Eigen::ThreadPool tp(internal::random<int>(3, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(3, 11));
+  out.device(thread_pool_device) = in1;
+  out.device(thread_pool_device) += in2 * 3.14f;
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        VERIFY_IS_APPROX(out(i,j,k), in1(i,j,k) + in2(i,j,k) * 3.14f);
+      }
+    }
+  }
+}
+
+template<int DataLayout>
+void test_multithread_contraction()
+{
+  Tensor<float, 4, DataLayout> t_left(30, 50, 37, 31);
+  Tensor<float, 5, DataLayout> t_right(37, 31, 70, 2, 10);
+  Tensor<float, 5, DataLayout> t_result(30, 50, 70, 2, 10);
+
+  t_left.setRandom();
+  t_right.setRandom();
+
+  // this contraction should be equivalent to a single matrix multiplication
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 2> dims({{DimPair(2, 0), DimPair(3, 1)}});
+
+  typedef Map<Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 1500, 1147);
+  MapXf m_right(t_right.data(), 1147, 1400);
+  Matrix<float, Dynamic, Dynamic, DataLayout> m_result(1500, 1400);
+
+  Eigen::ThreadPool tp(4);
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, 4);
+
+  // compute results by separate methods
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  m_result = m_left * m_right;
+
+ for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    VERIFY(&t_result.data()[i] != &m_result.data()[i]);
+    if (fabsf(t_result(i) - m_result(i)) < 1e-4f) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), m_result(i), 1e-4f)) {
+      continue;
+    }
+    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+              << " vs " <<  m_result(i) << std::endl;
+    assert(false);
+  }
+}
+
+template<int DataLayout>
+void test_contraction_corner_cases()
+{
+  Tensor<float, 2, DataLayout> t_left(32, 500);
+  Tensor<float, 2, DataLayout> t_right(32, 28*28);
+  Tensor<float, 2, DataLayout> t_result(500, 28*28);
+
+  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
+  t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
+  t_result = t_result.constant(NAN);
+
+  // this contraction should be equivalent to a single matrix multiplication
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims{{DimPair(0, 0)}};
+
+  typedef Map<Matrix<float, Dynamic, Dynamic, DataLayout>> MapXf;
+  MapXf m_left(t_left.data(), 32, 500);
+  MapXf m_right(t_right.data(), 32, 28*28);
+  Matrix<float, Dynamic, Dynamic, DataLayout> m_result(500, 28*28);
+
+  Eigen::ThreadPool tp(12);
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, 12);
+
+  // compute results by separate methods
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  m_result = m_left.transpose() * m_right;
+
+  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    assert(!(numext::isnan)(t_result.data()[i]));
+    if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
+      std::cout << "mismatch detected at index " << i << " : " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+
+  t_left.resize(32, 1);
+  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
+  t_result.resize (1, 28*28);
+  t_result = t_result.constant(NAN);
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  new(&m_left) MapXf(t_left.data(), 32, 1);
+  m_result = m_left.transpose() * m_right;
+  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    assert(!(numext::isnan)(t_result.data()[i]));
+    if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
+      std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+
+  t_left.resize(32, 500);
+  t_right.resize(32, 4);
+  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
+  t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
+  t_result.resize (500, 4);
+  t_result = t_result.constant(NAN);
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  new(&m_left) MapXf(t_left.data(), 32, 500);
+  new(&m_right) MapXf(t_right.data(), 32, 4);
+  m_result = m_left.transpose() * m_right;
+  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    assert(!(numext::isnan)(t_result.data()[i]));
+    if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
+      std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+
+  t_left.resize(32, 1);
+  t_right.resize(32, 4);
+  t_left = (t_left.constant(-0.5f) + t_left.random()) * 2.0f;
+  t_right = (t_right.constant(-0.6f) + t_right.random()) * 2.0f;
+  t_result.resize (1, 4);
+  t_result = t_result.constant(NAN);
+  t_result.device(thread_pool_device) = t_left.contract(t_right, dims);
+  new(&m_left) MapXf(t_left.data(), 32, 1);
+  new(&m_right) MapXf(t_right.data(), 32, 4);
+  m_result = m_left.transpose() * m_right;
+  for (ptrdiff_t i = 0; i < t_result.size(); i++) {
+    assert(!(numext::isnan)(t_result.data()[i]));
+    if (fabsf(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) {
+      std::cout << "mismatch detected: " << t_result.data()[i] << " vs " <<  m_result.data()[i] << std::endl;
+      assert(false);
+    }
+  }
+}
+
+template<int DataLayout>
+void test_multithread_contraction_agrees_with_singlethread() {
+  int contract_size = internal::random<int>(1, 5000);
+
+  Tensor<float, 3, DataLayout> left(internal::random<int>(1, 80),
+                                    contract_size,
+                                    internal::random<int>(1, 100));
+
+  Tensor<float, 4, DataLayout> right(internal::random<int>(1, 25),
+                                     internal::random<int>(1, 37),
+                                     contract_size,
+                                     internal::random<int>(1, 51));
+
+  left.setRandom();
+  right.setRandom();
+
+  // add constants to shift values away from 0 for more precision
+  left += left.constant(1.5f);
+  right += right.constant(1.5f);
+
+  typedef Tensor<float, 1>::DimensionPair DimPair;
+  Eigen::array<DimPair, 1> dims({{DimPair(1, 2)}});
+
+  Eigen::ThreadPool tp(internal::random<int>(2, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(2, 11));
+
+  Tensor<float, 5, DataLayout> st_result;
+  st_result = left.contract(right, dims);
+
+  Tensor<float, 5, DataLayout> tp_result(st_result.dimensions());
+  tp_result.device(thread_pool_device) = left.contract(right, dims);
+
+  VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
+  for (ptrdiff_t i = 0; i < st_result.size(); i++) {
+    // if both of the values are very small, then do nothing (because the test will fail
+    // due to numerical precision issues when values are small)
+    if (numext::abs(st_result.data()[i] - tp_result.data()[i]) >= 1e-4f) {
+      VERIFY_IS_APPROX(st_result.data()[i], tp_result.data()[i]);
+    }
+  }
+}
+
+
+template<int DataLayout>
+void test_full_contraction() {
+  int contract_size1 = internal::random<int>(1, 500);
+  int contract_size2 = internal::random<int>(1, 500);
+
+  Tensor<float, 2, DataLayout> left(contract_size1,
+                                    contract_size2);
+  Tensor<float, 2, DataLayout> right(contract_size1,
+                                    contract_size2);
+  left.setRandom();
+  right.setRandom();
+
+  // add constants to shift values away from 0 for more precision
+  left += left.constant(1.5f);
+  right += right.constant(1.5f);
+
+  typedef Tensor<float, 2>::DimensionPair DimPair;
+  Eigen::array<DimPair, 2> dims({{DimPair(0, 0), DimPair(1, 1)}});
+
+  Eigen::ThreadPool tp(internal::random<int>(2, 11));
+  Eigen::ThreadPoolDevice thread_pool_device(&tp, internal::random<int>(2, 11));
+
+  Tensor<float, 0, DataLayout> st_result;
+  st_result = left.contract(right, dims);
+
+  Tensor<float, 0, DataLayout> tp_result;
+  tp_result.device(thread_pool_device) = left.contract(right, dims);
+
+  VERIFY(dimensions_match(st_result.dimensions(), tp_result.dimensions()));
+  // if both of the values are very small, then do nothing (because the test will fail
+  // due to numerical precision issues when values are small)
+  if (numext::abs(st_result() - tp_result()) >= 1e-4f) {
+    VERIFY_IS_APPROX(st_result(), tp_result());
+  }
+}
+
+template<int DataLayout>
+void test_multithreaded_reductions() {
+  const int num_threads = internal::random<int>(3, 11);
+  ThreadPool thread_pool(num_threads);
+  Eigen::ThreadPoolDevice thread_pool_device(&thread_pool, num_threads);
+
+  const int num_rows = internal::random<int>(13, 732);
+  const int num_cols = internal::random<int>(13, 732);
+  Tensor<float, 2, DataLayout> t1(num_rows, num_cols);
+  t1.setRandom();
+
+  Tensor<float, 0, DataLayout> full_redux;
+  full_redux = t1.sum();
+
+  Tensor<float, 0, DataLayout> full_redux_tp;
+  full_redux_tp.device(thread_pool_device) = t1.sum();
+
+  // Check that the single threaded and the multi threaded reductions return
+  // the same result.
+  VERIFY_IS_APPROX(full_redux(), full_redux_tp());
+}
+
+
+void test_memcpy() {
+
+  for (int i = 0; i < 5; ++i) {
+    const int num_threads = internal::random<int>(3, 11);
+    Eigen::ThreadPool tp(num_threads);
+    Eigen::ThreadPoolDevice thread_pool_device(&tp, num_threads);
+
+    const int size = internal::random<int>(13, 7632);
+    Tensor<float, 1> t1(size);
+    t1.setRandom();
+    std::vector<float> result(size);
+    thread_pool_device.memcpy(&result[0], t1.data(), size*sizeof(float));
+    for (int j = 0; j < size; j++) {
+      VERIFY_IS_EQUAL(t1(j), result[j]);
+    }
+  }
+}
+
+
+void test_multithread_random()
+{
+  Eigen::ThreadPool tp(2);
+  Eigen::ThreadPoolDevice device(&tp, 2);
+  Tensor<float, 1> t(1 << 20);
+  t.device(device) = t.random<Eigen::internal::NormalRandomGenerator<float>>();
+}
+
+template<int DataLayout>
+void test_multithread_shuffle()
+{
+  Tensor<float, 4, DataLayout> tensor(17,5,7,11);
+  tensor.setRandom();
+
+  const int num_threads = internal::random<int>(2, 11);
+  ThreadPool threads(num_threads);
+  Eigen::ThreadPoolDevice device(&threads, num_threads);
+
+  Tensor<float, 4, DataLayout> shuffle(7,5,11,17);
+  array<ptrdiff_t, 4> shuffles = {{2,1,3,0}};
+  shuffle.device(device) = tensor.shuffle(shuffles);
+
+  for (int i = 0; i < 17; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      for (int k = 0; k < 7; ++k) {
+        for (int l = 0; l < 11; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,j,l,i));
+        }
+      }
+    }
+  }
+}
+
+
+void test_cxx11_tensor_thread_pool()
+{
+  CALL_SUBTEST_1(test_multithread_elementwise());
+  CALL_SUBTEST_1(test_multithread_compound_assignment());
+
+  CALL_SUBTEST_2(test_multithread_contraction<ColMajor>());
+  CALL_SUBTEST_2(test_multithread_contraction<RowMajor>());
+
+  CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<ColMajor>());
+  CALL_SUBTEST_3(test_multithread_contraction_agrees_with_singlethread<RowMajor>());
+
+  // Exercise various cases that have been problematic in the past.
+  CALL_SUBTEST_4(test_contraction_corner_cases<ColMajor>());
+  CALL_SUBTEST_4(test_contraction_corner_cases<RowMajor>());
+
+  CALL_SUBTEST_4(test_full_contraction<ColMajor>());
+  CALL_SUBTEST_4(test_full_contraction<RowMajor>());
+
+  CALL_SUBTEST_5(test_multithreaded_reductions<ColMajor>());
+  CALL_SUBTEST_5(test_multithreaded_reductions<RowMajor>());
+
+  CALL_SUBTEST_6(test_memcpy());
+  CALL_SUBTEST_6(test_multithread_random());
+  CALL_SUBTEST_6(test_multithread_shuffle<ColMajor>());
+  CALL_SUBTEST_6(test_multithread_shuffle<RowMajor>());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_uint128.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_uint128.cpp
new file mode 100644
index 000000000..d2a1e8673
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_uint128.cpp
@@ -0,0 +1,160 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+
+#if EIGEN_COMP_MSVC
+#define EIGEN_NO_INT128
+#else
+typedef __uint128_t uint128_t;
+#endif
+
+// Only run the test on compilers that support 128bit integers natively
+#ifndef EIGEN_NO_INT128
+
+using Eigen::internal::TensorUInt128;
+using Eigen::internal::static_val;
+
+void VERIFY_EQUAL(TensorUInt128<uint64_t, uint64_t> actual, uint128_t expected) {
+  bool matchl = actual.lower() == static_cast<uint64_t>(expected);
+  bool matchh = actual.upper() == static_cast<uint64_t>(expected >> 64);
+  if (!matchl || !matchh) {
+    const char* testname = g_test_stack.back().c_str();
+    std::cerr << "Test " << testname << " failed in " << __FILE__
+              << " (" << __LINE__ << ")"
+              << std::endl;
+    abort();
+  }
+}
+
+
+void test_add() {
+  uint64_t incr = internal::random<uint64_t>(1, 9999999999);
+  for (uint64_t i1 = 0; i1 < 100; ++i1) {
+    for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
+      TensorUInt128<uint64_t, uint64_t> i(i1, i2);
+      uint128_t a = (static_cast<uint128_t>(i1) << 64) + static_cast<uint128_t>(i2);
+      for (uint64_t j1 = 0; j1 < 100; ++j1) {
+        for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
+          TensorUInt128<uint64_t, uint64_t> j(j1, j2);
+          uint128_t b = (static_cast<uint128_t>(j1) << 64) + static_cast<uint128_t>(j2);
+          TensorUInt128<uint64_t, uint64_t> actual = i + j;
+          uint128_t expected = a + b;
+          VERIFY_EQUAL(actual, expected);
+        }
+      }
+    }
+  }
+}
+
+void test_sub() {
+  uint64_t incr = internal::random<uint64_t>(1, 9999999999);
+  for (uint64_t i1 = 0; i1 < 100; ++i1) {
+    for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
+      TensorUInt128<uint64_t, uint64_t> i(i1, i2);
+      uint128_t a = (static_cast<uint128_t>(i1) << 64) + static_cast<uint128_t>(i2);
+      for (uint64_t j1 = 0; j1 < 100; ++j1) {
+        for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
+          TensorUInt128<uint64_t, uint64_t> j(j1, j2);
+          uint128_t b = (static_cast<uint128_t>(j1) << 64) + static_cast<uint128_t>(j2);
+          TensorUInt128<uint64_t, uint64_t> actual = i - j;
+          uint128_t expected = a - b;
+          VERIFY_EQUAL(actual, expected);
+        }
+      }
+    }
+  }
+}
+
+void test_mul() {
+  uint64_t incr = internal::random<uint64_t>(1, 9999999999);
+  for (uint64_t i1 = 0; i1 < 100; ++i1) {
+    for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
+      TensorUInt128<uint64_t, uint64_t> i(i1, i2);
+      uint128_t a = (static_cast<uint128_t>(i1) << 64) + static_cast<uint128_t>(i2);
+      for (uint64_t j1 = 0; j1 < 100; ++j1) {
+        for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
+          TensorUInt128<uint64_t, uint64_t> j(j1, j2);
+          uint128_t b = (static_cast<uint128_t>(j1) << 64) + static_cast<uint128_t>(j2);
+          TensorUInt128<uint64_t, uint64_t> actual = i * j;
+          uint128_t expected = a * b;
+          VERIFY_EQUAL(actual, expected);
+        }
+      }
+    }
+  }
+}
+
+void test_div() {
+  uint64_t incr = internal::random<uint64_t>(1, 9999999999);
+  for (uint64_t i1 = 0; i1 < 100; ++i1) {
+    for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
+      TensorUInt128<uint64_t, uint64_t> i(i1, i2);
+      uint128_t a = (static_cast<uint128_t>(i1) << 64) + static_cast<uint128_t>(i2);
+      for (uint64_t j1 = 0; j1 < 100; ++j1) {
+        for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
+          TensorUInt128<uint64_t, uint64_t> j(j1, j2);
+          uint128_t b = (static_cast<uint128_t>(j1) << 64) + static_cast<uint128_t>(j2);
+          TensorUInt128<uint64_t, uint64_t> actual = i / j;
+          uint128_t expected = a / b;
+          VERIFY_EQUAL(actual, expected);
+        }
+      }
+    }
+  }
+}
+
+void test_misc1() {
+  uint64_t incr = internal::random<uint64_t>(1, 9999999999);
+  for (uint64_t i2 = 1; i2 < 100 * incr; i2 += incr) {
+    TensorUInt128<static_val<0>, uint64_t> i(0, i2);
+    uint128_t a = static_cast<uint128_t>(i2);
+    for (uint64_t j2 = 1; j2 < 100 * incr; j2 += incr) {
+      TensorUInt128<static_val<0>, uint64_t> j(0, j2);
+      uint128_t b = static_cast<uint128_t>(j2);
+      uint64_t actual = (i * j).upper();
+      uint64_t expected = (a * b) >> 64;
+      VERIFY_IS_EQUAL(actual, expected);
+    }
+  }
+}
+
+void test_misc2() {
+  int64_t incr = internal::random<int64_t>(1, 100);
+  for (int64_t log_div = 0; log_div < 63; ++log_div) {
+    for (int64_t divider = 1; divider <= 1000000 * incr; divider += incr) {
+      uint64_t expected = (static_cast<uint128_t>(1) << (64+log_div)) / static_cast<uint128_t>(divider) - (static_cast<uint128_t>(1) << 64) + 1;
+      uint64_t shift = 1ULL << log_div;
+
+      TensorUInt128<uint64_t, uint64_t> result = (TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1));
+      uint64_t actual = static_cast<uint64_t>(result);
+      VERIFY_IS_EQUAL(actual, expected);
+    }
+  }
+}
+#endif
+
+
+void test_cxx11_tensor_uint128()
+{
+#ifdef EIGEN_NO_INT128
+  // Skip the test on compilers that don't support 128bit integers natively
+  return;
+#else
+  CALL_SUBTEST_1(test_add());
+  CALL_SUBTEST_2(test_sub());
+  CALL_SUBTEST_3(test_mul());
+  CALL_SUBTEST_4(test_div());
+  CALL_SUBTEST_5(test_misc1());
+  CALL_SUBTEST_6(test_misc2());
+#endif
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_volume_patch.cpp b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_volume_patch.cpp
new file mode 100644
index 000000000..ca6840f3b
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_volume_patch.cpp
@@ -0,0 +1,112 @@
+#include "main.h"
+
+#include <Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+static void test_single_voxel_patch()
+{
+  Tensor<float, 5> tensor(4,2,3,5,7);
+  tensor.setRandom();
+  Tensor<float, 5, RowMajor> tensor_row_major = tensor.swap_layout();
+
+  Tensor<float, 6> single_voxel_patch;
+  single_voxel_patch = tensor.extract_volume_patches(1, 1, 1);
+  VERIFY_IS_EQUAL(single_voxel_patch.dimension(0), 4);
+  VERIFY_IS_EQUAL(single_voxel_patch.dimension(1), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch.dimension(3), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch.dimension(4), 2 * 3 * 5);
+  VERIFY_IS_EQUAL(single_voxel_patch.dimension(5), 7);
+
+  Tensor<float, 6, RowMajor> single_voxel_patch_row_major;
+  single_voxel_patch_row_major = tensor_row_major.extract_volume_patches(1, 1, 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(0), 7);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(1), 2 * 3 * 5);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(2), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(3), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(4), 1);
+  VERIFY_IS_EQUAL(single_voxel_patch_row_major.dimension(5), 4);
+
+  for (int i = 0; i < tensor.size(); ++i) {
+    VERIFY_IS_EQUAL(tensor.data()[i], single_voxel_patch.data()[i]);
+    VERIFY_IS_EQUAL(tensor_row_major.data()[i], single_voxel_patch_row_major.data()[i]);
+    VERIFY_IS_EQUAL(tensor.data()[i], tensor_row_major.data()[i]);
+  }
+}
+
+
+static void test_entire_volume_patch()
+{
+  const int depth = 4;
+  const int patch_z = 2;
+  const int patch_y = 3;
+  const int patch_x = 5;
+  const int batch = 7;
+
+  Tensor<float, 5> tensor(depth, patch_z, patch_y, patch_x, batch);
+  tensor.setRandom();
+  Tensor<float, 5, RowMajor> tensor_row_major = tensor.swap_layout();
+
+  Tensor<float, 6> entire_volume_patch;
+  entire_volume_patch = tensor.extract_volume_patches(patch_z, patch_y, patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch.dimension(0), depth);
+  VERIFY_IS_EQUAL(entire_volume_patch.dimension(1), patch_z);
+  VERIFY_IS_EQUAL(entire_volume_patch.dimension(2), patch_y);
+  VERIFY_IS_EQUAL(entire_volume_patch.dimension(3), patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch.dimension(4), patch_z * patch_y * patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch.dimension(5), batch);
+
+  Tensor<float, 6, RowMajor> entire_volume_patch_row_major;
+  entire_volume_patch_row_major = tensor_row_major.extract_volume_patches(patch_z, patch_y, patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(0), batch);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(1), patch_z * patch_y * patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(2), patch_x);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(3), patch_y);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(4), patch_z);
+  VERIFY_IS_EQUAL(entire_volume_patch_row_major.dimension(5), depth);
+
+  const int dz = patch_z - 1;
+  const int dy = patch_y - 1;
+  const int dx = patch_x - 1;
+
+  const int forward_pad_z = dz - dz / 2;
+  const int forward_pad_y = dy - dy / 2;
+  const int forward_pad_x = dx - dx / 2;
+
+  for (int pz = 0; pz < patch_z; pz++) {
+    for (int py = 0; py < patch_y; py++) {
+      for (int px = 0; px < patch_x; px++) {
+        const int patchId = pz + patch_z * (py + px * patch_y);
+        for (int z = 0; z < patch_z; z++) {
+          for (int y = 0; y < patch_y; y++) {
+            for (int x = 0; x < patch_x; x++) {
+              for (int b = 0; b < batch; b++) {
+                for (int d = 0; d < depth; d++) {
+                  float expected = 0.0f;
+                  float expected_row_major = 0.0f;
+                  const int eff_z = z - forward_pad_z + pz;
+                  const int eff_y = y - forward_pad_y + py;
+                  const int eff_x = x - forward_pad_x + px;
+                  if (eff_z >= 0 && eff_y >= 0 && eff_x >= 0 &&
+                      eff_z < patch_z && eff_y < patch_y && eff_x < patch_x) {
+                    expected = tensor(d, eff_z, eff_y, eff_x, b);
+                    expected_row_major = tensor_row_major(b, eff_x, eff_y, eff_z, d);
+                  }
+                  VERIFY_IS_EQUAL(entire_volume_patch(d, z, y, x, patchId, b), expected);
+                  VERIFY_IS_EQUAL(entire_volume_patch_row_major(b, patchId, x, y, z, d), expected_row_major);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void test_cxx11_tensor_volume_patch()
+{
+  CALL_SUBTEST(test_single_voxel_patch());
+  CALL_SUBTEST(test_entire_volume_patch());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/dgmres.cpp b/uppsrc/plugin/Eigen/unsupported/test/dgmres.cpp
new file mode 100644
index 000000000..2b11807c8
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/dgmres.cpp
@@ -0,0 +1,31 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2012 desire Nuentsa <desire.nuentsa_wakam@inria.fr
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "../../test/sparse_solver.h"
+#include <Eigen/src/IterativeSolvers/DGMRES.h>
+
+template<typename T> void test_dgmres_T()
+{
+  DGMRES<SparseMatrix<T>, DiagonalPreconditioner<T> > dgmres_colmajor_diag;
+  DGMRES<SparseMatrix<T>, IdentityPreconditioner    > dgmres_colmajor_I;
+  DGMRES<SparseMatrix<T>, IncompleteLUT<T> >           dgmres_colmajor_ilut;
+  //GMRES<SparseMatrix<T>, SSORPreconditioner<T> >     dgmres_colmajor_ssor;
+
+  CALL_SUBTEST( check_sparse_square_solving(dgmres_colmajor_diag)  );
+//   CALL_SUBTEST( check_sparse_square_solving(dgmres_colmajor_I)     );
+  CALL_SUBTEST( check_sparse_square_solving(dgmres_colmajor_ilut)     );
+  //CALL_SUBTEST( check_sparse_square_solving(dgmres_colmajor_ssor)     );
+}
+
+void test_dgmres()
+{
+  CALL_SUBTEST_1(test_dgmres_T<double>());
+  CALL_SUBTEST_2(test_dgmres_T<std::complex<double> >());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/forward_adolc.cpp b/uppsrc/plugin/Eigen/unsupported/test/forward_adolc.cpp
new file mode 100644
index 000000000..866db8e86
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/forward_adolc.cpp
@@ -0,0 +1,141 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <Eigen/Dense>
+
+#define NUMBER_DIRECTIONS 16
+#include <unsupported/Eigen/AdolcForward>
+
+template<typename Vector>
+EIGEN_DONT_INLINE typename Vector::Scalar foo(const Vector& p)
+{
+  typedef typename Vector::Scalar Scalar;
+  return (p-Vector(Scalar(-1),Scalar(1.))).norm() + (p.array().sqrt().abs() * p.array().sin()).sum() + p.dot(p);
+}
+
+template<typename _Scalar, int NX=Dynamic, int NY=Dynamic>
+struct TestFunc1
+{
+  typedef _Scalar Scalar;
+  enum {
+    InputsAtCompileTime = NX,
+    ValuesAtCompileTime = NY
+  };
+  typedef Matrix<Scalar,InputsAtCompileTime,1> InputType;
+  typedef Matrix<Scalar,ValuesAtCompileTime,1> ValueType;
+  typedef Matrix<Scalar,ValuesAtCompileTime,InputsAtCompileTime> JacobianType;
+
+  int m_inputs, m_values;
+
+  TestFunc1() : m_inputs(InputsAtCompileTime), m_values(ValuesAtCompileTime) {}
+  TestFunc1(int inputs, int values) : m_inputs(inputs), m_values(values) {}
+
+  int inputs() const { return m_inputs; }
+  int values() const { return m_values; }
+
+  template<typename T>
+  void operator() (const Matrix<T,InputsAtCompileTime,1>& x, Matrix<T,ValuesAtCompileTime,1>* _v) const
+  {
+    Matrix<T,ValuesAtCompileTime,1>& v = *_v;
+
+    v[0] = 2 * x[0] * x[0] + x[0] * x[1];
+    v[1] = 3 * x[1] * x[0] + 0.5 * x[1] * x[1];
+    if(inputs()>2)
+    {
+      v[0] += 0.5 * x[2];
+      v[1] += x[2];
+    }
+    if(values()>2)
+    {
+      v[2] = 3 * x[1] * x[0] * x[0];
+    }
+    if (inputs()>2 && values()>2)
+      v[2] *= x[2];
+  }
+
+  void operator() (const InputType& x, ValueType* v, JacobianType* _j) const
+  {
+    (*this)(x, v);
+
+    if(_j)
+    {
+      JacobianType& j = *_j;
+
+      j(0,0) = 4 * x[0] + x[1];
+      j(1,0) = 3 * x[1];
+
+      j(0,1) = x[0];
+      j(1,1) = 3 * x[0] + 2 * 0.5 * x[1];
+
+      if (inputs()>2)
+      {
+        j(0,2) = 0.5;
+        j(1,2) = 1;
+      }
+      if(values()>2)
+      {
+        j(2,0) = 3 * x[1] * 2 * x[0];
+        j(2,1) = 3 * x[0] * x[0];
+      }
+      if (inputs()>2 && values()>2)
+      {
+        j(2,0) *= x[2];
+        j(2,1) *= x[2];
+
+        j(2,2) = 3 * x[1] * x[0] * x[0];
+        j(2,2) = 3 * x[1] * x[0] * x[0];
+      }
+    }
+  }
+};
+
+template<typename Func> void adolc_forward_jacobian(const Func& f)
+{
+    typename Func::InputType x = Func::InputType::Random(f.inputs());
+    typename Func::ValueType y(f.values()), yref(f.values());
+    typename Func::JacobianType j(f.values(),f.inputs()), jref(f.values(),f.inputs());
+
+    jref.setZero();
+    yref.setZero();
+    f(x,&yref,&jref);
+//     std::cerr << y.transpose() << "\n\n";;
+//     std::cerr << j << "\n\n";;
+
+    j.setZero();
+    y.setZero();
+    AdolcForwardJacobian<Func> autoj(f);
+    autoj(x, &y, &j);
+//     std::cerr << y.transpose() << "\n\n";;
+//     std::cerr << j << "\n\n";;
+
+    VERIFY_IS_APPROX(y, yref);
+    VERIFY_IS_APPROX(j, jref);
+}
+
+void test_forward_adolc()
+{
+  adtl::setNumDir(NUMBER_DIRECTIONS);
+
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST(( adolc_forward_jacobian(TestFunc1<double,2,2>()) ));
+    CALL_SUBTEST(( adolc_forward_jacobian(TestFunc1<double,2,3>()) ));
+    CALL_SUBTEST(( adolc_forward_jacobian(TestFunc1<double,3,2>()) ));
+    CALL_SUBTEST(( adolc_forward_jacobian(TestFunc1<double,3,3>()) ));
+    CALL_SUBTEST(( adolc_forward_jacobian(TestFunc1<double>(3,3)) ));
+  }
+
+  {
+    // simple instanciation tests
+    Matrix<adtl::adouble,2,1> x;
+    foo(x);
+    Matrix<adtl::adouble,Dynamic,Dynamic> A(4,4);;
+    A.selfadjointView<Lower>().eigenvalues();
+  }
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/gmres.cpp b/uppsrc/plugin/Eigen/unsupported/test/gmres.cpp
new file mode 100644
index 000000000..f2969116b
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/gmres.cpp
@@ -0,0 +1,31 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
+// Copyright (C) 2012 Kolja Brix <brix@igpm.rwth-aaachen.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "../../test/sparse_solver.h"
+#include <Eigen/IterativeSolvers>
+
+template<typename T> void test_gmres_T()
+{
+  GMRES<SparseMatrix<T>, DiagonalPreconditioner<T> > gmres_colmajor_diag;
+  GMRES<SparseMatrix<T>, IdentityPreconditioner    > gmres_colmajor_I;
+  GMRES<SparseMatrix<T>, IncompleteLUT<T> >           gmres_colmajor_ilut;
+  //GMRES<SparseMatrix<T>, SSORPreconditioner<T> >     gmres_colmajor_ssor;
+
+  CALL_SUBTEST( check_sparse_square_solving(gmres_colmajor_diag)  );
+//   CALL_SUBTEST( check_sparse_square_solving(gmres_colmajor_I)     );
+  CALL_SUBTEST( check_sparse_square_solving(gmres_colmajor_ilut)     );
+  //CALL_SUBTEST( check_sparse_square_solving(gmres_colmajor_ssor)     );
+}
+
+void test_gmres()
+{
+  CALL_SUBTEST_1(test_gmres_T<double>());
+  CALL_SUBTEST_2(test_gmres_T<std::complex<double> >());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/kronecker_product.cpp b/uppsrc/plugin/Eigen/unsupported/test/kronecker_product.cpp
new file mode 100644
index 000000000..e770049e5
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/kronecker_product.cpp
@@ -0,0 +1,252 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Kolja Brix <brix@igpm.rwth-aachen.de>
+// Copyright (C) 2011 Andreas Platen <andiplaten@gmx.de>
+// Copyright (C) 2012 Chen-Pang He <jdh8@ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifdef EIGEN_TEST_PART_1
+
+#include "sparse.h"
+#include <Eigen/SparseExtra>
+#include <Eigen/KroneckerProduct>
+
+template<typename MatrixType>
+void check_dimension(const MatrixType& ab, const int rows,  const int cols)
+{
+  VERIFY_IS_EQUAL(ab.rows(), rows);
+  VERIFY_IS_EQUAL(ab.cols(), cols);
+}
+
+
+template<typename MatrixType>
+void check_kronecker_product(const MatrixType& ab)
+{
+  VERIFY_IS_EQUAL(ab.rows(), 6);
+  VERIFY_IS_EQUAL(ab.cols(), 6);
+  VERIFY_IS_EQUAL(ab.nonZeros(),  36);
+  VERIFY_IS_APPROX(ab.coeff(0,0), -0.4017367630386106);
+  VERIFY_IS_APPROX(ab.coeff(0,1),  0.1056863433932735);
+  VERIFY_IS_APPROX(ab.coeff(0,2), -0.7255206194554212);
+  VERIFY_IS_APPROX(ab.coeff(0,3),  0.1908653336744706);
+  VERIFY_IS_APPROX(ab.coeff(0,4),  0.350864567234111);
+  VERIFY_IS_APPROX(ab.coeff(0,5), -0.0923032108308013);
+  VERIFY_IS_APPROX(ab.coeff(1,0),  0.415417514804677);
+  VERIFY_IS_APPROX(ab.coeff(1,1), -0.2369227701722048);
+  VERIFY_IS_APPROX(ab.coeff(1,2),  0.7502275131458511);
+  VERIFY_IS_APPROX(ab.coeff(1,3), -0.4278731019742696);
+  VERIFY_IS_APPROX(ab.coeff(1,4), -0.3628129162264507);
+  VERIFY_IS_APPROX(ab.coeff(1,5),  0.2069210808481275);
+  VERIFY_IS_APPROX(ab.coeff(2,0),  0.05465890160863986);
+  VERIFY_IS_APPROX(ab.coeff(2,1), -0.2634092511419858);
+  VERIFY_IS_APPROX(ab.coeff(2,2),  0.09871180285793758);
+  VERIFY_IS_APPROX(ab.coeff(2,3), -0.4757066334017702);
+  VERIFY_IS_APPROX(ab.coeff(2,4), -0.04773740823058334);
+  VERIFY_IS_APPROX(ab.coeff(2,5),  0.2300535609645254);
+  VERIFY_IS_APPROX(ab.coeff(3,0), -0.8172945853260133);
+  VERIFY_IS_APPROX(ab.coeff(3,1),  0.2150086428359221);
+  VERIFY_IS_APPROX(ab.coeff(3,2),  0.5825113847292743);
+  VERIFY_IS_APPROX(ab.coeff(3,3), -0.1532433770097174);
+  VERIFY_IS_APPROX(ab.coeff(3,4), -0.329383387282399);
+  VERIFY_IS_APPROX(ab.coeff(3,5),  0.08665207912033064);
+  VERIFY_IS_APPROX(ab.coeff(4,0),  0.8451267514863225);
+  VERIFY_IS_APPROX(ab.coeff(4,1), -0.481996458918977);
+  VERIFY_IS_APPROX(ab.coeff(4,2), -0.6023482390791535);
+  VERIFY_IS_APPROX(ab.coeff(4,3),  0.3435339347164565);
+  VERIFY_IS_APPROX(ab.coeff(4,4),  0.3406002157428891);
+  VERIFY_IS_APPROX(ab.coeff(4,5), -0.1942526344200915);
+  VERIFY_IS_APPROX(ab.coeff(5,0),  0.1111982482925399);
+  VERIFY_IS_APPROX(ab.coeff(5,1), -0.5358806424754169);
+  VERIFY_IS_APPROX(ab.coeff(5,2), -0.07925446559335647);
+  VERIFY_IS_APPROX(ab.coeff(5,3),  0.3819388757769038);
+  VERIFY_IS_APPROX(ab.coeff(5,4),  0.04481475387219876);
+  VERIFY_IS_APPROX(ab.coeff(5,5), -0.2159688616158057);
+}
+
+
+template<typename MatrixType>
+void check_sparse_kronecker_product(const MatrixType& ab)
+{
+  VERIFY_IS_EQUAL(ab.rows(), 12);
+  VERIFY_IS_EQUAL(ab.cols(), 10);
+  VERIFY_IS_EQUAL(ab.nonZeros(), 3*2);
+  VERIFY_IS_APPROX(ab.coeff(3,0), -0.04);
+  VERIFY_IS_APPROX(ab.coeff(5,1),  0.05);
+  VERIFY_IS_APPROX(ab.coeff(0,6), -0.08);
+  VERIFY_IS_APPROX(ab.coeff(2,7),  0.10);
+  VERIFY_IS_APPROX(ab.coeff(6,8),  0.12);
+  VERIFY_IS_APPROX(ab.coeff(8,9), -0.15);
+}
+
+
+void test_kronecker_product()
+{
+  // DM = dense matrix; SM = sparse matrix
+
+  Matrix<double, 2, 3> DM_a;
+  SparseMatrix<double> SM_a(2,3);
+  SM_a.insert(0,0) = DM_a.coeffRef(0,0) = -0.4461540300782201;
+  SM_a.insert(0,1) = DM_a.coeffRef(0,1) = -0.8057364375283049;
+  SM_a.insert(0,2) = DM_a.coeffRef(0,2) =  0.3896572459516341;
+  SM_a.insert(1,0) = DM_a.coeffRef(1,0) = -0.9076572187376921;
+  SM_a.insert(1,1) = DM_a.coeffRef(1,1) =  0.6469156566545853;
+  SM_a.insert(1,2) = DM_a.coeffRef(1,2) = -0.3658010398782789;
+ 
+  MatrixXd             DM_b(3,2);
+  SparseMatrix<double> SM_b(3,2);
+  SM_b.insert(0,0) = DM_b.coeffRef(0,0) =  0.9004440976767099;
+  SM_b.insert(0,1) = DM_b.coeffRef(0,1) = -0.2368830858139832;
+  SM_b.insert(1,0) = DM_b.coeffRef(1,0) = -0.9311078389941825;
+  SM_b.insert(1,1) = DM_b.coeffRef(1,1) =  0.5310335762980047;
+  SM_b.insert(2,0) = DM_b.coeffRef(2,0) = -0.1225112806872035;
+  SM_b.insert(2,1) = DM_b.coeffRef(2,1) =  0.5903998022741264;
+
+  SparseMatrix<double,RowMajor> SM_row_a(SM_a), SM_row_b(SM_b);
+
+  // test DM_fixedSize = kroneckerProduct(DM_block,DM)
+  Matrix<double, 6, 6> DM_fix_ab = kroneckerProduct(DM_a.topLeftCorner<2,3>(),DM_b);
+
+  CALL_SUBTEST(check_kronecker_product(DM_fix_ab));
+  CALL_SUBTEST(check_kronecker_product(kroneckerProduct(DM_a.topLeftCorner<2,3>(),DM_b)));
+
+  for(int i=0;i<DM_fix_ab.rows();++i)
+    for(int j=0;j<DM_fix_ab.cols();++j)
+       VERIFY_IS_APPROX(kroneckerProduct(DM_a,DM_b).coeff(i,j), DM_fix_ab(i,j));
+
+  // test DM_block = kroneckerProduct(DM,DM)
+  MatrixXd DM_block_ab(10,15);
+  DM_block_ab.block<6,6>(2,5) = kroneckerProduct(DM_a,DM_b);
+  CALL_SUBTEST(check_kronecker_product(DM_block_ab.block<6,6>(2,5)));
+
+  // test DM = kroneckerProduct(DM,DM)
+  MatrixXd DM_ab = kroneckerProduct(DM_a,DM_b);
+  CALL_SUBTEST(check_kronecker_product(DM_ab));
+  CALL_SUBTEST(check_kronecker_product(kroneckerProduct(DM_a,DM_b)));
+
+  // test SM = kroneckerProduct(SM,DM)
+  SparseMatrix<double> SM_ab = kroneckerProduct(SM_a,DM_b);
+  CALL_SUBTEST(check_kronecker_product(SM_ab));
+  SparseMatrix<double,RowMajor> SM_ab2 = kroneckerProduct(SM_a,DM_b);
+  CALL_SUBTEST(check_kronecker_product(SM_ab2));
+  CALL_SUBTEST(check_kronecker_product(kroneckerProduct(SM_a,DM_b)));
+
+  // test SM = kroneckerProduct(DM,SM)
+  SM_ab.setZero();
+  SM_ab.insert(0,0)=37.0;
+  SM_ab = kroneckerProduct(DM_a,SM_b);
+  CALL_SUBTEST(check_kronecker_product(SM_ab));
+  SM_ab2.setZero();
+  SM_ab2.insert(0,0)=37.0;
+  SM_ab2 = kroneckerProduct(DM_a,SM_b);
+  CALL_SUBTEST(check_kronecker_product(SM_ab2));
+  CALL_SUBTEST(check_kronecker_product(kroneckerProduct(DM_a,SM_b)));
+
+  // test SM = kroneckerProduct(SM,SM)
+  SM_ab.resize(2,33);
+  SM_ab.insert(0,0)=37.0;
+  SM_ab = kroneckerProduct(SM_a,SM_b);
+  CALL_SUBTEST(check_kronecker_product(SM_ab));
+  SM_ab2.resize(5,11);
+  SM_ab2.insert(0,0)=37.0;
+  SM_ab2 = kroneckerProduct(SM_a,SM_b);
+  CALL_SUBTEST(check_kronecker_product(SM_ab2));
+  CALL_SUBTEST(check_kronecker_product(kroneckerProduct(SM_a,SM_b)));
+
+  // test SM = kroneckerProduct(SM,SM) with sparse pattern
+  SM_a.resize(4,5);
+  SM_b.resize(3,2);
+  SM_a.resizeNonZeros(0);
+  SM_b.resizeNonZeros(0);
+  SM_a.insert(1,0) = -0.1;
+  SM_a.insert(0,3) = -0.2;
+  SM_a.insert(2,4) =  0.3;
+  SM_a.finalize();
+  
+  SM_b.insert(0,0) =  0.4;
+  SM_b.insert(2,1) = -0.5;
+  SM_b.finalize();
+  SM_ab.resize(1,1);
+  SM_ab.insert(0,0)=37.0;
+  SM_ab = kroneckerProduct(SM_a,SM_b);
+  CALL_SUBTEST(check_sparse_kronecker_product(SM_ab));
+
+  // test dimension of result of DM = kroneckerProduct(DM,DM)
+  MatrixXd DM_a2(2,1);
+  MatrixXd DM_b2(5,4);
+  MatrixXd DM_ab2 = kroneckerProduct(DM_a2,DM_b2);
+  CALL_SUBTEST(check_dimension(DM_ab2,2*5,1*4));
+  DM_a2.resize(10,9);
+  DM_b2.resize(4,8);
+  DM_ab2 = kroneckerProduct(DM_a2,DM_b2);
+  CALL_SUBTEST(check_dimension(DM_ab2,10*4,9*8));
+  
+  for(int i = 0; i < g_repeat; i++)
+  {
+    double density = Eigen::internal::random<double>(0.01,0.5);
+    int ra = Eigen::internal::random<int>(1,50);
+    int ca = Eigen::internal::random<int>(1,50);
+    int rb = Eigen::internal::random<int>(1,50);
+    int cb = Eigen::internal::random<int>(1,50);
+    SparseMatrix<float,ColMajor> sA(ra,ca), sB(rb,cb), sC;
+    SparseMatrix<float,RowMajor> sC2;
+    MatrixXf dA(ra,ca), dB(rb,cb), dC;
+    initSparse(density, dA, sA);
+    initSparse(density, dB, sB);
+    
+    sC = kroneckerProduct(sA,sB);
+    dC = kroneckerProduct(dA,dB);
+    VERIFY_IS_APPROX(MatrixXf(sC),dC);
+    
+    sC = kroneckerProduct(sA.transpose(),sB);
+    dC = kroneckerProduct(dA.transpose(),dB);
+    VERIFY_IS_APPROX(MatrixXf(sC),dC);
+    
+    sC = kroneckerProduct(sA.transpose(),sB.transpose());
+    dC = kroneckerProduct(dA.transpose(),dB.transpose());
+    VERIFY_IS_APPROX(MatrixXf(sC),dC);
+    
+    sC = kroneckerProduct(sA,sB.transpose());
+    dC = kroneckerProduct(dA,dB.transpose());
+    VERIFY_IS_APPROX(MatrixXf(sC),dC);
+    
+    sC2 = kroneckerProduct(sA,sB);
+    dC = kroneckerProduct(dA,dB);
+    VERIFY_IS_APPROX(MatrixXf(sC2),dC);
+    
+    sC2 = kroneckerProduct(dA,sB);
+    dC = kroneckerProduct(dA,dB);
+    VERIFY_IS_APPROX(MatrixXf(sC2),dC);
+    
+    sC2 = kroneckerProduct(sA,dB);
+    dC = kroneckerProduct(dA,dB);
+    VERIFY_IS_APPROX(MatrixXf(sC2),dC);
+    
+    sC2 = kroneckerProduct(2*sA,sB);
+    dC = kroneckerProduct(2*dA,dB);
+    VERIFY_IS_APPROX(MatrixXf(sC2),dC);
+  }
+}
+
+#endif
+
+#ifdef EIGEN_TEST_PART_2
+
+// simply check that for a dense kronecker product, sparse module is not needed
+
+#include "main.h"
+#include <Eigen/KroneckerProduct>
+
+void test_kronecker_product()
+{
+  MatrixXd a(2,2), b(3,3), c;
+  a.setRandom();
+  b.setRandom();
+  c = kroneckerProduct(a,b);
+  VERIFY_IS_APPROX(c.block(3,3,3,3), a(1,1)*b);
+}
+
+#endif
diff --git a/uppsrc/plugin/Eigen/unsupported/test/levenberg_marquardt.cpp b/uppsrc/plugin/Eigen/unsupported/test/levenberg_marquardt.cpp
new file mode 100644
index 000000000..64f168c16
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/levenberg_marquardt.cpp
@@ -0,0 +1,1477 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Thomas Capricelli <orzel@freehackers.org>
+// Copyright (C) 2012 desire Nuentsa <desire.nuentsa_wakam@inria.fr
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+// FIXME: These tests all check for hard-coded values. Ideally, parameters and start estimates should be randomized.
+
+
+#include <stdio.h>
+
+#include "main.h"
+#include <unsupported/Eigen/LevenbergMarquardt>
+
+// This disables some useless Warnings on MSVC.
+// It is intended to be done for this test only.
+#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+
+using std::sqrt;
+
+// tolerance for chekcing number of iterations
+#define LM_EVAL_COUNT_TOL 4/3
+
+struct lmder_functor : DenseFunctor<double>
+{
+    lmder_functor(void): DenseFunctor<double>(3,15) {}
+    int operator()(const VectorXd &x, VectorXd &fvec) const
+    {
+        double tmp1, tmp2, tmp3;
+        static const double y[15] = {1.4e-1, 1.8e-1, 2.2e-1, 2.5e-1, 2.9e-1, 3.2e-1, 3.5e-1,
+            3.9e-1, 3.7e-1, 5.8e-1, 7.3e-1, 9.6e-1, 1.34, 2.1, 4.39};
+
+        for (int i = 0; i < values(); i++)
+        {
+            tmp1 = i+1;
+            tmp2 = 16 - i - 1;
+            tmp3 = (i>=8)? tmp2 : tmp1;
+            fvec[i] = y[i] - (x[0] + tmp1/(x[1]*tmp2 + x[2]*tmp3));
+        }
+        return 0;
+    }
+
+    int df(const VectorXd &x, MatrixXd &fjac) const
+    {
+        double tmp1, tmp2, tmp3, tmp4;
+        for (int i = 0; i < values(); i++)
+        {
+            tmp1 = i+1;
+            tmp2 = 16 - i - 1;
+            tmp3 = (i>=8)? tmp2 : tmp1;
+            tmp4 = (x[1]*tmp2 + x[2]*tmp3); tmp4 = tmp4*tmp4;
+            fjac(i,0) = -1;
+            fjac(i,1) = tmp1*tmp2/tmp4;
+            fjac(i,2) = tmp1*tmp3/tmp4;
+        }
+        return 0;
+    }
+};
+
+void testLmder1()
+{
+  int n=3, info;
+
+  VectorXd x;
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, 1.);
+
+  // do the computation
+  lmder_functor functor;
+  LevenbergMarquardt<lmder_functor> lm(functor);
+  info = lm.lmder1(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 6);
+  VERIFY_IS_EQUAL(lm.njev(), 5);
+
+  // check norm
+  VERIFY_IS_APPROX(lm.fvec().blueNorm(), 0.09063596);
+
+  // check x
+  VectorXd x_ref(n);
+  x_ref << 0.08241058, 1.133037, 2.343695;
+  VERIFY_IS_APPROX(x, x_ref);
+}
+
+void testLmder()
+{
+  const int m=15, n=3;
+  int info;
+  double fnorm, covfac;
+  VectorXd x;
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, 1.);
+
+  // do the computation
+  lmder_functor functor;
+  LevenbergMarquardt<lmder_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return values
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 6);
+  VERIFY_IS_EQUAL(lm.njev(), 5);
+
+  // check norm
+  fnorm = lm.fvec().blueNorm();
+  VERIFY_IS_APPROX(fnorm, 0.09063596);
+
+  // check x
+  VectorXd x_ref(n);
+  x_ref << 0.08241058, 1.133037, 2.343695;
+  VERIFY_IS_APPROX(x, x_ref);
+
+  // check covariance
+  covfac = fnorm*fnorm/(m-n);
+  internal::covar(lm.matrixR(), lm.permutation().indices()); // TODO : move this as a function of lm
+
+  MatrixXd cov_ref(n,n);
+  cov_ref <<
+      0.0001531202,   0.002869941,  -0.002656662,
+      0.002869941,    0.09480935,   -0.09098995,
+      -0.002656662,   -0.09098995,    0.08778727;
+
+//  std::cout << fjac*covfac << std::endl;
+
+  MatrixXd cov;
+  cov =  covfac*lm.matrixR().topLeftCorner<n,n>();
+  VERIFY_IS_APPROX( cov, cov_ref);
+  // TODO: why isn't this allowed ? :
+  // VERIFY_IS_APPROX( covfac*fjac.topLeftCorner<n,n>() , cov_ref);
+}
+
+struct lmdif_functor : DenseFunctor<double>
+{
+    lmdif_functor(void) : DenseFunctor<double>(3,15) {}
+    int operator()(const VectorXd &x, VectorXd &fvec) const
+    {
+        int i;
+        double tmp1,tmp2,tmp3;
+        static const double y[15]={1.4e-1,1.8e-1,2.2e-1,2.5e-1,2.9e-1,3.2e-1,3.5e-1,3.9e-1,
+            3.7e-1,5.8e-1,7.3e-1,9.6e-1,1.34e0,2.1e0,4.39e0};
+
+        assert(x.size()==3);
+        assert(fvec.size()==15);
+        for (i=0; i<15; i++)
+        {
+            tmp1 = i+1;
+            tmp2 = 15 - i;
+            tmp3 = tmp1;
+
+            if (i >= 8) tmp3 = tmp2;
+            fvec[i] = y[i] - (x[0] + tmp1/(x[1]*tmp2 + x[2]*tmp3));
+        }
+        return 0;
+    }
+};
+
+void testLmdif1()
+{
+  const int n=3;
+  int info;
+
+  VectorXd x(n), fvec(15);
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, 1.);
+
+  // do the computation
+  lmdif_functor functor;
+  DenseIndex nfev;
+  info = LevenbergMarquardt<lmdif_functor>::lmdif1(functor, x, &nfev);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+//   VERIFY_IS_EQUAL(nfev, 26);
+
+  // check norm
+  functor(x, fvec);
+  VERIFY_IS_APPROX(fvec.blueNorm(), 0.09063596);
+
+  // check x
+  VectorXd x_ref(n);
+  x_ref << 0.0824106, 1.1330366, 2.3436947;
+  VERIFY_IS_APPROX(x, x_ref);
+
+}
+
+void testLmdif()
+{
+  const int m=15, n=3;
+  int info;
+  double fnorm, covfac;
+  VectorXd x(n);
+
+  /* the following starting values provide a rough fit. */
+  x.setConstant(n, 1.);
+
+  // do the computation
+  lmdif_functor functor;
+  NumericalDiff<lmdif_functor> numDiff(functor);
+  LevenbergMarquardt<NumericalDiff<lmdif_functor> > lm(numDiff);
+  info = lm.minimize(x);
+
+  // check return values
+  VERIFY_IS_EQUAL(info, 1);
+//   VERIFY_IS_EQUAL(lm.nfev(), 26);
+
+  // check norm
+  fnorm = lm.fvec().blueNorm();
+  VERIFY_IS_APPROX(fnorm, 0.09063596);
+
+  // check x
+  VectorXd x_ref(n);
+  x_ref << 0.08241058, 1.133037, 2.343695;
+  VERIFY_IS_APPROX(x, x_ref);
+
+  // check covariance
+  covfac = fnorm*fnorm/(m-n);
+  internal::covar(lm.matrixR(), lm.permutation().indices()); // TODO : move this as a function of lm
+
+  MatrixXd cov_ref(n,n);
+  cov_ref <<
+      0.0001531202,   0.002869942,  -0.002656662,
+      0.002869942,    0.09480937,   -0.09098997,
+      -0.002656662,   -0.09098997,    0.08778729;
+
+//  std::cout << fjac*covfac << std::endl;
+
+  MatrixXd cov;
+  cov =  covfac*lm.matrixR().topLeftCorner<n,n>();
+  VERIFY_IS_APPROX( cov, cov_ref);
+  // TODO: why isn't this allowed ? :
+  // VERIFY_IS_APPROX( covfac*fjac.topLeftCorner<n,n>() , cov_ref);
+}
+
+struct chwirut2_functor : DenseFunctor<double>
+{
+    chwirut2_functor(void) : DenseFunctor<double>(3,54) {}
+    static const double m_x[54];
+    static const double m_y[54];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        int i;
+
+        assert(b.size()==3);
+        assert(fvec.size()==54);
+        for(i=0; i<54; i++) {
+            double x = m_x[i];
+            fvec[i] = exp(-b[0]*x)/(b[1]+b[2]*x) - m_y[i];
+        }
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==3);
+        assert(fjac.rows()==54);
+        assert(fjac.cols()==3);
+        for(int i=0; i<54; i++) {
+            double x = m_x[i];
+            double factor = 1./(b[1]+b[2]*x);
+            double e = exp(-b[0]*x);
+            fjac(i,0) = -x*e*factor;
+            fjac(i,1) = -e*factor*factor;
+            fjac(i,2) = -x*e*factor*factor;
+        }
+        return 0;
+    }
+};
+const double chwirut2_functor::m_x[54] = { 0.500E0, 1.000E0, 1.750E0, 3.750E0, 5.750E0, 0.875E0, 2.250E0, 3.250E0, 5.250E0, 0.750E0, 1.750E0, 2.750E0, 4.750E0, 0.625E0, 1.250E0, 2.250E0, 4.250E0, .500E0, 3.000E0, .750E0, 3.000E0, 1.500E0, 6.000E0, 3.000E0, 6.000E0, 1.500E0, 3.000E0, .500E0, 2.000E0, 4.000E0, .750E0, 2.000E0, 5.000E0, .750E0, 2.250E0, 3.750E0, 5.750E0, 3.000E0, .750E0, 2.500E0, 4.000E0, .750E0, 2.500E0, 4.000E0, .750E0, 2.500E0, 4.000E0, .500E0, 6.000E0, 3.000E0, .500E0, 2.750E0, .500E0, 1.750E0};
+const double chwirut2_functor::m_y[54] = { 92.9000E0 ,57.1000E0 ,31.0500E0 ,11.5875E0 ,8.0250E0 ,63.6000E0 ,21.4000E0 ,14.2500E0 ,8.4750E0 ,63.8000E0 ,26.8000E0 ,16.4625E0 ,7.1250E0 ,67.3000E0 ,41.0000E0 ,21.1500E0 ,8.1750E0 ,81.5000E0 ,13.1200E0 ,59.9000E0 ,14.6200E0 ,32.9000E0 ,5.4400E0 ,12.5600E0 ,5.4400E0 ,32.0000E0 ,13.9500E0 ,75.8000E0 ,20.0000E0 ,10.4200E0 ,59.5000E0 ,21.6700E0 ,8.5500E0 ,62.0000E0 ,20.2000E0 ,7.7600E0 ,3.7500E0 ,11.8100E0 ,54.7000E0 ,23.7000E0 ,11.5500E0 ,61.3000E0 ,17.7000E0 ,8.7400E0 ,59.2000E0 ,16.3000E0 ,8.6200E0 ,81.0000E0 ,4.8700E0 ,14.6200E0 ,81.7000E0 ,17.1700E0 ,81.3000E0 ,28.9000E0  };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/chwirut2.shtml
+void testNistChwirut2(void)
+{
+  const int n=3;
+  LevenbergMarquardtSpace::Status info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 0.1, 0.01, 0.02;
+  // do the computation
+  chwirut2_functor functor;
+  LevenbergMarquardt<chwirut2_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+//   VERIFY_IS_EQUAL(lm.nfev(), 10);
+  VERIFY_IS_EQUAL(lm.njev(), 8);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.1304802941E+02);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.6657666537E-01);
+  VERIFY_IS_APPROX(x[1], 5.1653291286E-03);
+  VERIFY_IS_APPROX(x[2], 1.2150007096E-02);
+
+  /*
+   * Second try
+   */
+  x<< 0.15, 0.008, 0.010;
+  // do the computation
+  lm.resetParameters();
+  lm.setFtol(1.E6*NumTraits<double>::epsilon());
+  lm.setXtol(1.E6*NumTraits<double>::epsilon());
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+//   VERIFY_IS_EQUAL(lm.nfev(), 7);
+  VERIFY_IS_EQUAL(lm.njev(), 6);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.1304802941E+02);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.6657666537E-01);
+  VERIFY_IS_APPROX(x[1], 5.1653291286E-03);
+  VERIFY_IS_APPROX(x[2], 1.2150007096E-02);
+}
+
+
+struct misra1a_functor : DenseFunctor<double>
+{
+    misra1a_functor(void) : DenseFunctor<double>(2,14) {}
+    static const double m_x[14];
+    static const double m_y[14];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==2);
+        assert(fvec.size()==14);
+        for(int i=0; i<14; i++) {
+            fvec[i] = b[0]*(1.-exp(-b[1]*m_x[i])) - m_y[i] ;
+        }
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==2);
+        assert(fjac.rows()==14);
+        assert(fjac.cols()==2);
+        for(int i=0; i<14; i++) {
+            fjac(i,0) = (1.-exp(-b[1]*m_x[i]));
+            fjac(i,1) = (b[0]*m_x[i]*exp(-b[1]*m_x[i]));
+        }
+        return 0;
+    }
+};
+const double misra1a_functor::m_x[14] = { 77.6E0, 114.9E0, 141.1E0, 190.8E0, 239.9E0, 289.0E0, 332.8E0, 378.4E0, 434.8E0, 477.3E0, 536.8E0, 593.1E0, 689.1E0, 760.0E0};
+const double misra1a_functor::m_y[14] = { 10.07E0, 14.73E0, 17.94E0, 23.93E0, 29.61E0, 35.18E0, 40.02E0, 44.82E0, 50.76E0, 55.05E0, 61.01E0, 66.40E0, 75.47E0, 81.78E0};
+
+// http://www.itl.nist.gov/div898/strd/nls/data/misra1a.shtml
+void testNistMisra1a(void)
+{
+  const int n=2;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 500., 0.0001;
+  // do the computation
+  misra1a_functor functor;
+  LevenbergMarquardt<misra1a_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 19);
+  VERIFY_IS_EQUAL(lm.njev(), 15);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.2455138894E-01);
+  // check x
+  VERIFY_IS_APPROX(x[0], 2.3894212918E+02);
+  VERIFY_IS_APPROX(x[1], 5.5015643181E-04);
+
+  /*
+   * Second try
+   */
+  x<< 250., 0.0005;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 5);
+  VERIFY_IS_EQUAL(lm.njev(), 4);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.2455138894E-01);
+  // check x
+  VERIFY_IS_APPROX(x[0], 2.3894212918E+02);
+  VERIFY_IS_APPROX(x[1], 5.5015643181E-04);
+}
+
+struct hahn1_functor : DenseFunctor<double>
+{
+    hahn1_functor(void) : DenseFunctor<double>(7,236) {}
+    static const double m_x[236];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        static const double m_y[236] = { .591E0 , 1.547E0 , 2.902E0 , 2.894E0 , 4.703E0 , 6.307E0 , 7.03E0  , 7.898E0 , 9.470E0 , 9.484E0 , 10.072E0 , 10.163E0 , 11.615E0 , 12.005E0 , 12.478E0 , 12.982E0 , 12.970E0 , 13.926E0 , 14.452E0 , 14.404E0 , 15.190E0 , 15.550E0 , 15.528E0 , 15.499E0 , 16.131E0 , 16.438E0 , 16.387E0 , 16.549E0 , 16.872E0 , 16.830E0 , 16.926E0 , 16.907E0 , 16.966E0 , 17.060E0 , 17.122E0 , 17.311E0 , 17.355E0 , 17.668E0 , 17.767E0 , 17.803E0 , 17.765E0 , 17.768E0 , 17.736E0 , 17.858E0 , 17.877E0 , 17.912E0 , 18.046E0 , 18.085E0 , 18.291E0 , 18.357E0 , 18.426E0 , 18.584E0 , 18.610E0 , 18.870E0 , 18.795E0 , 19.111E0 , .367E0 , .796E0 , 0.892E0 , 1.903E0 , 2.150E0 , 3.697E0 , 5.870E0 , 6.421E0 , 7.422E0 , 9.944E0 , 11.023E0 , 11.87E0  , 12.786E0 , 14.067E0 , 13.974E0 , 14.462E0 , 14.464E0 , 15.381E0 , 15.483E0 , 15.59E0  , 16.075E0 , 16.347E0 , 16.181E0 , 16.915E0 , 17.003E0 , 16.978E0 , 17.756E0 , 17.808E0 , 17.868E0 , 18.481E0 , 18.486E0 , 19.090E0 , 16.062E0 , 16.337E0 , 16.345E0 ,
+        16.388E0 , 17.159E0 , 17.116E0 , 17.164E0 , 17.123E0 , 17.979E0 , 17.974E0 , 18.007E0 , 17.993E0 , 18.523E0 , 18.669E0 , 18.617E0 , 19.371E0 , 19.330E0 , 0.080E0 , 0.248E0 , 1.089E0 , 1.418E0 , 2.278E0 , 3.624E0 , 4.574E0 , 5.556E0 , 7.267E0 , 7.695E0 , 9.136E0 , 9.959E0 , 9.957E0 , 11.600E0 , 13.138E0 , 13.564E0 , 13.871E0 , 13.994E0 , 14.947E0 , 15.473E0 , 15.379E0 , 15.455E0 , 15.908E0 , 16.114E0 , 17.071E0 , 17.135E0 , 17.282E0 , 17.368E0 , 17.483E0 , 17.764E0 , 18.185E0 , 18.271E0 , 18.236E0 , 18.237E0 , 18.523E0 , 18.627E0 , 18.665E0 , 19.086E0 , 0.214E0 , 0.943E0 , 1.429E0 , 2.241E0 , 2.951E0 , 3.782E0 , 4.757E0 , 5.602E0 , 7.169E0 , 8.920E0 , 10.055E0 , 12.035E0 , 12.861E0 , 13.436E0 , 14.167E0 , 14.755E0 , 15.168E0 , 15.651E0 , 15.746E0 , 16.216E0 , 16.445E0 , 16.965E0 , 17.121E0 , 17.206E0 , 17.250E0 , 17.339E0 , 17.793E0 , 18.123E0 , 18.49E0  , 18.566E0 , 18.645E0 , 18.706E0 , 18.924E0 , 19.1E0   , 0.375E0 , 0.471E0 , 1.504E0 , 2.204E0 , 2.813E0 , 4.765E0 , 9.835E0 , 10.040E0 , 11.946E0 , 
+12.596E0 , 
+13.303E0 , 13.922E0 , 14.440E0 , 14.951E0 , 15.627E0 , 15.639E0 , 15.814E0 , 16.315E0 , 16.334E0 , 16.430E0 , 16.423E0 , 17.024E0 , 17.009E0 , 17.165E0 , 17.134E0 , 17.349E0 , 17.576E0 , 17.848E0 , 18.090E0 , 18.276E0 , 18.404E0 , 18.519E0 , 19.133E0 , 19.074E0 , 19.239E0 , 19.280E0 , 19.101E0 , 19.398E0 , 19.252E0 , 19.89E0  , 20.007E0 , 19.929E0 , 19.268E0 , 19.324E0 , 20.049E0 , 20.107E0 , 20.062E0 , 20.065E0 , 19.286E0 , 19.972E0 , 20.088E0 , 20.743E0 , 20.83E0  , 20.935E0 , 21.035E0 , 20.93E0  , 21.074E0 , 21.085E0 , 20.935E0 };
+
+        //        int called=0; printf("call hahn1_functor with  iflag=%d, called=%d\n", iflag, called); if (iflag==1) called++;
+
+        assert(b.size()==7);
+        assert(fvec.size()==236);
+        for(int i=0; i<236; i++) {
+            double x=m_x[i], xx=x*x, xxx=xx*x;
+            fvec[i] = (b[0]+b[1]*x+b[2]*xx+b[3]*xxx) / (1.+b[4]*x+b[5]*xx+b[6]*xxx) - m_y[i];
+        }
+        return 0;
+    }
+
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==7);
+        assert(fjac.rows()==236);
+        assert(fjac.cols()==7);
+        for(int i=0; i<236; i++) {
+            double x=m_x[i], xx=x*x, xxx=xx*x;
+            double fact = 1./(1.+b[4]*x+b[5]*xx+b[6]*xxx);
+            fjac(i,0) = 1.*fact;
+            fjac(i,1) = x*fact;
+            fjac(i,2) = xx*fact;
+            fjac(i,3) = xxx*fact;
+            fact = - (b[0]+b[1]*x+b[2]*xx+b[3]*xxx) * fact * fact;
+            fjac(i,4) = x*fact;
+            fjac(i,5) = xx*fact;
+            fjac(i,6) = xxx*fact;
+        }
+        return 0;
+    }
+};
+const double hahn1_functor::m_x[236] = { 24.41E0 , 34.82E0 , 44.09E0 , 45.07E0 , 54.98E0 , 65.51E0 , 70.53E0 , 75.70E0 , 89.57E0 , 91.14E0 , 96.40E0 , 97.19E0 , 114.26E0 , 120.25E0 , 127.08E0 , 133.55E0 , 133.61E0 , 158.67E0 , 172.74E0 , 171.31E0 , 202.14E0 , 220.55E0 , 221.05E0 , 221.39E0 , 250.99E0 , 268.99E0 , 271.80E0 , 271.97E0 , 321.31E0 , 321.69E0 , 330.14E0 , 333.03E0 , 333.47E0 , 340.77E0 , 345.65E0 , 373.11E0 , 373.79E0 , 411.82E0 , 419.51E0 , 421.59E0 , 422.02E0 , 422.47E0 , 422.61E0 , 441.75E0 , 447.41E0 , 448.7E0  , 472.89E0 , 476.69E0 , 522.47E0 , 522.62E0 , 524.43E0 , 546.75E0 , 549.53E0 , 575.29E0 , 576.00E0 , 625.55E0 , 20.15E0 , 28.78E0 , 29.57E0 , 37.41E0 , 39.12E0 , 50.24E0 , 61.38E0 , 66.25E0 , 73.42E0 , 95.52E0 , 107.32E0 , 122.04E0 , 134.03E0 , 163.19E0 , 163.48E0 , 175.70E0 , 179.86E0 , 211.27E0 , 217.78E0 , 219.14E0 , 262.52E0 , 268.01E0 , 268.62E0 , 336.25E0 , 337.23E0 , 339.33E0 , 427.38E0 , 428.58E0 , 432.68E0 , 528.99E0 , 531.08E0 , 628.34E0 , 253.24E0 , 273.13E0 , 273.66E0 ,
+282.10E0 , 346.62E0 , 347.19E0 , 348.78E0 , 351.18E0 , 450.10E0 , 450.35E0 , 451.92E0 , 455.56E0 , 552.22E0 , 553.56E0 , 555.74E0 , 652.59E0 , 656.20E0 , 14.13E0 , 20.41E0 , 31.30E0 , 33.84E0 , 39.70E0 , 48.83E0 , 54.50E0 , 60.41E0 , 72.77E0 , 75.25E0 , 86.84E0 , 94.88E0 , 96.40E0 , 117.37E0 , 139.08E0 , 147.73E0 , 158.63E0 , 161.84E0 , 192.11E0 , 206.76E0 , 209.07E0 , 213.32E0 , 226.44E0 , 237.12E0 , 330.90E0 , 358.72E0 , 370.77E0 , 372.72E0 , 396.24E0 , 416.59E0 , 484.02E0 , 495.47E0 , 514.78E0 , 515.65E0 , 519.47E0 , 544.47E0 , 560.11E0 , 620.77E0 , 18.97E0 , 28.93E0 , 33.91E0 , 40.03E0 , 44.66E0 , 49.87E0 , 55.16E0 , 60.90E0 , 72.08E0 , 85.15E0 , 97.06E0 , 119.63E0 , 133.27E0 , 143.84E0 , 161.91E0 , 180.67E0 , 198.44E0 , 226.86E0 , 229.65E0 , 258.27E0 , 273.77E0 , 339.15E0 , 350.13E0 , 362.75E0 , 371.03E0 , 393.32E0 , 448.53E0 , 473.78E0 , 511.12E0 , 524.70E0 , 548.75E0 , 551.64E0 , 574.02E0 , 623.86E0 , 21.46E0 , 24.33E0 , 33.43E0 , 39.22E0 , 44.18E0 , 55.02E0 , 94.33E0 , 96.44E0 , 118.82E0 , 128.48E0 ,
+141.94E0 , 156.92E0 , 171.65E0 , 190.00E0 , 223.26E0 , 223.88E0 , 231.50E0 , 265.05E0 , 269.44E0 , 271.78E0 , 273.46E0 , 334.61E0 , 339.79E0 , 349.52E0 , 358.18E0 , 377.98E0 , 394.77E0 , 429.66E0 , 468.22E0 , 487.27E0 , 519.54E0 , 523.03E0 , 612.99E0 , 638.59E0 , 641.36E0 , 622.05E0 , 631.50E0 , 663.97E0 , 646.9E0  , 748.29E0 , 749.21E0 , 750.14E0 , 647.04E0 , 646.89E0 , 746.9E0  , 748.43E0 , 747.35E0 , 749.27E0 , 647.61E0 , 747.78E0 , 750.51E0 , 851.37E0 , 845.97E0 , 847.54E0 , 849.93E0 , 851.61E0 , 849.75E0 , 850.98E0 , 848.23E0};
+
+// http://www.itl.nist.gov/div898/strd/nls/data/hahn1.shtml
+void testNistHahn1(void)
+{
+  const int  n=7;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 10., -1., .05, -.00001, -.05, .001, -.000001;
+  // do the computation
+  hahn1_functor functor;
+  LevenbergMarquardt<hahn1_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 11);
+  VERIFY_IS_EQUAL(lm.njev(), 10);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.5324382854E+00);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.0776351733E+00);
+  VERIFY_IS_APPROX(x[1],-1.2269296921E-01);
+  VERIFY_IS_APPROX(x[2], 4.0863750610E-03);
+  VERIFY_IS_APPROX(x[3],-1.426264e-06); // shoulde be : -1.4262662514E-06
+  VERIFY_IS_APPROX(x[4],-5.7609940901E-03);
+  VERIFY_IS_APPROX(x[5], 2.4053735503E-04);
+  VERIFY_IS_APPROX(x[6],-1.2314450199E-07);
+
+  /*
+   * Second try
+   */
+  x<< .1, -.1, .005, -.000001, -.005, .0001, -.0000001;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+//   VERIFY_IS_EQUAL(lm.nfev(), 11);
+  VERIFY_IS_EQUAL(lm.njev(), 10);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.5324382854E+00);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.077640); // should be :  1.0776351733E+00
+  VERIFY_IS_APPROX(x[1], -0.1226933); // should be : -1.2269296921E-01
+  VERIFY_IS_APPROX(x[2], 0.004086383); // should be : 4.0863750610E-03
+  VERIFY_IS_APPROX(x[3], -1.426277e-06); // shoulde be : -1.4262662514E-06
+  VERIFY_IS_APPROX(x[4],-5.7609940901E-03);
+  VERIFY_IS_APPROX(x[5], 0.00024053772); // should be : 2.4053735503E-04
+  VERIFY_IS_APPROX(x[6], -1.231450e-07); // should be : -1.2314450199E-07
+
+}
+
+struct misra1d_functor : DenseFunctor<double>
+{
+    misra1d_functor(void) : DenseFunctor<double>(2,14) {}
+    static const double x[14];
+    static const double y[14];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==2);
+        assert(fvec.size()==14);
+        for(int i=0; i<14; i++) {
+            fvec[i] = b[0]*b[1]*x[i]/(1.+b[1]*x[i]) - y[i];
+        }
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==2);
+        assert(fjac.rows()==14);
+        assert(fjac.cols()==2);
+        for(int i=0; i<14; i++) {
+            double den = 1.+b[1]*x[i];
+            fjac(i,0) = b[1]*x[i] / den;
+            fjac(i,1) = b[0]*x[i]*(den-b[1]*x[i])/den/den;
+        }
+        return 0;
+    }
+};
+const double misra1d_functor::x[14] = { 77.6E0, 114.9E0, 141.1E0, 190.8E0, 239.9E0, 289.0E0, 332.8E0, 378.4E0, 434.8E0, 477.3E0, 536.8E0, 593.1E0, 689.1E0, 760.0E0};
+const double misra1d_functor::y[14] = { 10.07E0, 14.73E0, 17.94E0, 23.93E0, 29.61E0, 35.18E0, 40.02E0, 44.82E0, 50.76E0, 55.05E0, 61.01E0, 66.40E0, 75.47E0, 81.78E0};
+
+// http://www.itl.nist.gov/div898/strd/nls/data/misra1d.shtml
+void testNistMisra1d(void)
+{
+  const int n=2;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 500., 0.0001;
+  // do the computation
+  misra1d_functor functor;
+  LevenbergMarquardt<misra1d_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 9);
+  VERIFY_IS_EQUAL(lm.njev(), 7);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.6419295283E-02);
+  // check x
+  VERIFY_IS_APPROX(x[0], 4.3736970754E+02);
+  VERIFY_IS_APPROX(x[1], 3.0227324449E-04);
+
+  /*
+   * Second try
+   */
+  x<< 450., 0.0003;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 4);
+  VERIFY_IS_EQUAL(lm.njev(), 3);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.6419295283E-02);
+  // check x
+  VERIFY_IS_APPROX(x[0], 4.3736970754E+02);
+  VERIFY_IS_APPROX(x[1], 3.0227324449E-04);
+}
+
+
+struct lanczos1_functor : DenseFunctor<double>
+{
+    lanczos1_functor(void) : DenseFunctor<double>(6,24) {}
+    static const double x[24];
+    static const double y[24];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==6);
+        assert(fvec.size()==24);
+        for(int i=0; i<24; i++)
+            fvec[i] = b[0]*exp(-b[1]*x[i]) + b[2]*exp(-b[3]*x[i]) + b[4]*exp(-b[5]*x[i])  - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==6);
+        assert(fjac.rows()==24);
+        assert(fjac.cols()==6);
+        for(int i=0; i<24; i++) {
+            fjac(i,0) = exp(-b[1]*x[i]);
+            fjac(i,1) = -b[0]*x[i]*exp(-b[1]*x[i]);
+            fjac(i,2) = exp(-b[3]*x[i]);
+            fjac(i,3) = -b[2]*x[i]*exp(-b[3]*x[i]);
+            fjac(i,4) = exp(-b[5]*x[i]);
+            fjac(i,5) = -b[4]*x[i]*exp(-b[5]*x[i]);
+        }
+        return 0;
+    }
+};
+const double lanczos1_functor::x[24] = { 0.000000000000E+00, 5.000000000000E-02, 1.000000000000E-01, 1.500000000000E-01, 2.000000000000E-01, 2.500000000000E-01, 3.000000000000E-01, 3.500000000000E-01, 4.000000000000E-01, 4.500000000000E-01, 5.000000000000E-01, 5.500000000000E-01, 6.000000000000E-01, 6.500000000000E-01, 7.000000000000E-01, 7.500000000000E-01, 8.000000000000E-01, 8.500000000000E-01, 9.000000000000E-01, 9.500000000000E-01, 1.000000000000E+00, 1.050000000000E+00, 1.100000000000E+00, 1.150000000000E+00 };
+const double lanczos1_functor::y[24] = { 2.513400000000E+00 ,2.044333373291E+00 ,1.668404436564E+00 ,1.366418021208E+00 ,1.123232487372E+00 ,9.268897180037E-01 ,7.679338563728E-01 ,6.388775523106E-01 ,5.337835317402E-01 ,4.479363617347E-01 ,3.775847884350E-01 ,3.197393199326E-01 ,2.720130773746E-01 ,2.324965529032E-01 ,1.996589546065E-01 ,1.722704126914E-01 ,1.493405660168E-01 ,1.300700206922E-01 ,1.138119324644E-01 ,1.000415587559E-01 ,8.833209084540E-02 ,7.833544019350E-02 ,6.976693743449E-02 ,6.239312536719E-02 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/lanczos1.shtml
+void testNistLanczos1(void)
+{
+  const int n=6;
+  LevenbergMarquardtSpace::Status info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 1.2, 0.3, 5.6, 5.5, 6.5, 7.6;
+  // do the computation
+  lanczos1_functor functor;
+  LevenbergMarquardt<lanczos1_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall);
+  VERIFY_IS_EQUAL(lm.nfev(), 79);
+  VERIFY_IS_EQUAL(lm.njev(), 72);
+  // check norm^2
+  VERIFY(lm.fvec().squaredNorm() <= 1.4307867721E-25);
+  // check x
+  VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
+  VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
+  VERIFY_IS_APPROX(x[2], 8.6070000013E-01);
+  VERIFY_IS_APPROX(x[3], 3.0000000002E+00);
+  VERIFY_IS_APPROX(x[4], 1.5575999998E+00);
+  VERIFY_IS_APPROX(x[5], 5.0000000001E+00);
+
+  /*
+   * Second try
+   */
+  x<< 0.5, 0.7, 3.6, 4.2, 4., 6.3;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall);
+  VERIFY_IS_EQUAL(lm.nfev(), 9);
+  VERIFY_IS_EQUAL(lm.njev(), 8);
+  // check norm^2
+  VERIFY(lm.fvec().squaredNorm() <= 1.4307867721E-25);
+  // check x
+  VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
+  VERIFY_IS_APPROX(x[1], 1.0000000001E+00);
+  VERIFY_IS_APPROX(x[2], 8.6070000013E-01);
+  VERIFY_IS_APPROX(x[3], 3.0000000002E+00);
+  VERIFY_IS_APPROX(x[4], 1.5575999998E+00);
+  VERIFY_IS_APPROX(x[5], 5.0000000001E+00);
+
+}
+
+struct rat42_functor : DenseFunctor<double>
+{
+    rat42_functor(void) : DenseFunctor<double>(3,9) {}
+    static const double x[9];
+    static const double y[9];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==3);
+        assert(fvec.size()==9);
+        for(int i=0; i<9; i++) {
+            fvec[i] = b[0] / (1.+exp(b[1]-b[2]*x[i])) - y[i];
+        }
+        return 0;
+    }
+
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==3);
+        assert(fjac.rows()==9);
+        assert(fjac.cols()==3);
+        for(int i=0; i<9; i++) {
+            double e = exp(b[1]-b[2]*x[i]);
+            fjac(i,0) = 1./(1.+e);
+            fjac(i,1) = -b[0]*e/(1.+e)/(1.+e);
+            fjac(i,2) = +b[0]*e*x[i]/(1.+e)/(1.+e);
+        }
+        return 0;
+    }
+};
+const double rat42_functor::x[9] = { 9.000E0, 14.000E0, 21.000E0, 28.000E0, 42.000E0, 57.000E0, 63.000E0, 70.000E0, 79.000E0 };
+const double rat42_functor::y[9] = { 8.930E0 ,10.800E0 ,18.590E0 ,22.330E0 ,39.350E0 ,56.110E0 ,61.730E0 ,64.620E0 ,67.080E0 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/ratkowsky2.shtml
+void testNistRat42(void)
+{
+  const int n=3;
+  LevenbergMarquardtSpace::Status info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 100., 1., 0.1;
+  // do the computation
+  rat42_functor functor;
+  LevenbergMarquardt<rat42_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
+  VERIFY_IS_EQUAL(lm.nfev(), 10);
+  VERIFY_IS_EQUAL(lm.njev(), 8);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.0565229338E+00);
+  // check x
+  VERIFY_IS_APPROX(x[0], 7.2462237576E+01);
+  VERIFY_IS_APPROX(x[1], 2.6180768402E+00);
+  VERIFY_IS_APPROX(x[2], 6.7359200066E-02);
+
+  /*
+   * Second try
+   */
+  x<< 75., 2.5, 0.07;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
+  VERIFY_IS_EQUAL(lm.nfev(), 6);
+  VERIFY_IS_EQUAL(lm.njev(), 5);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.0565229338E+00);
+  // check x
+  VERIFY_IS_APPROX(x[0], 7.2462237576E+01);
+  VERIFY_IS_APPROX(x[1], 2.6180768402E+00);
+  VERIFY_IS_APPROX(x[2], 6.7359200066E-02);
+}
+
+struct MGH10_functor : DenseFunctor<double>
+{
+    MGH10_functor(void) : DenseFunctor<double>(3,16) {}
+    static const double x[16];
+    static const double y[16];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==3);
+        assert(fvec.size()==16);
+        for(int i=0; i<16; i++)
+            fvec[i] =  b[0] * exp(b[1]/(x[i]+b[2])) - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==3);
+        assert(fjac.rows()==16);
+        assert(fjac.cols()==3);
+        for(int i=0; i<16; i++) {
+            double factor = 1./(x[i]+b[2]);
+            double e = exp(b[1]*factor);
+            fjac(i,0) = e;
+            fjac(i,1) = b[0]*factor*e;
+            fjac(i,2) = -b[1]*b[0]*factor*factor*e;
+        }
+        return 0;
+    }
+};
+const double MGH10_functor::x[16] = { 5.000000E+01, 5.500000E+01, 6.000000E+01, 6.500000E+01, 7.000000E+01, 7.500000E+01, 8.000000E+01, 8.500000E+01, 9.000000E+01, 9.500000E+01, 1.000000E+02, 1.050000E+02, 1.100000E+02, 1.150000E+02, 1.200000E+02, 1.250000E+02 };
+const double MGH10_functor::y[16] = { 3.478000E+04, 2.861000E+04, 2.365000E+04, 1.963000E+04, 1.637000E+04, 1.372000E+04, 1.154000E+04, 9.744000E+03, 8.261000E+03, 7.030000E+03, 6.005000E+03, 5.147000E+03, 4.427000E+03, 3.820000E+03, 3.307000E+03, 2.872000E+03 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/mgh10.shtml
+void testNistMGH10(void)
+{
+  const int n=3;
+  LevenbergMarquardtSpace::Status info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 2., 400000., 25000.;
+  // do the computation
+  MGH10_functor functor;
+  LevenbergMarquardt<MGH10_functor> lm(functor);
+  info = lm.minimize(x);
+  ++g_test_level;
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
+  --g_test_level;
+  // was: VERIFY_IS_EQUAL(info, 1);
+
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7945855171E+01);
+  // check x
+  VERIFY_IS_APPROX(x[0], 5.6096364710E-03);
+  VERIFY_IS_APPROX(x[1], 6.1813463463E+03);
+  VERIFY_IS_APPROX(x[2], 3.4522363462E+02);
+  
+  // check return value
+
+  ++g_test_level;
+  VERIFY_IS_EQUAL(lm.nfev(), 284 );
+  VERIFY_IS_EQUAL(lm.njev(), 249 );
+  --g_test_level;
+  VERIFY(lm.nfev() < 284 * LM_EVAL_COUNT_TOL);
+  VERIFY(lm.njev() < 249 * LM_EVAL_COUNT_TOL);
+
+  /*
+   * Second try
+   */
+  x<< 0.02, 4000., 250.;
+  // do the computation
+  info = lm.minimize(x);
+  ++g_test_level;
+  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
+  // was: VERIFY_IS_EQUAL(info, 1);
+  --g_test_level;
+
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7945855171E+01);
+  // check x
+  VERIFY_IS_APPROX(x[0], 5.6096364710E-03);
+  VERIFY_IS_APPROX(x[1], 6.1813463463E+03);
+  VERIFY_IS_APPROX(x[2], 3.4522363462E+02);
+  
+  // check return value
+  ++g_test_level;
+  VERIFY_IS_EQUAL(lm.nfev(), 126);
+  VERIFY_IS_EQUAL(lm.njev(), 116);
+  --g_test_level;
+  VERIFY(lm.nfev() < 126 * LM_EVAL_COUNT_TOL);
+  VERIFY(lm.njev() < 116 * LM_EVAL_COUNT_TOL);
+}
+
+
+struct BoxBOD_functor : DenseFunctor<double>
+{
+    BoxBOD_functor(void) : DenseFunctor<double>(2,6) {}
+    static const double x[6];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        static const double y[6] = { 109., 149., 149., 191., 213., 224. };
+        assert(b.size()==2);
+        assert(fvec.size()==6);
+        for(int i=0; i<6; i++)
+            fvec[i] =  b[0]*(1.-exp(-b[1]*x[i])) - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==2);
+        assert(fjac.rows()==6);
+        assert(fjac.cols()==2);
+        for(int i=0; i<6; i++) {
+            double e = exp(-b[1]*x[i]);
+            fjac(i,0) = 1.-e;
+            fjac(i,1) = b[0]*x[i]*e;
+        }
+        return 0;
+    }
+};
+const double BoxBOD_functor::x[6] = { 1., 2., 3., 5., 7., 10. };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/boxbod.shtml
+void testNistBoxBOD(void)
+{
+  const int n=2;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 1., 1.;
+  // do the computation
+  BoxBOD_functor functor;
+  LevenbergMarquardt<BoxBOD_functor> lm(functor);
+  lm.setFtol(1.E6*NumTraits<double>::epsilon());
+  lm.setXtol(1.E6*NumTraits<double>::epsilon());
+  lm.setFactor(10);
+  info = lm.minimize(x);
+
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.1680088766E+03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 2.1380940889E+02);
+  VERIFY_IS_APPROX(x[1], 5.4723748542E-01);
+  
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY(lm.nfev() < 31); // 31
+  VERIFY(lm.njev() < 25); // 25
+
+  /*
+   * Second try
+   */
+  x<< 100., 0.75;
+  // do the computation
+  lm.resetParameters();
+  lm.setFtol(NumTraits<double>::epsilon());
+  lm.setXtol( NumTraits<double>::epsilon());
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1); 
+  ++g_test_level;
+  VERIFY_IS_EQUAL(lm.nfev(), 16 );
+  VERIFY_IS_EQUAL(lm.njev(), 15 );
+  --g_test_level;
+  VERIFY(lm.nfev() < 16 * LM_EVAL_COUNT_TOL);
+  VERIFY(lm.njev() < 15 * LM_EVAL_COUNT_TOL);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.1680088766E+03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 2.1380940889E+02);
+  VERIFY_IS_APPROX(x[1], 5.4723748542E-01);
+}
+
+struct MGH17_functor : DenseFunctor<double>
+{
+    MGH17_functor(void) : DenseFunctor<double>(5,33) {}
+    static const double x[33];
+    static const double y[33];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==5);
+        assert(fvec.size()==33);
+        for(int i=0; i<33; i++)
+            fvec[i] =  b[0] + b[1]*exp(-b[3]*x[i]) +  b[2]*exp(-b[4]*x[i]) - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==5);
+        assert(fjac.rows()==33);
+        assert(fjac.cols()==5);
+        for(int i=0; i<33; i++) {
+            fjac(i,0) = 1.;
+            fjac(i,1) = exp(-b[3]*x[i]);
+            fjac(i,2) = exp(-b[4]*x[i]);
+            fjac(i,3) = -x[i]*b[1]*exp(-b[3]*x[i]);
+            fjac(i,4) = -x[i]*b[2]*exp(-b[4]*x[i]);
+        }
+        return 0;
+    }
+};
+const double MGH17_functor::x[33] = { 0.000000E+00, 1.000000E+01, 2.000000E+01, 3.000000E+01, 4.000000E+01, 5.000000E+01, 6.000000E+01, 7.000000E+01, 8.000000E+01, 9.000000E+01, 1.000000E+02, 1.100000E+02, 1.200000E+02, 1.300000E+02, 1.400000E+02, 1.500000E+02, 1.600000E+02, 1.700000E+02, 1.800000E+02, 1.900000E+02, 2.000000E+02, 2.100000E+02, 2.200000E+02, 2.300000E+02, 2.400000E+02, 2.500000E+02, 2.600000E+02, 2.700000E+02, 2.800000E+02, 2.900000E+02, 3.000000E+02, 3.100000E+02, 3.200000E+02 };
+const double MGH17_functor::y[33] = { 8.440000E-01, 9.080000E-01, 9.320000E-01, 9.360000E-01, 9.250000E-01, 9.080000E-01, 8.810000E-01, 8.500000E-01, 8.180000E-01, 7.840000E-01, 7.510000E-01, 7.180000E-01, 6.850000E-01, 6.580000E-01, 6.280000E-01, 6.030000E-01, 5.800000E-01, 5.580000E-01, 5.380000E-01, 5.220000E-01, 5.060000E-01, 4.900000E-01, 4.780000E-01, 4.670000E-01, 4.570000E-01, 4.480000E-01, 4.380000E-01, 4.310000E-01, 4.240000E-01, 4.200000E-01, 4.140000E-01, 4.110000E-01, 4.060000E-01 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/mgh17.shtml
+void testNistMGH17(void)
+{
+  const int n=5;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 50., 150., -100., 1., 2.;
+  // do the computation
+  MGH17_functor functor;
+  LevenbergMarquardt<MGH17_functor> lm(functor);
+  lm.setFtol(NumTraits<double>::epsilon());
+  lm.setXtol(NumTraits<double>::epsilon());
+  lm.setMaxfev(1000);
+  info = lm.minimize(x);
+
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.4648946975E-05);
+  // check x
+  VERIFY_IS_APPROX(x[0], 3.7541005211E-01);
+  VERIFY_IS_APPROX(x[1], 1.9358469127E+00);
+  VERIFY_IS_APPROX(x[2], -1.4646871366E+00);
+  VERIFY_IS_APPROX(x[3], 1.2867534640E-02);
+  VERIFY_IS_APPROX(x[4], 2.2122699662E-02);
+  
+    // check return value
+//   VERIFY_IS_EQUAL(info, 2);  //FIXME Use (lm.info() == Success)
+  VERIFY(lm.nfev() < 700 ); // 602
+  VERIFY(lm.njev() < 600 ); // 545
+
+  /*
+   * Second try
+   */
+  x<< 0.5  ,1.5  ,-1   ,0.01 ,0.02;
+  // do the computation
+  lm.resetParameters();
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 18);
+  VERIFY_IS_EQUAL(lm.njev(), 15);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.4648946975E-05);
+  // check x
+  VERIFY_IS_APPROX(x[0], 3.7541005211E-01);
+  VERIFY_IS_APPROX(x[1], 1.9358469127E+00);
+  VERIFY_IS_APPROX(x[2], -1.4646871366E+00);
+  VERIFY_IS_APPROX(x[3], 1.2867534640E-02);
+  VERIFY_IS_APPROX(x[4], 2.2122699662E-02);
+}
+
+struct MGH09_functor : DenseFunctor<double>
+{
+    MGH09_functor(void) : DenseFunctor<double>(4,11) {}
+    static const double _x[11];
+    static const double y[11];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==4);
+        assert(fvec.size()==11);
+        for(int i=0; i<11; i++) {
+            double x = _x[i], xx=x*x;
+            fvec[i] = b[0]*(xx+x*b[1])/(xx+x*b[2]+b[3]) - y[i];
+        }
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==4);
+        assert(fjac.rows()==11);
+        assert(fjac.cols()==4);
+        for(int i=0; i<11; i++) {
+            double x = _x[i], xx=x*x;
+            double factor = 1./(xx+x*b[2]+b[3]);
+            fjac(i,0) = (xx+x*b[1]) * factor;
+            fjac(i,1) = b[0]*x* factor;
+            fjac(i,2) = - b[0]*(xx+x*b[1]) * x * factor * factor;
+            fjac(i,3) = - b[0]*(xx+x*b[1]) * factor * factor;
+        }
+        return 0;
+    }
+};
+const double MGH09_functor::_x[11] = { 4., 2., 1., 5.E-1 , 2.5E-01, 1.670000E-01, 1.250000E-01,  1.E-01, 8.330000E-02, 7.140000E-02, 6.250000E-02 };
+const double MGH09_functor::y[11] = { 1.957000E-01, 1.947000E-01, 1.735000E-01, 1.600000E-01, 8.440000E-02, 6.270000E-02, 4.560000E-02, 3.420000E-02, 3.230000E-02, 2.350000E-02, 2.460000E-02 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/mgh09.shtml
+void testNistMGH09(void)
+{
+  const int n=4;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 25., 39, 41.5, 39.;
+  // do the computation
+  MGH09_functor functor;
+  LevenbergMarquardt<MGH09_functor> lm(functor);
+  lm.setMaxfev(1000);
+  info = lm.minimize(x);
+
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 3.0750560385E-04);
+  // check x
+  VERIFY_IS_APPROX(x[0], 0.1928077089); // should be 1.9280693458E-01
+  VERIFY_IS_APPROX(x[1], 0.19126423573); // should be 1.9128232873E-01
+  VERIFY_IS_APPROX(x[2], 0.12305309914); // should be 1.2305650693E-01
+  VERIFY_IS_APPROX(x[3], 0.13605395375); // should be 1.3606233068E-01
+  // check return value
+  VERIFY_IS_EQUAL(info, 1); 
+  VERIFY(lm.nfev() < 510 ); // 490
+  VERIFY(lm.njev() < 400 ); // 376
+
+  /*
+   * Second try
+   */
+  x<< 0.25, 0.39, 0.415, 0.39;
+  // do the computation
+  lm.resetParameters();
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 18);
+  VERIFY_IS_EQUAL(lm.njev(), 16);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 3.0750560385E-04);
+  // check x
+  VERIFY_IS_APPROX(x[0], 0.19280781); // should be 1.9280693458E-01
+  VERIFY_IS_APPROX(x[1], 0.19126265); // should be 1.9128232873E-01
+  VERIFY_IS_APPROX(x[2], 0.12305280); // should be 1.2305650693E-01
+  VERIFY_IS_APPROX(x[3], 0.13605322); // should be 1.3606233068E-01
+}
+
+
+
+struct Bennett5_functor : DenseFunctor<double>
+{
+    Bennett5_functor(void) : DenseFunctor<double>(3,154) {}
+    static const double x[154];
+    static const double y[154];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==3);
+        assert(fvec.size()==154);
+        for(int i=0; i<154; i++)
+            fvec[i] = b[0]* pow(b[1]+x[i],-1./b[2]) - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==3);
+        assert(fjac.rows()==154);
+        assert(fjac.cols()==3);
+        for(int i=0; i<154; i++) {
+            double e = pow(b[1]+x[i],-1./b[2]);
+            fjac(i,0) = e;
+            fjac(i,1) = - b[0]*e/b[2]/(b[1]+x[i]);
+            fjac(i,2) = b[0]*e*log(b[1]+x[i])/b[2]/b[2];
+        }
+        return 0;
+    }
+};
+const double Bennett5_functor::x[154] = { 7.447168E0, 8.102586E0, 8.452547E0, 8.711278E0, 8.916774E0, 9.087155E0, 9.232590E0, 9.359535E0, 9.472166E0, 9.573384E0, 9.665293E0, 9.749461E0, 9.827092E0, 9.899128E0, 9.966321E0, 10.029280E0, 10.088510E0, 10.144430E0, 10.197380E0, 10.247670E0, 10.295560E0, 10.341250E0, 10.384950E0, 10.426820E0, 10.467000E0, 10.505640E0, 10.542830E0, 10.578690E0, 10.613310E0, 10.646780E0, 10.679150E0, 10.710520E0, 10.740920E0, 10.770440E0, 10.799100E0, 10.826970E0, 10.854080E0, 10.880470E0, 10.906190E0, 10.931260E0, 10.955720E0, 10.979590E0, 11.002910E0, 11.025700E0, 11.047980E0, 11.069770E0, 11.091100E0, 11.111980E0, 11.132440E0, 11.152480E0, 11.172130E0, 11.191410E0, 11.210310E0, 11.228870E0, 11.247090E0, 11.264980E0, 11.282560E0, 11.299840E0, 11.316820E0, 11.333520E0, 11.349940E0, 11.366100E0, 11.382000E0, 11.397660E0, 11.413070E0, 11.428240E0, 11.443200E0, 11.457930E0, 11.472440E0, 11.486750E0, 11.500860E0, 11.514770E0, 11.528490E0, 11.542020E0, 11.555380E0, 11.568550E0,
+11.581560E0, 11.594420E0, 11.607121E0, 11.619640E0, 11.632000E0, 11.644210E0, 11.656280E0, 11.668200E0, 11.679980E0, 11.691620E0, 11.703130E0, 11.714510E0, 11.725760E0, 11.736880E0, 11.747890E0, 11.758780E0, 11.769550E0, 11.780200E0, 11.790730E0, 11.801160E0, 11.811480E0, 11.821700E0, 11.831810E0, 11.841820E0, 11.851730E0, 11.861550E0, 11.871270E0, 11.880890E0, 11.890420E0, 11.899870E0, 11.909220E0, 11.918490E0, 11.927680E0, 11.936780E0, 11.945790E0, 11.954730E0, 11.963590E0, 11.972370E0, 11.981070E0, 11.989700E0, 11.998260E0, 12.006740E0, 12.015150E0, 12.023490E0, 12.031760E0, 12.039970E0, 12.048100E0, 12.056170E0, 12.064180E0, 12.072120E0, 12.080010E0, 12.087820E0, 12.095580E0, 12.103280E0, 12.110920E0, 12.118500E0, 12.126030E0, 12.133500E0, 12.140910E0, 12.148270E0, 12.155570E0, 12.162830E0, 12.170030E0, 12.177170E0, 12.184270E0, 12.191320E0, 12.198320E0, 12.205270E0, 12.212170E0, 12.219030E0, 12.225840E0, 12.232600E0, 12.239320E0, 12.245990E0, 12.252620E0, 12.259200E0, 12.265750E0, 12.272240E0 };
+const double Bennett5_functor::y[154] = { -34.834702E0 ,-34.393200E0 ,-34.152901E0 ,-33.979099E0 ,-33.845901E0 ,-33.732899E0 ,-33.640301E0 ,-33.559200E0 ,-33.486801E0 ,-33.423100E0 ,-33.365101E0 ,-33.313000E0 ,-33.260899E0 ,-33.217400E0 ,-33.176899E0 ,-33.139198E0 ,-33.101601E0 ,-33.066799E0 ,-33.035000E0 ,-33.003101E0 ,-32.971298E0 ,-32.942299E0 ,-32.916302E0 ,-32.890202E0 ,-32.864101E0 ,-32.841000E0 ,-32.817799E0 ,-32.797501E0 ,-32.774300E0 ,-32.757000E0 ,-32.733799E0 ,-32.716400E0 ,-32.699100E0 ,-32.678799E0 ,-32.661400E0 ,-32.644001E0 ,-32.626701E0 ,-32.612202E0 ,-32.597698E0 ,-32.583199E0 ,-32.568699E0 ,-32.554298E0 ,-32.539799E0 ,-32.525299E0 ,-32.510799E0 ,-32.499199E0 ,-32.487598E0 ,-32.473202E0 ,-32.461601E0 ,-32.435501E0 ,-32.435501E0 ,-32.426800E0 ,-32.412300E0 ,-32.400799E0 ,-32.392101E0 ,-32.380501E0 ,-32.366001E0 ,-32.357300E0 ,-32.348598E0 ,-32.339901E0 ,-32.328400E0 ,-32.319698E0 ,-32.311001E0 ,-32.299400E0 ,-32.290699E0 ,-32.282001E0 ,-32.273300E0 ,-32.264599E0 ,-32.256001E0 ,-32.247299E0
+,-32.238602E0 ,-32.229900E0 ,-32.224098E0 ,-32.215401E0 ,-32.203800E0 ,-32.198002E0 ,-32.189400E0 ,-32.183601E0 ,-32.174900E0 ,-32.169102E0 ,-32.163300E0 ,-32.154598E0 ,-32.145901E0 ,-32.140099E0 ,-32.131401E0 ,-32.125599E0 ,-32.119801E0 ,-32.111198E0 ,-32.105400E0 ,-32.096699E0 ,-32.090900E0 ,-32.088001E0 ,-32.079300E0 ,-32.073502E0 ,-32.067699E0 ,-32.061901E0 ,-32.056099E0 ,-32.050301E0 ,-32.044498E0 ,-32.038799E0 ,-32.033001E0 ,-32.027199E0 ,-32.024300E0 ,-32.018501E0 ,-32.012699E0 ,-32.004002E0 ,-32.001099E0 ,-31.995300E0 ,-31.989500E0 ,-31.983700E0 ,-31.977900E0 ,-31.972099E0 ,-31.969299E0 ,-31.963501E0 ,-31.957701E0 ,-31.951900E0 ,-31.946100E0 ,-31.940300E0 ,-31.937401E0 ,-31.931601E0 ,-31.925800E0 ,-31.922899E0 ,-31.917101E0 ,-31.911301E0 ,-31.908400E0 ,-31.902599E0 ,-31.896900E0 ,-31.893999E0 ,-31.888201E0 ,-31.885300E0 ,-31.882401E0 ,-31.876600E0 ,-31.873699E0 ,-31.867901E0 ,-31.862101E0 ,-31.859200E0 ,-31.856300E0 ,-31.850500E0 ,-31.844700E0 ,-31.841801E0 ,-31.838900E0 ,-31.833099E0 ,-31.830200E0 ,
+-31.827299E0 ,-31.821600E0 ,-31.818701E0 ,-31.812901E0 ,-31.809999E0 ,-31.807100E0 ,-31.801300E0 ,-31.798401E0 ,-31.795500E0 ,-31.789700E0 ,-31.786800E0 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/bennett5.shtml
+void testNistBennett5(void)
+{
+  const int  n=3;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< -2000., 50., 0.8;
+  // do the computation
+  Bennett5_functor functor;
+  LevenbergMarquardt<Bennett5_functor> lm(functor);
+  lm.setMaxfev(1000);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 758);
+  VERIFY_IS_EQUAL(lm.njev(), 744);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.2404744073E-04);
+  // check x
+  VERIFY_IS_APPROX(x[0], -2.5235058043E+03);
+  VERIFY_IS_APPROX(x[1], 4.6736564644E+01);
+  VERIFY_IS_APPROX(x[2], 9.3218483193E-01);
+  /*
+   * Second try
+   */
+  x<< -1500., 45., 0.85;
+  // do the computation
+  lm.resetParameters();
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 203);
+  VERIFY_IS_EQUAL(lm.njev(), 192);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.2404744073E-04);
+  // check x
+  VERIFY_IS_APPROX(x[0], -2523.3007865); // should be -2.5235058043E+03
+  VERIFY_IS_APPROX(x[1], 46.735705771); // should be 4.6736564644E+01);
+  VERIFY_IS_APPROX(x[2], 0.93219881891); // should be 9.3218483193E-01);
+}
+
+struct thurber_functor : DenseFunctor<double>
+{
+    thurber_functor(void) : DenseFunctor<double>(7,37) {}
+    static const double _x[37];
+    static const double _y[37];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        //        int called=0; printf("call hahn1_functor with  iflag=%d, called=%d\n", iflag, called); if (iflag==1) called++;
+        assert(b.size()==7);
+        assert(fvec.size()==37);
+        for(int i=0; i<37; i++) {
+            double x=_x[i], xx=x*x, xxx=xx*x;
+            fvec[i] = (b[0]+b[1]*x+b[2]*xx+b[3]*xxx) / (1.+b[4]*x+b[5]*xx+b[6]*xxx) - _y[i];
+        }
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==7);
+        assert(fjac.rows()==37);
+        assert(fjac.cols()==7);
+        for(int i=0; i<37; i++) {
+            double x=_x[i], xx=x*x, xxx=xx*x;
+            double fact = 1./(1.+b[4]*x+b[5]*xx+b[6]*xxx);
+            fjac(i,0) = 1.*fact;
+            fjac(i,1) = x*fact;
+            fjac(i,2) = xx*fact;
+            fjac(i,3) = xxx*fact;
+            fact = - (b[0]+b[1]*x+b[2]*xx+b[3]*xxx) * fact * fact;
+            fjac(i,4) = x*fact;
+            fjac(i,5) = xx*fact;
+            fjac(i,6) = xxx*fact;
+        }
+        return 0;
+    }
+};
+const double thurber_functor::_x[37] = { -3.067E0, -2.981E0, -2.921E0, -2.912E0, -2.840E0, -2.797E0, -2.702E0, -2.699E0, -2.633E0, -2.481E0, -2.363E0, -2.322E0, -1.501E0, -1.460E0, -1.274E0, -1.212E0, -1.100E0, -1.046E0, -0.915E0, -0.714E0, -0.566E0, -0.545E0, -0.400E0, -0.309E0, -0.109E0, -0.103E0, 0.010E0, 0.119E0, 0.377E0, 0.790E0, 0.963E0, 1.006E0, 1.115E0, 1.572E0, 1.841E0, 2.047E0, 2.200E0 };
+const double thurber_functor::_y[37] = { 80.574E0, 84.248E0, 87.264E0, 87.195E0, 89.076E0, 89.608E0, 89.868E0, 90.101E0, 92.405E0, 95.854E0, 100.696E0, 101.060E0, 401.672E0, 390.724E0, 567.534E0, 635.316E0, 733.054E0, 759.087E0, 894.206E0, 990.785E0, 1090.109E0, 1080.914E0, 1122.643E0, 1178.351E0, 1260.531E0, 1273.514E0, 1288.339E0, 1327.543E0, 1353.863E0, 1414.509E0, 1425.208E0, 1421.384E0, 1442.962E0, 1464.350E0, 1468.705E0, 1447.894E0, 1457.628E0};
+
+// http://www.itl.nist.gov/div898/strd/nls/data/thurber.shtml
+void testNistThurber(void)
+{
+  const int n=7;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 1000 ,1000 ,400 ,40 ,0.7,0.3,0.0 ;
+  // do the computation
+  thurber_functor functor;
+  LevenbergMarquardt<thurber_functor> lm(functor);
+  lm.setFtol(1.E4*NumTraits<double>::epsilon());
+  lm.setXtol(1.E4*NumTraits<double>::epsilon());
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 39);
+  VERIFY_IS_EQUAL(lm.njev(), 36);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.6427082397E+03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.2881396800E+03);
+  VERIFY_IS_APPROX(x[1], 1.4910792535E+03);
+  VERIFY_IS_APPROX(x[2], 5.8323836877E+02);
+  VERIFY_IS_APPROX(x[3], 7.5416644291E+01);
+  VERIFY_IS_APPROX(x[4], 9.6629502864E-01);
+  VERIFY_IS_APPROX(x[5], 3.9797285797E-01);
+  VERIFY_IS_APPROX(x[6], 4.9727297349E-02);
+
+  /*
+   * Second try
+   */
+  x<< 1300 ,1500 ,500  ,75   ,1    ,0.4  ,0.05  ;
+  // do the computation
+  lm.resetParameters();
+  lm.setFtol(1.E4*NumTraits<double>::epsilon());
+  lm.setXtol(1.E4*NumTraits<double>::epsilon());
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 29);
+  VERIFY_IS_EQUAL(lm.njev(), 28);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.6427082397E+03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.2881396800E+03);
+  VERIFY_IS_APPROX(x[1], 1.4910792535E+03);
+  VERIFY_IS_APPROX(x[2], 5.8323836877E+02);
+  VERIFY_IS_APPROX(x[3], 7.5416644291E+01);
+  VERIFY_IS_APPROX(x[4], 9.6629502864E-01);
+  VERIFY_IS_APPROX(x[5], 3.9797285797E-01);
+  VERIFY_IS_APPROX(x[6], 4.9727297349E-02);
+}
+
+struct rat43_functor : DenseFunctor<double>
+{
+    rat43_functor(void) : DenseFunctor<double>(4,15) {}
+    static const double x[15];
+    static const double y[15];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==4);
+        assert(fvec.size()==15);
+        for(int i=0; i<15; i++)
+            fvec[i] = b[0] * pow(1.+exp(b[1]-b[2]*x[i]),-1./b[3]) - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==4);
+        assert(fjac.rows()==15);
+        assert(fjac.cols()==4);
+        for(int i=0; i<15; i++) {
+            double e = exp(b[1]-b[2]*x[i]);
+            double power = -1./b[3];
+            fjac(i,0) = pow(1.+e, power);
+            fjac(i,1) = power*b[0]*e*pow(1.+e, power-1.);
+            fjac(i,2) = -power*b[0]*e*x[i]*pow(1.+e, power-1.);
+            fjac(i,3) = b[0]*power*power*log(1.+e)*pow(1.+e, power);
+        }
+        return 0;
+    }
+};
+const double rat43_functor::x[15] = { 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15. };
+const double rat43_functor::y[15] = { 16.08, 33.83, 65.80, 97.20, 191.55, 326.20, 386.87, 520.53, 590.03, 651.92, 724.93, 699.56, 689.96, 637.56, 717.41 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/ratkowsky3.shtml
+void testNistRat43(void)
+{
+  const int n=4;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 100., 10., 1., 1.;
+  // do the computation
+  rat43_functor functor;
+  LevenbergMarquardt<rat43_functor> lm(functor);
+  lm.setFtol(1.E6*NumTraits<double>::epsilon());
+  lm.setXtol(1.E6*NumTraits<double>::epsilon());
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 27);
+  VERIFY_IS_EQUAL(lm.njev(), 20);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7864049080E+03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 6.9964151270E+02);
+  VERIFY_IS_APPROX(x[1], 5.2771253025E+00);
+  VERIFY_IS_APPROX(x[2], 7.5962938329E-01);
+  VERIFY_IS_APPROX(x[3], 1.2792483859E+00);
+
+  /*
+   * Second try
+   */
+  x<< 700., 5., 0.75, 1.3;
+  // do the computation
+  lm.resetParameters();
+  lm.setFtol(1.E5*NumTraits<double>::epsilon());
+  lm.setXtol(1.E5*NumTraits<double>::epsilon());
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 9);
+  VERIFY_IS_EQUAL(lm.njev(), 8);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7864049080E+03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 6.9964151270E+02);
+  VERIFY_IS_APPROX(x[1], 5.2771253025E+00);
+  VERIFY_IS_APPROX(x[2], 7.5962938329E-01);
+  VERIFY_IS_APPROX(x[3], 1.2792483859E+00);
+}
+
+
+
+struct eckerle4_functor : DenseFunctor<double>
+{
+    eckerle4_functor(void) : DenseFunctor<double>(3,35) {}
+    static const double x[35];
+    static const double y[35];
+    int operator()(const VectorXd &b, VectorXd &fvec)
+    {
+        assert(b.size()==3);
+        assert(fvec.size()==35);
+        for(int i=0; i<35; i++)
+            fvec[i] = b[0]/b[1] * exp(-0.5*(x[i]-b[2])*(x[i]-b[2])/(b[1]*b[1])) - y[i];
+        return 0;
+    }
+    int df(const VectorXd &b, MatrixXd &fjac)
+    {
+        assert(b.size()==3);
+        assert(fjac.rows()==35);
+        assert(fjac.cols()==3);
+        for(int i=0; i<35; i++) {
+            double b12 = b[1]*b[1];
+            double e = exp(-0.5*(x[i]-b[2])*(x[i]-b[2])/b12);
+            fjac(i,0) = e / b[1];
+            fjac(i,1) = ((x[i]-b[2])*(x[i]-b[2])/b12-1.) * b[0]*e/b12;
+            fjac(i,2) = (x[i]-b[2])*e*b[0]/b[1]/b12;
+        }
+        return 0;
+    }
+};
+const double eckerle4_functor::x[35] = { 400.0, 405.0, 410.0, 415.0, 420.0, 425.0, 430.0, 435.0, 436.5, 438.0, 439.5, 441.0, 442.5, 444.0, 445.5, 447.0, 448.5, 450.0, 451.5, 453.0, 454.5, 456.0, 457.5, 459.0, 460.5, 462.0, 463.5, 465.0, 470.0, 475.0, 480.0, 485.0, 490.0, 495.0, 500.0};
+const double eckerle4_functor::y[35] = { 0.0001575, 0.0001699, 0.0002350, 0.0003102, 0.0004917, 0.0008710, 0.0017418, 0.0046400, 0.0065895, 0.0097302, 0.0149002, 0.0237310, 0.0401683, 0.0712559, 0.1264458, 0.2073413, 0.2902366, 0.3445623, 0.3698049, 0.3668534, 0.3106727, 0.2078154, 0.1164354, 0.0616764, 0.0337200, 0.0194023, 0.0117831, 0.0074357, 0.0022732, 0.0008800, 0.0004579, 0.0002345, 0.0001586, 0.0001143, 0.0000710 };
+
+// http://www.itl.nist.gov/div898/strd/nls/data/eckerle4.shtml
+void testNistEckerle4(void)
+{
+  const int n=3;
+  int info;
+
+  VectorXd x(n);
+
+  /*
+   * First try
+   */
+  x<< 1., 10., 500.;
+  // do the computation
+  eckerle4_functor functor;
+  LevenbergMarquardt<eckerle4_functor> lm(functor);
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 18);
+  VERIFY_IS_EQUAL(lm.njev(), 15);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.4635887487E-03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.5543827178);
+  VERIFY_IS_APPROX(x[1], 4.0888321754);
+  VERIFY_IS_APPROX(x[2], 4.5154121844E+02);
+
+  /*
+   * Second try
+   */
+  x<< 1.5, 5., 450.;
+  // do the computation
+  info = lm.minimize(x);
+
+  // check return value
+  VERIFY_IS_EQUAL(info, 1);
+  VERIFY_IS_EQUAL(lm.nfev(), 7);
+  VERIFY_IS_EQUAL(lm.njev(), 6);
+  // check norm^2
+  VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.4635887487E-03);
+  // check x
+  VERIFY_IS_APPROX(x[0], 1.5543827178);
+  VERIFY_IS_APPROX(x[1], 4.0888321754);
+  VERIFY_IS_APPROX(x[2], 4.5154121844E+02);
+}
+
+void test_levenberg_marquardt()
+{
+    // Tests using the examples provided by (c)minpack
+    CALL_SUBTEST(testLmder1());
+    CALL_SUBTEST(testLmder());
+    CALL_SUBTEST(testLmdif1());
+//     CALL_SUBTEST(testLmstr1());
+//     CALL_SUBTEST(testLmstr());
+    CALL_SUBTEST(testLmdif());
+
+    // NIST tests, level of difficulty = "Lower"
+    CALL_SUBTEST(testNistMisra1a());
+    CALL_SUBTEST(testNistChwirut2());
+
+    // NIST tests, level of difficulty = "Average"
+    CALL_SUBTEST(testNistHahn1());
+    CALL_SUBTEST(testNistMisra1d());
+    CALL_SUBTEST(testNistMGH17());
+    CALL_SUBTEST(testNistLanczos1());
+
+//     // NIST tests, level of difficulty = "Higher"
+    CALL_SUBTEST(testNistRat42());
+    CALL_SUBTEST(testNistMGH10());
+    CALL_SUBTEST(testNistBoxBOD());
+//     CALL_SUBTEST(testNistMGH09());
+    CALL_SUBTEST(testNistBennett5());
+    CALL_SUBTEST(testNistThurber());
+    CALL_SUBTEST(testNistRat43());
+    CALL_SUBTEST(testNistEckerle4());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/matrix_exponential.cpp b/uppsrc/plugin/Eigen/unsupported/test/matrix_exponential.cpp
new file mode 100644
index 000000000..50dec083d
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/matrix_exponential.cpp
@@ -0,0 +1,141 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "matrix_functions.h"
+
+double binom(int n, int k)
+{
+  double res = 1;
+  for (int i=0; i<k; i++)
+    res = res * (n-k+i+1) / (i+1);
+  return res;
+}
+
+template <typename T>
+T expfn(T x, int)
+{
+  return std::exp(x);
+}
+
+template <typename T>
+void test2dRotation(double tol)
+{
+  Matrix<T,2,2> A, B, C;
+  T angle;
+
+  A << 0, 1, -1, 0;
+  for (int i=0; i<=20; i++)
+  {
+    angle = static_cast<T>(pow(10, i / 5. - 2));
+    B << std::cos(angle), std::sin(angle), -std::sin(angle), std::cos(angle);
+
+    C = (angle*A).matrixFunction(expfn);
+    std::cout << "test2dRotation: i = " << i << "   error funm = " << relerr(C, B);
+    VERIFY(C.isApprox(B, static_cast<T>(tol)));
+
+    C = (angle*A).exp();
+    std::cout << "   error expm = " << relerr(C, B) << "\n";
+    VERIFY(C.isApprox(B, static_cast<T>(tol)));
+  }
+}
+
+template <typename T>
+void test2dHyperbolicRotation(double tol)
+{
+  Matrix<std::complex<T>,2,2> A, B, C;
+  std::complex<T> imagUnit(0,1);
+  T angle, ch, sh;
+
+  for (int i=0; i<=20; i++)
+  {
+    angle = static_cast<T>((i-10) / 2.0);
+    ch = std::cosh(angle);
+    sh = std::sinh(angle);
+    A << 0, angle*imagUnit, -angle*imagUnit, 0;
+    B << ch, sh*imagUnit, -sh*imagUnit, ch;
+
+    C = A.matrixFunction(expfn);
+    std::cout << "test2dHyperbolicRotation: i = " << i << "   error funm = " << relerr(C, B);
+    VERIFY(C.isApprox(B, static_cast<T>(tol)));
+
+    C = A.exp();
+    std::cout << "   error expm = " << relerr(C, B) << "\n";
+    VERIFY(C.isApprox(B, static_cast<T>(tol)));
+  }
+}
+
+template <typename T>
+void testPascal(double tol)
+{
+  for (int size=1; size<20; size++)
+  {
+    Matrix<T,Dynamic,Dynamic> A(size,size), B(size,size), C(size,size);
+    A.setZero();
+    for (int i=0; i<size-1; i++)
+      A(i+1,i) = static_cast<T>(i+1);
+    B.setZero();
+    for (int i=0; i<size; i++)
+      for (int j=0; j<=i; j++)
+    B(i,j) = static_cast<T>(binom(i,j));
+
+    C = A.matrixFunction(expfn);
+    std::cout << "testPascal: size = " << size << "   error funm = " << relerr(C, B);
+    VERIFY(C.isApprox(B, static_cast<T>(tol)));
+
+    C = A.exp();
+    std::cout << "   error expm = " << relerr(C, B) << "\n";
+    VERIFY(C.isApprox(B, static_cast<T>(tol)));
+  }
+}
+
+template<typename MatrixType>
+void randomTest(const MatrixType& m, double tol)
+{
+  /* this test covers the following files:
+     Inverse.h
+  */
+  typename MatrixType::Index rows = m.rows();
+  typename MatrixType::Index cols = m.cols();
+  MatrixType m1(rows, cols), m2(rows, cols), identity = MatrixType::Identity(rows, cols);
+
+  typedef typename NumTraits<typename internal::traits<MatrixType>::Scalar>::Real RealScalar;
+
+  for(int i = 0; i < g_repeat; i++) {
+    m1 = MatrixType::Random(rows, cols);
+
+    m2 = m1.matrixFunction(expfn) * (-m1).matrixFunction(expfn);
+    std::cout << "randomTest: error funm = " << relerr(identity, m2);
+    VERIFY(identity.isApprox(m2, static_cast<RealScalar>(tol)));
+
+    m2 = m1.exp() * (-m1).exp();
+    std::cout << "   error expm = " << relerr(identity, m2) << "\n";
+    VERIFY(identity.isApprox(m2, static_cast<RealScalar>(tol)));
+  }
+}
+
+void test_matrix_exponential()
+{
+  CALL_SUBTEST_2(test2dRotation<double>(1e-13));
+  CALL_SUBTEST_1(test2dRotation<float>(2e-5));  // was 1e-5, relaxed for clang 2.8 / linux / x86-64
+  CALL_SUBTEST_8(test2dRotation<long double>(1e-13)); 
+  CALL_SUBTEST_2(test2dHyperbolicRotation<double>(1e-14));
+  CALL_SUBTEST_1(test2dHyperbolicRotation<float>(1e-5));
+  CALL_SUBTEST_8(test2dHyperbolicRotation<long double>(1e-14));
+  CALL_SUBTEST_6(testPascal<float>(1e-6));
+  CALL_SUBTEST_5(testPascal<double>(1e-15));
+  CALL_SUBTEST_2(randomTest(Matrix2d(), 1e-13));
+  CALL_SUBTEST_7(randomTest(Matrix<double,3,3,RowMajor>(), 1e-13));
+  CALL_SUBTEST_3(randomTest(Matrix4cd(), 1e-13));
+  CALL_SUBTEST_4(randomTest(MatrixXd(8,8), 1e-13));
+  CALL_SUBTEST_1(randomTest(Matrix2f(), 1e-4));
+  CALL_SUBTEST_5(randomTest(Matrix3cf(), 1e-4));
+  CALL_SUBTEST_1(randomTest(Matrix4f(), 1e-4));
+  CALL_SUBTEST_6(randomTest(MatrixXf(8,8), 1e-4));
+  CALL_SUBTEST_9(randomTest(Matrix<long double,Dynamic,Dynamic>(7,7), 1e-13));
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/matrix_function.cpp b/uppsrc/plugin/Eigen/unsupported/test/matrix_function.cpp
new file mode 100644
index 000000000..005c9c15f
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/matrix_function.cpp
@@ -0,0 +1,227 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <unsupported/Eigen/MatrixFunctions>
+
+// Variant of VERIFY_IS_APPROX which uses absolute error instead of
+// relative error.
+#define VERIFY_IS_APPROX_ABS(a, b) VERIFY(test_isApprox_abs(a, b))
+
+template<typename Type1, typename Type2>
+inline bool test_isApprox_abs(const Type1& a, const Type2& b)
+{
+  return ((a-b).array().abs() < test_precision<typename Type1::RealScalar>()).all();
+}
+
+
+// Returns a matrix with eigenvalues clustered around 0, 1 and 2.
+template<typename MatrixType>
+MatrixType randomMatrixWithRealEivals(const typename MatrixType::Index size)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  MatrixType diag = MatrixType::Zero(size, size);
+  for (Index i = 0; i < size; ++i) {
+    diag(i, i) = Scalar(RealScalar(internal::random<int>(0,2)))
+      + internal::random<Scalar>() * Scalar(RealScalar(0.01));
+  }
+  MatrixType A = MatrixType::Random(size, size);
+  HouseholderQR<MatrixType> QRofA(A);
+  return QRofA.householderQ().inverse() * diag * QRofA.householderQ();
+}
+
+template <typename MatrixType, int IsComplex = NumTraits<typename internal::traits<MatrixType>::Scalar>::IsComplex>
+struct randomMatrixWithImagEivals
+{
+  // Returns a matrix with eigenvalues clustered around 0 and +/- i.
+  static MatrixType run(const typename MatrixType::Index size);
+};
+
+// Partial specialization for real matrices
+template<typename MatrixType>
+struct randomMatrixWithImagEivals<MatrixType, 0>
+{
+  static MatrixType run(const typename MatrixType::Index size)
+  {
+    typedef typename MatrixType::Scalar Scalar;
+    MatrixType diag = MatrixType::Zero(size, size);
+    Index i = 0;
+    while (i < size) {
+      Index randomInt = internal::random<Index>(-1, 1);
+      if (randomInt == 0 || i == size-1) {
+        diag(i, i) = internal::random<Scalar>() * Scalar(0.01);
+        ++i;
+      } else {
+        Scalar alpha = Scalar(randomInt) + internal::random<Scalar>() * Scalar(0.01);
+        diag(i, i+1) = alpha;
+        diag(i+1, i) = -alpha;
+        i += 2;
+      }
+    }
+    MatrixType A = MatrixType::Random(size, size);
+    HouseholderQR<MatrixType> QRofA(A);
+    return QRofA.householderQ().inverse() * diag * QRofA.householderQ();
+  }
+};
+
+// Partial specialization for complex matrices
+template<typename MatrixType>
+struct randomMatrixWithImagEivals<MatrixType, 1>
+{
+  static MatrixType run(const typename MatrixType::Index size)
+  {
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename MatrixType::RealScalar RealScalar;
+    const Scalar imagUnit(0, 1);
+    MatrixType diag = MatrixType::Zero(size, size);
+    for (Index i = 0; i < size; ++i) {
+      diag(i, i) = Scalar(RealScalar(internal::random<Index>(-1, 1))) * imagUnit
+        + internal::random<Scalar>() * Scalar(RealScalar(0.01));
+    }
+    MatrixType A = MatrixType::Random(size, size);
+    HouseholderQR<MatrixType> QRofA(A);
+    return QRofA.householderQ().inverse() * diag * QRofA.householderQ();
+  }
+};
+
+
+template<typename MatrixType>
+void testMatrixExponential(const MatrixType& A)
+{
+  typedef typename internal::traits<MatrixType>::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef std::complex<RealScalar> ComplexScalar;
+
+  VERIFY_IS_APPROX(A.exp(), A.matrixFunction(internal::stem_function_exp<ComplexScalar>));
+}
+
+template<typename MatrixType>
+void testMatrixLogarithm(const MatrixType& A)
+{
+  typedef typename internal::traits<MatrixType>::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  MatrixType scaledA;
+  RealScalar maxImagPartOfSpectrum = A.eigenvalues().imag().cwiseAbs().maxCoeff();
+  if (maxImagPartOfSpectrum >= RealScalar(0.9L * EIGEN_PI))
+    scaledA = A * RealScalar(0.9L * EIGEN_PI) / maxImagPartOfSpectrum;
+  else
+    scaledA = A;
+
+  // identity X.exp().log() = X only holds if Im(lambda) < pi for all eigenvalues of X
+  MatrixType expA = scaledA.exp();
+  MatrixType logExpA = expA.log();
+  VERIFY_IS_APPROX(logExpA, scaledA);
+}
+
+template<typename MatrixType>
+void testHyperbolicFunctions(const MatrixType& A)
+{
+  // Need to use absolute error because of possible cancellation when
+  // adding/subtracting expA and expmA.
+  VERIFY_IS_APPROX_ABS(A.sinh(), (A.exp() - (-A).exp()) / 2);
+  VERIFY_IS_APPROX_ABS(A.cosh(), (A.exp() + (-A).exp()) / 2);
+}
+
+template<typename MatrixType>
+void testGonioFunctions(const MatrixType& A)
+{
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef std::complex<RealScalar> ComplexScalar;
+  typedef Matrix<ComplexScalar, MatrixType::RowsAtCompileTime, 
+                 MatrixType::ColsAtCompileTime, MatrixType::Options> ComplexMatrix;
+
+  ComplexScalar imagUnit(0,1);
+  ComplexScalar two(2,0);
+
+  ComplexMatrix Ac = A.template cast<ComplexScalar>();
+  
+  ComplexMatrix exp_iA = (imagUnit * Ac).exp();
+  ComplexMatrix exp_miA = (-imagUnit * Ac).exp();
+  
+  ComplexMatrix sinAc = A.sin().template cast<ComplexScalar>();
+  VERIFY_IS_APPROX_ABS(sinAc, (exp_iA - exp_miA) / (two*imagUnit));
+  
+  ComplexMatrix cosAc = A.cos().template cast<ComplexScalar>();
+  VERIFY_IS_APPROX_ABS(cosAc, (exp_iA + exp_miA) / 2);
+}
+
+template<typename MatrixType>
+void testMatrix(const MatrixType& A)
+{
+  testMatrixExponential(A);
+  testMatrixLogarithm(A);
+  testHyperbolicFunctions(A);
+  testGonioFunctions(A);
+}
+
+template<typename MatrixType>
+void testMatrixType(const MatrixType& m)
+{
+  // Matrices with clustered eigenvalue lead to different code paths
+  // in MatrixFunction.h and are thus useful for testing.
+
+  const Index size = m.rows();
+  for (int i = 0; i < g_repeat; i++) {
+    testMatrix(MatrixType::Random(size, size).eval());
+    testMatrix(randomMatrixWithRealEivals<MatrixType>(size));
+    testMatrix(randomMatrixWithImagEivals<MatrixType>::run(size));
+  }
+}
+
+template<typename MatrixType>
+void testMapRef(const MatrixType& A)
+{
+  // Test if passing Ref and Map objects is possible
+  // (Regression test for Bug #1796)
+  Index size = A.rows();
+  MatrixType X; X.setRandom(size, size);
+  MatrixType Y(size,size);
+  Ref<      MatrixType> R(Y);
+  Ref<const MatrixType> Rc(X);
+  Map<      MatrixType> M(Y.data(), size, size);
+  Map<const MatrixType> Mc(X.data(), size, size);
+
+  X = X*X; // make sure sqrt is possible
+  Y = X.sqrt();
+  R = Rc.sqrt();
+  M = Mc.sqrt();
+  Y = X.exp();
+  R = Rc.exp();
+  M = Mc.exp();
+  X = Y; // make sure log is possible
+  Y = X.log();
+  R = Rc.log();
+  M = Mc.log();
+
+  Y = X.cos() + Rc.cos() + Mc.cos();
+  Y = X.sin() + Rc.sin() + Mc.sin();
+
+  Y = X.cosh() + Rc.cosh() + Mc.cosh();
+  Y = X.sinh() + Rc.sinh() + Mc.sinh();
+}
+
+
+void test_matrix_function()
+{
+  CALL_SUBTEST_1(testMatrixType(Matrix<float,1,1>()));
+  CALL_SUBTEST_2(testMatrixType(Matrix3cf()));
+  CALL_SUBTEST_3(testMatrixType(MatrixXf(8,8)));
+  CALL_SUBTEST_4(testMatrixType(Matrix2d()));
+  CALL_SUBTEST_5(testMatrixType(Matrix<double,5,5,RowMajor>()));
+  CALL_SUBTEST_6(testMatrixType(Matrix4cd()));
+  CALL_SUBTEST_7(testMatrixType(MatrixXd(13,13)));
+
+  CALL_SUBTEST_1(testMapRef(Matrix<float,1,1>()));
+  CALL_SUBTEST_2(testMapRef(Matrix3cf()));
+  CALL_SUBTEST_3(testMapRef(MatrixXf(8,8)));
+  CALL_SUBTEST_7(testMapRef(MatrixXd(13,13)));
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/matrix_functions.h b/uppsrc/plugin/Eigen/unsupported/test/matrix_functions.h
new file mode 100644
index 000000000..4e2636404
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/matrix_functions.h
@@ -0,0 +1,67 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2011 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <unsupported/Eigen/MatrixFunctions>
+
+// For complex matrices, any matrix is fine.
+template<typename MatrixType, int IsComplex = NumTraits<typename internal::traits<MatrixType>::Scalar>::IsComplex>
+struct processTriangularMatrix
+{
+  static void run(MatrixType&, MatrixType&, const MatrixType&)
+  { }
+};
+
+// For real matrices, make sure none of the eigenvalues are negative.
+template<typename MatrixType>
+struct processTriangularMatrix<MatrixType,0>
+{
+  static void run(MatrixType& m, MatrixType& T, const MatrixType& U)
+  {
+    const Index size = m.cols();
+
+    for (Index i=0; i < size; ++i) {
+      if (i == size - 1 || T.coeff(i+1,i) == 0)
+        T.coeffRef(i,i) = std::abs(T.coeff(i,i));
+      else
+        ++i;
+    }
+    m = U * T * U.transpose();
+  }
+};
+
+template <typename MatrixType, int IsComplex = NumTraits<typename internal::traits<MatrixType>::Scalar>::IsComplex>
+struct generateTestMatrix;
+
+template <typename MatrixType>
+struct generateTestMatrix<MatrixType,0>
+{
+  static void run(MatrixType& result, typename MatrixType::Index size)
+  {
+    result = MatrixType::Random(size, size);
+    RealSchur<MatrixType> schur(result);
+    MatrixType T = schur.matrixT();
+    processTriangularMatrix<MatrixType>::run(result, T, schur.matrixU());
+  }
+};
+
+template <typename MatrixType>
+struct generateTestMatrix<MatrixType,1>
+{
+  static void run(MatrixType& result, typename MatrixType::Index size)
+  {
+    result = MatrixType::Random(size, size);
+  }
+};
+
+template <typename Derived, typename OtherDerived>
+typename Derived::RealScalar relerr(const MatrixBase<Derived>& A, const MatrixBase<OtherDerived>& B)
+{
+  return std::sqrt((A - B).cwiseAbs2().sum() / (std::min)(A.cwiseAbs2().sum(), B.cwiseAbs2().sum()));
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/matrix_power.cpp b/uppsrc/plugin/Eigen/unsupported/test/matrix_power.cpp
new file mode 100644
index 000000000..7ccfacfdf
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/matrix_power.cpp
@@ -0,0 +1,204 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012, 2013 Chen-Pang He <jdh8@ms63.hinet.net>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "matrix_functions.h"
+
+template<typename T>
+void test2dRotation(const T& tol)
+{
+  Matrix<T,2,2> A, B, C;
+  T angle, c, s;
+
+  A << 0, 1, -1, 0;
+  MatrixPower<Matrix<T,2,2> > Apow(A);
+
+  for (int i=0; i<=20; ++i) {
+    angle = std::pow(T(10), (i-10) / T(5.));
+    c = std::cos(angle);
+    s = std::sin(angle);
+    B << c, s, -s, c;
+
+    C = Apow(std::ldexp(angle,1) / T(EIGEN_PI));
+    std::cout << "test2dRotation: i = " << i << "   error powerm = " << relerr(C,B) << '\n';
+    VERIFY(C.isApprox(B, tol));
+  }
+}
+
+template<typename T>
+void test2dHyperbolicRotation(const T& tol)
+{
+  Matrix<std::complex<T>,2,2> A, B, C;
+  T angle, ch = std::cosh((T)1);
+  std::complex<T> ish(0, std::sinh((T)1));
+
+  A << ch, ish, -ish, ch;
+  MatrixPower<Matrix<std::complex<T>,2,2> > Apow(A);
+
+  for (int i=0; i<=20; ++i) {
+    angle = std::ldexp(static_cast<T>(i-10), -1);
+    ch = std::cosh(angle);
+    ish = std::complex<T>(0, std::sinh(angle));
+    B << ch, ish, -ish, ch;
+
+    C = Apow(angle);
+    std::cout << "test2dHyperbolicRotation: i = " << i << "   error powerm = " << relerr(C,B) << '\n';
+    VERIFY(C.isApprox(B, tol));
+  }
+}
+
+template<typename T>
+void test3dRotation(const T& tol)
+{
+  Matrix<T,3,1> v;
+  T angle;
+
+  for (int i=0; i<=20; ++i) {
+    v = Matrix<T,3,1>::Random();
+    v.normalize();
+    angle = std::pow(T(10), (i-10) / T(5.));
+    VERIFY(AngleAxis<T>(angle, v).matrix().isApprox(AngleAxis<T>(1,v).matrix().pow(angle), tol));
+  }
+}
+
+template<typename MatrixType>
+void testGeneral(const MatrixType& m, const typename MatrixType::RealScalar& tol)
+{
+  typedef typename MatrixType::RealScalar RealScalar;
+  MatrixType m1, m2, m3, m4, m5;
+  RealScalar x, y;
+
+  for (int i=0; i < g_repeat; ++i) {
+    generateTestMatrix<MatrixType>::run(m1, m.rows());
+    MatrixPower<MatrixType> mpow(m1);
+
+    x = internal::random<RealScalar>();
+    y = internal::random<RealScalar>();
+    m2 = mpow(x);
+    m3 = mpow(y);
+
+    m4 = mpow(x+y);
+    m5.noalias() = m2 * m3;
+    VERIFY(m4.isApprox(m5, tol));
+
+    m4 = mpow(x*y);
+    m5 = m2.pow(y);
+    VERIFY(m4.isApprox(m5, tol));
+
+    m4 = (std::abs(x) * m1).pow(y);
+    m5 = std::pow(std::abs(x), y) * m3;
+    VERIFY(m4.isApprox(m5, tol));
+  }
+}
+
+template<typename MatrixType>
+void testSingular(const MatrixType& m_const, const typename MatrixType::RealScalar& tol)
+{
+  // we need to pass by reference in order to prevent errors with
+  // MSVC for aligned data types ...
+  MatrixType& m = const_cast<MatrixType&>(m_const);
+
+  const int IsComplex = NumTraits<typename internal::traits<MatrixType>::Scalar>::IsComplex;
+  typedef typename internal::conditional<IsComplex, TriangularView<MatrixType,Upper>, const MatrixType&>::type TriangularType;
+  typename internal::conditional< IsComplex, ComplexSchur<MatrixType>, RealSchur<MatrixType> >::type schur;
+  MatrixType T;
+
+  for (int i=0; i < g_repeat; ++i) {
+    m.setRandom();
+    m.col(0).fill(0);
+
+    schur.compute(m);
+    T = schur.matrixT();
+    const MatrixType& U = schur.matrixU();
+    processTriangularMatrix<MatrixType>::run(m, T, U);
+    MatrixPower<MatrixType> mpow(m);
+
+    T = T.sqrt();
+    VERIFY(mpow(0.5L).isApprox(U * (TriangularType(T) * U.adjoint()), tol));
+
+    T = T.sqrt();
+    VERIFY(mpow(0.25L).isApprox(U * (TriangularType(T) * U.adjoint()), tol));
+
+    T = T.sqrt();
+    VERIFY(mpow(0.125L).isApprox(U * (TriangularType(T) * U.adjoint()), tol));
+  }
+}
+
+template<typename MatrixType>
+void testLogThenExp(const MatrixType& m_const, const typename MatrixType::RealScalar& tol)
+{
+  // we need to pass by reference in order to prevent errors with
+  // MSVC for aligned data types ...
+  MatrixType& m = const_cast<MatrixType&>(m_const);
+
+  typedef typename MatrixType::Scalar Scalar;
+  Scalar x;
+
+  for (int i=0; i < g_repeat; ++i) {
+    generateTestMatrix<MatrixType>::run(m, m.rows());
+    x = internal::random<Scalar>();
+    VERIFY(m.pow(x).isApprox((x * m.log()).exp(), tol));
+  }
+}
+
+typedef Matrix<double,3,3,RowMajor>         Matrix3dRowMajor;
+typedef Matrix<long double,3,3>             Matrix3e;
+typedef Matrix<long double,Dynamic,Dynamic> MatrixXe;
+ 
+void test_matrix_power()
+{
+  CALL_SUBTEST_2(test2dRotation<double>(1e-13));
+  CALL_SUBTEST_1(test2dRotation<float>(2e-5));  // was 1e-5, relaxed for clang 2.8 / linux / x86-64
+  CALL_SUBTEST_9(test2dRotation<long double>(1e-13L));
+  CALL_SUBTEST_2(test2dHyperbolicRotation<double>(1e-14));
+  CALL_SUBTEST_1(test2dHyperbolicRotation<float>(1e-5));
+  CALL_SUBTEST_9(test2dHyperbolicRotation<long double>(1e-14L));
+
+  CALL_SUBTEST_10(test3dRotation<double>(1e-13));
+  CALL_SUBTEST_11(test3dRotation<float>(1e-5));
+  CALL_SUBTEST_12(test3dRotation<long double>(1e-13L));
+
+  CALL_SUBTEST_2(testGeneral(Matrix2d(),         1e-13));
+  CALL_SUBTEST_7(testGeneral(Matrix3dRowMajor(), 1e-13));
+  CALL_SUBTEST_3(testGeneral(Matrix4cd(),        1e-13));
+  CALL_SUBTEST_4(testGeneral(MatrixXd(8,8),      2e-12));
+  CALL_SUBTEST_1(testGeneral(Matrix2f(),         1e-4));
+  CALL_SUBTEST_5(testGeneral(Matrix3cf(),        1e-4));
+  CALL_SUBTEST_8(testGeneral(Matrix4f(),         1e-4));
+  CALL_SUBTEST_6(testGeneral(MatrixXf(2,2),      1e-3)); // see bug 614
+  CALL_SUBTEST_9(testGeneral(MatrixXe(7,7),      1e-13L));
+  CALL_SUBTEST_10(testGeneral(Matrix3d(),        1e-13));
+  CALL_SUBTEST_11(testGeneral(Matrix3f(),        1e-4));
+  CALL_SUBTEST_12(testGeneral(Matrix3e(),        1e-13L));
+
+  CALL_SUBTEST_2(testSingular(Matrix2d(),         1e-13));
+  CALL_SUBTEST_7(testSingular(Matrix3dRowMajor(), 1e-13));
+  CALL_SUBTEST_3(testSingular(Matrix4cd(),        1e-13));
+  CALL_SUBTEST_4(testSingular(MatrixXd(8,8),      2e-12));
+  CALL_SUBTEST_1(testSingular(Matrix2f(),         1e-4));
+  CALL_SUBTEST_5(testSingular(Matrix3cf(),        1e-4));
+  CALL_SUBTEST_8(testSingular(Matrix4f(),         1e-4));
+  CALL_SUBTEST_6(testSingular(MatrixXf(2,2),      1e-3));
+  CALL_SUBTEST_9(testSingular(MatrixXe(7,7),      1e-13L));
+  CALL_SUBTEST_10(testSingular(Matrix3d(),        1e-13));
+  CALL_SUBTEST_11(testSingular(Matrix3f(),        1e-4));
+  CALL_SUBTEST_12(testSingular(Matrix3e(),        1e-13L));
+
+  CALL_SUBTEST_2(testLogThenExp(Matrix2d(),         1e-13));
+  CALL_SUBTEST_7(testLogThenExp(Matrix3dRowMajor(), 1e-13));
+  CALL_SUBTEST_3(testLogThenExp(Matrix4cd(),        1e-13));
+  CALL_SUBTEST_4(testLogThenExp(MatrixXd(8,8),      2e-12));
+  CALL_SUBTEST_1(testLogThenExp(Matrix2f(),         1e-4));
+  CALL_SUBTEST_5(testLogThenExp(Matrix3cf(),        1e-4));
+  CALL_SUBTEST_8(testLogThenExp(Matrix4f(),         1e-4));
+  CALL_SUBTEST_6(testLogThenExp(MatrixXf(2,2),      1e-3));
+  CALL_SUBTEST_9(testLogThenExp(MatrixXe(7,7),      1e-13L));
+  CALL_SUBTEST_10(testLogThenExp(Matrix3d(),        1e-13));
+  CALL_SUBTEST_11(testLogThenExp(Matrix3f(),        1e-4));
+  CALL_SUBTEST_12(testLogThenExp(Matrix3e(),        1e-13L));
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/matrix_square_root.cpp b/uppsrc/plugin/Eigen/unsupported/test/matrix_square_root.cpp
new file mode 100644
index 000000000..ea541e1ea
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/matrix_square_root.cpp
@@ -0,0 +1,31 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "matrix_functions.h"
+
+template<typename MatrixType>
+void testMatrixSqrt(const MatrixType& m)
+{
+  MatrixType A;
+  generateTestMatrix<MatrixType>::run(A, m.rows());
+  MatrixType sqrtA = A.sqrt();
+  VERIFY_IS_APPROX(sqrtA * sqrtA, A);
+}
+
+void test_matrix_square_root()
+{
+  for (int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1(testMatrixSqrt(Matrix3cf()));
+    CALL_SUBTEST_2(testMatrixSqrt(MatrixXcd(12,12)));
+    CALL_SUBTEST_3(testMatrixSqrt(Matrix4f()));
+    CALL_SUBTEST_4(testMatrixSqrt(Matrix<double,Dynamic,Dynamic,RowMajor>(9, 9)));
+    CALL_SUBTEST_5(testMatrixSqrt(Matrix<float,1,1>()));
+    CALL_SUBTEST_5(testMatrixSqrt(Matrix<std::complex<float>,1,1>()));
+  }
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/minres.cpp b/uppsrc/plugin/Eigen/unsupported/test/minres.cpp
new file mode 100644
index 000000000..8b300b78a
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/minres.cpp
@@ -0,0 +1,44 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Giacomo Po <gpo@ucla.edu>
+// Copyright (C) 2011 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+#include <cmath>
+
+#include "../../test/sparse_solver.h"
+#include <Eigen/IterativeSolvers>
+
+template<typename T> void test_minres_T()
+{
+  // Identity preconditioner
+  MINRES<SparseMatrix<T>, Lower, IdentityPreconditioner    > minres_colmajor_lower_I;
+  MINRES<SparseMatrix<T>, Upper, IdentityPreconditioner    > minres_colmajor_upper_I;
+
+  // Diagonal preconditioner
+  MINRES<SparseMatrix<T>, Lower, DiagonalPreconditioner<T> > minres_colmajor_lower_diag;
+  MINRES<SparseMatrix<T>, Upper, DiagonalPreconditioner<T> > minres_colmajor_upper_diag;
+  MINRES<SparseMatrix<T>, Lower|Upper, DiagonalPreconditioner<T> > minres_colmajor_uplo_diag;
+  
+  // call tests for SPD matrix
+  CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_lower_I) );
+  CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_upper_I) );
+    
+  CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_lower_diag)  );
+  CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_upper_diag)  );
+  CALL_SUBTEST( check_sparse_spd_solving(minres_colmajor_uplo_diag)  );
+    
+  // TO DO: symmetric semi-definite matrix
+  // TO DO: symmetric indefinite matrix
+
+}
+
+void test_minres()
+{
+  CALL_SUBTEST_1(test_minres_T<double>());
+//  CALL_SUBTEST_2(test_minres_T<std::compex<double> >());
+
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/mpreal/mpreal.h b/uppsrc/plugin/Eigen/unsupported/test/mpreal/mpreal.h
new file mode 100644
index 000000000..8404f1ff8
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/mpreal/mpreal.h
@@ -0,0 +1,3104 @@
+/*
+    MPFR C++: Multi-precision floating point number class for C++.
+    Based on MPFR library:    http://mpfr.org
+
+    Project homepage:    http://www.holoborodko.com/pavel/mpfr
+    Contact e-mail:      pavel@holoborodko.com
+
+    Copyright (c) 2008-2015 Pavel Holoborodko
+
+    Contributors:
+    Dmitriy Gubanov, Konstantin Holoborodko, Brian Gladman,
+    Helmut Jarausch, Fokko Beekhof, Ulrich Mutze, Heinz van Saanen,
+    Pere Constans, Peter van Hoof, Gael Guennebaud, Tsai Chia Cheng,
+    Alexei Zubanov, Jauhien Piatlicki, Victor Berger, John Westwood,
+    Petr Aleksandrov, Orion Poplawski, Charles Karney, Arash Partow,
+    Rodney James, Jorge Leitao.
+
+    Licensing:
+    (A) MPFR C++ is under GNU General Public License ("GPL").
+
+    (B) Non-free licenses may also be purchased from the author, for users who
+        do not want their programs protected by the GPL.
+
+        The non-free licenses are for users that wish to use MPFR C++ in
+        their products but are unwilling to release their software
+        under the GPL (which would require them to release source code
+        and allow free redistribution).
+
+        Such users can purchase an unlimited-use license from the author.
+        Contact us for more details.
+
+    GNU General Public License ("GPL") copyright permissions statement:
+    **************************************************************************
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef __MPREAL_H__
+#define __MPREAL_H__
+
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <cfloat>
+#include <cmath>
+#include <cstring>
+#include <limits>
+#include <complex>
+#include <algorithm>
+
+// Options
+#define MPREAL_HAVE_MSVC_DEBUGVIEW              // Enable Debugger Visualizer for "Debug" builds in MSVC.
+#define MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS  // Enable extended std::numeric_limits<mpfr::mpreal> specialization.
+                                                // Meaning that "digits", "round_style" and similar members are defined as functions, not constants.
+                                                // See std::numeric_limits<mpfr::mpreal> at the end of the file for more information.
+
+// Library version
+#define MPREAL_VERSION_MAJOR 3
+#define MPREAL_VERSION_MINOR 6
+#define MPREAL_VERSION_PATCHLEVEL 2
+#define MPREAL_VERSION_STRING "3.6.2"
+
+// Detect compiler using signatures from http://predef.sourceforge.net/
+#if defined(__GNUC__)
+    #define IsInf(x) (isinf)(x)                 // GNU C++/Intel ICC compiler on Linux
+#elif defined(_MSC_VER)                         // Microsoft Visual C++
+    #define IsInf(x) (!_finite(x))
+#else
+    #define IsInf(x) (std::isinf)(x)              // GNU C/C++ (and/or other compilers), just hope for C99 conformance
+#endif
+
+// A Clang feature extension to determine compiler features.
+#ifndef __has_feature
+    #define __has_feature(x) 0
+#endif
+
+// Detect support for r-value references (move semantic). Borrowed from Eigen.
+#if (__has_feature(cxx_rvalue_references) || \
+       defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L || \
+      (defined(_MSC_VER) && _MSC_VER >= 1600))
+
+    #define MPREAL_HAVE_MOVE_SUPPORT
+
+    // Use fields in mpfr_t structure to check if it was initialized / set dummy initialization
+    #define mpfr_is_initialized(x)      (0 != (x)->_mpfr_d)
+    #define mpfr_set_uninitialized(x)   ((x)->_mpfr_d = 0 )
+#endif
+
+// Detect support for explicit converters.
+#if (__has_feature(cxx_explicit_conversions) || \
+       (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GNUC_MINOR__ >= 5) || __cplusplus >= 201103L || \
+       (defined(_MSC_VER) && _MSC_VER >= 1800))
+
+    #define MPREAL_HAVE_EXPLICIT_CONVERTERS
+#endif
+
+#define MPFR_USE_INTMAX_T   // Enable 64-bit integer types - should be defined before mpfr.h
+
+#if defined(MPREAL_HAVE_MSVC_DEBUGVIEW) && defined(_MSC_VER) && defined(_DEBUG)
+    #define MPREAL_MSVC_DEBUGVIEW_CODE     DebugView = toString();
+    #define MPREAL_MSVC_DEBUGVIEW_DATA     std::string DebugView;
+#else
+    #define MPREAL_MSVC_DEBUGVIEW_CODE
+    #define MPREAL_MSVC_DEBUGVIEW_DATA
+#endif
+
+#include <mpfr.h>
+
+#if (MPFR_VERSION < MPFR_VERSION_NUM(3,0,0))
+    #include <cstdlib>                          // Needed for random()
+#endif
+
+// Less important options
+#define MPREAL_DOUBLE_BITS_OVERFLOW -1          // Triggers overflow exception during conversion to double if mpreal
+                                                // cannot fit in MPREAL_DOUBLE_BITS_OVERFLOW bits
+                                                // = -1 disables overflow checks (default)
+
+// Fast replacement for mpfr_set_zero(x, +1):
+// (a) uses low-level data members, might not be compatible with new versions of MPFR
+// (b) sign is not set, add (x)->_mpfr_sign = 1;
+#define mpfr_set_zero_fast(x)  ((x)->_mpfr_exp = __MPFR_EXP_ZERO)
+
+#if defined(__GNUC__)
+  #define MPREAL_PERMISSIVE_EXPR __extension__
+#else
+  #define MPREAL_PERMISSIVE_EXPR
+#endif
+
+namespace mpfr {
+
+class mpreal {
+private:
+    mpfr_t mp;
+
+public:
+
+    // Get default rounding mode & precision
+    inline static mp_rnd_t   get_default_rnd()    {    return (mp_rnd_t)(mpfr_get_default_rounding_mode());       }
+    inline static mp_prec_t  get_default_prec()   {    return mpfr_get_default_prec();                            }
+
+    // Constructors && type conversions
+    mpreal();
+    mpreal(const mpreal& u);
+    mpreal(const mpf_t u);
+    mpreal(const mpz_t u,                  mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const mpq_t u,                  mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const double u,                 mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const long double u,            mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const unsigned long long int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const long long int u,          mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const unsigned long int u,      mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const unsigned int u,           mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const long int u,               mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const int u,                    mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
+
+    // Construct mpreal from mpfr_t structure.
+    // shared = true allows to avoid deep copy, so that mpreal and 'u' share the same data & pointers.
+    mpreal(const mpfr_t  u, bool shared = false);
+
+    mpreal(const char* s,             mp_prec_t prec = mpreal::get_default_prec(), int base = 10, mp_rnd_t mode = mpreal::get_default_rnd());
+    mpreal(const std::string& s,      mp_prec_t prec = mpreal::get_default_prec(), int base = 10, mp_rnd_t mode = mpreal::get_default_rnd());
+
+    ~mpreal();
+
+#ifdef MPREAL_HAVE_MOVE_SUPPORT
+    mpreal& operator=(mpreal&& v);
+    mpreal(mpreal&& u);
+#endif
+
+    // Operations
+    // =
+    // +, -, *, /, ++, --, <<, >>
+    // *=, +=, -=, /=,
+    // <, >, ==, <=, >=
+
+    // =
+    mpreal& operator=(const mpreal& v);
+    mpreal& operator=(const mpf_t v);
+    mpreal& operator=(const mpz_t v);
+    mpreal& operator=(const mpq_t v);
+    mpreal& operator=(const long double v);
+    mpreal& operator=(const double v);
+    mpreal& operator=(const unsigned long int v);
+    mpreal& operator=(const unsigned long long int v);
+    mpreal& operator=(const long long int v);
+    mpreal& operator=(const unsigned int v);
+    mpreal& operator=(const long int v);
+    mpreal& operator=(const int v);
+    mpreal& operator=(const char* s);
+    mpreal& operator=(const std::string& s);
+    template <typename real_t> mpreal& operator= (const std::complex<real_t>& z);
+
+    // +
+    mpreal& operator+=(const mpreal& v);
+    mpreal& operator+=(const mpf_t v);
+    mpreal& operator+=(const mpz_t v);
+    mpreal& operator+=(const mpq_t v);
+    mpreal& operator+=(const long double u);
+    mpreal& operator+=(const double u);
+    mpreal& operator+=(const unsigned long int u);
+    mpreal& operator+=(const unsigned int u);
+    mpreal& operator+=(const long int u);
+    mpreal& operator+=(const int u);
+
+    mpreal& operator+=(const long long int  u);
+    mpreal& operator+=(const unsigned long long int u);
+    mpreal& operator-=(const long long int  u);
+    mpreal& operator-=(const unsigned long long int u);
+    mpreal& operator*=(const long long int  u);
+    mpreal& operator*=(const unsigned long long int u);
+    mpreal& operator/=(const long long int  u);
+    mpreal& operator/=(const unsigned long long int u);
+
+    const mpreal operator+() const;
+    mpreal& operator++ ();
+    const mpreal  operator++ (int);
+
+    // -
+    mpreal& operator-=(const mpreal& v);
+    mpreal& operator-=(const mpz_t v);
+    mpreal& operator-=(const mpq_t v);
+    mpreal& operator-=(const long double u);
+    mpreal& operator-=(const double u);
+    mpreal& operator-=(const unsigned long int u);
+    mpreal& operator-=(const unsigned int u);
+    mpreal& operator-=(const long int u);
+    mpreal& operator-=(const int u);
+    const mpreal operator-() const;
+    friend const mpreal operator-(const unsigned long int b, const mpreal& a);
+    friend const mpreal operator-(const unsigned int b,      const mpreal& a);
+    friend const mpreal operator-(const long int b,          const mpreal& a);
+    friend const mpreal operator-(const int b,               const mpreal& a);
+    friend const mpreal operator-(const double b,            const mpreal& a);
+    mpreal& operator-- ();
+    const mpreal  operator-- (int);
+
+    // *
+    mpreal& operator*=(const mpreal& v);
+    mpreal& operator*=(const mpz_t v);
+    mpreal& operator*=(const mpq_t v);
+    mpreal& operator*=(const long double v);
+    mpreal& operator*=(const double v);
+    mpreal& operator*=(const unsigned long int v);
+    mpreal& operator*=(const unsigned int v);
+    mpreal& operator*=(const long int v);
+    mpreal& operator*=(const int v);
+
+    // /
+    mpreal& operator/=(const mpreal& v);
+    mpreal& operator/=(const mpz_t v);
+    mpreal& operator/=(const mpq_t v);
+    mpreal& operator/=(const long double v);
+    mpreal& operator/=(const double v);
+    mpreal& operator/=(const unsigned long int v);
+    mpreal& operator/=(const unsigned int v);
+    mpreal& operator/=(const long int v);
+    mpreal& operator/=(const int v);
+    friend const mpreal operator/(const unsigned long int b, const mpreal& a);
+    friend const mpreal operator/(const unsigned int b,      const mpreal& a);
+    friend const mpreal operator/(const long int b,          const mpreal& a);
+    friend const mpreal operator/(const int b,               const mpreal& a);
+    friend const mpreal operator/(const double b,            const mpreal& a);
+
+    //<<= Fast Multiplication by 2^u
+    mpreal& operator<<=(const unsigned long int u);
+    mpreal& operator<<=(const unsigned int u);
+    mpreal& operator<<=(const long int u);
+    mpreal& operator<<=(const int u);
+
+    //>>= Fast Division by 2^u
+    mpreal& operator>>=(const unsigned long int u);
+    mpreal& operator>>=(const unsigned int u);
+    mpreal& operator>>=(const long int u);
+    mpreal& operator>>=(const int u);
+
+    // Type Conversion operators
+    bool               toBool      (                        )    const;
+    long               toLong      (mp_rnd_t mode = GMP_RNDZ)    const;
+    unsigned long      toULong     (mp_rnd_t mode = GMP_RNDZ)    const;
+    long long          toLLong     (mp_rnd_t mode = GMP_RNDZ)    const;
+    unsigned long long toULLong    (mp_rnd_t mode = GMP_RNDZ)    const;
+    float              toFloat     (mp_rnd_t mode = GMP_RNDN)    const;
+    double             toDouble    (mp_rnd_t mode = GMP_RNDN)    const;
+    long double        toLDouble   (mp_rnd_t mode = GMP_RNDN)    const;
+
+#if defined (MPREAL_HAVE_EXPLICIT_CONVERTERS)
+    explicit operator bool               () const { return toBool();                 }
+    explicit operator int                () const { return int(toLong());            }
+    explicit operator long               () const { return toLong();                 }
+    explicit operator long long          () const { return toLLong();                }
+    explicit operator unsigned           () const { return unsigned(toULong());      }
+    explicit operator unsigned long      () const { return toULong();                }
+    explicit operator unsigned long long () const { return toULLong();               }
+    explicit operator float              () const { return toFloat();                }
+    explicit operator double             () const { return toDouble();               }
+    explicit operator long double        () const { return toLDouble();              }
+#endif
+
+    // Get raw pointers so that mpreal can be directly used in raw mpfr_* functions
+    ::mpfr_ptr    mpfr_ptr();
+    ::mpfr_srcptr mpfr_ptr()    const;
+    ::mpfr_srcptr mpfr_srcptr() const;
+
+    // Convert mpreal to string with n significant digits in base b
+    // n = -1 -> convert with the maximum available digits
+    std::string toString(int n = -1, int b = 10, mp_rnd_t mode = mpreal::get_default_rnd()) const;
+
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
+    std::string toString(const std::string& format) const;
+#endif
+
+    std::ostream& output(std::ostream& os) const;
+
+    // Math Functions
+    friend const mpreal sqr (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal sqrt(const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal sqrt(const unsigned long int v, mp_rnd_t rnd_mode);
+    friend const mpreal cbrt(const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal root(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode);
+    friend const mpreal pow (const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode);
+    friend const mpreal pow (const mpreal& a, const mpz_t b, mp_rnd_t rnd_mode);
+    friend const mpreal pow (const mpreal& a, const unsigned long int b, mp_rnd_t rnd_mode);
+    friend const mpreal pow (const mpreal& a, const long int b, mp_rnd_t rnd_mode);
+    friend const mpreal pow (const unsigned long int a, const mpreal& b, mp_rnd_t rnd_mode);
+    friend const mpreal pow (const unsigned long int a, const unsigned long int b, mp_rnd_t rnd_mode);
+    friend const mpreal fabs(const mpreal& v, mp_rnd_t rnd_mode);
+
+    friend const mpreal abs(const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal dim(const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode);
+    friend inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode);
+    friend inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode);
+    friend inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode);
+    friend inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode);
+    friend int cmpabs(const mpreal& a,const mpreal& b);
+
+    friend const mpreal log  (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal log2 (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal logb (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal log10(const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal exp  (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal exp2 (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal exp10(const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal log1p(const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal expm1(const mpreal& v, mp_rnd_t rnd_mode);
+
+    friend const mpreal cos(const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal sin(const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal tan(const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal sec(const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal csc(const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal cot(const mpreal& v, mp_rnd_t rnd_mode);
+    friend int sin_cos(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode);
+
+    friend const mpreal acos  (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal asin  (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal atan  (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal atan2 (const mpreal& y, const mpreal& x, mp_rnd_t rnd_mode);
+    friend const mpreal acot  (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal asec  (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal acsc  (const mpreal& v, mp_rnd_t rnd_mode);
+
+    friend const mpreal cosh  (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal sinh  (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal tanh  (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal sech  (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal csch  (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal coth  (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal acosh (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal asinh (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal atanh (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal acoth (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal asech (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal acsch (const mpreal& v, mp_rnd_t rnd_mode);
+
+    friend const mpreal hypot (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
+
+    friend const mpreal fac_ui (unsigned long int v,  mp_prec_t prec, mp_rnd_t rnd_mode);
+    friend const mpreal eint   (const mpreal& v, mp_rnd_t rnd_mode);
+
+    friend const mpreal gamma    (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal tgamma   (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal lngamma  (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal lgamma   (const mpreal& v, int *signp, mp_rnd_t rnd_mode);
+    friend const mpreal zeta     (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal erf      (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal erfc     (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal besselj0 (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal besselj1 (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal besseljn (long n, const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal bessely0 (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal bessely1 (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal besselyn (long n, const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal fma      (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode);
+    friend const mpreal fms      (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode);
+    friend const mpreal agm      (const mpreal& v1, const mpreal& v2, mp_rnd_t rnd_mode);
+    friend const mpreal sum      (const mpreal tab[], const unsigned long int n, int& status, mp_rnd_t rnd_mode);
+    friend int sgn(const mpreal& v); // returns -1 or +1
+
+// MPFR 2.4.0 Specifics
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
+    friend int          sinh_cosh   (mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal li2         (const mpreal& v,                       mp_rnd_t rnd_mode);
+    friend const mpreal fmod        (const mpreal& x, const mpreal& y,      mp_rnd_t rnd_mode);
+    friend const mpreal rec_sqrt    (const mpreal& v,                       mp_rnd_t rnd_mode);
+
+    // MATLAB's semantic equivalents
+    friend const mpreal rem (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); // Remainder after division
+    friend const mpreal mod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); // Modulus after division
+#endif
+
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
+    friend const mpreal digamma (const mpreal& v,        mp_rnd_t rnd_mode);
+    friend const mpreal ai      (const mpreal& v,        mp_rnd_t rnd_mode);
+    friend const mpreal urandom (gmp_randstate_t& state, mp_rnd_t rnd_mode);     // use gmp_randinit_default() to init state, gmp_randclear() to clear
+#endif
+
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
+    friend const mpreal grandom (gmp_randstate_t& state, mp_rnd_t rnd_mode);     // use gmp_randinit_default() to init state, gmp_randclear() to clear
+    friend const mpreal grandom (unsigned int seed);
+#endif
+
+    // Uniformly distributed random number generation in [0,1] using
+    // Mersenne-Twister algorithm by default.
+    // Use parameter to setup seed, e.g.: random((unsigned)time(NULL))
+    // Check urandom() for more precise control.
+    friend const mpreal random(unsigned int seed);
+
+    // Splits mpreal value into fractional and integer parts.
+    // Returns fractional part and stores integer part in n.
+    friend const mpreal modf(const mpreal& v, mpreal& n);
+
+    // Constants
+    // don't forget to call mpfr_free_cache() for every thread where you are using const-functions
+    friend const mpreal const_log2      (mp_prec_t prec, mp_rnd_t rnd_mode);
+    friend const mpreal const_pi        (mp_prec_t prec, mp_rnd_t rnd_mode);
+    friend const mpreal const_euler     (mp_prec_t prec, mp_rnd_t rnd_mode);
+    friend const mpreal const_catalan   (mp_prec_t prec, mp_rnd_t rnd_mode);
+
+    // returns +inf iff sign>=0 otherwise -inf
+    friend const mpreal const_infinity(int sign, mp_prec_t prec);
+
+    // Output/ Input
+    friend std::ostream& operator<<(std::ostream& os, const mpreal& v);
+    friend std::istream& operator>>(std::istream& is, mpreal& v);
+
+    // Integer Related Functions
+    friend const mpreal rint (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal ceil (const mpreal& v);
+    friend const mpreal floor(const mpreal& v);
+    friend const mpreal round(const mpreal& v);
+    friend const mpreal trunc(const mpreal& v);
+    friend const mpreal rint_ceil   (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal rint_floor  (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal rint_round  (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal rint_trunc  (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal frac        (const mpreal& v, mp_rnd_t rnd_mode);
+    friend const mpreal remainder   (         const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
+    friend const mpreal remquo      (long* q, const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
+
+    // Miscellaneous Functions
+    friend const mpreal nexttoward (const mpreal& x, const mpreal& y);
+    friend const mpreal nextabove  (const mpreal& x);
+    friend const mpreal nextbelow  (const mpreal& x);
+
+    // use gmp_randinit_default() to init state, gmp_randclear() to clear
+    friend const mpreal urandomb (gmp_randstate_t& state);
+
+// MPFR < 2.4.2 Specifics
+#if (MPFR_VERSION <= MPFR_VERSION_NUM(2,4,2))
+    friend const mpreal random2 (mp_size_t size, mp_exp_t exp);
+#endif
+
+    // Instance Checkers
+    friend bool (isnan)    (const mpreal& v);
+    friend bool (isinf)    (const mpreal& v);
+    friend bool (isfinite) (const mpreal& v);
+
+    friend bool isnum    (const mpreal& v);
+    friend bool iszero   (const mpreal& v);
+    friend bool isint    (const mpreal& v);
+
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
+    friend bool isregular(const mpreal& v);
+#endif
+
+    // Set/Get instance properties
+    inline mp_prec_t    get_prec() const;
+    inline void         set_prec(mp_prec_t prec, mp_rnd_t rnd_mode = get_default_rnd());    // Change precision with rounding mode
+
+    // Aliases for get_prec(), set_prec() - needed for compatibility with std::complex<mpreal> interface
+    inline mpreal&      setPrecision(int Precision, mp_rnd_t RoundingMode = get_default_rnd());
+    inline int          getPrecision() const;
+
+    // Set mpreal to +/- inf, NaN, +/-0
+    mpreal&        setInf  (int Sign = +1);
+    mpreal&        setNan  ();
+    mpreal&        setZero (int Sign = +1);
+    mpreal&        setSign (int Sign, mp_rnd_t RoundingMode = get_default_rnd());
+
+    //Exponent
+    mp_exp_t get_exp();
+    int set_exp(mp_exp_t e);
+    int check_range  (int t, mp_rnd_t rnd_mode = get_default_rnd());
+    int subnormalize (int t, mp_rnd_t rnd_mode = get_default_rnd());
+
+    // Inexact conversion from float
+    inline bool fits_in_bits(double x, int n);
+
+    // Set/Get global properties
+    static void            set_default_prec(mp_prec_t prec);
+    static void            set_default_rnd(mp_rnd_t rnd_mode);
+
+    static mp_exp_t  get_emin (void);
+    static mp_exp_t  get_emax (void);
+    static mp_exp_t  get_emin_min (void);
+    static mp_exp_t  get_emin_max (void);
+    static mp_exp_t  get_emax_min (void);
+    static mp_exp_t  get_emax_max (void);
+    static int       set_emin (mp_exp_t exp);
+    static int       set_emax (mp_exp_t exp);
+
+    // Efficient swapping of two mpreal values - needed for std algorithms
+    friend void swap(mpreal& x, mpreal& y);
+
+    friend const mpreal fmax(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
+    friend const mpreal fmin(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
+
+private:
+    // Human friendly Debug Preview in Visual Studio.
+    // Put one of these lines:
+    //
+    // mpfr::mpreal=<DebugView>                              ; Show value only
+    // mpfr::mpreal=<DebugView>, <mp[0]._mpfr_prec,u>bits    ; Show value & precision
+    //
+    // at the beginning of
+    // [Visual Studio Installation Folder]\Common7\Packages\Debugger\autoexp.dat
+    MPREAL_MSVC_DEBUGVIEW_DATA
+
+    // "Smart" resources deallocation. Checks if instance initialized before deletion.
+    void clear(::mpfr_ptr);
+};
+
+//////////////////////////////////////////////////////////////////////////
+// Exceptions
+class conversion_overflow : public std::exception {
+public:
+    std::string why() { return "inexact conversion from floating point"; }
+};
+
+//////////////////////////////////////////////////////////////////////////
+// Constructors & converters
+// Default constructor: creates mp number and initializes it to 0.
+inline mpreal::mpreal()
+{
+    mpfr_init2(mpfr_ptr(), mpreal::get_default_prec());
+    mpfr_set_zero_fast(mpfr_ptr());
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+}
+
+inline mpreal::mpreal(const mpreal& u)
+{
+    mpfr_init2(mpfr_ptr(),mpfr_get_prec(u.mpfr_srcptr()));
+    mpfr_set  (mpfr_ptr(),u.mpfr_srcptr(),mpreal::get_default_rnd());
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+}
+
+#ifdef MPREAL_HAVE_MOVE_SUPPORT
+inline mpreal::mpreal(mpreal&& other)
+{
+    mpfr_set_uninitialized(mpfr_ptr());     // make sure "other" holds no pointer to actual data
+    mpfr_swap(mpfr_ptr(), other.mpfr_ptr());
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+}
+
+inline mpreal& mpreal::operator=(mpreal&& other)
+{
+    mpfr_swap(mpfr_ptr(), other.mpfr_ptr());
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+#endif
+
+inline mpreal::mpreal(const mpfr_t  u, bool shared)
+{
+    if(shared)
+    {
+        std::memcpy(mpfr_ptr(), u, sizeof(mpfr_t));
+    }
+    else
+    {
+        mpfr_init2(mpfr_ptr(), mpfr_get_prec(u));
+        mpfr_set  (mpfr_ptr(), u, mpreal::get_default_rnd());
+    }
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+}
+
+inline mpreal::mpreal(const mpf_t u)
+{
+    mpfr_init2(mpfr_ptr(),(mp_prec_t) mpf_get_prec(u)); // (gmp: mp_bitcnt_t) unsigned long -> long (mpfr: mp_prec_t)
+    mpfr_set_f(mpfr_ptr(),u,mpreal::get_default_rnd());
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+}
+
+inline mpreal::mpreal(const mpz_t u, mp_prec_t prec, mp_rnd_t mode)
+{
+    mpfr_init2(mpfr_ptr(), prec);
+    mpfr_set_z(mpfr_ptr(), u, mode);
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+}
+
+inline mpreal::mpreal(const mpq_t u, mp_prec_t prec, mp_rnd_t mode)
+{
+    mpfr_init2(mpfr_ptr(), prec);
+    mpfr_set_q(mpfr_ptr(), u, mode);
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+}
+
+inline mpreal::mpreal(const double u, mp_prec_t prec, mp_rnd_t mode)
+{
+     mpfr_init2(mpfr_ptr(), prec);
+
+#if (MPREAL_DOUBLE_BITS_OVERFLOW > -1)
+  if(fits_in_bits(u, MPREAL_DOUBLE_BITS_OVERFLOW))
+  {
+    mpfr_set_d(mpfr_ptr(), u, mode);
+  }else
+    throw conversion_overflow();
+#else
+  mpfr_set_d(mpfr_ptr(), u, mode);
+#endif
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+}
+
+inline mpreal::mpreal(const long double u, mp_prec_t prec, mp_rnd_t mode)
+{
+    mpfr_init2 (mpfr_ptr(), prec);
+    mpfr_set_ld(mpfr_ptr(), u, mode);
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+}
+
+inline mpreal::mpreal(const unsigned long long int u, mp_prec_t prec, mp_rnd_t mode)
+{
+    mpfr_init2 (mpfr_ptr(), prec);
+    mpfr_set_uj(mpfr_ptr(), u, mode);
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+}
+
+inline mpreal::mpreal(const long long int u, mp_prec_t prec, mp_rnd_t mode)
+{
+    mpfr_init2 (mpfr_ptr(), prec);
+    mpfr_set_sj(mpfr_ptr(), u, mode);
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+}
+
+inline mpreal::mpreal(const unsigned long int u, mp_prec_t prec, mp_rnd_t mode)
+{
+    mpfr_init2 (mpfr_ptr(), prec);
+    mpfr_set_ui(mpfr_ptr(), u, mode);
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+}
+
+inline mpreal::mpreal(const unsigned int u, mp_prec_t prec, mp_rnd_t mode)
+{
+    mpfr_init2 (mpfr_ptr(), prec);
+    mpfr_set_ui(mpfr_ptr(), u, mode);
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+}
+
+inline mpreal::mpreal(const long int u, mp_prec_t prec, mp_rnd_t mode)
+{
+    mpfr_init2 (mpfr_ptr(), prec);
+    mpfr_set_si(mpfr_ptr(), u, mode);
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+}
+
+inline mpreal::mpreal(const int u, mp_prec_t prec, mp_rnd_t mode)
+{
+    mpfr_init2 (mpfr_ptr(), prec);
+    mpfr_set_si(mpfr_ptr(), u, mode);
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+}
+
+inline mpreal::mpreal(const char* s, mp_prec_t prec, int base, mp_rnd_t mode)
+{
+    mpfr_init2  (mpfr_ptr(), prec);
+    mpfr_set_str(mpfr_ptr(), s, base, mode);
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+}
+
+inline mpreal::mpreal(const std::string& s, mp_prec_t prec, int base, mp_rnd_t mode)
+{
+    mpfr_init2  (mpfr_ptr(), prec);
+    mpfr_set_str(mpfr_ptr(), s.c_str(), base, mode);
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+}
+
+inline void mpreal::clear(::mpfr_ptr x)
+{
+#ifdef MPREAL_HAVE_MOVE_SUPPORT
+    if(mpfr_is_initialized(x))
+#endif
+    mpfr_clear(x);
+}
+
+inline mpreal::~mpreal()
+{
+    clear(mpfr_ptr());
+}
+
+// internal namespace needed for template magic
+namespace internal{
+
+    // Use SFINAE to restrict arithmetic operations instantiation only for numeric types
+    // This is needed for smooth integration with libraries based on expression templates, like Eigen.
+    // TODO: Do the same for boolean operators.
+    template <typename ArgumentType> struct result_type {};
+
+    template <> struct result_type<mpreal>              {typedef mpreal type;};
+    template <> struct result_type<mpz_t>               {typedef mpreal type;};
+    template <> struct result_type<mpq_t>               {typedef mpreal type;};
+    template <> struct result_type<long double>         {typedef mpreal type;};
+    template <> struct result_type<double>              {typedef mpreal type;};
+    template <> struct result_type<unsigned long int>   {typedef mpreal type;};
+    template <> struct result_type<unsigned int>        {typedef mpreal type;};
+    template <> struct result_type<long int>            {typedef mpreal type;};
+    template <> struct result_type<int>                 {typedef mpreal type;};
+    template <> struct result_type<long long>           {typedef mpreal type;};
+    template <> struct result_type<unsigned long long>  {typedef mpreal type;};
+}
+
+// + Addition
+template <typename Rhs>
+inline const typename internal::result_type<Rhs>::type
+    operator+(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) += rhs;    }
+
+template <typename Lhs>
+inline const typename internal::result_type<Lhs>::type
+    operator+(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) += lhs;    }
+
+// - Subtraction
+template <typename Rhs>
+inline const typename internal::result_type<Rhs>::type
+    operator-(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) -= rhs;    }
+
+template <typename Lhs>
+inline const typename internal::result_type<Lhs>::type
+    operator-(const Lhs& lhs, const mpreal& rhs){ return mpreal(lhs) -= rhs;    }
+
+// * Multiplication
+template <typename Rhs>
+inline const typename internal::result_type<Rhs>::type
+    operator*(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) *= rhs;    }
+
+template <typename Lhs>
+inline const typename internal::result_type<Lhs>::type
+    operator*(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) *= lhs;    }
+
+// / Division
+template <typename Rhs>
+inline const typename internal::result_type<Rhs>::type
+    operator/(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) /= rhs;    }
+
+template <typename Lhs>
+inline const typename internal::result_type<Lhs>::type
+    operator/(const Lhs& lhs, const mpreal& rhs){ return mpreal(lhs) /= rhs;    }
+
+//////////////////////////////////////////////////////////////////////////
+// sqrt
+const mpreal sqrt(const unsigned int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal sqrt(const long int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal sqrt(const int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal sqrt(const long double v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal sqrt(const double v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+
+// abs
+inline const mpreal abs(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd());
+
+//////////////////////////////////////////////////////////////////////////
+// pow
+const mpreal pow(const mpreal& a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const mpreal& a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const mpreal& a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const mpreal& a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+
+const mpreal pow(const unsigned int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const long int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const long double a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const double a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+
+const mpreal pow(const unsigned long int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const unsigned long int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const unsigned long int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const unsigned long int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const unsigned long int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+
+const mpreal pow(const unsigned int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const unsigned int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const unsigned int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const unsigned int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const unsigned int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const unsigned int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+
+const mpreal pow(const long int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const long int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const long int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const long int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const long int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const long int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+
+const mpreal pow(const int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+
+const mpreal pow(const long double a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const long double a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const long double a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const long double a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const long double a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+
+const mpreal pow(const double a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const double a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const double a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const double a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+const mpreal pow(const double a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+
+inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
+
+//////////////////////////////////////////////////////////////////////////
+// Estimate machine epsilon for the given precision
+// Returns smallest eps such that 1.0 + eps != 1.0
+inline mpreal machine_epsilon(mp_prec_t prec = mpreal::get_default_prec());
+
+// Returns smallest eps such that x + eps != x (relative machine epsilon)
+inline mpreal machine_epsilon(const mpreal& x);
+
+// Gives max & min values for the required precision,
+// minval is 'safe' meaning 1 / minval does not overflow
+// maxval is 'safe' meaning 1 / maxval does not underflow
+inline mpreal minval(mp_prec_t prec = mpreal::get_default_prec());
+inline mpreal maxval(mp_prec_t prec = mpreal::get_default_prec());
+
+// 'Dirty' equality check 1: |a-b| < min{|a|,|b|} * eps
+inline bool isEqualFuzzy(const mpreal& a, const mpreal& b, const mpreal& eps);
+
+// 'Dirty' equality check 2: |a-b| < min{|a|,|b|} * eps( min{|a|,|b|} )
+inline bool isEqualFuzzy(const mpreal& a, const mpreal& b);
+
+// 'Bitwise' equality check
+//  maxUlps - a and b can be apart by maxUlps binary numbers.
+inline bool isEqualUlps(const mpreal& a, const mpreal& b, int maxUlps);
+
+//////////////////////////////////////////////////////////////////////////
+// Convert precision in 'bits' to decimal digits and vice versa.
+//    bits   = ceil(digits*log[2](10))
+//    digits = floor(bits*log[10](2))
+
+inline mp_prec_t digits2bits(int d);
+inline int       bits2digits(mp_prec_t b);
+
+//////////////////////////////////////////////////////////////////////////
+// min, max
+const mpreal (max)(const mpreal& x, const mpreal& y);
+const mpreal (min)(const mpreal& x, const mpreal& y);
+
+//////////////////////////////////////////////////////////////////////////
+// Implementation
+//////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////
+// Operators - Assignment
+inline mpreal& mpreal::operator=(const mpreal& v)
+{
+    if (this != &v)
+    {
+    mp_prec_t tp = mpfr_get_prec(  mpfr_srcptr());
+    mp_prec_t vp = mpfr_get_prec(v.mpfr_srcptr());
+
+    if(tp != vp){
+      clear(mpfr_ptr());
+      mpfr_init2(mpfr_ptr(), vp);
+    }
+
+        mpfr_set(mpfr_ptr(), v.mpfr_srcptr(), mpreal::get_default_rnd());
+
+        MPREAL_MSVC_DEBUGVIEW_CODE;
+    }
+    return *this;
+}
+
+inline mpreal& mpreal::operator=(const mpf_t v)
+{
+    mpfr_set_f(mpfr_ptr(), v, mpreal::get_default_rnd());
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator=(const mpz_t v)
+{
+    mpfr_set_z(mpfr_ptr(), v, mpreal::get_default_rnd());
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator=(const mpq_t v)
+{
+    mpfr_set_q(mpfr_ptr(), v, mpreal::get_default_rnd());
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator=(const long double v)
+{
+    mpfr_set_ld(mpfr_ptr(), v, mpreal::get_default_rnd());
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator=(const double v)
+{
+#if (MPREAL_DOUBLE_BITS_OVERFLOW > -1)
+  if(fits_in_bits(v, MPREAL_DOUBLE_BITS_OVERFLOW))
+  {
+    mpfr_set_d(mpfr_ptr(),v,mpreal::get_default_rnd());
+  }else
+    throw conversion_overflow();
+#else
+  mpfr_set_d(mpfr_ptr(),v,mpreal::get_default_rnd());
+#endif
+
+  MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator=(const unsigned long int v)
+{
+    mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd());
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator=(const unsigned int v)
+{
+    mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd());
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator=(const unsigned long long int v)
+{
+    mpfr_set_uj(mpfr_ptr(), v, mpreal::get_default_rnd());
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator=(const long long int v)
+{
+    mpfr_set_sj(mpfr_ptr(), v, mpreal::get_default_rnd());
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator=(const long int v)
+{
+    mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd());
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator=(const int v)
+{
+    mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd());
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator=(const char* s)
+{
+    // Use other converters for more precise control on base & precision & rounding:
+    //
+    //        mpreal(const char* s,        mp_prec_t prec, int base, mp_rnd_t mode)
+    //        mpreal(const std::string& s,mp_prec_t prec, int base, mp_rnd_t mode)
+    //
+    // Here we assume base = 10 and we use precision of target variable.
+
+    mpfr_t t;
+
+    mpfr_init2(t, mpfr_get_prec(mpfr_srcptr()));
+
+    if(0 == mpfr_set_str(t, s, 10, mpreal::get_default_rnd()))
+    {
+        mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd());
+        MPREAL_MSVC_DEBUGVIEW_CODE;
+    }
+
+    clear(t);
+    return *this;
+}
+
+inline mpreal& mpreal::operator=(const std::string& s)
+{
+    // Use other converters for more precise control on base & precision & rounding:
+    //
+    //        mpreal(const char* s,        mp_prec_t prec, int base, mp_rnd_t mode)
+    //        mpreal(const std::string& s,mp_prec_t prec, int base, mp_rnd_t mode)
+    //
+    // Here we assume base = 10 and we use precision of target variable.
+
+    mpfr_t t;
+
+    mpfr_init2(t, mpfr_get_prec(mpfr_srcptr()));
+
+    if(0 == mpfr_set_str(t, s.c_str(), 10, mpreal::get_default_rnd()))
+    {
+        mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd());
+        MPREAL_MSVC_DEBUGVIEW_CODE;
+    }
+
+    clear(t);
+    return *this;
+}
+
+template <typename real_t>
+inline mpreal& mpreal::operator= (const std::complex<real_t>& z)
+{
+    return *this = z.real();
+}
+
+//////////////////////////////////////////////////////////////////////////
+// + Addition
+inline mpreal& mpreal::operator+=(const mpreal& v)
+{
+    mpfr_add(mpfr_ptr(), mpfr_srcptr(), v.mpfr_srcptr(), mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator+=(const mpf_t u)
+{
+    *this += mpreal(u);
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator+=(const mpz_t u)
+{
+    mpfr_add_z(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator+=(const mpq_t u)
+{
+    mpfr_add_q(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator+= (const long double u)
+{
+    *this += mpreal(u);
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator+= (const double u)
+{
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
+    mpfr_add_d(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
+#else
+    *this += mpreal(u);
+#endif
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator+=(const unsigned long int u)
+{
+    mpfr_add_ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator+=(const unsigned int u)
+{
+    mpfr_add_ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator+=(const long int u)
+{
+    mpfr_add_si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator+=(const int u)
+{
+    mpfr_add_si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator+=(const long long int u)         {    *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
+inline mpreal& mpreal::operator+=(const unsigned long long int u){    *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
+inline mpreal& mpreal::operator-=(const long long int  u)        {    *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
+inline mpreal& mpreal::operator-=(const unsigned long long int u){    *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
+inline mpreal& mpreal::operator*=(const long long int  u)        {    *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
+inline mpreal& mpreal::operator*=(const unsigned long long int u){    *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
+inline mpreal& mpreal::operator/=(const long long int  u)        {    *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
+inline mpreal& mpreal::operator/=(const unsigned long long int u){    *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
+
+inline const mpreal mpreal::operator+()const    {    return mpreal(*this); }
+
+inline const mpreal operator+(const mpreal& a, const mpreal& b)
+{
+  mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr())));
+  mpfr_add(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
+  return c;
+}
+
+inline mpreal& mpreal::operator++()
+{
+    return *this += 1;
+}
+
+inline const mpreal mpreal::operator++ (int)
+{
+    mpreal x(*this);
+    *this += 1;
+    return x;
+}
+
+inline mpreal& mpreal::operator--()
+{
+    return *this -= 1;
+}
+
+inline const mpreal mpreal::operator-- (int)
+{
+    mpreal x(*this);
+    *this -= 1;
+    return x;
+}
+
+//////////////////////////////////////////////////////////////////////////
+// - Subtraction
+inline mpreal& mpreal::operator-=(const mpreal& v)
+{
+    mpfr_sub(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator-=(const mpz_t v)
+{
+    mpfr_sub_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator-=(const mpq_t v)
+{
+    mpfr_sub_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator-=(const long double v)
+{
+    *this -= mpreal(v);
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator-=(const double v)
+{
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
+    mpfr_sub_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
+#else
+    *this -= mpreal(v);
+#endif
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator-=(const unsigned long int v)
+{
+    mpfr_sub_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator-=(const unsigned int v)
+{
+    mpfr_sub_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator-=(const long int v)
+{
+    mpfr_sub_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator-=(const int v)
+{
+    mpfr_sub_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline const mpreal mpreal::operator-()const
+{
+    mpreal u(*this);
+    mpfr_neg(u.mpfr_ptr(),u.mpfr_srcptr(),mpreal::get_default_rnd());
+    return u;
+}
+
+inline const mpreal operator-(const mpreal& a, const mpreal& b)
+{
+  mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr())));
+  mpfr_sub(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
+  return c;
+}
+
+inline const mpreal operator-(const double  b, const mpreal& a)
+{
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
+    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
+    mpfr_d_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
+    return x;
+#else
+    mpreal x(b, mpfr_get_prec(a.mpfr_ptr()));
+    x -= a;
+    return x;
+#endif
+}
+
+inline const mpreal operator-(const unsigned long int b, const mpreal& a)
+{
+    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
+    mpfr_ui_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
+    return x;
+}
+
+inline const mpreal operator-(const unsigned int b, const mpreal& a)
+{
+    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
+    mpfr_ui_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
+    return x;
+}
+
+inline const mpreal operator-(const long int b, const mpreal& a)
+{
+    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
+    mpfr_si_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
+    return x;
+}
+
+inline const mpreal operator-(const int b, const mpreal& a)
+{
+    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
+    mpfr_si_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
+    return x;
+}
+
+//////////////////////////////////////////////////////////////////////////
+// * Multiplication
+inline mpreal& mpreal::operator*= (const mpreal& v)
+{
+    mpfr_mul(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator*=(const mpz_t v)
+{
+    mpfr_mul_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator*=(const mpq_t v)
+{
+    mpfr_mul_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator*=(const long double v)
+{
+    *this *= mpreal(v);
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator*=(const double v)
+{
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
+    mpfr_mul_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
+#else
+    *this *= mpreal(v);
+#endif
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator*=(const unsigned long int v)
+{
+    mpfr_mul_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator*=(const unsigned int v)
+{
+    mpfr_mul_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator*=(const long int v)
+{
+    mpfr_mul_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator*=(const int v)
+{
+    mpfr_mul_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline const mpreal operator*(const mpreal& a, const mpreal& b)
+{
+  mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr())));
+  mpfr_mul(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
+  return c;
+}
+
+//////////////////////////////////////////////////////////////////////////
+// / Division
+inline mpreal& mpreal::operator/=(const mpreal& v)
+{
+    mpfr_div(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator/=(const mpz_t v)
+{
+    mpfr_div_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator/=(const mpq_t v)
+{
+    mpfr_div_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator/=(const long double v)
+{
+    *this /= mpreal(v);
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator/=(const double v)
+{
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
+    mpfr_div_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
+#else
+    *this /= mpreal(v);
+#endif
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator/=(const unsigned long int v)
+{
+    mpfr_div_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator/=(const unsigned int v)
+{
+    mpfr_div_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator/=(const long int v)
+{
+    mpfr_div_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator/=(const int v)
+{
+    mpfr_div_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline const mpreal operator/(const mpreal& a, const mpreal& b)
+{
+  mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_srcptr()), mpfr_get_prec(b.mpfr_srcptr())));
+  mpfr_div(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
+  return c;
+}
+
+inline const mpreal operator/(const unsigned long int b, const mpreal& a)
+{
+    mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
+    mpfr_ui_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
+    return x;
+}
+
+inline const mpreal operator/(const unsigned int b, const mpreal& a)
+{
+    mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
+    mpfr_ui_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
+    return x;
+}
+
+inline const mpreal operator/(const long int b, const mpreal& a)
+{
+    mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
+    mpfr_si_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
+    return x;
+}
+
+inline const mpreal operator/(const int b, const mpreal& a)
+{
+    mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
+    mpfr_si_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
+    return x;
+}
+
+inline const mpreal operator/(const double  b, const mpreal& a)
+{
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
+    mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
+    mpfr_d_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
+    return x;
+#else
+    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
+    x /= a;
+    return x;
+#endif
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Shifts operators - Multiplication/Division by power of 2
+inline mpreal& mpreal::operator<<=(const unsigned long int u)
+{
+    mpfr_mul_2ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator<<=(const unsigned int u)
+{
+    mpfr_mul_2ui(mpfr_ptr(),mpfr_srcptr(),static_cast<unsigned long int>(u),mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator<<=(const long int u)
+{
+    mpfr_mul_2si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator<<=(const int u)
+{
+    mpfr_mul_2si(mpfr_ptr(),mpfr_srcptr(),static_cast<long int>(u),mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator>>=(const unsigned long int u)
+{
+    mpfr_div_2ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator>>=(const unsigned int u)
+{
+    mpfr_div_2ui(mpfr_ptr(),mpfr_srcptr(),static_cast<unsigned long int>(u),mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator>>=(const long int u)
+{
+    mpfr_div_2si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::operator>>=(const int u)
+{
+    mpfr_div_2si(mpfr_ptr(),mpfr_srcptr(),static_cast<long int>(u),mpreal::get_default_rnd());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline const mpreal operator<<(const mpreal& v, const unsigned long int k)
+{
+    return mul_2ui(v,k);
+}
+
+inline const mpreal operator<<(const mpreal& v, const unsigned int k)
+{
+    return mul_2ui(v,static_cast<unsigned long int>(k));
+}
+
+inline const mpreal operator<<(const mpreal& v, const long int k)
+{
+    return mul_2si(v,k);
+}
+
+inline const mpreal operator<<(const mpreal& v, const int k)
+{
+    return mul_2si(v,static_cast<long int>(k));
+}
+
+inline const mpreal operator>>(const mpreal& v, const unsigned long int k)
+{
+    return div_2ui(v,k);
+}
+
+inline const mpreal operator>>(const mpreal& v, const long int k)
+{
+    return div_2si(v,k);
+}
+
+inline const mpreal operator>>(const mpreal& v, const unsigned int k)
+{
+    return div_2ui(v,static_cast<unsigned long int>(k));
+}
+
+inline const mpreal operator>>(const mpreal& v, const int k)
+{
+    return div_2si(v,static_cast<long int>(k));
+}
+
+// mul_2ui
+inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode)
+{
+    mpreal x(v);
+    mpfr_mul_2ui(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
+    return x;
+}
+
+// mul_2si
+inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode)
+{
+    mpreal x(v);
+    mpfr_mul_2si(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
+    return x;
+}
+
+inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode)
+{
+    mpreal x(v);
+    mpfr_div_2ui(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
+    return x;
+}
+
+inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode)
+{
+    mpreal x(v);
+    mpfr_div_2si(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
+    return x;
+}
+
+//////////////////////////////////////////////////////////////////////////
+//Relational operators
+
+// WARNING:
+//
+// Please note that following checks for double-NaN are guaranteed to work only in IEEE math mode:
+//
+// isnan(b) =  (b != b)
+// isnan(b) = !(b == b)  (we use in code below)
+//
+// Be cautions if you use compiler options which break strict IEEE compliance (e.g. -ffast-math in GCC).
+// Use std::isnan instead (C++11).
+
+inline bool operator >  (const mpreal& a, const mpreal& b           ){  return (mpfr_greater_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );            }
+inline bool operator >  (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) > 0 );                 }
+inline bool operator >  (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) > 0 );                 }
+inline bool operator >  (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) > 0 );                 }
+inline bool operator >  (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) > 0 );                 }
+inline bool operator >  (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) > 0 );    }
+inline bool operator >  (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) > 0 );    }
+
+inline bool operator >= (const mpreal& a, const mpreal& b           ){  return (mpfr_greaterequal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );       }
+inline bool operator >= (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) >= 0 );                }
+// inline bool operator >= (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (isnan()a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) >= 0 );                }
+inline bool operator >= (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) >= 0 );                }
+inline bool operator >= (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) >= 0 );                }
+inline bool operator >= (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) >= 0 );   }
+inline bool operator >= (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) >= 0 );   }
+
+inline bool operator <  (const mpreal& a, const mpreal& b           ){  return (mpfr_less_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );               }
+inline bool operator <  (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) < 0 );                 }
+inline bool operator <  (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) < 0 );                 }
+inline bool operator <  (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) < 0 );                 }
+inline bool operator <  (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) < 0 );                 }
+inline bool operator <  (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) < 0 );    }
+inline bool operator <  (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) < 0 );    }
+
+inline bool operator <= (const mpreal& a, const mpreal& b           ){  return (mpfr_lessequal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );          }
+inline bool operator <= (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) <= 0 );                }
+inline bool operator <= (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) <= 0 );                }
+inline bool operator <= (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) <= 0 );                }
+inline bool operator <= (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) <= 0 );                }
+inline bool operator <= (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) <= 0 );   }
+inline bool operator <= (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) <= 0 );   }
+
+inline bool operator == (const mpreal& a, const mpreal& b           ){  return (mpfr_equal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );              }
+inline bool operator == (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 );                }
+inline bool operator == (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 );                }
+inline bool operator == (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 );                }
+inline bool operator == (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 );                }
+inline bool operator == (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) == 0 );   }
+inline bool operator == (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) == 0 );   }
+
+inline bool operator != (const mpreal& a, const mpreal& b           ){  return !(a == b);  }
+inline bool operator != (const mpreal& a, const unsigned long int b ){  return !(a == b);  }
+inline bool operator != (const mpreal& a, const unsigned int b      ){  return !(a == b);  }
+inline bool operator != (const mpreal& a, const long int b          ){  return !(a == b);  }
+inline bool operator != (const mpreal& a, const int b               ){  return !(a == b);  }
+inline bool operator != (const mpreal& a, const long double b       ){  return !(a == b);  }
+inline bool operator != (const mpreal& a, const double b            ){  return !(a == b);  }
+
+inline bool (isnan)    (const mpreal& op){    return (mpfr_nan_p    (op.mpfr_srcptr()) != 0 );    }
+inline bool (isinf)    (const mpreal& op){    return (mpfr_inf_p    (op.mpfr_srcptr()) != 0 );    }
+inline bool (isfinite) (const mpreal& op){    return (mpfr_number_p (op.mpfr_srcptr()) != 0 );    }
+inline bool iszero   (const mpreal& op){    return (mpfr_zero_p   (op.mpfr_srcptr()) != 0 );    }
+inline bool isint    (const mpreal& op){    return (mpfr_integer_p(op.mpfr_srcptr()) != 0 );    }
+
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
+inline bool isregular(const mpreal& op){    return (mpfr_regular_p(op.mpfr_srcptr()));}
+#endif
+
+//////////////////////////////////////////////////////////////////////////
+// Type Converters
+inline bool               mpreal::toBool   (             )  const    {    return  mpfr_zero_p (mpfr_srcptr()) == 0;     }
+inline long               mpreal::toLong   (mp_rnd_t mode)  const    {    return  mpfr_get_si (mpfr_srcptr(), mode);    }
+inline unsigned long      mpreal::toULong  (mp_rnd_t mode)  const    {    return  mpfr_get_ui (mpfr_srcptr(), mode);    }
+inline float              mpreal::toFloat  (mp_rnd_t mode)  const    {    return  mpfr_get_flt(mpfr_srcptr(), mode);    }
+inline double             mpreal::toDouble (mp_rnd_t mode)  const    {    return  mpfr_get_d  (mpfr_srcptr(), mode);    }
+inline long double        mpreal::toLDouble(mp_rnd_t mode)  const    {    return  mpfr_get_ld (mpfr_srcptr(), mode);    }
+inline long long          mpreal::toLLong  (mp_rnd_t mode)  const    {    return  mpfr_get_sj (mpfr_srcptr(), mode);    }
+inline unsigned long long mpreal::toULLong (mp_rnd_t mode)  const    {    return  mpfr_get_uj (mpfr_srcptr(), mode);    }
+
+inline ::mpfr_ptr     mpreal::mpfr_ptr()             { return mp; }
+inline ::mpfr_srcptr  mpreal::mpfr_ptr()    const    { return mp; }
+inline ::mpfr_srcptr  mpreal::mpfr_srcptr() const    { return mp; }
+
+template <class T>
+inline std::string toString(T t, std::ios_base & (*f)(std::ios_base&))
+{
+    std::ostringstream oss;
+    oss << f << t;
+    return oss.str();
+}
+
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
+
+inline std::string mpreal::toString(const std::string& format) const
+{
+    char *s = NULL;
+    std::string out;
+
+    if( !format.empty() )
+    {
+        if(!(mpfr_asprintf(&s, format.c_str(), mpfr_srcptr()) < 0))
+        {
+            out = std::string(s);
+
+            mpfr_free_str(s);
+        }
+    }
+
+    return out;
+}
+
+#endif
+
+inline std::string mpreal::toString(int n, int b, mp_rnd_t mode) const
+{
+    // TODO: Add extended format specification (f, e, rounding mode) as it done in output operator
+    (void)b;
+    (void)mode;
+
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
+
+    std::ostringstream format;
+
+    int digits = (n >= 0) ? n : 1 + bits2digits(mpfr_get_prec(mpfr_srcptr()));
+
+    format << "%." << digits << "RNg";
+
+    return toString(format.str());
+
+#else
+
+    char *s, *ns = NULL;
+    size_t slen, nslen;
+    mp_exp_t exp;
+    std::string out;
+
+    if(mpfr_inf_p(mp))
+    {
+        if(mpfr_sgn(mp)>0) return "+Inf";
+        else               return "-Inf";
+    }
+
+    if(mpfr_zero_p(mp)) return "0";
+    if(mpfr_nan_p(mp))  return "NaN";
+
+    s  = mpfr_get_str(NULL, &exp, b, 0, mp, mode);
+    ns = mpfr_get_str(NULL, &exp, b, (std::max)(0,n), mp, mode);
+
+    if(s!=NULL && ns!=NULL)
+    {
+        slen  = strlen(s);
+        nslen = strlen(ns);
+        if(nslen<=slen)
+        {
+            mpfr_free_str(s);
+            s = ns;
+            slen = nslen;
+        }
+        else {
+            mpfr_free_str(ns);
+        }
+
+        // Make human eye-friendly formatting if possible
+        if (exp>0 && static_cast<size_t>(exp)<slen)
+        {
+            if(s[0]=='-')
+            {
+                // Remove zeros starting from right end
+                char* ptr = s+slen-1;
+                while (*ptr=='0' && ptr>s+exp) ptr--;
+
+                if(ptr==s+exp) out = std::string(s,exp+1);
+                else           out = std::string(s,exp+1)+'.'+std::string(s+exp+1,ptr-(s+exp+1)+1);
+
+                //out = string(s,exp+1)+'.'+string(s+exp+1);
+            }
+            else
+            {
+                // Remove zeros starting from right end
+                char* ptr = s+slen-1;
+                while (*ptr=='0' && ptr>s+exp-1) ptr--;
+
+                if(ptr==s+exp-1) out = std::string(s,exp);
+                else             out = std::string(s,exp)+'.'+std::string(s+exp,ptr-(s+exp)+1);
+
+                //out = string(s,exp)+'.'+string(s+exp);
+            }
+
+        }else{ // exp<0 || exp>slen
+            if(s[0]=='-')
+            {
+                // Remove zeros starting from right end
+                char* ptr = s+slen-1;
+                while (*ptr=='0' && ptr>s+1) ptr--;
+
+                if(ptr==s+1) out = std::string(s,2);
+                else         out = std::string(s,2)+'.'+std::string(s+2,ptr-(s+2)+1);
+
+                //out = string(s,2)+'.'+string(s+2);
+            }
+            else
+            {
+                // Remove zeros starting from right end
+                char* ptr = s+slen-1;
+                while (*ptr=='0' && ptr>s) ptr--;
+
+                if(ptr==s) out = std::string(s,1);
+                else       out = std::string(s,1)+'.'+std::string(s+1,ptr-(s+1)+1);
+
+                //out = string(s,1)+'.'+string(s+1);
+            }
+
+            // Make final string
+            if(--exp)
+            {
+                if(exp>0) out += "e+"+mpfr::toString<mp_exp_t>(exp,std::dec);
+                else       out += "e"+mpfr::toString<mp_exp_t>(exp,std::dec);
+            }
+        }
+
+        mpfr_free_str(s);
+        return out;
+    }else{
+        return "conversion error!";
+    }
+#endif
+}
+
+
+//////////////////////////////////////////////////////////////////////////
+// I/O
+inline std::ostream& mpreal::output(std::ostream& os) const
+{
+    std::ostringstream format;
+    const std::ios::fmtflags flags = os.flags();
+
+    format << ((flags & std::ios::showpos) ? "%+" : "%");
+    if (os.precision() >= 0)
+        format << '.' << os.precision() << "R*"
+               << ((flags & std::ios::floatfield) == std::ios::fixed ? 'f' :
+                   (flags & std::ios::floatfield) == std::ios::scientific ? 'e' :
+                   'g');
+    else
+        format << "R*e";
+
+    char *s = NULL;
+    if(!(mpfr_asprintf(&s, format.str().c_str(),
+                        mpfr::mpreal::get_default_rnd(),
+                        mpfr_srcptr())
+        < 0))
+    {
+        os << std::string(s);
+        mpfr_free_str(s);
+    }
+    return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const mpreal& v)
+{
+    return v.output(os);
+}
+
+inline std::istream& operator>>(std::istream &is, mpreal& v)
+{
+    // TODO: use cout::hexfloat and other flags to setup base
+    std::string tmp;
+    is >> tmp;
+    mpfr_set_str(v.mpfr_ptr(), tmp.c_str(), 10, mpreal::get_default_rnd());
+    return is;
+}
+
+//////////////////////////////////////////////////////////////////////////
+//     Bits - decimal digits relation
+//        bits   = ceil(digits*log[2](10))
+//        digits = floor(bits*log[10](2))
+
+inline mp_prec_t digits2bits(int d)
+{
+    const double LOG2_10 = 3.3219280948873624;
+
+    return mp_prec_t(std::ceil( d * LOG2_10 ));
+}
+
+inline int bits2digits(mp_prec_t b)
+{
+    const double LOG10_2 = 0.30102999566398119;
+
+    return int(std::floor( b * LOG10_2 ));
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Set/Get number properties
+inline int sgn(const mpreal& op)
+{
+    return mpfr_sgn(op.mpfr_srcptr());
+}
+
+inline mpreal& mpreal::setSign(int sign, mp_rnd_t RoundingMode)
+{
+    mpfr_setsign(mpfr_ptr(), mpfr_srcptr(), (sign < 0 ? 1 : 0), RoundingMode);
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline int mpreal::getPrecision() const
+{
+    return int(mpfr_get_prec(mpfr_srcptr()));
+}
+
+inline mpreal& mpreal::setPrecision(int Precision, mp_rnd_t RoundingMode)
+{
+    mpfr_prec_round(mpfr_ptr(), Precision, RoundingMode);
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::setInf(int sign)
+{
+    mpfr_set_inf(mpfr_ptr(), sign);
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::setNan()
+{
+    mpfr_set_nan(mpfr_ptr());
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mpreal& mpreal::setZero(int sign)
+{
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
+    mpfr_set_zero(mpfr_ptr(), sign);
+#else
+    mpfr_set_si(mpfr_ptr(), 0, (mpfr_get_default_rounding_mode)());
+    setSign(sign);
+#endif
+
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return *this;
+}
+
+inline mp_prec_t mpreal::get_prec() const
+{
+    return mpfr_get_prec(mpfr_srcptr());
+}
+
+inline void mpreal::set_prec(mp_prec_t prec, mp_rnd_t rnd_mode)
+{
+    mpfr_prec_round(mpfr_ptr(),prec,rnd_mode);
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+}
+
+inline mp_exp_t mpreal::get_exp ()
+{
+    return mpfr_get_exp(mpfr_srcptr());
+}
+
+inline int mpreal::set_exp (mp_exp_t e)
+{
+    int x = mpfr_set_exp(mpfr_ptr(), e);
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return x;
+}
+
+inline const mpreal frexp(const mpreal& x, mp_exp_t* exp, mp_rnd_t mode = mpreal::get_default_rnd())
+{
+    mpreal y(x);
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
+    mpfr_frexp(exp,y.mpfr_ptr(),x.mpfr_srcptr(),mode);
+#else
+    *exp = mpfr_get_exp(y.mpfr_srcptr());
+    mpfr_set_exp(y.mpfr_ptr(),0);
+#endif
+    return y;
+}
+
+inline const mpreal ldexp(const mpreal& v, mp_exp_t exp)
+{
+    mpreal x(v);
+
+    // rounding is not important since we are just increasing the exponent (= exact operation)
+    mpfr_mul_2si(x.mpfr_ptr(), x.mpfr_srcptr(), exp, mpreal::get_default_rnd());
+    return x;
+}
+
+inline const mpreal scalbn(const mpreal& v, mp_exp_t exp)
+{
+    return ldexp(v, exp);
+}
+
+inline mpreal machine_epsilon(mp_prec_t prec)
+{
+    /* the smallest eps such that 1 + eps != 1 */
+    return machine_epsilon(mpreal(1, prec));
+}
+
+inline mpreal machine_epsilon(const mpreal& x)
+{
+    /* the smallest eps such that x + eps != x */
+    if( x < 0)
+    {
+        return nextabove(-x) + x;
+    }else{
+        return nextabove( x) - x;
+    }
+}
+
+// minval is 'safe' meaning 1 / minval does not overflow
+inline mpreal minval(mp_prec_t prec)
+{
+    /* min = 1/2 * 2^emin = 2^(emin - 1) */
+    return mpreal(1, prec) << mpreal::get_emin()-1;
+}
+
+// maxval is 'safe' meaning 1 / maxval does not underflow
+inline mpreal maxval(mp_prec_t prec)
+{
+    /* max = (1 - eps) * 2^emax, eps is machine epsilon */
+    return (mpreal(1, prec) - machine_epsilon(prec)) << mpreal::get_emax();
+}
+
+inline bool isEqualUlps(const mpreal& a, const mpreal& b, int maxUlps)
+{
+    return abs(a - b) <= machine_epsilon((max)(abs(a), abs(b))) * maxUlps;
+}
+
+inline bool isEqualFuzzy(const mpreal& a, const mpreal& b, const mpreal& eps)
+{
+    return abs(a - b) <= eps;
+}
+
+inline bool isEqualFuzzy(const mpreal& a, const mpreal& b)
+{
+    return isEqualFuzzy(a, b, machine_epsilon((max)(1, (min)(abs(a), abs(b)))));
+}
+
+//////////////////////////////////////////////////////////////////////////
+// C++11 sign functions.
+inline mpreal copysign(const mpreal& x, const  mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    mpreal rop(0, mpfr_get_prec(x.mpfr_ptr()));
+    mpfr_setsign(rop.mpfr_ptr(), x.mpfr_srcptr(), mpfr_signbit(y.mpfr_srcptr()), rnd_mode);
+    return rop;
+}
+
+inline bool signbit(const mpreal& x)
+{
+    return mpfr_signbit(x.mpfr_srcptr());
+}
+
+inline const mpreal modf(const mpreal& v, mpreal& n)
+{
+    mpreal f(v);
+
+    // rounding is not important since we are using the same number
+    mpfr_frac (f.mpfr_ptr(),f.mpfr_srcptr(),mpreal::get_default_rnd());
+    mpfr_trunc(n.mpfr_ptr(),v.mpfr_srcptr());
+    return f;
+}
+
+inline int mpreal::check_range (int t, mp_rnd_t rnd_mode)
+{
+    return mpfr_check_range(mpfr_ptr(),t,rnd_mode);
+}
+
+inline int mpreal::subnormalize (int t,mp_rnd_t rnd_mode)
+{
+    int r = mpfr_subnormalize(mpfr_ptr(),t,rnd_mode);
+    MPREAL_MSVC_DEBUGVIEW_CODE;
+    return r;
+}
+
+inline mp_exp_t mpreal::get_emin (void)
+{
+    return mpfr_get_emin();
+}
+
+inline int mpreal::set_emin (mp_exp_t exp)
+{
+    return mpfr_set_emin(exp);
+}
+
+inline mp_exp_t mpreal::get_emax (void)
+{
+    return mpfr_get_emax();
+}
+
+inline int mpreal::set_emax (mp_exp_t exp)
+{
+    return mpfr_set_emax(exp);
+}
+
+inline mp_exp_t mpreal::get_emin_min (void)
+{
+    return mpfr_get_emin_min();
+}
+
+inline mp_exp_t mpreal::get_emin_max (void)
+{
+    return mpfr_get_emin_max();
+}
+
+inline mp_exp_t mpreal::get_emax_min (void)
+{
+    return mpfr_get_emax_min();
+}
+
+inline mp_exp_t mpreal::get_emax_max (void)
+{
+    return mpfr_get_emax_max();
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Mathematical Functions
+//////////////////////////////////////////////////////////////////////////
+#define MPREAL_UNARY_MATH_FUNCTION_BODY(f)                    \
+        mpreal y(0, mpfr_get_prec(x.mpfr_srcptr()));          \
+        mpfr_##f(y.mpfr_ptr(), x.mpfr_srcptr(), r);           \
+        return y;
+
+inline const mpreal sqr  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
+{   MPREAL_UNARY_MATH_FUNCTION_BODY(sqr );    }
+
+inline const mpreal sqrt (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
+{   MPREAL_UNARY_MATH_FUNCTION_BODY(sqrt);    }
+
+inline const mpreal sqrt(const unsigned long int x, mp_rnd_t r)
+{
+    mpreal y;
+    mpfr_sqrt_ui(y.mpfr_ptr(), x, r);
+    return y;
+}
+
+inline const mpreal sqrt(const unsigned int v, mp_rnd_t rnd_mode)
+{
+    return sqrt(static_cast<unsigned long int>(v),rnd_mode);
+}
+
+inline const mpreal sqrt(const long int v, mp_rnd_t rnd_mode)
+{
+    if (v>=0)   return sqrt(static_cast<unsigned long int>(v),rnd_mode);
+    else        return mpreal().setNan(); // NaN
+}
+
+inline const mpreal sqrt(const int v, mp_rnd_t rnd_mode)
+{
+    if (v>=0)   return sqrt(static_cast<unsigned long int>(v),rnd_mode);
+    else        return mpreal().setNan(); // NaN
+}
+
+inline const mpreal root(const mpreal& x, unsigned long int k, mp_rnd_t r = mpreal::get_default_rnd())
+{
+    mpreal y(0, mpfr_get_prec(x.mpfr_srcptr()));
+    mpfr_root(y.mpfr_ptr(), x.mpfr_srcptr(), k, r);
+    return y;
+}
+
+inline const mpreal dim(const mpreal& a, const mpreal& b, mp_rnd_t r = mpreal::get_default_rnd())
+{
+    mpreal y(0, mpfr_get_prec(a.mpfr_srcptr()));
+    mpfr_dim(y.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), r);
+    return y;
+}
+
+inline int cmpabs(const mpreal& a,const mpreal& b)
+{
+    return mpfr_cmpabs(a.mpfr_ptr(), b.mpfr_srcptr());
+}
+
+inline int sin_cos(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    return mpfr_sin_cos(s.mpfr_ptr(), c.mpfr_ptr(), v.mpfr_srcptr(), rnd_mode);
+}
+
+inline const mpreal sqrt  (const long double v, mp_rnd_t rnd_mode)    {   return sqrt(mpreal(v),rnd_mode);    }
+inline const mpreal sqrt  (const double v, mp_rnd_t rnd_mode)         {   return sqrt(mpreal(v),rnd_mode);    }
+
+inline const mpreal cbrt  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(cbrt );    }
+inline const mpreal fabs  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(abs  );    }
+inline const mpreal abs   (const mpreal& x, mp_rnd_t r)                             {   MPREAL_UNARY_MATH_FUNCTION_BODY(abs  );    }
+inline const mpreal log   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(log  );    }
+inline const mpreal log2  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(log2 );    }
+inline const mpreal log10 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(log10);    }
+inline const mpreal exp   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(exp  );    }
+inline const mpreal exp2  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(exp2 );    }
+inline const mpreal exp10 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(exp10);    }
+inline const mpreal cos   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(cos  );    }
+inline const mpreal sin   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(sin  );    }
+inline const mpreal tan   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(tan  );    }
+inline const mpreal sec   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(sec  );    }
+inline const mpreal csc   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(csc  );    }
+inline const mpreal cot   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(cot  );    }
+inline const mpreal acos  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(acos );    }
+inline const mpreal asin  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(asin );    }
+inline const mpreal atan  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(atan );    }
+
+inline const mpreal logb  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   return log2 (abs(x),r);                    }
+
+inline const mpreal acot  (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return atan (1/v, r);                      }
+inline const mpreal asec  (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return acos (1/v, r);                      }
+inline const mpreal acsc  (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return asin (1/v, r);                      }
+inline const mpreal acoth (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return atanh(1/v, r);                      }
+inline const mpreal asech (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return acosh(1/v, r);                      }
+inline const mpreal acsch (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return asinh(1/v, r);                      }
+
+inline const mpreal cosh  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(cosh );    }
+inline const mpreal sinh  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(sinh );    }
+inline const mpreal tanh  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(tanh );    }
+inline const mpreal sech  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(sech );    }
+inline const mpreal csch  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(csch );    }
+inline const mpreal coth  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(coth );    }
+inline const mpreal acosh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(acosh);    }
+inline const mpreal asinh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(asinh);    }
+inline const mpreal atanh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(atanh);    }
+
+inline const mpreal log1p   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(log1p  );    }
+inline const mpreal expm1   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(expm1  );    }
+inline const mpreal eint    (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(eint   );    }
+inline const mpreal gamma   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(gamma  );    }
+inline const mpreal tgamma  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(gamma  );    }
+inline const mpreal lngamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(lngamma);    }
+inline const mpreal zeta    (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(zeta   );    }
+inline const mpreal erf     (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(erf    );    }
+inline const mpreal erfc    (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(erfc   );    }
+inline const mpreal besselj0(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(j0     );    }
+inline const mpreal besselj1(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(j1     );    }
+inline const mpreal bessely0(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(y0     );    }
+inline const mpreal bessely1(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(y1     );    }
+
+inline const mpreal atan2 (const mpreal& y, const mpreal& x, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
+    mpfr_atan2(a.mpfr_ptr(), y.mpfr_srcptr(), x.mpfr_srcptr(), rnd_mode);
+    return a;
+}
+
+inline const mpreal hypot (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
+    mpfr_hypot(a.mpfr_ptr(), x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode);
+    return a;
+}
+
+inline const mpreal remainder (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
+    mpfr_remainder(a.mpfr_ptr(), x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode);
+    return a;
+}
+
+inline const mpreal remquo (long* q, const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
+    mpfr_remquo(a.mpfr_ptr(),q, x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode);
+    return a;
+}
+
+inline const mpreal fac_ui (unsigned long int v, mp_prec_t prec     = mpreal::get_default_prec(),
+                                           mp_rnd_t  rnd_mode = mpreal::get_default_rnd())
+{
+    mpreal x(0, prec);
+    mpfr_fac_ui(x.mpfr_ptr(),v,rnd_mode);
+    return x;
+}
+
+
+inline const mpreal lgamma (const mpreal& v, int *signp = 0, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    mpreal x(v);
+    int tsignp;
+
+    if(signp)   mpfr_lgamma(x.mpfr_ptr(),  signp,v.mpfr_srcptr(),rnd_mode);
+    else        mpfr_lgamma(x.mpfr_ptr(),&tsignp,v.mpfr_srcptr(),rnd_mode);
+
+    return x;
+}
+
+
+inline const mpreal besseljn (long n, const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
+{
+    mpreal  y(0, x.getPrecision());
+    mpfr_jn(y.mpfr_ptr(), n, x.mpfr_srcptr(), r);
+    return y;
+}
+
+inline const mpreal besselyn (long n, const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
+{
+    mpreal  y(0, x.getPrecision());
+    mpfr_yn(y.mpfr_ptr(), n, x.mpfr_srcptr(), r);
+    return y;
+}
+
+inline const mpreal fma (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    mpreal a;
+    mp_prec_t p1, p2, p3;
+
+    p1 = v1.get_prec();
+    p2 = v2.get_prec();
+    p3 = v3.get_prec();
+
+    a.set_prec(p3>p2?(p3>p1?p3:p1):(p2>p1?p2:p1));
+
+    mpfr_fma(a.mp,v1.mp,v2.mp,v3.mp,rnd_mode);
+    return a;
+}
+
+inline const mpreal fms (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    mpreal a;
+    mp_prec_t p1, p2, p3;
+
+    p1 = v1.get_prec();
+    p2 = v2.get_prec();
+    p3 = v3.get_prec();
+
+    a.set_prec(p3>p2?(p3>p1?p3:p1):(p2>p1?p2:p1));
+
+    mpfr_fms(a.mp,v1.mp,v2.mp,v3.mp,rnd_mode);
+    return a;
+}
+
+inline const mpreal agm (const mpreal& v1, const mpreal& v2, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    mpreal a;
+    mp_prec_t p1, p2;
+
+    p1 = v1.get_prec();
+    p2 = v2.get_prec();
+
+    a.set_prec(p1>p2?p1:p2);
+
+    mpfr_agm(a.mp, v1.mp, v2.mp, rnd_mode);
+
+    return a;
+}
+
+inline const mpreal sum (const mpreal tab[], const unsigned long int n, int& status, mp_rnd_t mode = mpreal::get_default_rnd())
+{
+    mpfr_srcptr *p = new mpfr_srcptr[n];
+
+    for (unsigned long int  i = 0; i < n; i++)
+        p[i] = tab[i].mpfr_srcptr();
+
+    mpreal x;
+    status = mpfr_sum(x.mpfr_ptr(), (mpfr_ptr*)p, n, mode);
+
+    delete [] p;
+    return x;
+}
+
+//////////////////////////////////////////////////////////////////////////
+// MPFR 2.4.0 Specifics
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
+
+inline int sinh_cosh(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    return mpfr_sinh_cosh(s.mp,c.mp,v.mp,rnd_mode);
+}
+
+inline const mpreal li2 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
+{
+    MPREAL_UNARY_MATH_FUNCTION_BODY(li2);
+}
+
+inline const mpreal rem (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    /*  R = rem(X,Y) if Y != 0, returns X - n * Y where n = trunc(X/Y). */
+    return fmod(x, y, rnd_mode);
+}
+
+inline const mpreal mod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    (void)rnd_mode;
+
+    /*
+
+    m = mod(x,y) if y != 0, returns x - n*y where n = floor(x/y)
+
+    The following are true by convention:
+    - mod(x,0) is x
+    - mod(x,x) is 0
+    - mod(x,y) for x != y and y != 0 has the same sign as y.
+
+    */
+
+    if(iszero(y)) return x;
+    if(x == y) return 0;
+
+    mpreal m = x - floor(x / y) * y;
+
+    m.setSign(sgn(y)); // make sure result has the same sign as Y
+
+    return m;
+}
+
+inline const mpreal fmod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    mpreal a;
+    mp_prec_t yp, xp;
+
+    yp = y.get_prec();
+    xp = x.get_prec();
+
+    a.set_prec(yp>xp?yp:xp);
+
+    mpfr_fmod(a.mp, x.mp, y.mp, rnd_mode);
+
+    return a;
+}
+
+inline const mpreal rec_sqrt(const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    mpreal x(v);
+    mpfr_rec_sqrt(x.mp,v.mp,rnd_mode);
+    return x;
+}
+#endif //  MPFR 2.4.0 Specifics
+
+//////////////////////////////////////////////////////////////////////////
+// MPFR 3.0.0 Specifics
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
+inline const mpreal digamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(digamma);     }
+inline const mpreal ai      (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(ai);          }
+#endif // MPFR 3.0.0 Specifics
+
+//////////////////////////////////////////////////////////////////////////
+// Constants
+inline const mpreal const_log2 (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
+{
+    mpreal x(0, p);
+    mpfr_const_log2(x.mpfr_ptr(), r);
+    return x;
+}
+
+inline const mpreal const_pi (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
+{
+    mpreal x(0, p);
+    mpfr_const_pi(x.mpfr_ptr(), r);
+    return x;
+}
+
+inline const mpreal const_euler (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
+{
+    mpreal x(0, p);
+    mpfr_const_euler(x.mpfr_ptr(), r);
+    return x;
+}
+
+inline const mpreal const_catalan (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
+{
+    mpreal x(0, p);
+    mpfr_const_catalan(x.mpfr_ptr(), r);
+    return x;
+}
+
+inline const mpreal const_infinity (int sign = 1, mp_prec_t p = mpreal::get_default_prec())
+{
+    mpreal x(0, p);
+    mpfr_set_inf(x.mpfr_ptr(), sign);
+    return x;
+}
+
+//////////////////////////////////////////////////////////////////////////
+// Integer Related Functions
+inline const mpreal ceil(const mpreal& v)
+{
+    mpreal x(v);
+    mpfr_ceil(x.mp,v.mp);
+    return x;
+}
+
+inline const mpreal floor(const mpreal& v)
+{
+    mpreal x(v);
+    mpfr_floor(x.mp,v.mp);
+    return x;
+}
+
+inline const mpreal round(const mpreal& v)
+{
+    mpreal x(v);
+    mpfr_round(x.mp,v.mp);
+    return x;
+}
+
+inline const mpreal trunc(const mpreal& v)
+{
+    mpreal x(v);
+    mpfr_trunc(x.mp,v.mp);
+    return x;
+}
+
+inline const mpreal rint       (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(rint      );     }
+inline const mpreal rint_ceil  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(rint_ceil );     }
+inline const mpreal rint_floor (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(rint_floor);     }
+inline const mpreal rint_round (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(rint_round);     }
+inline const mpreal rint_trunc (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(rint_trunc);     }
+inline const mpreal frac       (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(frac      );     }
+
+//////////////////////////////////////////////////////////////////////////
+// Miscellaneous Functions
+inline void         swap (mpreal& a, mpreal& b)            {    mpfr_swap(a.mp,b.mp);   }
+inline const mpreal (max)(const mpreal& x, const mpreal& y){    return (x>y?x:y);       }
+inline const mpreal (min)(const mpreal& x, const mpreal& y){    return (x<y?x:y);       }
+
+inline const mpreal fmax(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    mpreal a;
+    mpfr_max(a.mp,x.mp,y.mp,rnd_mode);
+    return a;
+}
+
+inline const mpreal fmin(const mpreal& x, const mpreal& y,  mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    mpreal a;
+    mpfr_min(a.mp,x.mp,y.mp,rnd_mode);
+    return a;
+}
+
+inline const mpreal nexttoward (const mpreal& x, const mpreal& y)
+{
+    mpreal a(x);
+    mpfr_nexttoward(a.mp,y.mp);
+    return a;
+}
+
+inline const mpreal nextabove  (const mpreal& x)
+{
+    mpreal a(x);
+    mpfr_nextabove(a.mp);
+    return a;
+}
+
+inline const mpreal nextbelow  (const mpreal& x)
+{
+    mpreal a(x);
+    mpfr_nextbelow(a.mp);
+    return a;
+}
+
+inline const mpreal urandomb (gmp_randstate_t& state)
+{
+    mpreal x;
+    mpfr_urandomb(x.mpfr_ptr(),state);
+    return x;
+}
+
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
+inline const mpreal urandom (gmp_randstate_t& state, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    mpreal x;
+    mpfr_urandom(x.mpfr_ptr(), state, rnd_mode);
+    return x;
+}
+#endif
+
+#if (MPFR_VERSION <= MPFR_VERSION_NUM(2,4,2))
+inline const mpreal random2 (mp_size_t size, mp_exp_t exp)
+{
+    mpreal x;
+    mpfr_random2(x.mpfr_ptr(),size,exp);
+    return x;
+}
+#endif
+
+// Uniformly distributed random number generation
+// a = random(seed); <- initialization & first random number generation
+// a = random();     <- next random numbers generation
+// seed != 0
+inline const mpreal random(unsigned int seed = 0)
+{
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
+    static gmp_randstate_t state;
+    static bool initialize = true;
+
+    if(initialize)
+    {
+        gmp_randinit_default(state);
+        gmp_randseed_ui(state,0);
+        initialize = false;
+    }
+
+    if(seed != 0)    gmp_randseed_ui(state,seed);
+
+    return mpfr::urandom(state);
+#else
+    if(seed != 0)    std::srand(seed);
+    return mpfr::mpreal(std::rand()/(double)RAND_MAX);
+#endif
+
+}
+
+#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
+
+inline const mpreal grandom (gmp_randstate_t& state, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    mpreal x;
+    mpfr_grandom(x.mpfr_ptr(), NULL, state, rnd_mode);
+    return x;
+}
+
+inline const mpreal grandom(unsigned int seed = 0)
+{
+    static gmp_randstate_t state;
+    static bool initialize = true;
+
+    if(initialize)
+    {
+        gmp_randinit_default(state);
+        gmp_randseed_ui(state,0);
+        initialize = false;
+    }
+
+    if(seed != 0) gmp_randseed_ui(state,seed);
+
+    return mpfr::grandom(state);
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////
+// Set/Get global properties
+inline void mpreal::set_default_prec(mp_prec_t prec)
+{
+    mpfr_set_default_prec(prec);
+}
+
+inline void mpreal::set_default_rnd(mp_rnd_t rnd_mode)
+{
+    mpfr_set_default_rounding_mode(rnd_mode);
+}
+
+inline bool mpreal::fits_in_bits(double x, int n)
+{
+    int i;
+    double t;
+    return IsInf(x) || (std::modf ( std::ldexp ( std::frexp ( x, &i ), n ), &t ) == 0.0);
+}
+
+inline const mpreal pow(const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    mpreal x(a);
+    mpfr_pow(x.mp,x.mp,b.mp,rnd_mode);
+    return x;
+}
+
+inline const mpreal pow(const mpreal& a, const mpz_t b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    mpreal x(a);
+    mpfr_pow_z(x.mp,x.mp,b,rnd_mode);
+    return x;
+}
+
+inline const mpreal pow(const mpreal& a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    mpreal x(a);
+    mpfr_pow_ui(x.mp,x.mp,b,rnd_mode);
+    return x;
+}
+
+inline const mpreal pow(const mpreal& a, const unsigned int b, mp_rnd_t rnd_mode)
+{
+    return pow(a,static_cast<unsigned long int>(b),rnd_mode);
+}
+
+inline const mpreal pow(const mpreal& a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    mpreal x(a);
+    mpfr_pow_si(x.mp,x.mp,b,rnd_mode);
+    return x;
+}
+
+inline const mpreal pow(const mpreal& a, const int b, mp_rnd_t rnd_mode)
+{
+    return pow(a,static_cast<long int>(b),rnd_mode);
+}
+
+inline const mpreal pow(const mpreal& a, const long double b, mp_rnd_t rnd_mode)
+{
+    return pow(a,mpreal(b),rnd_mode);
+}
+
+inline const mpreal pow(const mpreal& a, const double b, mp_rnd_t rnd_mode)
+{
+    return pow(a,mpreal(b),rnd_mode);
+}
+
+inline const mpreal pow(const unsigned long int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
+{
+    mpreal x(a);
+    mpfr_ui_pow(x.mp,a,b.mp,rnd_mode);
+    return x;
+}
+
+inline const mpreal pow(const unsigned int a, const mpreal& b, mp_rnd_t rnd_mode)
+{
+    return pow(static_cast<unsigned long int>(a),b,rnd_mode);
+}
+
+inline const mpreal pow(const long int a, const mpreal& b, mp_rnd_t rnd_mode)
+{
+    if (a>=0)     return pow(static_cast<unsigned long int>(a),b,rnd_mode);
+    else          return pow(mpreal(a),b,rnd_mode);
+}
+
+inline const mpreal pow(const int a, const mpreal& b, mp_rnd_t rnd_mode)
+{
+    if (a>=0)     return pow(static_cast<unsigned long int>(a),b,rnd_mode);
+    else          return pow(mpreal(a),b,rnd_mode);
+}
+
+inline const mpreal pow(const long double a, const mpreal& b, mp_rnd_t rnd_mode)
+{
+    return pow(mpreal(a),b,rnd_mode);
+}
+
+inline const mpreal pow(const double a, const mpreal& b, mp_rnd_t rnd_mode)
+{
+    return pow(mpreal(a),b,rnd_mode);
+}
+
+// pow unsigned long int
+inline const mpreal pow(const unsigned long int a, const unsigned long int b, mp_rnd_t rnd_mode)
+{
+    mpreal x(a);
+    mpfr_ui_pow_ui(x.mp,a,b,rnd_mode);
+    return x;
+}
+
+inline const mpreal pow(const unsigned long int a, const unsigned int b, mp_rnd_t rnd_mode)
+{
+    return pow(a,static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
+}
+
+inline const mpreal pow(const unsigned long int a, const long int b, mp_rnd_t rnd_mode)
+{
+    if(b>0)    return pow(a,static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
+    else       return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
+}
+
+inline const mpreal pow(const unsigned long int a, const int b, mp_rnd_t rnd_mode)
+{
+    if(b>0)    return pow(a,static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
+    else       return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
+}
+
+inline const mpreal pow(const unsigned long int a, const long double b, mp_rnd_t rnd_mode)
+{
+    return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
+}
+
+inline const mpreal pow(const unsigned long int a, const double b, mp_rnd_t rnd_mode)
+{
+    return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
+}
+
+// pow unsigned int
+inline const mpreal pow(const unsigned int a, const unsigned long int b, mp_rnd_t rnd_mode)
+{
+    return pow(static_cast<unsigned long int>(a),b,rnd_mode); //mpfr_ui_pow_ui
+}
+
+inline const mpreal pow(const unsigned int a, const unsigned int b, mp_rnd_t rnd_mode)
+{
+    return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
+}
+
+inline const mpreal pow(const unsigned int a, const long int b, mp_rnd_t rnd_mode)
+{
+    if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
+    else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
+}
+
+inline const mpreal pow(const unsigned int a, const int b, mp_rnd_t rnd_mode)
+{
+    if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
+    else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
+}
+
+inline const mpreal pow(const unsigned int a, const long double b, mp_rnd_t rnd_mode)
+{
+    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
+}
+
+inline const mpreal pow(const unsigned int a, const double b, mp_rnd_t rnd_mode)
+{
+    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
+}
+
+// pow long int
+inline const mpreal pow(const long int a, const unsigned long int b, mp_rnd_t rnd_mode)
+{
+    if (a>0) return pow(static_cast<unsigned long int>(a),b,rnd_mode); //mpfr_ui_pow_ui
+    else     return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui
+}
+
+inline const mpreal pow(const long int a, const unsigned int b, mp_rnd_t rnd_mode)
+{
+    if (a>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode);  //mpfr_ui_pow_ui
+    else     return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_pow_ui
+}
+
+inline const mpreal pow(const long int a, const long int b, mp_rnd_t rnd_mode)
+{
+    if (a>0)
+    {
+        if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
+        else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
+    }else{
+        return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
+    }
+}
+
+inline const mpreal pow(const long int a, const int b, mp_rnd_t rnd_mode)
+{
+    if (a>0)
+    {
+        if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
+        else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
+    }else{
+        return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
+    }
+}
+
+inline const mpreal pow(const long int a, const long double b, mp_rnd_t rnd_mode)
+{
+    if (a>=0)   return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
+    else        return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
+}
+
+inline const mpreal pow(const long int a, const double b, mp_rnd_t rnd_mode)
+{
+    if (a>=0)   return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
+    else        return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
+}
+
+// pow int
+inline const mpreal pow(const int a, const unsigned long int b, mp_rnd_t rnd_mode)
+{
+    if (a>0) return pow(static_cast<unsigned long int>(a),b,rnd_mode); //mpfr_ui_pow_ui
+    else     return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui
+}
+
+inline const mpreal pow(const int a, const unsigned int b, mp_rnd_t rnd_mode)
+{
+    if (a>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode);  //mpfr_ui_pow_ui
+    else     return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_pow_ui
+}
+
+inline const mpreal pow(const int a, const long int b, mp_rnd_t rnd_mode)
+{
+    if (a>0)
+    {
+        if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
+        else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
+    }else{
+        return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
+    }
+}
+
+inline const mpreal pow(const int a, const int b, mp_rnd_t rnd_mode)
+{
+    if (a>0)
+    {
+        if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
+        else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
+    }else{
+        return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
+    }
+}
+
+inline const mpreal pow(const int a, const long double b, mp_rnd_t rnd_mode)
+{
+    if (a>=0)   return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
+    else        return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
+}
+
+inline const mpreal pow(const int a, const double b, mp_rnd_t rnd_mode)
+{
+    if (a>=0)   return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
+    else        return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
+}
+
+// pow long double
+inline const mpreal pow(const long double a, const long double b, mp_rnd_t rnd_mode)
+{
+    return pow(mpreal(a),mpreal(b),rnd_mode);
+}
+
+inline const mpreal pow(const long double a, const unsigned long int b, mp_rnd_t rnd_mode)
+{
+    return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui
+}
+
+inline const mpreal pow(const long double a, const unsigned int b, mp_rnd_t rnd_mode)
+{
+    return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_pow_ui
+}
+
+inline const mpreal pow(const long double a, const long int b, mp_rnd_t rnd_mode)
+{
+    return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
+}
+
+inline const mpreal pow(const long double a, const int b, mp_rnd_t rnd_mode)
+{
+    return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
+}
+
+inline const mpreal pow(const double a, const double b, mp_rnd_t rnd_mode)
+{
+    return pow(mpreal(a),mpreal(b),rnd_mode);
+}
+
+inline const mpreal pow(const double a, const unsigned long int b, mp_rnd_t rnd_mode)
+{
+    return pow(mpreal(a),b,rnd_mode); // mpfr_pow_ui
+}
+
+inline const mpreal pow(const double a, const unsigned int b, mp_rnd_t rnd_mode)
+{
+    return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); // mpfr_pow_ui
+}
+
+inline const mpreal pow(const double a, const long int b, mp_rnd_t rnd_mode)
+{
+    return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
+}
+
+inline const mpreal pow(const double a, const int b, mp_rnd_t rnd_mode)
+{
+    return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
+}
+} // End of mpfr namespace
+
+// Explicit specialization of std::swap for mpreal numbers
+// Thus standard algorithms will use efficient version of swap (due to Koenig lookup)
+// Non-throwing swap C++ idiom: http://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Non-throwing_swap
+namespace std
+{
+  // we are allowed to extend namespace std with specializations only
+    template <>
+    inline void swap(mpfr::mpreal& x, mpfr::mpreal& y)
+    {
+        return mpfr::swap(x, y);
+    }
+
+    template<>
+    class numeric_limits<mpfr::mpreal>
+    {
+    public:
+        static const bool is_specialized    = true;
+        static const bool is_signed         = true;
+        static const bool is_integer        = false;
+        static const bool is_exact          = false;
+        static const int  radix             = 2;
+
+        static const bool has_infinity      = true;
+        static const bool has_quiet_NaN     = true;
+        static const bool has_signaling_NaN = true;
+
+        static const bool is_iec559         = true;        // = IEEE 754
+        static const bool is_bounded        = true;
+        static const bool is_modulo         = false;
+        static const bool traps             = true;
+        static const bool tinyness_before   = true;
+
+        static const float_denorm_style has_denorm  = denorm_absent;
+
+        inline static mpfr::mpreal (min)    (mp_prec_t precision = mpfr::mpreal::get_default_prec()) {  return  mpfr::minval(precision);  }
+        inline static mpfr::mpreal (max)    (mp_prec_t precision = mpfr::mpreal::get_default_prec()) {  return  mpfr::maxval(precision);  }
+        inline static mpfr::mpreal lowest   (mp_prec_t precision = mpfr::mpreal::get_default_prec()) {  return -mpfr::maxval(precision);  }
+
+        // Returns smallest eps such that 1 + eps != 1 (classic machine epsilon)
+        inline static mpfr::mpreal epsilon(mp_prec_t precision = mpfr::mpreal::get_default_prec()) {  return  mpfr::machine_epsilon(precision); }
+
+        // Returns smallest eps such that x + eps != x (relative machine epsilon)
+        inline static mpfr::mpreal epsilon(const mpfr::mpreal& x) {  return mpfr::machine_epsilon(x);  }
+
+        inline static mpfr::mpreal round_error(mp_prec_t precision = mpfr::mpreal::get_default_prec())
+        {
+            mp_rnd_t r = mpfr::mpreal::get_default_rnd();
+
+            if(r == GMP_RNDN)  return mpfr::mpreal(0.5, precision);
+            else               return mpfr::mpreal(1.0, precision);
+        }
+
+        inline static const mpfr::mpreal infinity()         { return mpfr::const_infinity();     }
+        inline static const mpfr::mpreal quiet_NaN()        { return mpfr::mpreal().setNan();    }
+        inline static const mpfr::mpreal signaling_NaN()    { return mpfr::mpreal().setNan();    }
+        inline static const mpfr::mpreal denorm_min()       { return (min)();                    }
+
+        // Please note, exponent range is not fixed in MPFR
+        static const int min_exponent = MPFR_EMIN_DEFAULT;
+        static const int max_exponent = MPFR_EMAX_DEFAULT;
+        MPREAL_PERMISSIVE_EXPR static const int min_exponent10 = (int) (MPFR_EMIN_DEFAULT * 0.3010299956639811);
+        MPREAL_PERMISSIVE_EXPR static const int max_exponent10 = (int) (MPFR_EMAX_DEFAULT * 0.3010299956639811);
+
+#ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS
+
+        // Following members should be constant according to standard, but they can be variable in MPFR
+        // So we define them as functions here.
+        //
+        // This is preferable way for std::numeric_limits<mpfr::mpreal> specialization.
+        // But it is incompatible with standard std::numeric_limits and might not work with other libraries, e.g. boost.
+        // See below for compatible implementation.
+        inline static float_round_style round_style()
+        {
+            mp_rnd_t r = mpfr::mpreal::get_default_rnd();
+
+            switch (r)
+            {
+            case GMP_RNDN: return round_to_nearest;
+            case GMP_RNDZ: return round_toward_zero;
+            case GMP_RNDU: return round_toward_infinity;
+            case GMP_RNDD: return round_toward_neg_infinity;
+            default: return round_indeterminate;
+            }
+        }
+
+        inline static int digits()                        {    return int(mpfr::mpreal::get_default_prec());    }
+        inline static int digits(const mpfr::mpreal& x)   {    return x.getPrecision();                         }
+
+        inline static int digits10(mp_prec_t precision = mpfr::mpreal::get_default_prec())
+        {
+            return mpfr::bits2digits(precision);
+        }
+
+        inline static int digits10(const mpfr::mpreal& x)
+        {
+            return mpfr::bits2digits(x.getPrecision());
+        }
+
+        inline static int max_digits10(mp_prec_t precision = mpfr::mpreal::get_default_prec())
+        {
+            return digits10(precision);
+        }
+#else
+        // Digits and round_style are NOT constants when it comes to mpreal.
+        // If possible, please use functions digits() and round_style() defined above.
+        //
+        // These (default) values are preserved for compatibility with existing libraries, e.g. boost.
+        // Change them accordingly to your application.
+        //
+        // For example, if you use 256 bits of precision uniformly in your program, then:
+        // digits       = 256
+        // digits10     = 77
+        // max_digits10 = 78
+        //
+        // Approximate formula for decimal digits is: digits10 = floor(log10(2) * digits). See bits2digits() for more details.
+
+        static const std::float_round_style round_style = round_to_nearest;
+        static const int digits       = 53;
+        static const int digits10     = 15;
+        static const int max_digits10 = 16;
+#endif
+    };
+
+}
+
+#endif /* __MPREAL_H__ */
diff --git a/uppsrc/plugin/Eigen/unsupported/test/mpreal_support.cpp b/uppsrc/plugin/Eigen/unsupported/test/mpreal_support.cpp
new file mode 100644
index 000000000..685e7ea45
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/mpreal_support.cpp
@@ -0,0 +1,65 @@
+#include "main.h"
+#include <Eigen/MPRealSupport>
+#include <Eigen/LU>
+#include <Eigen/Eigenvalues>
+#include <sstream>
+
+using namespace mpfr;
+using namespace Eigen;
+
+void test_mpreal_support()
+{
+  // set precision to 256 bits (double has only 53 bits)
+  mpreal::set_default_prec(256);
+  typedef Matrix<mpreal,Eigen::Dynamic,Eigen::Dynamic> MatrixXmp;
+  typedef Matrix<std::complex<mpreal>,Eigen::Dynamic,Eigen::Dynamic> MatrixXcmp;
+
+  std::cerr << "epsilon =         " << NumTraits<mpreal>::epsilon() << "\n";
+  std::cerr << "dummy_precision = " << NumTraits<mpreal>::dummy_precision() << "\n";
+  std::cerr << "highest =         " << NumTraits<mpreal>::highest() << "\n";
+  std::cerr << "lowest =          " << NumTraits<mpreal>::lowest() << "\n";
+  std::cerr << "digits10 =        " << NumTraits<mpreal>::digits10() << "\n";
+
+  for(int i = 0; i < g_repeat; i++) {
+    int s = Eigen::internal::random<int>(1,100);
+    MatrixXmp A = MatrixXmp::Random(s,s);
+    MatrixXmp B = MatrixXmp::Random(s,s);
+    MatrixXmp S = A.adjoint() * A;
+    MatrixXmp X;
+    MatrixXcmp Ac = MatrixXcmp::Random(s,s);
+    MatrixXcmp Bc = MatrixXcmp::Random(s,s);
+    MatrixXcmp Sc = Ac.adjoint() * Ac;
+    MatrixXcmp Xc;
+    
+    // Basic stuffs
+    VERIFY_IS_APPROX(A.real(), A);
+    VERIFY(Eigen::internal::isApprox(A.array().abs2().sum(), A.squaredNorm()));
+    VERIFY_IS_APPROX(A.array().exp(),         exp(A.array()));
+    VERIFY_IS_APPROX(A.array().abs2().sqrt(), A.array().abs());
+    VERIFY_IS_APPROX(A.array().sin(),         sin(A.array()));
+    VERIFY_IS_APPROX(A.array().cos(),         cos(A.array()));
+
+    // Cholesky
+    X = S.selfadjointView<Lower>().llt().solve(B);
+    VERIFY_IS_APPROX((S.selfadjointView<Lower>()*X).eval(),B);
+
+    Xc = Sc.selfadjointView<Lower>().llt().solve(Bc);
+    VERIFY_IS_APPROX((Sc.selfadjointView<Lower>()*Xc).eval(),Bc);
+    
+    // partial LU
+    X = A.lu().solve(B);
+    VERIFY_IS_APPROX((A*X).eval(),B);
+
+    // symmetric eigenvalues
+    SelfAdjointEigenSolver<MatrixXmp> eig(S);
+    VERIFY_IS_EQUAL(eig.info(), Success);
+    VERIFY( (S.selfadjointView<Lower>() * eig.eigenvectors()).isApprox(eig.eigenvectors() * eig.eigenvalues().asDiagonal(), NumTraits<mpreal>::dummy_precision()*1e3) );
+  }
+  
+  {
+    MatrixXmp A(8,3); A.setRandom();
+    // test output (interesting things happen in this code)
+    std::stringstream stream;
+    stream << A;
+  }
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/openglsupport.cpp b/uppsrc/plugin/Eigen/unsupported/test/openglsupport.cpp
new file mode 100644
index 000000000..5f6343427
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/openglsupport.cpp
@@ -0,0 +1,333 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <main.h>
+#include <iostream>
+#include <GL/glew.h>
+#include <Eigen/OpenGLSupport>
+#include <GL/glut.h>
+using namespace Eigen;
+
+
+
+
+#define VERIFY_MATRIX(CODE,REF) { \
+    glLoadIdentity(); \
+    CODE; \
+    Matrix<float,4,4,ColMajor> m; m.setZero(); \
+    glGet(GL_MODELVIEW_MATRIX, m); \
+    if(!(REF).cast<float>().isApprox(m)) { \
+      std::cerr << "Expected:\n" << ((REF).cast<float>()) << "\n" << "got\n" << m << "\n\n"; \
+    } \
+    VERIFY_IS_APPROX((REF).cast<float>(), m); \
+  }
+
+#define VERIFY_UNIFORM(SUFFIX,NAME,TYPE) { \
+    TYPE value; value.setRandom(); \
+    TYPE data; \
+    int loc = glGetUniformLocation(prg_id, #NAME); \
+    VERIFY((loc!=-1) && "uniform not found"); \
+    glUniform(loc,value); \
+    EIGEN_CAT(glGetUniform,SUFFIX)(prg_id,loc,data.data()); \
+    if(!value.isApprox(data)) { \
+      std::cerr << "Expected:\n" << value << "\n" << "got\n" << data << "\n\n"; \
+    } \
+    VERIFY_IS_APPROX(value, data); \
+  }
+  
+#define VERIFY_UNIFORMi(NAME,TYPE) { \
+    TYPE value = TYPE::Random().eval().cast<float>().cast<TYPE::Scalar>(); \
+    TYPE data; \
+    int loc = glGetUniformLocation(prg_id, #NAME); \
+    VERIFY((loc!=-1) && "uniform not found"); \
+    glUniform(loc,value); \
+    glGetUniformiv(prg_id,loc,(GLint*)data.data()); \
+    if(!value.isApprox(data)) { \
+      std::cerr << "Expected:\n" << value << "\n" << "got\n" << data << "\n\n"; \
+    } \
+    VERIFY_IS_APPROX(value, data); \
+  }
+  
+void printInfoLog(GLuint objectID)
+{
+    int infologLength, charsWritten;
+    GLchar *infoLog;
+    glGetProgramiv(objectID,GL_INFO_LOG_LENGTH, &infologLength);
+    if(infologLength > 0)
+    {
+        infoLog = new GLchar[infologLength];
+        glGetProgramInfoLog(objectID, infologLength, &charsWritten, infoLog);
+        if (charsWritten>0)
+          std::cerr << "Shader info : \n" << infoLog << std::endl;
+        delete[] infoLog;
+    }
+}
+
+GLint createShader(const char* vtx, const char* frg)
+{
+  GLint prg_id = glCreateProgram();
+  GLint vtx_id = glCreateShader(GL_VERTEX_SHADER);
+  GLint frg_id = glCreateShader(GL_FRAGMENT_SHADER);
+  GLint ok;
+  
+  glShaderSource(vtx_id, 1, &vtx, 0);
+  glCompileShader(vtx_id);
+  glGetShaderiv(vtx_id,GL_COMPILE_STATUS,&ok);
+  if(!ok)
+  {
+    std::cerr << "vtx compilation failed\n";
+  }
+  
+  glShaderSource(frg_id, 1, &frg, 0);
+  glCompileShader(frg_id);
+  glGetShaderiv(frg_id,GL_COMPILE_STATUS,&ok);
+  if(!ok)
+  {
+    std::cerr << "frg compilation failed\n";
+  }
+  
+  glAttachShader(prg_id, vtx_id);
+  glAttachShader(prg_id, frg_id);
+  glLinkProgram(prg_id);
+  glGetProgramiv(prg_id,GL_LINK_STATUS,&ok);
+  if(!ok)
+  {
+    std::cerr << "linking failed\n";
+  }
+  printInfoLog(prg_id);
+  
+  glUseProgram(prg_id);
+  return prg_id;
+}
+
+void test_openglsupport()
+{
+  int argc = 0;
+  glutInit(&argc, 0);
+  glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH);
+  glutInitWindowPosition (0,0);
+  glutInitWindowSize(10, 10);
+
+  if(glutCreateWindow("Eigen") <= 0)
+  {
+    std::cerr << "Error: Unable to create GLUT Window.\n";
+    exit(1);
+  }
+  
+  glewExperimental = GL_TRUE;
+  if(glewInit() != GLEW_OK)
+  {
+    std::cerr << "Warning: Failed to initialize GLEW\n";
+  }
+
+  Vector3f v3f;
+  Matrix3f rot;
+  glBegin(GL_POINTS);
+  
+  glVertex(v3f);
+  glVertex(2*v3f+v3f);
+  glVertex(rot*v3f);
+  
+  glEnd();
+  
+  // 4x4 matrices
+  Matrix4f mf44; mf44.setRandom();
+  VERIFY_MATRIX(glLoadMatrix(mf44), mf44);
+  VERIFY_MATRIX(glMultMatrix(mf44), mf44);
+  Matrix4d md44; md44.setRandom();
+  VERIFY_MATRIX(glLoadMatrix(md44), md44);
+  VERIFY_MATRIX(glMultMatrix(md44), md44);
+  
+  // Quaternion
+  Quaterniond qd(AngleAxisd(internal::random<double>(), Vector3d::Random()));
+  VERIFY_MATRIX(glRotate(qd), Projective3d(qd).matrix());
+  
+  Quaternionf qf(AngleAxisf(internal::random<double>(), Vector3f::Random()));
+  VERIFY_MATRIX(glRotate(qf), Projective3f(qf).matrix());
+  
+  // 3D Transform
+  Transform<float,3,AffineCompact> acf3; acf3.matrix().setRandom();
+  VERIFY_MATRIX(glLoadMatrix(acf3), Projective3f(acf3).matrix());
+  VERIFY_MATRIX(glMultMatrix(acf3), Projective3f(acf3).matrix());
+  
+  Transform<float,3,Affine> af3(acf3);
+  VERIFY_MATRIX(glLoadMatrix(af3), Projective3f(af3).matrix());
+  VERIFY_MATRIX(glMultMatrix(af3), Projective3f(af3).matrix());
+  
+  Transform<float,3,Projective> pf3; pf3.matrix().setRandom();
+  VERIFY_MATRIX(glLoadMatrix(pf3), Projective3f(pf3).matrix());
+  VERIFY_MATRIX(glMultMatrix(pf3), Projective3f(pf3).matrix());
+  
+  Transform<double,3,AffineCompact> acd3; acd3.matrix().setRandom();
+  VERIFY_MATRIX(glLoadMatrix(acd3), Projective3d(acd3).matrix());
+  VERIFY_MATRIX(glMultMatrix(acd3), Projective3d(acd3).matrix());
+  
+  Transform<double,3,Affine> ad3(acd3);
+  VERIFY_MATRIX(glLoadMatrix(ad3), Projective3d(ad3).matrix());
+  VERIFY_MATRIX(glMultMatrix(ad3), Projective3d(ad3).matrix());
+  
+  Transform<double,3,Projective> pd3; pd3.matrix().setRandom();
+  VERIFY_MATRIX(glLoadMatrix(pd3), Projective3d(pd3).matrix());
+  VERIFY_MATRIX(glMultMatrix(pd3), Projective3d(pd3).matrix());
+  
+  // translations (2D and 3D)
+  {
+    Vector2f vf2; vf2.setRandom(); Vector3f vf23; vf23 << vf2, 0;
+    VERIFY_MATRIX(glTranslate(vf2), Projective3f(Translation3f(vf23)).matrix());
+    Vector2d vd2; vd2.setRandom(); Vector3d vd23; vd23 << vd2, 0;
+    VERIFY_MATRIX(glTranslate(vd2), Projective3d(Translation3d(vd23)).matrix());
+    
+    Vector3f vf3; vf3.setRandom();
+    VERIFY_MATRIX(glTranslate(vf3), Projective3f(Translation3f(vf3)).matrix());
+    Vector3d vd3; vd3.setRandom();
+    VERIFY_MATRIX(glTranslate(vd3), Projective3d(Translation3d(vd3)).matrix());
+    
+    Translation<float,3> tf3; tf3.vector().setRandom();
+    VERIFY_MATRIX(glTranslate(tf3), Projective3f(tf3).matrix());
+    
+    Translation<double,3> td3;  td3.vector().setRandom();
+    VERIFY_MATRIX(glTranslate(td3), Projective3d(td3).matrix());
+  }
+  
+  // scaling (2D and 3D)
+  {
+    Vector2f vf2; vf2.setRandom(); Vector3f vf23; vf23 << vf2, 1;
+    VERIFY_MATRIX(glScale(vf2), Projective3f(Scaling(vf23)).matrix());
+    Vector2d vd2; vd2.setRandom(); Vector3d vd23; vd23 << vd2, 1;
+    VERIFY_MATRIX(glScale(vd2), Projective3d(Scaling(vd23)).matrix());
+    
+    Vector3f vf3; vf3.setRandom();
+    VERIFY_MATRIX(glScale(vf3), Projective3f(Scaling(vf3)).matrix());
+    Vector3d vd3; vd3.setRandom();
+    VERIFY_MATRIX(glScale(vd3), Projective3d(Scaling(vd3)).matrix());
+    
+    UniformScaling<float> usf(internal::random<float>());
+    VERIFY_MATRIX(glScale(usf), Projective3f(usf).matrix());
+    
+    UniformScaling<double> usd(internal::random<double>());
+    VERIFY_MATRIX(glScale(usd), Projective3d(usd).matrix());
+  }
+  
+  // uniform
+  {
+    const char* vtx = "void main(void) { gl_Position = gl_Vertex; }\n";
+    
+    if(GLEW_VERSION_2_0)
+    {
+      #ifdef GL_VERSION_2_0
+      const char* frg = ""
+        "uniform vec2 v2f;\n"
+        "uniform vec3 v3f;\n"
+        "uniform vec4 v4f;\n"
+        "uniform ivec2 v2i;\n"
+        "uniform ivec3 v3i;\n"
+        "uniform ivec4 v4i;\n"
+        "uniform mat2 m2f;\n"
+        "uniform mat3 m3f;\n"
+        "uniform mat4 m4f;\n"
+        "void main(void) { gl_FragColor = vec4(v2f[0]+v3f[0]+v4f[0])+vec4(v2i[0]+v3i[0]+v4i[0])+vec4(m2f[0][0]+m3f[0][0]+m4f[0][0]); }\n";
+        
+      GLint prg_id = createShader(vtx,frg);
+      
+      VERIFY_UNIFORM(fv,v2f, Vector2f);
+      VERIFY_UNIFORM(fv,v3f, Vector3f);
+      VERIFY_UNIFORM(fv,v4f, Vector4f);
+      VERIFY_UNIFORMi(v2i, Vector2i);
+      VERIFY_UNIFORMi(v3i, Vector3i);
+      VERIFY_UNIFORMi(v4i, Vector4i);
+      VERIFY_UNIFORM(fv,m2f, Matrix2f);
+      VERIFY_UNIFORM(fv,m3f, Matrix3f);
+      VERIFY_UNIFORM(fv,m4f, Matrix4f);
+      #endif
+    }
+    else
+      std::cerr << "Warning: opengl 2.0 was not tested\n";
+    
+    if(GLEW_VERSION_2_1)
+    {
+      #ifdef GL_VERSION_2_1
+      const char* frg = "#version 120\n"
+        "uniform mat2x3 m23f;\n"
+        "uniform mat3x2 m32f;\n"
+        "uniform mat2x4 m24f;\n"
+        "uniform mat4x2 m42f;\n"
+        "uniform mat3x4 m34f;\n"
+        "uniform mat4x3 m43f;\n"
+        "void main(void) { gl_FragColor = vec4(m23f[0][0]+m32f[0][0]+m24f[0][0]+m42f[0][0]+m34f[0][0]+m43f[0][0]); }\n";
+        
+      GLint prg_id = createShader(vtx,frg);
+      
+      typedef Matrix<float,2,3> Matrix23f;
+      typedef Matrix<float,3,2> Matrix32f;
+      typedef Matrix<float,2,4> Matrix24f;
+      typedef Matrix<float,4,2> Matrix42f;
+      typedef Matrix<float,3,4> Matrix34f;
+      typedef Matrix<float,4,3> Matrix43f;
+      
+      VERIFY_UNIFORM(fv,m23f, Matrix23f);
+      VERIFY_UNIFORM(fv,m32f, Matrix32f);
+      VERIFY_UNIFORM(fv,m24f, Matrix24f);
+      VERIFY_UNIFORM(fv,m42f, Matrix42f);
+      VERIFY_UNIFORM(fv,m34f, Matrix34f);
+      VERIFY_UNIFORM(fv,m43f, Matrix43f);
+      #endif
+    }
+    else
+      std::cerr << "Warning: opengl 2.1 was not tested\n";
+    
+    if(GLEW_VERSION_3_0)
+    {
+      #ifdef GL_VERSION_3_0
+      const char* frg = "#version 150\n"
+        "uniform uvec2 v2ui;\n"
+        "uniform uvec3 v3ui;\n"
+        "uniform uvec4 v4ui;\n"
+        "out vec4 data;\n"
+        "void main(void) { data = vec4(v2ui[0]+v3ui[0]+v4ui[0]); }\n";
+        
+      GLint prg_id = createShader(vtx,frg);
+      
+      typedef Matrix<unsigned int,2,1> Vector2ui;
+      typedef Matrix<unsigned int,3,1> Vector3ui;
+      typedef Matrix<unsigned int,4,1> Vector4ui;
+      
+      VERIFY_UNIFORMi(v2ui, Vector2ui);
+      VERIFY_UNIFORMi(v3ui, Vector3ui);
+      VERIFY_UNIFORMi(v4ui, Vector4ui);
+      #endif
+    }
+    else
+      std::cerr << "Warning: opengl 3.0 was not tested\n";
+    
+    #ifdef GLEW_ARB_gpu_shader_fp64
+    if(GLEW_ARB_gpu_shader_fp64)
+    {
+      #ifdef GL_ARB_gpu_shader_fp64
+      const char* frg = "#version 150\n"
+        "uniform dvec2 v2d;\n"
+        "uniform dvec3 v3d;\n"
+        "uniform dvec4 v4d;\n"
+        "out vec4 data;\n"
+        "void main(void) { data = vec4(v2d[0]+v3d[0]+v4d[0]); }\n";
+        
+      GLint prg_id = createShader(vtx,frg);
+      
+      VERIFY_UNIFORM(dv,v2d, Vector2d);
+      VERIFY_UNIFORM(dv,v3d, Vector3d);
+      VERIFY_UNIFORM(dv,v4d, Vector4d);
+      #endif
+    }
+    else
+      std::cerr << "Warning: GLEW_ARB_gpu_shader_fp64 was not tested\n";
+    #else
+      std::cerr << "Warning: GLEW_ARB_gpu_shader_fp64 was not tested\n";
+    #endif
+  }
+  
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/polynomialsolver.cpp b/uppsrc/plugin/Eigen/unsupported/test/polynomialsolver.cpp
new file mode 100644
index 000000000..db8ad7dda
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/polynomialsolver.cpp
@@ -0,0 +1,232 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Manuel Yguel <manuel.yguel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <unsupported/Eigen/Polynomials>
+#include <iostream>
+#include <algorithm>
+
+using namespace std;
+
+namespace Eigen {
+namespace internal {
+template<int Size>
+struct increment_if_fixed_size
+{
+  enum {
+    ret = (Size == Dynamic) ? Dynamic : Size+1
+  };
+};
+}
+}
+
+template<typename PolynomialType>
+PolynomialType polyder(const PolynomialType& p)
+{
+  typedef typename PolynomialType::Scalar Scalar;
+  PolynomialType res(p.size());
+  for(Index i=1; i<p.size(); ++i)
+    res[i-1] = p[i]*Scalar(i);
+  res[p.size()-1] = 0.;
+  return res;
+}
+
+template<int Deg, typename POLYNOMIAL, typename SOLVER>
+bool aux_evalSolver( const POLYNOMIAL& pols, SOLVER& psolve )
+{
+  typedef typename POLYNOMIAL::Scalar Scalar;
+  typedef typename POLYNOMIAL::RealScalar RealScalar;
+
+  typedef typename SOLVER::RootsType    RootsType;
+  typedef Matrix<RealScalar,Deg,1>      EvalRootsType;
+
+  const Index deg = pols.size()-1;
+
+  // Test template constructor from coefficient vector
+  SOLVER solve_constr (pols);
+
+  psolve.compute( pols );
+  const RootsType& roots( psolve.roots() );
+  EvalRootsType evr( deg );
+  POLYNOMIAL pols_der = polyder(pols);
+  EvalRootsType der( deg );
+  for( int i=0; i<roots.size(); ++i ){
+    evr[i] = std::abs( poly_eval( pols, roots[i] ) );
+    der[i] = numext::maxi(RealScalar(1.), std::abs( poly_eval( pols_der, roots[i] ) ));
+  }
+
+  // we need to divide by the magnitude of the derivative because
+  // with a high derivative is very small error in the value of the root
+  // yiels a very large error in the polynomial evaluation.
+  bool evalToZero = (evr.cwiseQuotient(der)).isZero( test_precision<Scalar>() );
+  if( !evalToZero )
+  {
+    cerr << "WRONG root: " << endl;
+    cerr << "Polynomial: " << pols.transpose() << endl;
+    cerr << "Roots found: " << roots.transpose() << endl;
+    cerr << "Abs value of the polynomial at the roots: " << evr.transpose() << endl;
+    cerr << endl;
+  }
+
+  std::vector<RealScalar> rootModuli( roots.size() );
+  Map< EvalRootsType > aux( &rootModuli[0], roots.size() );
+  aux = roots.array().abs();
+  std::sort( rootModuli.begin(), rootModuli.end() );
+  bool distinctModuli=true;
+  for( size_t i=1; i<rootModuli.size() && distinctModuli; ++i )
+  {
+    if( internal::isApprox( rootModuli[i], rootModuli[i-1] ) ){
+      distinctModuli = false; }
+  }
+  VERIFY( evalToZero || !distinctModuli );
+
+  return distinctModuli;
+}
+
+
+
+
+
+
+
+template<int Deg, typename POLYNOMIAL>
+void evalSolver( const POLYNOMIAL& pols )
+{
+  typedef typename POLYNOMIAL::Scalar Scalar;
+
+  typedef PolynomialSolver<Scalar, Deg > PolynomialSolverType;
+
+  PolynomialSolverType psolve;
+  aux_evalSolver<Deg, POLYNOMIAL, PolynomialSolverType>( pols, psolve );
+}
+
+
+
+
+template< int Deg, typename POLYNOMIAL, typename ROOTS, typename REAL_ROOTS >
+void evalSolverSugarFunction( const POLYNOMIAL& pols, const ROOTS& roots, const REAL_ROOTS& real_roots )
+{
+  using std::sqrt;
+  typedef typename POLYNOMIAL::Scalar Scalar;
+  typedef typename POLYNOMIAL::RealScalar RealScalar;
+
+  typedef PolynomialSolver<Scalar, Deg >              PolynomialSolverType;
+
+  PolynomialSolverType psolve;
+  if( aux_evalSolver<Deg, POLYNOMIAL, PolynomialSolverType>( pols, psolve ) )
+  {
+    //It is supposed that
+    // 1) the roots found are correct
+    // 2) the roots have distinct moduli
+
+    //Test realRoots
+    std::vector< RealScalar > calc_realRoots;
+    psolve.realRoots( calc_realRoots,  test_precision<RealScalar>());
+    VERIFY_IS_EQUAL( calc_realRoots.size() , (size_t)real_roots.size() );
+
+    const RealScalar psPrec = sqrt( test_precision<RealScalar>() );
+
+    for( size_t i=0; i<calc_realRoots.size(); ++i )
+    {
+      bool found = false;
+      for( size_t j=0; j<calc_realRoots.size()&& !found; ++j )
+      {
+        if( internal::isApprox( calc_realRoots[i], real_roots[j], psPrec ) ){
+          found = true; }
+      }
+      VERIFY( found );
+    }
+
+    //Test greatestRoot
+    VERIFY( internal::isApprox( roots.array().abs().maxCoeff(),
+          abs( psolve.greatestRoot() ), psPrec ) );
+
+    //Test smallestRoot
+    VERIFY( internal::isApprox( roots.array().abs().minCoeff(),
+          abs( psolve.smallestRoot() ), psPrec ) );
+
+    bool hasRealRoot;
+    //Test absGreatestRealRoot
+    RealScalar r = psolve.absGreatestRealRoot( hasRealRoot );
+    VERIFY( hasRealRoot == (real_roots.size() > 0 ) );
+    if( hasRealRoot ){
+      VERIFY( internal::isApprox( real_roots.array().abs().maxCoeff(), abs(r), psPrec ) );  }
+
+    //Test absSmallestRealRoot
+    r = psolve.absSmallestRealRoot( hasRealRoot );
+    VERIFY( hasRealRoot == (real_roots.size() > 0 ) );
+    if( hasRealRoot ){
+      VERIFY( internal::isApprox( real_roots.array().abs().minCoeff(), abs( r ), psPrec ) ); }
+
+    //Test greatestRealRoot
+    r = psolve.greatestRealRoot( hasRealRoot );
+    VERIFY( hasRealRoot == (real_roots.size() > 0 ) );
+    if( hasRealRoot ){
+      VERIFY( internal::isApprox( real_roots.array().maxCoeff(), r, psPrec ) ); }
+
+    //Test smallestRealRoot
+    r = psolve.smallestRealRoot( hasRealRoot );
+    VERIFY( hasRealRoot == (real_roots.size() > 0 ) );
+    if( hasRealRoot ){
+    VERIFY( internal::isApprox( real_roots.array().minCoeff(), r, psPrec ) ); }
+  }
+}
+
+
+template<typename _Scalar, int _Deg>
+void polynomialsolver(int deg)
+{
+  typedef typename NumTraits<_Scalar>::Real RealScalar;
+  typedef internal::increment_if_fixed_size<_Deg>     Dim;
+  typedef Matrix<_Scalar,Dim::ret,1>                  PolynomialType;
+  typedef Matrix<_Scalar,_Deg,1>                      EvalRootsType;
+  typedef Matrix<RealScalar,_Deg,1>                   RealRootsType;
+
+  cout << "Standard cases" << endl;
+  PolynomialType pols = PolynomialType::Random(deg+1);
+  evalSolver<_Deg,PolynomialType>( pols );
+
+  cout << "Hard cases" << endl;
+  _Scalar multipleRoot = internal::random<_Scalar>();
+  EvalRootsType allRoots = EvalRootsType::Constant(deg,multipleRoot);
+  roots_to_monicPolynomial( allRoots, pols );
+  evalSolver<_Deg,PolynomialType>( pols );
+
+  cout << "Test sugar" << endl;
+  RealRootsType realRoots = RealRootsType::Random(deg);
+  roots_to_monicPolynomial( realRoots, pols );
+  evalSolverSugarFunction<_Deg>(
+      pols,
+      realRoots.template cast <std::complex<RealScalar> >().eval(),
+      realRoots );
+}
+
+void test_polynomialsolver()
+{
+  for(int i = 0; i < g_repeat; i++)
+  {
+    CALL_SUBTEST_1( (polynomialsolver<float,1>(1)) );
+    CALL_SUBTEST_2( (polynomialsolver<double,2>(2)) );
+    CALL_SUBTEST_3( (polynomialsolver<double,3>(3)) );
+    CALL_SUBTEST_4( (polynomialsolver<float,4>(4)) );
+    CALL_SUBTEST_5( (polynomialsolver<double,5>(5)) );
+    CALL_SUBTEST_6( (polynomialsolver<float,6>(6)) );
+    CALL_SUBTEST_7( (polynomialsolver<float,7>(7)) );
+    CALL_SUBTEST_8( (polynomialsolver<double,8>(8)) );
+
+    CALL_SUBTEST_9( (polynomialsolver<float,Dynamic>(
+            internal::random<int>(9,13)
+            )) );
+    CALL_SUBTEST_10((polynomialsolver<double,Dynamic>(
+            internal::random<int>(9,13)
+            )) );
+    CALL_SUBTEST_11((polynomialsolver<float,Dynamic>(1)) );
+    CALL_SUBTEST_12((polynomialsolver<std::complex<double>,Dynamic>(internal::random<int>(2,13))) );
+  }
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/polynomialutils.cpp b/uppsrc/plugin/Eigen/unsupported/test/polynomialutils.cpp
new file mode 100644
index 000000000..5fc968402
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/polynomialutils.cpp
@@ -0,0 +1,113 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Manuel Yguel <manuel.yguel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include <unsupported/Eigen/Polynomials>
+#include <iostream>
+
+using namespace std;
+
+namespace Eigen {
+namespace internal {
+template<int Size>
+struct increment_if_fixed_size
+{
+  enum {
+    ret = (Size == Dynamic) ? Dynamic : Size+1
+  };
+};
+}
+}
+
+template<typename _Scalar, int _Deg>
+void realRoots_to_monicPolynomial_test(int deg)
+{
+  typedef internal::increment_if_fixed_size<_Deg>            Dim;
+  typedef Matrix<_Scalar,Dim::ret,1>                  PolynomialType;
+  typedef Matrix<_Scalar,_Deg,1>                      EvalRootsType;
+
+  PolynomialType pols(deg+1);
+  EvalRootsType roots = EvalRootsType::Random(deg);
+  roots_to_monicPolynomial( roots, pols );
+
+  EvalRootsType evr( deg );
+  for( int i=0; i<roots.size(); ++i ){
+    evr[i] = std::abs( poly_eval( pols, roots[i] ) ); }
+
+  bool evalToZero = evr.isZero( test_precision<_Scalar>() );
+  if( !evalToZero ){
+    cerr << evr.transpose() << endl; }
+  VERIFY( evalToZero );
+}
+
+template<typename _Scalar> void realRoots_to_monicPolynomial_scalar()
+{
+  CALL_SUBTEST_2( (realRoots_to_monicPolynomial_test<_Scalar,2>(2)) );
+  CALL_SUBTEST_3( (realRoots_to_monicPolynomial_test<_Scalar,3>(3)) );
+  CALL_SUBTEST_4( (realRoots_to_monicPolynomial_test<_Scalar,4>(4)) );
+  CALL_SUBTEST_5( (realRoots_to_monicPolynomial_test<_Scalar,5>(5)) );
+  CALL_SUBTEST_6( (realRoots_to_monicPolynomial_test<_Scalar,6>(6)) );
+  CALL_SUBTEST_7( (realRoots_to_monicPolynomial_test<_Scalar,7>(7)) );
+  CALL_SUBTEST_8( (realRoots_to_monicPolynomial_test<_Scalar,17>(17)) );
+
+  CALL_SUBTEST_9( (realRoots_to_monicPolynomial_test<_Scalar,Dynamic>(
+          internal::random<int>(18,26) )) );
+}
+
+
+
+
+template<typename _Scalar, int _Deg>
+void CauchyBounds(int deg)
+{
+  typedef internal::increment_if_fixed_size<_Deg>            Dim;
+  typedef Matrix<_Scalar,Dim::ret,1>                  PolynomialType;
+  typedef Matrix<_Scalar,_Deg,1>                      EvalRootsType;
+
+  PolynomialType pols(deg+1);
+  EvalRootsType roots = EvalRootsType::Random(deg);
+  roots_to_monicPolynomial( roots, pols );
+  _Scalar M = cauchy_max_bound( pols );
+  _Scalar m = cauchy_min_bound( pols );
+  _Scalar Max = roots.array().abs().maxCoeff();
+  _Scalar min = roots.array().abs().minCoeff();
+  bool eval = (M >= Max) && (m <= min);
+  if( !eval )
+  {
+    cerr << "Roots: " << roots << endl;
+    cerr << "Bounds: (" << m << ", " << M << ")" << endl;
+    cerr << "Min,Max: (" << min << ", " << Max << ")" << endl;
+  }
+  VERIFY( eval );
+}
+
+template<typename _Scalar> void CauchyBounds_scalar()
+{
+  CALL_SUBTEST_2( (CauchyBounds<_Scalar,2>(2)) );
+  CALL_SUBTEST_3( (CauchyBounds<_Scalar,3>(3)) );
+  CALL_SUBTEST_4( (CauchyBounds<_Scalar,4>(4)) );
+  CALL_SUBTEST_5( (CauchyBounds<_Scalar,5>(5)) );
+  CALL_SUBTEST_6( (CauchyBounds<_Scalar,6>(6)) );
+  CALL_SUBTEST_7( (CauchyBounds<_Scalar,7>(7)) );
+  CALL_SUBTEST_8( (CauchyBounds<_Scalar,17>(17)) );
+
+  CALL_SUBTEST_9( (CauchyBounds<_Scalar,Dynamic>(
+          internal::random<int>(18,26) )) );
+}
+
+void test_polynomialutils()
+{
+  for(int i = 0; i < g_repeat; i++)
+  {
+    realRoots_to_monicPolynomial_scalar<double>();
+    realRoots_to_monicPolynomial_scalar<float>();
+    CauchyBounds_scalar<double>();
+    CauchyBounds_scalar<float>();
+  }
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/sparse_extra.cpp b/uppsrc/plugin/Eigen/unsupported/test/sparse_extra.cpp
new file mode 100644
index 000000000..7a049c870
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/sparse_extra.cpp
@@ -0,0 +1,147 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+// import basic and product tests for deprecated DynamicSparseMatrix
+#define EIGEN_NO_DEPRECATED_WARNING
+#include "sparse_product.cpp"
+#include "sparse_basic.cpp"
+#include <Eigen/SparseExtra>
+
+template<typename SetterType,typename DenseType, typename Scalar, int Options>
+bool test_random_setter(SparseMatrix<Scalar,Options>& sm, const DenseType& ref, const std::vector<Vector2i>& nonzeroCoords)
+{
+  {
+    sm.setZero();
+    SetterType w(sm);
+    std::vector<Vector2i> remaining = nonzeroCoords;
+    while(!remaining.empty())
+    {
+      int i = internal::random<int>(0,static_cast<int>(remaining.size())-1);
+      w(remaining[i].x(),remaining[i].y()) = ref.coeff(remaining[i].x(),remaining[i].y());
+      remaining[i] = remaining.back();
+      remaining.pop_back();
+    }
+  }
+  return sm.isApprox(ref);
+}
+
+template<typename SetterType,typename DenseType, typename T>
+bool test_random_setter(DynamicSparseMatrix<T>& sm, const DenseType& ref, const std::vector<Vector2i>& nonzeroCoords)
+{
+  sm.setZero();
+  std::vector<Vector2i> remaining = nonzeroCoords;
+  while(!remaining.empty())
+  {
+    int i = internal::random<int>(0,static_cast<int>(remaining.size())-1);
+    sm.coeffRef(remaining[i].x(),remaining[i].y()) = ref.coeff(remaining[i].x(),remaining[i].y());
+    remaining[i] = remaining.back();
+    remaining.pop_back();
+  }
+  return sm.isApprox(ref);
+}
+
+template<typename SparseMatrixType> void sparse_extra(const SparseMatrixType& ref)
+{
+  const Index rows = ref.rows();
+  const Index cols = ref.cols();
+  typedef typename SparseMatrixType::Scalar Scalar;
+  enum { Flags = SparseMatrixType::Flags };
+
+  double density = (std::max)(8./(rows*cols), 0.01);
+  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
+  typedef Matrix<Scalar,Dynamic,1> DenseVector;
+  Scalar eps = 1e-6;
+
+  SparseMatrixType m(rows, cols);
+  DenseMatrix refMat = DenseMatrix::Zero(rows, cols);
+  DenseVector vec1 = DenseVector::Random(rows);
+
+  std::vector<Vector2i> zeroCoords;
+  std::vector<Vector2i> nonzeroCoords;
+  initSparse<Scalar>(density, refMat, m, 0, &zeroCoords, &nonzeroCoords);
+
+  if (zeroCoords.size()==0 || nonzeroCoords.size()==0)
+    return;
+
+  // test coeff and coeffRef
+  for (int i=0; i<(int)zeroCoords.size(); ++i)
+  {
+    VERIFY_IS_MUCH_SMALLER_THAN( m.coeff(zeroCoords[i].x(),zeroCoords[i].y()), eps );
+    if(internal::is_same<SparseMatrixType,SparseMatrix<Scalar,Flags> >::value)
+      VERIFY_RAISES_ASSERT( m.coeffRef(zeroCoords[0].x(),zeroCoords[0].y()) = 5 );
+  }
+  VERIFY_IS_APPROX(m, refMat);
+
+  m.coeffRef(nonzeroCoords[0].x(), nonzeroCoords[0].y()) = Scalar(5);
+  refMat.coeffRef(nonzeroCoords[0].x(), nonzeroCoords[0].y()) = Scalar(5);
+
+  VERIFY_IS_APPROX(m, refMat);
+
+  // random setter
+//   {
+//     m.setZero();
+//     VERIFY_IS_NOT_APPROX(m, refMat);
+//     SparseSetter<SparseMatrixType, RandomAccessPattern> w(m);
+//     std::vector<Vector2i> remaining = nonzeroCoords;
+//     while(!remaining.empty())
+//     {
+//       int i = internal::random<int>(0,remaining.size()-1);
+//       w->coeffRef(remaining[i].x(),remaining[i].y()) = refMat.coeff(remaining[i].x(),remaining[i].y());
+//       remaining[i] = remaining.back();
+//       remaining.pop_back();
+//     }
+//   }
+//   VERIFY_IS_APPROX(m, refMat);
+
+    VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, StdMapTraits> >(m,refMat,nonzeroCoords) ));
+    #ifdef EIGEN_UNORDERED_MAP_SUPPORT
+    VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, StdUnorderedMapTraits> >(m,refMat,nonzeroCoords) ));
+    #endif
+    #ifdef _DENSE_HASH_MAP_H_
+    VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, GoogleDenseHashMapTraits> >(m,refMat,nonzeroCoords) ));
+    #endif
+    #ifdef _SPARSE_HASH_MAP_H_
+    VERIFY(( test_random_setter<RandomSetter<SparseMatrixType, GoogleSparseHashMapTraits> >(m,refMat,nonzeroCoords) ));
+    #endif
+
+
+  // test RandomSetter
+  /*{
+    SparseMatrixType m1(rows,cols), m2(rows,cols);
+    DenseMatrix refM1 = DenseMatrix::Zero(rows, rows);
+    initSparse<Scalar>(density, refM1, m1);
+    {
+      Eigen::RandomSetter<SparseMatrixType > setter(m2);
+      for (int j=0; j<m1.outerSize(); ++j)
+        for (typename SparseMatrixType::InnerIterator i(m1,j); i; ++i)
+          setter(i.index(), j) = i.value();
+    }
+    VERIFY_IS_APPROX(m1, m2);
+  }*/
+
+
+}
+
+void test_sparse_extra()
+{
+  for(int i = 0; i < g_repeat; i++) {
+    int s = Eigen::internal::random<int>(1,50);
+    CALL_SUBTEST_1( sparse_extra(SparseMatrix<double>(8, 8)) );
+    CALL_SUBTEST_2( sparse_extra(SparseMatrix<std::complex<double> >(s, s)) );
+    CALL_SUBTEST_1( sparse_extra(SparseMatrix<double>(s, s)) );
+
+    CALL_SUBTEST_3( sparse_extra(DynamicSparseMatrix<double>(s, s)) );
+//    CALL_SUBTEST_3(( sparse_basic(DynamicSparseMatrix<double>(s, s)) ));
+//    CALL_SUBTEST_3(( sparse_basic(DynamicSparseMatrix<double,ColMajor,long int>(s, s)) ));
+
+    CALL_SUBTEST_3( (sparse_product<DynamicSparseMatrix<float, ColMajor> >()) );
+    CALL_SUBTEST_3( (sparse_product<DynamicSparseMatrix<float, RowMajor> >()) );
+  }
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/special_functions.cpp b/uppsrc/plugin/Eigen/unsupported/test/special_functions.cpp
new file mode 100644
index 000000000..057fb3e92
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/special_functions.cpp
@@ -0,0 +1,345 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include "../Eigen/SpecialFunctions"
+
+template<typename X, typename Y>
+void verify_component_wise(const X& x, const Y& y)
+{
+  for(Index i=0; i<x.size(); ++i)
+  {
+    if((numext::isfinite)(y(i)))
+      VERIFY_IS_APPROX( x(i), y(i) );
+    else if((numext::isnan)(y(i)))
+      VERIFY((numext::isnan)(x(i)));
+    else
+      VERIFY_IS_EQUAL( x(i), y(i) );
+  }
+}
+
+template<typename ArrayType> void array_special_functions()
+{
+  using std::abs;
+  using std::sqrt;
+  typedef typename ArrayType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  Scalar plusinf = std::numeric_limits<Scalar>::infinity();
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+
+  Index rows = internal::random<Index>(1,30);
+  Index cols = 1;
+
+  // API
+  {
+    ArrayType m1 = ArrayType::Random(rows,cols);
+#if EIGEN_HAS_C99_MATH
+    VERIFY_IS_APPROX(m1.lgamma(), lgamma(m1));
+    VERIFY_IS_APPROX(m1.digamma(), digamma(m1));
+    VERIFY_IS_APPROX(m1.erf(), erf(m1));
+    VERIFY_IS_APPROX(m1.erfc(), erfc(m1));
+#endif  // EIGEN_HAS_C99_MATH
+  }
+
+
+#if EIGEN_HAS_C99_MATH
+  // check special functions (comparing against numpy implementation)
+  if (!NumTraits<Scalar>::IsComplex)
+  {
+
+    {
+      ArrayType m1 = ArrayType::Random(rows,cols);
+      ArrayType m2 = ArrayType::Random(rows,cols);
+
+      // Test various propreties of igamma & igammac.  These are normalized
+      // gamma integrals where
+      //   igammac(a, x) = Gamma(a, x) / Gamma(a)
+      //   igamma(a, x) = gamma(a, x) / Gamma(a)
+      // where Gamma and gamma are considered the standard unnormalized
+      // upper and lower incomplete gamma functions, respectively.
+      ArrayType a = m1.abs() + 2;
+      ArrayType x = m2.abs() + 2;
+      ArrayType zero = ArrayType::Zero(rows, cols);
+      ArrayType one = ArrayType::Constant(rows, cols, Scalar(1.0));
+      ArrayType a_m1 = a - one;
+      ArrayType Gamma_a_x = Eigen::igammac(a, x) * a.lgamma().exp();
+      ArrayType Gamma_a_m1_x = Eigen::igammac(a_m1, x) * a_m1.lgamma().exp();
+      ArrayType gamma_a_x = Eigen::igamma(a, x) * a.lgamma().exp();
+      ArrayType gamma_a_m1_x = Eigen::igamma(a_m1, x) * a_m1.lgamma().exp();
+
+      // Gamma(a, 0) == Gamma(a)
+      VERIFY_IS_APPROX(Eigen::igammac(a, zero), one);
+
+      // Gamma(a, x) + gamma(a, x) == Gamma(a)
+      VERIFY_IS_APPROX(Gamma_a_x + gamma_a_x, a.lgamma().exp());
+
+      // Gamma(a, x) == (a - 1) * Gamma(a-1, x) + x^(a-1) * exp(-x)
+      VERIFY_IS_APPROX(Gamma_a_x, (a - 1) * Gamma_a_m1_x + x.pow(a-1) * (-x).exp());
+
+      // gamma(a, x) == (a - 1) * gamma(a-1, x) - x^(a-1) * exp(-x)
+      VERIFY_IS_APPROX(gamma_a_x, (a - 1) * gamma_a_m1_x - x.pow(a-1) * (-x).exp());
+    }
+
+    {
+      // Check exact values of igamma and igammac against a third party calculation.
+      Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+      Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+
+      // location i*6+j corresponds to a_s[i], x_s[j].
+      Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
+                              {0.0, 0.6321205588285578, 0.7768698398515702,
+                              0.9816843611112658, 9.999500016666262e-05, 1.0},
+                              {0.0, 0.4275932955291202, 0.608374823728911,
+                              0.9539882943107686, 7.522076445089201e-07, 1.0},
+                              {0.0, 0.01898815687615381, 0.06564245437845008,
+                              0.5665298796332909, 4.166333347221828e-18, 1.0},
+                              {0.0, 0.9999780593618628, 0.9999899967080838,
+                              0.9999996219837988, 0.9991370418689945, 1.0},
+                              {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
+      Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
+                              {1.0, 0.36787944117144233, 0.22313016014842982,
+                                0.018315638888734182, 0.9999000049998333, 0.0},
+                              {1.0, 0.5724067044708798, 0.3916251762710878,
+                                0.04601170568923136, 0.9999992477923555, 0.0},
+                              {1.0, 0.9810118431238462, 0.9343575456215499,
+                                0.4334701203667089, 1.0, 0.0},
+                              {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
+                                3.7801620118431334e-07, 0.0008629581310054535,
+                                0.0},
+                              {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
+      for (int i = 0; i < 6; ++i) {
+        for (int j = 0; j < 6; ++j) {
+          if ((std::isnan)(igamma_s[i][j])) {
+            VERIFY((std::isnan)(numext::igamma(a_s[i], x_s[j])));
+          } else {
+            VERIFY_IS_APPROX(numext::igamma(a_s[i], x_s[j]), igamma_s[i][j]);
+          }
+
+          if ((std::isnan)(igammac_s[i][j])) {
+            VERIFY((std::isnan)(numext::igammac(a_s[i], x_s[j])));
+          } else {
+            VERIFY_IS_APPROX(numext::igammac(a_s[i], x_s[j]), igammac_s[i][j]);
+          }
+        }
+      }
+    }
+  }
+#endif  // EIGEN_HAS_C99_MATH
+
+  // Check the zeta function against scipy.special.zeta
+  {
+    ArrayType x(7), q(7), res(7), ref(7);
+    x << 1.5,   4, 10.5, 10000.5,    3, 1,        0.9;
+    q << 2,   1.5,    3,  1.0001, -2.5, 1.2345, 1.2345;
+    ref << 1.61237534869, 0.234848505667, 1.03086757337e-5, 0.367879440865, 0.054102025820864097, plusinf, nan;
+    CALL_SUBTEST( verify_component_wise(ref, ref); );
+    CALL_SUBTEST( res = x.zeta(q); verify_component_wise(res, ref); );
+    CALL_SUBTEST( res = zeta(x,q); verify_component_wise(res, ref); );
+  }
+
+  // digamma
+  {
+    ArrayType x(7), res(7), ref(7);
+    x << 1, 1.5, 4, -10.5, 10000.5, 0, -1;
+    ref << -0.5772156649015329, 0.03648997397857645, 1.2561176684318, 2.398239129535781, 9.210340372392849, plusinf, plusinf;
+    CALL_SUBTEST( verify_component_wise(ref, ref); );
+
+    CALL_SUBTEST( res = x.digamma(); verify_component_wise(res, ref); );
+    CALL_SUBTEST( res = digamma(x);  verify_component_wise(res, ref); );
+  }
+
+
+#if EIGEN_HAS_C99_MATH
+  {
+    ArrayType n(11), x(11), res(11), ref(11);
+    n << 1, 1,    1, 1.5,   17,   31,   28,    8, 42, 147, 170;
+    x << 2, 3, 25.5, 1.5,  4.7, 11.8, 17.7, 30.2, 15.8, 54.1, 64;
+    ref << 0.644934066848, 0.394934066848, 0.0399946696496, nan, 293.334565435, 0.445487887616, -2.47810300902e-07, -8.29668781082e-09, -0.434562276666, 0.567742190178, -0.0108615497927;
+    CALL_SUBTEST( verify_component_wise(ref, ref); );
+
+    if(sizeof(RealScalar)>=8) {  // double
+      // Reason for commented line: http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1232
+      //       CALL_SUBTEST( res = x.polygamma(n); verify_component_wise(res, ref); );
+      CALL_SUBTEST( res = polygamma(n,x);  verify_component_wise(res, ref); );
+    }
+    else {
+      //       CALL_SUBTEST( res = x.polygamma(n); verify_component_wise(res.head(8), ref.head(8)); );
+      CALL_SUBTEST( res = polygamma(n,x); verify_component_wise(res.head(8), ref.head(8)); );
+    }
+  }
+#endif
+
+#if EIGEN_HAS_C99_MATH
+  {
+    // Inputs and ground truth generated with scipy via:
+    //   a = np.logspace(-3, 3, 5) - 1e-3
+    //   b = np.logspace(-3, 3, 5) - 1e-3
+    //   x = np.linspace(-0.1, 1.1, 5)
+    //   (full_a, full_b, full_x) = np.vectorize(lambda a, b, x: (a, b, x))(*np.ix_(a, b, x))
+    //   full_a = full_a.flatten().tolist()  # same for full_b, full_x
+    //   v = scipy.special.betainc(full_a, full_b, full_x).flatten().tolist()
+    //
+    // Note in Eigen, we call betainc with arguments in the order (x, a, b).
+    ArrayType a(125);
+    ArrayType b(125);
+    ArrayType x(125);
+    ArrayType v(125);
+    ArrayType res(125);
+
+    a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+        0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+        0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+        999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+        999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+        999.999, 999.999, 999.999;
+
+    b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+        0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999,
+        999.999, 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.999, 0.999, 0.999, 0.999,
+        0.999, 31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999,
+        999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999,
+        999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999,
+        999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999,
+        999.999, 999.999;
+
+    x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+        0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+        0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
+        0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
+        -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
+        1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+        0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+        0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
+        0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+        0.8, 1.1;
+
+    v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+        nan, nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
+        0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
+        0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
+        0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan,
+        nan, nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
+        0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
+        0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
+        0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
+        0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
+        1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06,
+        nan, nan, 7.864342668429763e-23, 3.015969667594166e-10,
+        0.0008598571564165444, nan, nan, 6.031987710123844e-08,
+        0.5000000000000007, 0.9999999396801229, nan, nan, 0.9999999999999999,
+        0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan, nan,
+        nan, 0.0, 7.029920380986636e-306, 2.2450728208591345e-101, nan, nan,
+        0.0, 9.275871147869727e-302, 1.2232913026152827e-97, nan, nan, 0.0,
+        3.0891393081932924e-252, 2.9303043666183996e-60, nan, nan,
+        2.248913486879199e-196, 0.5000000000004947, 0.9999999999999999, nan;
+
+    CALL_SUBTEST(res = betainc(a, b, x);
+                 verify_component_wise(res, v););
+  }
+
+  // Test various properties of betainc
+  {
+    ArrayType m1 = ArrayType::Random(32);
+    ArrayType m2 = ArrayType::Random(32);
+    ArrayType m3 = ArrayType::Random(32);
+    ArrayType one = ArrayType::Constant(32, Scalar(1.0));
+    const Scalar eps = std::numeric_limits<Scalar>::epsilon();
+    ArrayType a = (m1 * 4.0).exp();
+    ArrayType b = (m2 * 4.0).exp();
+    ArrayType x = m3.abs();
+
+    // betainc(a, 1, x) == x**a
+    CALL_SUBTEST(
+        ArrayType test = betainc(a, one, x);
+        ArrayType expected = x.pow(a);
+        verify_component_wise(test, expected););
+
+    // betainc(1, b, x) == 1 - (1 - x)**b
+    CALL_SUBTEST(
+        ArrayType test = betainc(one, b, x);
+        ArrayType expected = one - (one - x).pow(b);
+        verify_component_wise(test, expected););
+
+    // betainc(a, b, x) == 1 - betainc(b, a, 1-x)
+    CALL_SUBTEST(
+        ArrayType test = betainc(a, b, x) + betainc(b, a, one - x);
+        ArrayType expected = one;
+        verify_component_wise(test, expected););
+
+    // betainc(a+1, b, x) = betainc(a, b, x) - x**a * (1 - x)**b / (a * beta(a, b))
+    CALL_SUBTEST(
+        ArrayType num = x.pow(a) * (one - x).pow(b);
+        ArrayType denom = a * (a.lgamma() + b.lgamma() - (a + b).lgamma()).exp();
+        // Add eps to rhs and lhs so that component-wise test doesn't result in
+        // nans when both outputs are zeros.
+        ArrayType expected = betainc(a, b, x) - num / denom + eps;
+        ArrayType test = betainc(a + one, b, x) + eps;
+        if (sizeof(Scalar) >= 8) { // double
+          verify_component_wise(test, expected);
+        } else {
+          // Reason for limited test: http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1232
+          verify_component_wise(test.head(8), expected.head(8));
+        });
+
+    // betainc(a, b+1, x) = betainc(a, b, x) + x**a * (1 - x)**b / (b * beta(a, b))
+    CALL_SUBTEST(
+        // Add eps to rhs and lhs so that component-wise test doesn't result in
+        // nans when both outputs are zeros.
+        ArrayType num = x.pow(a) * (one - x).pow(b);
+        ArrayType denom = b * (a.lgamma() + b.lgamma() - (a + b).lgamma()).exp();
+        ArrayType expected = betainc(a, b, x) + num / denom + eps;
+        ArrayType test = betainc(a, b + one, x) + eps;
+        verify_component_wise(test, expected););
+  }
+#endif
+}
+
+void test_special_functions()
+{
+  CALL_SUBTEST_1(array_special_functions<ArrayXf>());
+  CALL_SUBTEST_2(array_special_functions<ArrayXd>());
+}
diff --git a/uppsrc/plugin/Eigen/unsupported/test/splines.cpp b/uppsrc/plugin/Eigen/unsupported/test/splines.cpp
new file mode 100644
index 000000000..3be020434
--- /dev/null
+++ b/uppsrc/plugin/Eigen/unsupported/test/splines.cpp
@@ -0,0 +1,281 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010-2011 Hauke Heibel <heibel@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <unsupported/Eigen/Splines>
+
+namespace Eigen {
+  
+  // lets do some explicit instantiations and thus
+  // force the compilation of all spline functions...
+  template class Spline<double, 2, Dynamic>;
+  template class Spline<double, 3, Dynamic>;
+
+  template class Spline<double, 2, 2>;
+  template class Spline<double, 2, 3>;
+  template class Spline<double, 2, 4>;
+  template class Spline<double, 2, 5>;
+
+  template class Spline<float, 2, Dynamic>;
+  template class Spline<float, 3, Dynamic>;
+
+  template class Spline<float, 3, 2>;
+  template class Spline<float, 3, 3>;
+  template class Spline<float, 3, 4>;
+  template class Spline<float, 3, 5>;
+
+}
+
+Spline<double, 2, Dynamic> closed_spline2d()
+{
+  RowVectorXd knots(12);
+  knots << 0,
+    0,
+    0,
+    0,
+    0.867193179093898,
+    1.660330955342408,
+    2.605084834823134,
+    3.484154586374428,
+    4.252699478956276,
+    4.252699478956276,
+    4.252699478956276,
+    4.252699478956276;
+
+  MatrixXd ctrls(8,2);
+  ctrls << -0.370967741935484,   0.236842105263158,
+    -0.231401860693277,   0.442245185027632,
+    0.344361228532831,   0.773369994120753,
+    0.828990216203802,   0.106550882647595,
+    0.407270163678382,  -1.043452922172848,
+    -0.488467813584053,  -0.390098582530090,
+    -0.494657189446427,   0.054804824897884,
+    -0.370967741935484,   0.236842105263158;
+  ctrls.transposeInPlace();
+
+  return Spline<double, 2, Dynamic>(knots, ctrls);
+}
+
+/* create a reference spline */
+Spline<double, 3, Dynamic> spline3d()
+{
+  RowVectorXd knots(11);
+  knots << 0,
+    0,
+    0,
+    0.118997681558377,
+    0.162611735194631,
+    0.498364051982143,
+    0.655098003973841,
+    0.679702676853675,
+    1.000000000000000,
+    1.000000000000000,
+    1.000000000000000;
+
+  MatrixXd ctrls(8,3);
+  ctrls <<    0.959743958516081,   0.340385726666133,   0.585267750979777,
+    0.223811939491137,   0.751267059305653,   0.255095115459269,
+    0.505957051665142,   0.699076722656686,   0.890903252535799,
+    0.959291425205444,   0.547215529963803,   0.138624442828679,
+    0.149294005559057,   0.257508254123736,   0.840717255983663,
+    0.254282178971531,   0.814284826068816,   0.243524968724989,
+    0.929263623187228,   0.349983765984809,   0.196595250431208,
+    0.251083857976031,   0.616044676146639,   0.473288848902729;
+  ctrls.transposeInPlace();
+
+  return Spline<double, 3, Dynamic>(knots, ctrls);
+}
+
+/* compares evaluations against known results */
+void eval_spline3d()
+{
+  Spline3d spline = spline3d();
+
+  RowVectorXd u(10);
+  u << 0.351659507062997,
+    0.830828627896291,
+    0.585264091152724,
+    0.549723608291140,
+    0.917193663829810,
+    0.285839018820374,
+    0.757200229110721,
+    0.753729094278495,
+    0.380445846975357,
+    0.567821640725221;
+
+  MatrixXd pts(10,3);
+  pts << 0.707620811535916,   0.510258911240815,   0.417485437023409,
+    0.603422256426978,   0.529498282727551,   0.270351549348981,
+    0.228364197569334,   0.423745615677815,   0.637687289287490,
+    0.275556796335168,   0.350856706427970,   0.684295784598905,
+    0.514519311047655,   0.525077224890754,   0.351628308305896,
+    0.724152914315666,   0.574461155457304,   0.469860285484058,
+    0.529365063753288,   0.613328702656816,   0.237837040141739,
+    0.522469395136878,   0.619099658652895,   0.237139665242069,
+    0.677357023849552,   0.480655768435853,   0.422227610314397,
+    0.247046593173758,   0.380604672404750,   0.670065791405019;
+  pts.transposeInPlace();
+
+  for (int i=0; i<u.size(); ++i)
+  {
+    Vector3d pt = spline(u(i));
+    VERIFY( (pt - pts.col(i)).norm() < 1e-14 );
+  }
+}
+
+/* compares evaluations on corner cases */
+void eval_spline3d_onbrks()
+{
+  Spline3d spline = spline3d();
+
+  RowVectorXd u = spline.knots();
+
+  MatrixXd pts(11,3);
+  pts <<    0.959743958516081,   0.340385726666133,   0.585267750979777,
+    0.959743958516081,   0.340385726666133,   0.585267750979777,
+    0.959743958516081,   0.340385726666133,   0.585267750979777,
+    0.430282980289940,   0.713074680056118,   0.720373307943349,
+    0.558074875553060,   0.681617921034459,   0.804417124839942,
+    0.407076008291750,   0.349707710518163,   0.617275937419545,
+    0.240037008286602,   0.738739390398014,   0.324554153129411,
+    0.302434111480572,   0.781162443963899,   0.240177089094644,
+    0.251083857976031,   0.616044676146639,   0.473288848902729,
+    0.251083857976031,   0.616044676146639,   0.473288848902729,
+    0.251083857976031,   0.616044676146639,   0.473288848902729;
+  pts.transposeInPlace();
+
+  for (int i=0; i<u.size(); ++i)
+  {
+    Vector3d pt = spline(u(i));
+    VERIFY( (pt - pts.col(i)).norm() < 1e-14 );
+  }
+}
+
+void eval_closed_spline2d()
+{
+  Spline2d spline = closed_spline2d();
+
+  RowVectorXd u(12);
+  u << 0,
+    0.332457030395796,
+    0.356467130532952,
+    0.453562180176215,
+    0.648017921874804,
+    0.973770235555003,
+    1.882577647219307,
+    2.289408593930498,
+    3.511951429883045,
+    3.884149321369450,
+    4.236261590369414,
+    4.252699478956276;
+
+  MatrixXd pts(12,2);
+  pts << -0.370967741935484,   0.236842105263158,
+    -0.152576775123250,   0.448975001279334,
+    -0.133417538277668,   0.461615613865667,
+    -0.053199060826740,   0.507630360006299,
+    0.114249591147281,   0.570414135097409,
+    0.377810316891987,   0.560497102875315,
+    0.665052120135908,  -0.157557441109611,
+    0.516006487053228,  -0.559763292174825,
+    -0.379486035348887,  -0.331959640488223,
+    -0.462034726249078,  -0.039105670080824,
+    -0.378730600917982,   0.225127015099919,
+    -0.370967741935484,   0.236842105263158;
+  pts.transposeInPlace();
+
+  for (int i=0; i<u.size(); ++i)
+  {
+    Vector2d pt = spline(u(i));
+    VERIFY( (pt - pts.col(i)).norm() < 1e-14 );
+  }
+}
+
+void check_global_interpolation2d()
+{
+  typedef Spline2d::PointType PointType;
+  typedef Spline2d::KnotVectorType KnotVectorType;
+  typedef Spline2d::ControlPointVectorType ControlPointVectorType;
+
+  ControlPointVectorType points = ControlPointVectorType::Random(2,100);
+
+  KnotVectorType chord_lengths; // knot parameters
+  Eigen::ChordLengths(points, chord_lengths);
+
+  // interpolation without knot parameters
+  {
+    const Spline2d spline = SplineFitting<Spline2d>::Interpolate(points,3);  
+
+    for (Eigen::DenseIndex i=0; i<points.cols(); ++i)
+    {
+      PointType pt = spline( chord_lengths(i) );
+      PointType ref = points.col(i);
+      VERIFY( (pt - ref).matrix().norm() < 1e-14 );
+    }
+  }
+
+  // interpolation with given knot parameters
+  {
+    const Spline2d spline = SplineFitting<Spline2d>::Interpolate(points,3,chord_lengths);  
+
+    for (Eigen::DenseIndex i=0; i<points.cols(); ++i)
+    {
+      PointType pt = spline( chord_lengths(i) );
+      PointType ref = points.col(i);
+      VERIFY( (pt - ref).matrix().norm() < 1e-14 );
+    }
+  }
+}
+
+void check_global_interpolation_with_derivatives2d()
+{
+  typedef Spline2d::PointType PointType;
+  typedef Spline2d::KnotVectorType KnotVectorType;
+
+  const Eigen::DenseIndex numPoints = 100;
+  const unsigned int dimension = 2;
+  const unsigned int degree = 3;
+
+  ArrayXXd points = ArrayXXd::Random(dimension, numPoints);
+
+  KnotVectorType knots;
+  Eigen::ChordLengths(points, knots);
+
+  ArrayXXd derivatives = ArrayXXd::Random(dimension, numPoints);
+  VectorXd derivativeIndices(numPoints);
+
+  for (Eigen::DenseIndex i = 0; i < numPoints; ++i)
+      derivativeIndices(i) = static_cast<double>(i);
+
+  const Spline2d spline = SplineFitting<Spline2d>::InterpolateWithDerivatives(
+    points, derivatives, derivativeIndices, degree);  
+    
+  for (Eigen::DenseIndex i = 0; i < points.cols(); ++i)
+  {
+    PointType point = spline(knots(i));
+    PointType referencePoint = points.col(i);
+    VERIFY_IS_APPROX(point, referencePoint);
+    PointType derivative = spline.derivatives(knots(i), 1).col(1);
+    PointType referenceDerivative = derivatives.col(i);
+    VERIFY_IS_APPROX(derivative, referenceDerivative);
+  }
+}
+
+void test_splines()
+{
+  for (int i = 0; i < g_repeat; ++i)
+  {
+    CALL_SUBTEST( eval_spline3d() );
+    CALL_SUBTEST( eval_spline3d_onbrks() );
+    CALL_SUBTEST( eval_closed_spline2d() );
+    CALL_SUBTEST( check_global_interpolation2d() );
+    CALL_SUBTEST( check_global_interpolation_with_derivatives2d() );
+  }
+}