From aa63fef21cf9c0d1fd5708aca2c17171cee83fc0 Mon Sep 17 00:00:00 2001 From: koldo Date: Fri, 23 Oct 2020 09:15:00 +0000 Subject: [PATCH] Eigen: Updated to 3.3.8 git-svn-id: svn://ultimatepp.org/upp/trunk@15292 f0d560ea-af0d-0410-9eb7-867de7ffcac7 --- uppsrc/plugin/Eigen/Eigen.h | 2 + uppsrc/plugin/Eigen/Eigen/Core | 358 +- uppsrc/plugin/Eigen/Eigen/Geometry | 6 +- uppsrc/plugin/Eigen/Eigen/KLUSupport | 41 - uppsrc/plugin/Eigen/Eigen/OrderingMethods | 3 + uppsrc/plugin/Eigen/Eigen/PaStiXSupport | 1 - uppsrc/plugin/Eigen/Eigen/Sparse | 2 + uppsrc/plugin/Eigen/Eigen/SparseCholesky | 8 + uppsrc/plugin/Eigen/Eigen/SparseLU | 4 - uppsrc/plugin/Eigen/Eigen/src/Cholesky/LDLT.h | 59 +- uppsrc/plugin/Eigen/Eigen/src/Cholesky/LLT.h | 56 +- .../Eigen/src/CholmodSupport/CholmodSupport.h | 137 +- .../Eigen/Eigen/src/Core/ArithmeticSequence.h | 413 -- uppsrc/plugin/Eigen/Eigen/src/Core/Array.h | 100 +- .../plugin/Eigen/Eigen/src/Core/ArrayBase.h | 2 +- .../Eigen/Eigen/src/Core/ArrayWrapper.h | 2 +- uppsrc/plugin/Eigen/Eigen/src/Core/Assign.h | 2 +- .../Eigen/Eigen/src/Core/AssignEvaluator.h | 63 +- .../plugin/Eigen/Eigen/src/Core/Assign_MKL.h | 20 +- uppsrc/plugin/Eigen/Eigen/src/Core/Block.h | 60 +- .../Eigen/Eigen/src/Core/BooleanRedux.h | 56 +- .../Eigen/Eigen/src/Core/CommaInitializer.h | 10 +- .../Eigen/Eigen/src/Core/CoreEvaluators.h | 386 +- .../Eigen/Eigen/src/Core/CoreIterators.h | 5 - .../Eigen/Eigen/src/Core/CwiseBinaryOp.h | 29 +- .../Eigen/Eigen/src/Core/CwiseNullaryOp.h | 82 +- .../Eigen/Eigen/src/Core/CwiseUnaryView.h | 2 +- .../plugin/Eigen/Eigen/src/Core/DenseBase.h | 87 +- .../Eigen/Eigen/src/Core/DenseCoeffsBase.h | 12 +- .../Eigen/Eigen/src/Core/DenseStorage.h | 60 +- uppsrc/plugin/Eigen/Eigen/src/Core/Diagonal.h | 10 +- .../Eigen/Eigen/src/Core/DiagonalMatrix.h | 52 +- .../Eigen/Eigen/src/Core/DiagonalProduct.h | 2 +- uppsrc/plugin/Eigen/Eigen/src/Core/Dot.h | 16 +- .../plugin/Eigen/Eigen/src/Core/EigenBase.h | 3 +- uppsrc/plugin/Eigen/Eigen/src/Core/Fuzzy.h | 6 +- .../Eigen/Eigen/src/Core/GeneralProduct.h | 24 +- .../Eigen/Eigen/src/Core/GenericPacketMath.h | 374 +- .../Eigen/Eigen/src/Core/GlobalFunctions.h | 68 +- uppsrc/plugin/Eigen/Eigen/src/Core/IO.h | 47 +- .../plugin/Eigen/Eigen/src/Core/IndexedView.h | 207 - uppsrc/plugin/Eigen/Eigen/src/Core/Inverse.h | 7 +- uppsrc/plugin/Eigen/Eigen/src/Core/Map.h | 6 +- .../Eigen/Eigen/src/Core/MathFunctions.h | 541 +-- .../Eigen/Eigen/src/Core/MathFunctionsImpl.h | 39 +- uppsrc/plugin/Eigen/Eigen/src/Core/Matrix.h | 138 +- .../plugin/Eigen/Eigen/src/Core/MatrixBase.h | 21 +- .../plugin/Eigen/Eigen/src/Core/NestByValue.h | 71 +- uppsrc/plugin/Eigen/Eigen/src/Core/NoAlias.h | 7 +- .../plugin/Eigen/Eigen/src/Core/NumTraits.h | 47 +- .../Eigen/src/Core/PartialReduxEvaluator.h | 232 -- .../Eigen/Eigen/src/Core/PermutationMatrix.h | 34 +- .../Eigen/Eigen/src/Core/PlainObjectBase.h | 148 +- uppsrc/plugin/Eigen/Eigen/src/Core/Product.h | 19 +- .../Eigen/Eigen/src/Core/ProductEvaluators.h | 164 +- uppsrc/plugin/Eigen/Eigen/src/Core/Random.h | 2 +- uppsrc/plugin/Eigen/Eigen/src/Core/Redux.h | 330 +- uppsrc/plugin/Eigen/Eigen/src/Core/Ref.h | 2 - .../plugin/Eigen/Eigen/src/Core/Replicate.h | 4 +- uppsrc/plugin/Eigen/Eigen/src/Core/Reshaped.h | 453 --- .../Eigen/Eigen/src/Core/ReturnByValue.h | 2 +- uppsrc/plugin/Eigen/Eigen/src/Core/Reverse.h | 16 +- .../Eigen/Eigen/src/Core/SelfAdjointView.h | 17 +- uppsrc/plugin/Eigen/Eigen/src/Core/Solve.h | 4 +- .../Eigen/Eigen/src/Core/SolveTriangular.h | 2 +- .../plugin/Eigen/Eigen/src/Core/SolverBase.h | 44 +- .../plugin/Eigen/Eigen/src/Core/StableNorm.h | 117 +- .../Eigen/Eigen/src/Core/StlIterators.h | 331 -- uppsrc/plugin/Eigen/Eigen/src/Core/Swap.h | 9 +- .../plugin/Eigen/Eigen/src/Core/Transpose.h | 109 +- .../Eigen/Eigen/src/Core/Transpositions.h | 41 +- .../Eigen/Eigen/src/Core/TriangularMatrix.h | 48 +- .../plugin/Eigen/Eigen/src/Core/VectorBlock.h | 10 +- .../Eigen/Eigen/src/Core/VectorwiseOp.h | 203 +- uppsrc/plugin/Eigen/Eigen/src/Core/Visitor.h | 36 - .../Eigen/Eigen/src/Core/arch/AVX/Complex.h | 94 +- .../Eigen/src/Core/arch/AVX/MathFunctions.h | 353 +- .../Eigen/src/Core/arch/AVX/PacketMath.h | 719 +--- .../Eigen/src/Core/arch/AVX/TypeCasting.h | 40 +- .../Eigen/src/Core/arch/AVX512/Complex.h | 447 --- .../src/Core/arch/AVX512/MathFunctions.h | 153 +- .../Eigen/src/Core/arch/AVX512/PacketMath.h | 834 ++-- .../Eigen/src/Core/arch/AVX512/TypeCasting.h | 47 - .../Eigen/src/Core/arch/AltiVec/Complex.h | 76 +- .../src/Core/arch/AltiVec/MathFunctions.h | 270 +- .../Eigen/src/Core/arch/AltiVec/PacketMath.h | 1579 ++------ .../Eigen/Eigen/src/Core/arch/CUDA/Complex.h | 6 +- .../src/Core/arch/{Default => CUDA}/Half.h | 158 +- .../Core/arch/{GPU => CUDA}/MathFunctions.h | 20 +- .../Eigen/src/Core/arch/CUDA/PacketMath.h | 333 ++ .../Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 1124 ++++++ .../Eigen/src/Core/arch/CUDA/TypeCasting.h | 212 ++ .../arch/Default/GenericPacketMathFunctions.h | 655 ---- .../Default/GenericPacketMathFunctionsFwd.h | 69 - .../Eigen/src/Core/arch/Default/Settings.h | 2 +- .../Eigen/src/Core/arch/Default/TypeCasting.h | 77 - .../Eigen/src/Core/arch/GPU/PacketMath.h | 1786 --------- .../Eigen/src/Core/arch/GPU/TypeCasting.h | 80 - .../src/Core/arch/HIP/hcc/math_constants.h | 23 - .../Eigen/Eigen/src/Core/arch/MSA/Complex.h | 720 ---- .../Eigen/src/Core/arch/MSA/MathFunctions.h | 387 -- .../Eigen/src/Core/arch/MSA/PacketMath.h | 1237 ------ .../Eigen/Eigen/src/Core/arch/NEON/Complex.h | 505 +-- .../Eigen/src/Core/arch/NEON/MathFunctions.h | 88 +- .../Eigen/src/Core/arch/NEON/PacketMath.h | 3358 ++--------------- .../Eigen/src/Core/arch/NEON/TypeCasting.h | 278 -- .../Eigen/Eigen/src/Core/arch/SSE/Complex.h | 81 +- .../Eigen/src/Core/arch/SSE/MathFunctions.h | 460 ++- .../Eigen/src/Core/arch/SSE/PacketMath.h | 796 ++-- .../Eigen/src/Core/arch/SSE/TypeCasting.h | 58 - .../Eigen/src/Core/arch/SYCL/InteropHeaders.h | 229 -- .../Eigen/src/Core/arch/SYCL/MathFunctions.h | 289 -- .../Eigen/src/Core/arch/SYCL/PacketMath.h | 670 ---- .../src/Core/arch/SYCL/SyclMemoryModel.h | 694 ---- .../Eigen/src/Core/arch/SYCL/TypeCasting.h | 85 - .../Eigen/src/Core/arch/ZVector/Complex.h | 401 +- .../src/Core/arch/ZVector/MathFunctions.h | 112 +- .../Eigen/src/Core/arch/ZVector/PacketMath.h | 885 ++--- .../src/Core/functors/AssignmentFunctors.h | 13 +- .../Eigen/src/Core/functors/BinaryFunctors.h | 93 +- .../Eigen/src/Core/functors/NullaryFunctors.h | 55 +- .../Eigen/src/Core/functors/UnaryFunctors.h | 298 +- .../Core/products/GeneralBlockPanelKernel.h | 1613 +++----- .../src/Core/products/GeneralMatrixMatrix.h | 32 +- .../products/GeneralMatrixMatrixTriangular.h | 8 +- .../GeneralMatrixMatrixTriangular_BLAS.h | 2 +- .../src/Core/products/GeneralMatrixVector.h | 865 +++-- .../Eigen/src/Core/products/Parallelizer.h | 36 +- .../Core/products/SelfadjointMatrixMatrix.h | 29 +- .../Core/products/SelfadjointMatrixVector.h | 14 +- .../src/Core/products/SelfadjointProduct.h | 2 +- .../Core/products/SelfadjointRank2Update.h | 5 +- .../Core/products/TriangularMatrixMatrix.h | 6 +- .../Core/products/TriangularSolverMatrix.h | 4 +- .../Core/products/TriangularSolverVector.h | 21 +- .../Eigen/Eigen/src/Core/util/BlasUtil.h | 77 +- .../src/Core/util/ConfigureVectorization.h | 483 --- .../Eigen/Eigen/src/Core/util/Constants.h | 15 +- .../src/Core/util/DisableStupidWarnings.h | 13 +- .../Eigen/src/Core/util/ForwardDeclarations.h | 25 +- .../Eigen/src/Core/util/IndexedViewHelper.h | 186 - .../Eigen/src/Core/util/IntegralConstant.h | 272 -- .../Eigen/Eigen/src/Core/util/MKL_support.h | 9 +- .../plugin/Eigen/Eigen/src/Core/util/Macros.h | 724 ++-- .../plugin/Eigen/Eigen/src/Core/util/Memory.h | 215 +- .../plugin/Eigen/Eigen/src/Core/util/Meta.h | 248 +- .../Eigen/src/Core/util/ReshapedHelper.h | 51 - .../Eigen/Eigen/src/Core/util/StaticAssert.h | 10 +- .../Eigen/Eigen/src/Core/util/SymbolicIndex.h | 293 -- .../Eigen/Eigen/src/Core/util/XprHelper.h | 38 +- .../src/Eigenvalues/ComplexEigenSolver.h | 2 +- .../Eigen/src/Eigenvalues/ComplexSchur.h | 2 +- .../Eigen/Eigen/src/Eigenvalues/EigenSolver.h | 4 +- .../GeneralizedSelfAdjointEigenSolver.h | 2 +- .../src/Eigenvalues/HessenbergDecomposition.h | 2 +- .../src/Eigenvalues/MatrixBaseEigenvalues.h | 4 +- .../Eigen/Eigen/src/Eigenvalues/RealQZ.h | 15 +- .../Eigen/Eigen/src/Eigenvalues/RealSchur.h | 15 +- .../src/Eigenvalues/SelfAdjointEigenSolver.h | 32 +- .../src/Eigenvalues/Tridiagonalization.h | 9 +- .../Eigen/Eigen/src/Geometry/AlignedBox.h | 2 +- .../Eigen/Eigen/src/Geometry/Hyperplane.h | 2 +- .../Eigen/Eigen/src/Geometry/OrthoMethods.h | 5 +- .../Eigen/src/Geometry/ParametrizedLine.h | 39 +- .../Eigen/Eigen/src/Geometry/Quaternion.h | 17 +- .../plugin/Eigen/Eigen/src/Geometry/Scaling.h | 26 +- .../Eigen/Eigen/src/Geometry/Transform.h | 62 +- .../Eigen/Eigen/src/Geometry/Translation.h | 12 +- .../Eigen/src/Geometry/arch/Geometry_SSE.h | 51 +- .../Eigen/src/Householder/BlockHouseholder.h | 11 +- .../Eigen/Eigen/src/Householder/Householder.h | 12 +- .../src/Householder/HouseholderSequence.h | 147 +- .../src/IterativeLinearSolvers/BiCGSTAB.h | 30 +- .../ConjugateGradient.h | 27 +- .../IncompleteCholesky.h | 12 +- .../IterativeLinearSolvers/IncompleteLUT.h | 13 +- .../IterativeSolverBase.h | 56 +- .../LeastSquareConjugateGradient.h | 22 +- .../IterativeLinearSolvers/SolveWithGuess.h | 2 +- uppsrc/plugin/Eigen/Eigen/src/Jacobi/Jacobi.h | 39 +- .../Eigen/Eigen/src/KLUSupport/KLUSupport.h | 358 -- .../plugin/Eigen/Eigen/src/LU/Determinant.h | 54 +- uppsrc/plugin/Eigen/Eigen/src/LU/FullPivLU.h | 66 +- .../plugin/Eigen/Eigen/src/LU/InverseImpl.h | 2 - .../plugin/Eigen/Eigen/src/LU/PartialPivLU.h | 111 +- .../Eigen/Eigen/src/OrderingMethods/Amd.h | 24 +- .../Eigen/src/OrderingMethods/Eigen_Colamd.h | 574 ++- .../Eigen/src/OrderingMethods/Ordering.h | 16 +- .../Eigen/src/PaStiXSupport/PaStiXSupport.h | 2 +- .../Eigen/src/PardisoSupport/PardisoSupport.h | 19 +- .../Eigen/Eigen/src/QR/ColPivHouseholderQR.h | 61 +- .../src/QR/CompleteOrthogonalDecomposition.h | 127 +- .../Eigen/Eigen/src/QR/FullPivHouseholderQR.h | 81 +- .../plugin/Eigen/Eigen/src/QR/HouseholderQR.h | 71 +- .../src/SPQRSupport/SuiteSparseQRSupport.h | 28 +- uppsrc/plugin/Eigen/Eigen/src/SVD/BDCSVD.h | 155 +- uppsrc/plugin/Eigen/Eigen/src/SVD/JacobiSVD.h | 3 +- uppsrc/plugin/Eigen/Eigen/src/SVD/SVDBase.h | 69 +- .../Eigen/src/SVD/UpperBidiagonalization.h | 6 +- .../src/SparseCholesky/SimplicialCholesky.h | 14 +- .../SparseCholesky/SimplicialCholesky_impl.h | 47 +- .../Eigen/src/SparseCore/CompressedStorage.h | 16 - .../Eigen/Eigen/src/SparseCore/SparseAssign.h | 108 +- .../Eigen/Eigen/src/SparseCore/SparseBlock.h | 72 +- .../src/SparseCore/SparseCompressedBase.h | 51 +- .../src/SparseCore/SparseCwiseBinaryOp.h | 14 +- .../Eigen/src/SparseCore/SparseDenseProduct.h | 38 +- .../Eigen/Eigen/src/SparseCore/SparseMatrix.h | 126 +- .../Eigen/src/SparseCore/SparseMatrixBase.h | 17 +- .../Eigen/src/SparseCore/SparseProduct.h | 2 +- .../Eigen/Eigen/src/SparseCore/SparseRef.h | 14 +- .../src/SparseCore/SparseSelfAdjointView.h | 4 +- .../Eigen/Eigen/src/SparseCore/SparseUtil.h | 8 - .../Eigen/Eigen/src/SparseCore/SparseVector.h | 2 +- .../Eigen/Eigen/src/SparseLU/SparseLU.h | 16 +- .../Eigen/src/SparseLU/SparseLU_Memory.h | 2 +- .../src/SparseLU/SparseLU_SupernodalMatrix.h | 4 +- .../Eigen/src/SparseLU/SparseLU_column_dfs.h | 4 +- .../Eigen/src/SparseLU/SparseLU_gemm_kernel.h | 2 +- .../Eigen/src/SparseLU/SparseLU_panel_bmod.h | 2 +- .../Eigen/Eigen/src/SparseQR/SparseQR.h | 25 +- .../Eigen/Eigen/src/StlSupport/StdDeque.h | 10 +- .../Eigen/Eigen/src/StlSupport/StdList.h | 4 +- .../Eigen/Eigen/src/StlSupport/StdVector.h | 4 +- .../Eigen/src/SuperLUSupport/SuperLUSupport.h | 2 +- .../Eigen/src/UmfPackSupport/UmfPackSupport.h | 214 +- uppsrc/plugin/Eigen/Eigen/src/misc/lapacke.h | 9 +- .../Eigen/src/plugins/ArrayCwiseBinaryOps.h | 28 +- .../Eigen/src/plugins/ArrayCwiseUnaryOps.h | 114 +- .../Eigen/Eigen/src/plugins/BlockMethods.h | 869 ++--- .../Eigen/src/plugins/CommonCwiseUnaryOps.h | 57 - .../Eigen/src/plugins/IndexedViewMethods.h | 262 -- .../Eigen/Eigen/src/plugins/ReshapedMethods.h | 149 - .../plugin/Eigen/srcdoc.tpp/Eigen_en-us.tpp | 5 +- .../plugin/Eigen/unsupported/CMakeLists.txt | 9 + .../Eigen/unsupported/Eigen/AdolcForward | 2 +- .../Eigen/unsupported/Eigen/AlignedVector3 | 12 +- .../Eigen/unsupported/Eigen/ArpackSupport | 8 +- .../plugin/Eigen/unsupported/Eigen/AutoDiff | 6 - uppsrc/plugin/Eigen/unsupported/Eigen/BVH | 6 +- .../Eigen/unsupported/Eigen/CMakeLists.txt | 32 + .../Eigen/unsupported/Eigen/CXX11/Tensor | 61 +- .../unsupported/Eigen/CXX11/TensorSymmetry | 6 +- .../Eigen/unsupported/Eigen/CXX11/ThreadPool | 18 +- .../Eigen/CXX11/src/Tensor/README.md | 217 +- .../Eigen/CXX11/src/Tensor/Tensor.h | 29 +- .../Eigen/CXX11/src/Tensor/TensorArgMax.h | 70 +- .../Eigen/CXX11/src/Tensor/TensorAssign.h | 90 +- .../Eigen/CXX11/src/Tensor/TensorBase.h | 205 +- .../Eigen/CXX11/src/Tensor/TensorBlock.h | 1559 -------- .../CXX11/src/Tensor/TensorBroadcasting.h | 766 +--- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 218 +- .../CXX11/src/Tensor/TensorConcatenation.h | 57 +- .../CXX11/src/Tensor/TensorContraction.h | 609 +-- .../src/Tensor/TensorContractionBlocking.h | 39 +- .../CXX11/src/Tensor/TensorContractionCuda.h | 1393 ++++++- .../CXX11/src/Tensor/TensorContractionGpu.h | 1413 ------- .../src/Tensor/TensorContractionMapper.h | 246 +- .../CXX11/src/Tensor/TensorContractionSycl.h | 1650 -------- .../src/Tensor/TensorContractionThreadPool.h | 1576 +++----- .../Eigen/CXX11/src/Tensor/TensorConversion.h | 234 +- .../CXX11/src/Tensor/TensorConvolution.h | 172 +- .../CXX11/src/Tensor/TensorConvolutionSycl.h | 544 --- .../Eigen/CXX11/src/Tensor/TensorCostModel.h | 10 +- .../Eigen/CXX11/src/Tensor/TensorCustomOp.h | 82 +- .../Eigen/CXX11/src/Tensor/TensorDevice.h | 67 - .../Eigen/CXX11/src/Tensor/TensorDeviceCuda.h | 337 +- .../CXX11/src/Tensor/TensorDeviceDefault.h | 33 +- .../Eigen/CXX11/src/Tensor/TensorDeviceGpu.h | 360 -- .../Eigen/CXX11/src/Tensor/TensorDeviceSycl.h | 1078 +----- .../CXX11/src/Tensor/TensorDeviceThreadPool.h | 347 +- .../Eigen/CXX11/src/Tensor/TensorDimensions.h | 160 +- .../Eigen/CXX11/src/Tensor/TensorEvalTo.h | 105 +- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 605 +-- .../Eigen/CXX11/src/Tensor/TensorExecutor.h | 699 +--- .../Eigen/CXX11/src/Tensor/TensorExpr.h | 23 +- .../Eigen/CXX11/src/Tensor/TensorFFT.h | 82 +- .../Eigen/CXX11/src/Tensor/TensorFixedSize.h | 9 +- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 189 +- .../src/Tensor/TensorForwardDeclarations.h | 90 +- .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 138 +- .../Eigen/CXX11/src/Tensor/TensorGenerator.h | 145 +- .../src/Tensor/TensorGpuHipCudaDefines.h | 93 - .../src/Tensor/TensorGpuHipCudaUndefines.h | 40 - .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 142 +- .../Eigen/CXX11/src/Tensor/TensorIndexList.h | 203 +- .../Eigen/CXX11/src/Tensor/TensorInflation.h | 24 +- .../CXX11/src/Tensor/TensorInitializer.h | 4 +- .../Eigen/CXX11/src/Tensor/TensorIntDiv.h | 28 +- .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 30 +- .../Eigen/CXX11/src/Tensor/TensorMacros.h | 41 +- .../Eigen/CXX11/src/Tensor/TensorMap.h | 92 +- .../Eigen/CXX11/src/Tensor/TensorMeta.h | 123 +- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 479 +-- .../Eigen/CXX11/src/Tensor/TensorPadding.h | 371 +- .../Eigen/CXX11/src/Tensor/TensorPatch.h | 28 +- .../Eigen/CXX11/src/Tensor/TensorRandom.h | 154 +- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 386 +- .../CXX11/src/Tensor/TensorReductionCuda.h | 750 +++- .../CXX11/src/Tensor/TensorReductionGpu.h | 967 ----- .../CXX11/src/Tensor/TensorReductionSycl.h | 746 +--- .../Eigen/CXX11/src/Tensor/TensorRef.h | 31 +- .../Eigen/CXX11/src/Tensor/TensorReverse.h | 220 +- .../Eigen/CXX11/src/Tensor/TensorScan.h | 526 +-- .../Eigen/CXX11/src/Tensor/TensorScanSycl.h | 512 --- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 289 +- .../Eigen/CXX11/src/Tensor/TensorStriding.h | 38 +- .../Eigen/CXX11/src/Tensor/TensorSycl.h | 82 + .../TensorSyclConvertToDeviceExpression.h | 121 + .../src/Tensor/TensorSyclExprConstructor.h | 239 ++ .../src/Tensor/TensorSyclExtractAccessor.h | 204 + .../src/Tensor/TensorSyclExtractFunctors.h | 177 + .../CXX11/src/Tensor/TensorSyclLeafCount.h | 114 + .../src/Tensor/TensorSyclPlaceHolderExpr.h | 181 + .../Eigen/CXX11/src/Tensor/TensorSyclRun.h | 70 + .../Eigen/CXX11/src/Tensor/TensorSyclTuple.h | 237 ++ .../Eigen/CXX11/src/Tensor/TensorTrace.h | 303 -- .../Eigen/CXX11/src/Tensor/TensorTraits.h | 44 +- .../Eigen/CXX11/src/Tensor/TensorUInt128.h | 1 - .../CXX11/src/Tensor/TensorVolumePatch.h | 59 +- .../TensorSymmetry/util/TemplateGroupTheory.h | 2 +- .../Eigen/CXX11/src/ThreadPool/Barrier.h | 67 - .../Eigen/CXX11/src/ThreadPool/EventCount.h | 196 +- .../src/ThreadPool/NonBlockingThreadPool.h | 398 +- .../Eigen/CXX11/src/ThreadPool/RunQueue.h | 98 +- .../CXX11/src/ThreadPool/SimpleThreadPool.h | 154 + .../Eigen/CXX11/src/ThreadPool/ThreadCancel.h | 23 - .../CXX11/src/ThreadPool/ThreadEnvironment.h | 2 - .../Eigen/CXX11/src/ThreadPool/ThreadLocal.h | 289 +- .../src/ThreadPool/ThreadPoolInterface.h | 15 - .../Eigen/CXX11/src/util/CXX11Meta.h | 93 +- .../Eigen/CXX11/src/util/CXX11Workarounds.h | 6 +- .../Eigen/CXX11/src/util/EmulateArray.h | 54 +- .../Eigen/CXX11/src/util/EmulateCXX11Meta.h | 311 ++ .../Eigen/CXX11/src/util/MaxSizeVector.h | 51 +- .../Eigen/unsupported/Eigen/EulerAngles | 8 +- uppsrc/plugin/Eigen/unsupported/Eigen/FFT | 7 +- .../Eigen/unsupported/Eigen/IterativeSolvers | 10 +- .../unsupported/Eigen/LevenbergMarquardt | 16 +- .../Eigen/unsupported/Eigen/MPRealSupport | 6 +- .../Eigen/unsupported/Eigen/MatrixFunctions | 10 +- .../Eigen/unsupported/Eigen/MoreVectorization | 2 +- .../unsupported/Eigen/NonLinearOptimization | 44 +- .../Eigen/unsupported/Eigen/NumericalDiff | 2 +- .../Eigen/unsupported/Eigen/OpenGLSupport | 4 +- .../Eigen/unsupported/Eigen/Polynomials | 8 +- uppsrc/plugin/Eigen/unsupported/Eigen/Skyline | 6 +- .../Eigen/unsupported/Eigen/SpecialFunctions | 29 +- uppsrc/plugin/Eigen/unsupported/Eigen/Splines | 4 - .../Eigen/src/AutoDiff/AutoDiffScalar.h | 30 +- .../Eigen/unsupported/Eigen/src/BVH/KdBVH.h | 2 +- .../ArpackSelfAdjointEigenSolver.h | 4 +- .../Eigen/src/EulerAngles/CMakeLists.txt | 4 +- .../Eigen/src/EulerAngles/EulerAngles.h | 261 +- .../Eigen/src/EulerAngles/EulerSystem.h | 197 +- .../Eigen/src/FFT/ei_kissfft_impl.h | 4 +- .../IterativeSolvers/ConstrainedConjGrad.h | 4 +- .../Eigen/src/IterativeSolvers/DGMRES.h | 61 +- .../Eigen/src/IterativeSolvers/GMRES.h | 36 +- .../Eigen/src/IterativeSolvers/MINRES.h | 38 +- .../Eigen/src/IterativeSolvers/Scaling.h | 6 - .../Eigen/src/LevenbergMarquardt/LMqrsolv.h | 2 +- .../LevenbergMarquardt/LevenbergMarquardt.h | 6 +- .../src/MatrixFunctions/MatrixExponential.h | 5 +- .../src/MatrixFunctions/MatrixFunction.h | 29 +- .../src/MatrixFunctions/MatrixLogarithm.h | 20 +- .../Eigen/src/MatrixFunctions/MatrixPower.h | 20 +- .../src/MatrixFunctions/MatrixSquareRoot.h | 12 +- .../Eigen/src/NonLinearOptimization/qrsolv.h | 2 +- .../Eigen/src/NonLinearOptimization/r1updt.h | 2 +- .../Eigen/src/Polynomials/Companion.h | 4 +- .../Eigen/src/Skyline/SkylineInplaceLU.h | 4 +- .../Eigen/src/Skyline/SkylineMatrix.h | 18 +- .../Eigen/src/Skyline/SkylineMatrixBase.h | 2 +- .../Eigen/src/Skyline/SkylineStorage.h | 2 +- .../src/SparseExtra/DynamicSparseMatrix.h | 8 +- .../Eigen/src/SparseExtra/MarketIO.h | 91 +- .../Eigen/src/SparseExtra/RandomSetter.h | 6 +- .../BesselFunctionsArrayAPI.h | 286 -- .../BesselFunctionsFunctors.h | 357 -- .../SpecialFunctions/BesselFunctionsHalf.h | 66 - .../SpecialFunctions/BesselFunctionsImpl.h | 1959 ---------- .../BesselFunctionsPacketMath.h | 130 - .../SpecialFunctions/HipVectorCompatibility.h | 67 - .../SpecialFunctionsArrayAPI.h | 55 +- .../SpecialFunctionsFunctors.h | 140 +- .../SpecialFunctions/SpecialFunctionsHalf.h | 11 - .../SpecialFunctions/SpecialFunctionsImpl.h | 1028 ++--- .../SpecialFunctionsPacketMath.h | 23 +- .../arch/CUDA/CudaSpecialFunctions.h | 165 + .../arch/GPU/GpuSpecialFunctions.h | 369 -- .../unsupported/Eigen/src/Splines/Spline.h | 2 +- .../Eigen/src/Splines/SplineFitting.h | 11 +- .../unsupported/Eigen/src/Splines/SplineFwd.h | 2 +- uppsrc/plugin/Eigen/unsupported/README.txt | 2 +- .../Eigen/unsupported/bench/bench_svd.cpp | 123 + .../Eigen/unsupported/doc/CMakeLists.txt | 4 + .../plugin/Eigen/unsupported/doc/Overview.dox | 28 + .../unsupported/doc/eigendoxy_layout.xml.in | 177 + .../unsupported/doc/examples/BVH_Example.cpp | 50 + .../unsupported/doc/examples/CMakeLists.txt | 20 + .../unsupported/doc/examples/EulerAngles.cpp | 46 + .../Eigen/unsupported/doc/examples/FFT.cpp | 118 + .../doc/examples/MatrixExponential.cpp | 16 + .../doc/examples/MatrixFunction.cpp | 23 + .../doc/examples/MatrixLogarithm.cpp | 15 + .../unsupported/doc/examples/MatrixPower.cpp | 16 + .../doc/examples/MatrixPower_optimal.cpp | 17 + .../unsupported/doc/examples/MatrixSine.cpp | 20 + .../unsupported/doc/examples/MatrixSinh.cpp | 20 + .../doc/examples/MatrixSquareRoot.cpp | 16 + .../doc/examples/PolynomialSolver1.cpp | 53 + .../doc/examples/PolynomialUtils1.cpp | 20 + .../unsupported/doc/snippets/CMakeLists.txt | 26 + uppsrc/plugin/Eigen/unsupported/test/BVH.cpp | 222 ++ .../Eigen/unsupported/test/CMakeLists.txt | 263 ++ .../Eigen/unsupported/test/EulerAngles.cpp | 208 + uppsrc/plugin/Eigen/unsupported/test/FFT.cpp | 2 + uppsrc/plugin/Eigen/unsupported/test/FFTW.cpp | 262 ++ .../test/NonLinearOptimization.cpp | 1849 +++++++++ .../Eigen/unsupported/test/NumericalDiff.cpp | 114 + .../Eigen/unsupported/test/alignedvector3.cpp | 84 + .../Eigen/unsupported/test/autodiff.cpp | 387 ++ .../unsupported/test/autodiff_scalar.cpp | 101 + .../unsupported/test/cxx11_eventcount.cpp | 142 + .../Eigen/unsupported/test/cxx11_meta.cpp | 357 ++ .../test/cxx11_non_blocking_thread_pool.cpp | 107 + .../Eigen/unsupported/test/cxx11_runqueue.cpp | 235 ++ .../unsupported/test/cxx11_tensor_argmax.cpp | 294 ++ .../test/cxx11_tensor_argmax_cuda.cu | 251 ++ .../unsupported/test/cxx11_tensor_assign.cpp | 370 ++ .../test/cxx11_tensor_broadcast_sycl.cpp | 74 + .../test/cxx11_tensor_broadcasting.cpp | 194 + .../test/cxx11_tensor_cast_float16_cuda.cu | 79 + .../unsupported/test/cxx11_tensor_casts.cpp | 115 + .../test/cxx11_tensor_chipping.cpp | 425 +++ .../test/cxx11_tensor_comparisons.cpp | 84 + .../test/cxx11_tensor_complex_cuda.cu | 150 + .../cxx11_tensor_complex_cwise_ops_cuda.cu | 94 + .../test/cxx11_tensor_concatenation.cpp | 137 + .../unsupported/test/cxx11_tensor_const.cpp | 62 + .../test/cxx11_tensor_contract_cuda.cu | 213 ++ .../test/cxx11_tensor_contraction.cpp | 545 +++ .../test/cxx11_tensor_convolution.cpp | 149 + .../unsupported/test/cxx11_tensor_cuda.cu | 1284 +++++++ .../test/cxx11_tensor_custom_index.cpp | 100 + .../test/cxx11_tensor_custom_op.cpp | 111 + .../unsupported/test/cxx11_tensor_device.cu | 387 ++ .../test/cxx11_tensor_device_sycl.cpp | 31 + .../test/cxx11_tensor_dimension.cpp | 69 + .../unsupported/test/cxx11_tensor_empty.cpp | 40 + .../unsupported/test/cxx11_tensor_expr.cpp | 314 ++ .../unsupported/test/cxx11_tensor_fft.cpp | 273 ++ .../test/cxx11_tensor_fixed_size.cpp | 261 ++ .../test/cxx11_tensor_forced_eval.cpp | 79 + .../test/cxx11_tensor_forced_eval_sycl.cpp | 70 + .../test/cxx11_tensor_generator.cpp | 91 + .../unsupported/test/cxx11_tensor_ifft.cpp | 154 + .../test/cxx11_tensor_image_patch.cpp | 757 ++++ .../test/cxx11_tensor_index_list.cpp | 386 ++ .../test/cxx11_tensor_inflation.cpp | 81 + .../unsupported/test/cxx11_tensor_intdiv.cpp | 147 + .../unsupported/test/cxx11_tensor_io.cpp | 136 + .../test/cxx11_tensor_layout_swap.cpp | 61 + .../unsupported/test/cxx11_tensor_lvalue.cpp | 42 + .../unsupported/test/cxx11_tensor_map.cpp | 277 ++ .../unsupported/test/cxx11_tensor_math.cpp | 46 + .../test/cxx11_tensor_mixed_indices.cpp | 53 + .../test/cxx11_tensor_morphing.cpp | 485 +++ .../test/cxx11_tensor_notification.cpp | 81 + .../test/cxx11_tensor_of_complex.cpp | 103 + .../test/cxx11_tensor_of_const_values.cpp | 105 + .../test/cxx11_tensor_of_float16_cuda.cu | 491 +++ .../test/cxx11_tensor_of_strings.cpp | 152 + .../unsupported/test/cxx11_tensor_padding.cpp | 93 + .../unsupported/test/cxx11_tensor_patch.cpp | 172 + .../unsupported/test/cxx11_tensor_random.cpp | 78 + .../test/cxx11_tensor_random_cuda.cu | 85 + .../test/cxx11_tensor_reduction.cpp | 508 +++ .../test/cxx11_tensor_reduction_cuda.cu | 154 + .../test/cxx11_tensor_reduction_sycl.cpp | 138 + .../unsupported/test/cxx11_tensor_ref.cpp | 248 ++ .../unsupported/test/cxx11_tensor_reverse.cpp | 190 + .../test/cxx11_tensor_roundings.cpp | 62 + .../unsupported/test/cxx11_tensor_scan.cpp | 110 + .../test/cxx11_tensor_scan_cuda.cu | 76 + .../test/cxx11_tensor_shuffling.cpp | 228 ++ .../unsupported/test/cxx11_tensor_simple.cpp | 327 ++ .../test/cxx11_tensor_striding.cpp | 119 + .../unsupported/test/cxx11_tensor_sugar.cpp | 81 + .../unsupported/test/cxx11_tensor_sycl.cpp | 159 + .../test/cxx11_tensor_symmetry.cpp | 818 ++++ .../test/cxx11_tensor_thread_pool.cpp | 373 ++ .../unsupported/test/cxx11_tensor_uint128.cpp | 160 + .../test/cxx11_tensor_volume_patch.cpp | 112 + .../plugin/Eigen/unsupported/test/dgmres.cpp | 31 + .../Eigen/unsupported/test/forward_adolc.cpp | 141 + .../plugin/Eigen/unsupported/test/gmres.cpp | 31 + .../unsupported/test/kronecker_product.cpp | 252 ++ .../unsupported/test/levenberg_marquardt.cpp | 1477 ++++++++ .../unsupported/test/matrix_exponential.cpp | 141 + .../unsupported/test/matrix_function.cpp | 227 ++ .../Eigen/unsupported/test/matrix_functions.h | 67 + .../Eigen/unsupported/test/matrix_power.cpp | 204 + .../unsupported/test/matrix_square_root.cpp | 31 + .../plugin/Eigen/unsupported/test/minres.cpp | 44 + .../Eigen/unsupported/test/mpreal/mpreal.h | 3104 +++++++++++++++ .../Eigen/unsupported/test/mpreal_support.cpp | 65 + .../Eigen/unsupported/test/openglsupport.cpp | 333 ++ .../unsupported/test/polynomialsolver.cpp | 232 ++ .../unsupported/test/polynomialutils.cpp | 113 + .../Eigen/unsupported/test/sparse_extra.cpp | 147 + .../unsupported/test/special_functions.cpp | 345 ++ .../plugin/Eigen/unsupported/test/splines.cpp | 281 ++ 514 files changed, 44279 insertions(+), 50937 deletions(-) delete mode 100644 uppsrc/plugin/Eigen/Eigen/KLUSupport delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/ArithmeticSequence.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/IndexedView.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/PartialReduxEvaluator.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/Reshaped.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/StlIterators.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/Complex.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h rename uppsrc/plugin/Eigen/Eigen/src/Core/arch/{Default => CUDA}/Half.h (79%) rename uppsrc/plugin/Eigen/Eigen/src/Core/arch/{GPU => CUDA}/MathFunctions.h (82%) create mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/PacketMath.h create mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h create mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/TypeCasting.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/GPU/PacketMath.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/GPU/TypeCasting.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/MSA/Complex.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/MSA/MathFunctions.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/MSA/PacketMath.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/TypeCasting.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/PacketMath.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/util/ConfigureVectorization.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/util/IndexedViewHelper.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/util/IntegralConstant.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/util/ReshapedHelper.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/Core/util/SymbolicIndex.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/KLUSupport/KLUSupport.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/plugins/IndexedViewMethods.h delete mode 100644 uppsrc/plugin/Eigen/Eigen/src/plugins/ReshapedMethods.h create mode 100644 uppsrc/plugin/Eigen/unsupported/CMakeLists.txt create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CMakeLists.txt delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclConvertToDeviceExpression.h create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExprConstructor.h create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractAccessor.h create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclExtractFunctors.h create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclLeafCount.h create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclPlaceHolderExpr.h create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclRun.h create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorSyclTuple.h delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/Barrier.h create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/ThreadPool/ThreadCancel.h create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/util/EmulateCXX11Meta.h delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h create mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h delete mode 100644 uppsrc/plugin/Eigen/unsupported/Eigen/src/SpecialFunctions/arch/GPU/GpuSpecialFunctions.h create mode 100644 uppsrc/plugin/Eigen/unsupported/bench/bench_svd.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/CMakeLists.txt create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/Overview.dox create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/eigendoxy_layout.xml.in create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/BVH_Example.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/CMakeLists.txt create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/EulerAngles.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/FFT.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixExponential.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixFunction.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixLogarithm.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixPower.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixPower_optimal.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixSine.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixSinh.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/MatrixSquareRoot.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/PolynomialSolver1.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/examples/PolynomialUtils1.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/doc/snippets/CMakeLists.txt create mode 100644 uppsrc/plugin/Eigen/unsupported/test/BVH.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/CMakeLists.txt create mode 100644 uppsrc/plugin/Eigen/unsupported/test/EulerAngles.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/FFT.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/FFTW.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/NonLinearOptimization.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/NumericalDiff.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/alignedvector3.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/autodiff.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/autodiff_scalar.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_eventcount.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_meta.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_non_blocking_thread_pool.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_runqueue.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_argmax.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_argmax_cuda.cu create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_assign.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_broadcast_sycl.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_broadcasting.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_cast_float16_cuda.cu create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_casts.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_chipping.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_comparisons.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_complex_cuda.cu create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_complex_cwise_ops_cuda.cu create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_concatenation.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_const.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_contract_cuda.cu create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_contraction.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_convolution.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_cuda.cu create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_custom_index.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_custom_op.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_device.cu create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_device_sycl.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_dimension.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_empty.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_expr.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_fft.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_fixed_size.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_forced_eval.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_generator.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_ifft.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_image_patch.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_index_list.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_inflation.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_intdiv.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_io.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_layout_swap.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_lvalue.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_map.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_math.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_mixed_indices.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_morphing.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_notification.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_of_complex.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_of_const_values.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_of_float16_cuda.cu create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_of_strings.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_padding.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_patch.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_random.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_random_cuda.cu create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_reduction.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_reduction_cuda.cu create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_reduction_sycl.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_ref.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_reverse.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_roundings.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_scan.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_scan_cuda.cu create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_shuffling.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_simple.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_striding.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_sugar.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_sycl.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_symmetry.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_thread_pool.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_uint128.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/cxx11_tensor_volume_patch.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/dgmres.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/forward_adolc.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/gmres.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/kronecker_product.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/levenberg_marquardt.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/matrix_exponential.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/matrix_function.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/matrix_functions.h create mode 100644 uppsrc/plugin/Eigen/unsupported/test/matrix_power.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/matrix_square_root.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/minres.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/mpreal/mpreal.h create mode 100644 uppsrc/plugin/Eigen/unsupported/test/mpreal_support.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/openglsupport.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/polynomialsolver.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/polynomialutils.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/sparse_extra.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/special_functions.cpp create mode 100644 uppsrc/plugin/Eigen/unsupported/test/splines.cpp diff --git a/uppsrc/plugin/Eigen/Eigen.h b/uppsrc/plugin/Eigen/Eigen.h index 2c0c1ba91..bc6cc86cf 100644 --- a/uppsrc/plugin/Eigen/Eigen.h +++ b/uppsrc/plugin/Eigen/Eigen.h @@ -9,6 +9,8 @@ #ifndef _DEBUG #define EIGEN_NO_DEBUG +#else +#define EIGEN_INITIALIZE_MATRICES_BY_NAN #endif #define eigen_assert(x) ASSERT(x) diff --git a/uppsrc/plugin/Eigen/Eigen/Core b/uppsrc/plugin/Eigen/Eigen/Core index 688361d46..ac7c5b300 100644 --- a/uppsrc/plugin/Eigen/Eigen/Core +++ b/uppsrc/plugin/Eigen/Eigen/Core @@ -14,26 +14,79 @@ // first thing Eigen does: stop the compiler from committing suicide #include "src/Core/util/DisableStupidWarnings.h" -// then include this file where all our macros are defined. It's really important to do it first because -// it's where we do all the compiler/OS/arch detections and define most defaults. -#include "src/Core/util/Macros.h" - -// This detects SSE/AVX/NEON/etc. and configure alignment settings -#include "src/Core/util/ConfigureVectorization.h" - -// We need cuda_runtime.h/hip_runtime.h to ensure that -// the EIGEN_USING_STD_MATH macro works properly on the device side -#if defined(EIGEN_CUDACC) - #include -#elif defined(EIGEN_HIPCC) - #include +#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA) + #define EIGEN_CUDACC __CUDACC__ #endif +#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA) + #define EIGEN_CUDA_ARCH __CUDA_ARCH__ +#endif + +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) +#define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) +#elif defined(__CUDACC_VER__) +#define EIGEN_CUDACC_VER __CUDACC_VER__ +#else +#define EIGEN_CUDACC_VER 0 +#endif + +// Handle NVCC/CUDA/SYCL +#if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__) + // Do not try asserts on CUDA and SYCL! + #ifndef EIGEN_NO_DEBUG + #define EIGEN_NO_DEBUG + #endif + + #ifdef EIGEN_INTERNAL_DEBUGGING + #undef EIGEN_INTERNAL_DEBUGGING + #endif + + #ifdef EIGEN_EXCEPTIONS + #undef EIGEN_EXCEPTIONS + #endif + + // All functions callable from CUDA code must be qualified with __device__ + #ifdef __CUDACC__ + // Do not try to vectorize on CUDA and SYCL! + #ifndef EIGEN_DONT_VECTORIZE + #define EIGEN_DONT_VECTORIZE + #endif + + #define EIGEN_DEVICE_FUNC __host__ __device__ + // We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro + // works properly on the device side + #include + #else + #define EIGEN_DEVICE_FUNC + #endif + +#else + #define EIGEN_DEVICE_FUNC + +#endif + +// When compiling CUDA device code with NVCC, pull in math functions from the +// global namespace. In host mode, and when device doee with clang, use the +// std versions. +#if defined(__CUDA_ARCH__) && defined(__NVCC__) + #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC; +#else + #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC; +#endif + +#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(__CUDA_ARCH__) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) + #define EIGEN_EXCEPTIONS +#endif #ifdef EIGEN_EXCEPTIONS #include #endif +// then include this file where all our macros are defined. It's really important to do it first because +// it's where we do all the alignment settings (platform detection and honoring the user's will if he +// defined e.g. EIGEN_DONT_ALIGN) so it needs to be done before we do anything with vectorization. +#include "src/Core/util/Macros.h" + // Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3) // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details. #if EIGEN_COMP_MINGW && EIGEN_GNUC_AT_LEAST(4,6) @@ -46,9 +99,163 @@ // and inclusion of their respective header files #include "src/Core/util/MKL_support.h" +// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into +// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks +#if EIGEN_MAX_ALIGN_BYTES==0 + #ifndef EIGEN_DONT_VECTORIZE + #define EIGEN_DONT_VECTORIZE + #endif +#endif -#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) - #define EIGEN_HAS_GPU_FP16 +#if EIGEN_COMP_MSVC + #include // for _aligned_malloc -- need it regardless of whether vectorization is enabled + #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later + // Remember that usage of defined() in a #define is undefined by the standard. + // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP. + #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64 + #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER + #endif + #endif +#else + // Remember that usage of defined() in a #define is undefined by the standard + #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) ) + #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC + #endif +#endif + +#ifndef EIGEN_DONT_VECTORIZE + + #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER) + + // Defines symbols for compile-time detection of which instructions are + // used. + // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_SSE + #define EIGEN_VECTORIZE_SSE2 + + // Detect sse3/ssse3/sse4: + // gcc and icc defines __SSE3__, ... + // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you + // want to force the use of those instructions with msvc. + #ifdef __SSE3__ + #define EIGEN_VECTORIZE_SSE3 + #endif + #ifdef __SSSE3__ + #define EIGEN_VECTORIZE_SSSE3 + #endif + #ifdef __SSE4_1__ + #define EIGEN_VECTORIZE_SSE4_1 + #endif + #ifdef __SSE4_2__ + #define EIGEN_VECTORIZE_SSE4_2 + #endif + #ifdef __AVX__ + #define EIGEN_VECTORIZE_AVX + #define EIGEN_VECTORIZE_SSE3 + #define EIGEN_VECTORIZE_SSSE3 + #define EIGEN_VECTORIZE_SSE4_1 + #define EIGEN_VECTORIZE_SSE4_2 + #endif + #ifdef __AVX2__ + #define EIGEN_VECTORIZE_AVX2 + #endif + #ifdef __FMA__ + #define EIGEN_VECTORIZE_FMA + #endif + #if defined(__AVX512F__) && defined(EIGEN_ENABLE_AVX512) + #define EIGEN_VECTORIZE_AVX512 + #define EIGEN_VECTORIZE_AVX2 + #define EIGEN_VECTORIZE_AVX + #define EIGEN_VECTORIZE_FMA + #ifdef __AVX512DQ__ + #define EIGEN_VECTORIZE_AVX512DQ + #endif + #ifdef __AVX512ER__ + #define EIGEN_VECTORIZE_AVX512ER + #endif + #endif + + // include files + + // This extern "C" works around a MINGW-w64 compilation issue + // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354 + // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do). + // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations + // with conflicting linkage. The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know; + // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too. + // notice that since these are C headers, the extern "C" is theoretically needed anyways. + extern "C" { + // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly. + // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus: + #if EIGEN_COMP_ICC >= 1110 + #include + #else + #include + #include + #include + #ifdef EIGEN_VECTORIZE_SSE3 + #include + #endif + #ifdef EIGEN_VECTORIZE_SSSE3 + #include + #endif + #ifdef EIGEN_VECTORIZE_SSE4_1 + #include + #endif + #ifdef EIGEN_VECTORIZE_SSE4_2 + #include + #endif + #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512) + #include + #endif + #endif + } // end extern "C" + #elif defined __VSX__ + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_VSX + #include + // We need to #undef all these ugly tokens defined in + // => use __vector instead of vector + #undef bool + #undef vector + #undef pixel + #elif defined __ALTIVEC__ + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_ALTIVEC + #include + // We need to #undef all these ugly tokens defined in + // => use __vector instead of vector + #undef bool + #undef vector + #undef pixel + #elif (defined __ARM_NEON) || (defined __ARM_NEON__) + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_NEON + #include + #elif (defined __s390x__ && defined __VEC__) + #define EIGEN_VECTORIZE + #define EIGEN_VECTORIZE_ZVECTOR + #include + #endif +#endif + +#if defined(__F16C__) && !defined(EIGEN_COMP_CLANG) + // We can use the optimized fp16 to float and float to fp16 conversion routines + #define EIGEN_HAS_FP16_C +#endif + +#if defined __CUDACC__ + #define EIGEN_VECTORIZE_CUDA + #include + #if EIGEN_CUDACC_VER >= 70500 + #define EIGEN_HAS_CUDA_FP16 + #endif +#endif + +#if defined EIGEN_HAS_CUDA_FP16 + #include + #include #endif #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE) @@ -83,10 +290,6 @@ // for min/max: #include -#if EIGEN_HAS_CXX11 -#include -#endif - // for std::is_nothrow_move_assignable #ifdef EIGEN_INCLUDE_TYPE_TRAITS #include @@ -102,25 +305,38 @@ #include #endif -#if defined(EIGEN_USE_SYCL) - #undef min - #undef max - #undef isnan - #undef isinf - #undef isfinite - #include - #include - #include - #include - #include - #ifndef EIGEN_SYCL_LOCAL_THREAD_DIM0 - #define EIGEN_SYCL_LOCAL_THREAD_DIM0 16 - #endif - #ifndef EIGEN_SYCL_LOCAL_THREAD_DIM1 - #define EIGEN_SYCL_LOCAL_THREAD_DIM1 16 - #endif -#endif +/** \brief Namespace containing all symbols from the %Eigen library. */ +namespace Eigen { +inline static const char *SimdInstructionSetsInUse(void) { +#if defined(EIGEN_VECTORIZE_AVX512) + return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; +#elif defined(EIGEN_VECTORIZE_AVX) + return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; +#elif defined(EIGEN_VECTORIZE_SSE4_2) + return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; +#elif defined(EIGEN_VECTORIZE_SSE4_1) + return "SSE, SSE2, SSE3, SSSE3, SSE4.1"; +#elif defined(EIGEN_VECTORIZE_SSSE3) + return "SSE, SSE2, SSE3, SSSE3"; +#elif defined(EIGEN_VECTORIZE_SSE3) + return "SSE, SSE2, SSE3"; +#elif defined(EIGEN_VECTORIZE_SSE2) + return "SSE, SSE2"; +#elif defined(EIGEN_VECTORIZE_ALTIVEC) + return "AltiVec"; +#elif defined(EIGEN_VECTORIZE_VSX) + return "VSX"; +#elif defined(EIGEN_VECTORIZE_NEON) + return "ARM NEON"; +#elif defined(EIGEN_VECTORIZE_ZVECTOR) + return "S390X ZVECTOR"; +#else + return "None"; +#endif +} + +} // end namespace Eigen #if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API || defined EIGEN2_SUPPORT // This will generate an error message: @@ -129,7 +345,7 @@ namespace Eigen { -// we use size_t frequently and we'll never remember to prepend it with std:: every time just to +// we use size_t frequently and we'll never remember to prepend it with std:: everytime just to // ensure QNX/QCC support using std::size_t; // gcc 4.6.0 wants std:: for ptrdiff_t @@ -153,85 +369,60 @@ using std::ptrdiff_t; #include "src/Core/util/StaticAssert.h" #include "src/Core/util/XprHelper.h" #include "src/Core/util/Memory.h" -#include "src/Core/util/IntegralConstant.h" -#include "src/Core/util/SymbolicIndex.h" #include "src/Core/NumTraits.h" #include "src/Core/MathFunctions.h" #include "src/Core/GenericPacketMath.h" #include "src/Core/MathFunctionsImpl.h" #include "src/Core/arch/Default/ConjHelper.h" -// Generic half float support -#include "src/Core/arch/Default/Half.h" -#include "src/Core/arch/Default/TypeCasting.h" -#include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h" #if defined EIGEN_VECTORIZE_AVX512 #include "src/Core/arch/SSE/PacketMath.h" - #include "src/Core/arch/SSE/TypeCasting.h" - #include "src/Core/arch/SSE/Complex.h" - #include "src/Core/arch/AVX/PacketMath.h" - #include "src/Core/arch/AVX/TypeCasting.h" - #include "src/Core/arch/AVX/Complex.h" - #include "src/Core/arch/AVX512/PacketMath.h" - #include "src/Core/arch/AVX512/TypeCasting.h" - #include "src/Core/arch/AVX512/Complex.h" #include "src/Core/arch/SSE/MathFunctions.h" + #include "src/Core/arch/AVX/PacketMath.h" #include "src/Core/arch/AVX/MathFunctions.h" + #include "src/Core/arch/AVX512/PacketMath.h" #include "src/Core/arch/AVX512/MathFunctions.h" #elif defined EIGEN_VECTORIZE_AVX // Use AVX for floats and doubles, SSE for integers #include "src/Core/arch/SSE/PacketMath.h" - #include "src/Core/arch/SSE/TypeCasting.h" #include "src/Core/arch/SSE/Complex.h" - #include "src/Core/arch/AVX/PacketMath.h" - #include "src/Core/arch/AVX/TypeCasting.h" - #include "src/Core/arch/AVX/Complex.h" #include "src/Core/arch/SSE/MathFunctions.h" + #include "src/Core/arch/AVX/PacketMath.h" #include "src/Core/arch/AVX/MathFunctions.h" + #include "src/Core/arch/AVX/Complex.h" + #include "src/Core/arch/AVX/TypeCasting.h" + #include "src/Core/arch/SSE/TypeCasting.h" #elif defined EIGEN_VECTORIZE_SSE #include "src/Core/arch/SSE/PacketMath.h" - #include "src/Core/arch/SSE/TypeCasting.h" #include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/SSE/Complex.h" + #include "src/Core/arch/SSE/TypeCasting.h" #elif defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX) #include "src/Core/arch/AltiVec/PacketMath.h" #include "src/Core/arch/AltiVec/MathFunctions.h" #include "src/Core/arch/AltiVec/Complex.h" #elif defined EIGEN_VECTORIZE_NEON #include "src/Core/arch/NEON/PacketMath.h" - #include "src/Core/arch/NEON/TypeCasting.h" #include "src/Core/arch/NEON/MathFunctions.h" #include "src/Core/arch/NEON/Complex.h" #elif defined EIGEN_VECTORIZE_ZVECTOR #include "src/Core/arch/ZVector/PacketMath.h" #include "src/Core/arch/ZVector/MathFunctions.h" #include "src/Core/arch/ZVector/Complex.h" -#elif defined EIGEN_VECTORIZE_MSA - #include "src/Core/arch/MSA/PacketMath.h" - #include "src/Core/arch/MSA/MathFunctions.h" - #include "src/Core/arch/MSA/Complex.h" #endif -#if defined EIGEN_VECTORIZE_GPU - #include "src/Core/arch/GPU/PacketMath.h" - #include "src/Core/arch/GPU/MathFunctions.h" - #include "src/Core/arch/GPU/TypeCasting.h" -#endif +// Half float support +#include "src/Core/arch/CUDA/Half.h" +#include "src/Core/arch/CUDA/PacketMathHalf.h" +#include "src/Core/arch/CUDA/TypeCasting.h" -#if defined(EIGEN_USE_SYCL) - #include "src/Core/arch/SYCL/SyclMemoryModel.h" - #include "src/Core/arch/SYCL/InteropHeaders.h" -#if !defined(EIGEN_DONT_VECTORIZE_SYCL) - #include "src/Core/arch/SYCL/PacketMath.h" - #include "src/Core/arch/SYCL/MathFunctions.h" - #include "src/Core/arch/SYCL/TypeCasting.h" -#endif +#if defined EIGEN_VECTORIZE_CUDA + #include "src/Core/arch/CUDA/PacketMath.h" + #include "src/Core/arch/CUDA/MathFunctions.h" #endif #include "src/Core/arch/Default/Settings.h" -// This file provides generic implementations valid for scalar as well -#include "src/Core/arch/Default/GenericPacketMathFunctions.h" #include "src/Core/functors/TernaryFunctors.h" #include "src/Core/functors/BinaryFunctors.h" @@ -242,16 +433,9 @@ using std::ptrdiff_t; // Specialized functors to enable the processing of complex numbers // on CUDA devices -#ifdef EIGEN_CUDACC #include "src/Core/arch/CUDA/Complex.h" -#endif -#include "src/Core/util/IndexedViewHelper.h" -#include "src/Core/util/ReshapedHelper.h" -#include "src/Core/ArithmeticSequence.h" -#ifndef EIGEN_NO_IO - #include "src/Core/IO.h" -#endif +#include "src/Core/IO.h" #include "src/Core/DenseCoeffsBase.h" #include "src/Core/DenseBase.h" #include "src/Core/MatrixBase.h" @@ -292,8 +476,6 @@ using std::ptrdiff_t; #include "src/Core/Ref.h" #include "src/Core/Block.h" #include "src/Core/VectorBlock.h" -#include "src/Core/IndexedView.h" -#include "src/Core/Reshaped.h" #include "src/Core/Transpose.h" #include "src/Core/DiagonalMatrix.h" #include "src/Core/Diagonal.h" @@ -333,12 +515,10 @@ using std::ptrdiff_t; #include "src/Core/BooleanRedux.h" #include "src/Core/Select.h" #include "src/Core/VectorwiseOp.h" -#include "src/Core/PartialReduxEvaluator.h" #include "src/Core/Random.h" #include "src/Core/Replicate.h" #include "src/Core/Reverse.h" #include "src/Core/ArrayWrapper.h" -#include "src/Core/StlIterators.h" #ifdef EIGEN_USE_BLAS #include "src/Core/products/GeneralMatrixMatrix_BLAS.h" diff --git a/uppsrc/plugin/Eigen/Eigen/Geometry b/uppsrc/plugin/Eigen/Eigen/Geometry index 16b4bd6e1..da88c03bb 100644 --- a/uppsrc/plugin/Eigen/Eigen/Geometry +++ b/uppsrc/plugin/Eigen/Eigen/Geometry @@ -49,8 +49,9 @@ #include "src/Geometry/AlignedBox.h" #include "src/Geometry/Umeyama.h" -// Use the SSE optimized version whenever possible. -#if defined EIGEN_VECTORIZE_SSE +// Use the SSE optimized version whenever possible. At the moment the +// SSE version doesn't compile when AVX is enabled +#if defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX #include "src/Geometry/arch/Geometry_SSE.h" #endif @@ -58,3 +59,4 @@ #endif // EIGEN_GEOMETRY_MODULE_H /* vim: set filetype=cpp et sw=2 ts=2 ai: */ + diff --git a/uppsrc/plugin/Eigen/Eigen/KLUSupport b/uppsrc/plugin/Eigen/Eigen/KLUSupport deleted file mode 100644 index b23d90535..000000000 --- a/uppsrc/plugin/Eigen/Eigen/KLUSupport +++ /dev/null @@ -1,41 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_KLUSUPPORT_MODULE_H -#define EIGEN_KLUSUPPORT_MODULE_H - -#include - -#include - -extern "C" { -#include -#include - } - -/** \ingroup Support_modules - * \defgroup KLUSupport_Module KLUSupport module - * - * This module provides an interface to the KLU library which is part of the suitesparse package. - * It provides the following factorization class: - * - class KLU: a sparse LU factorization, well-suited for circuit simulation. - * - * \code - * #include - * \endcode - * - * In order to use this module, the klu and btf headers must be accessible from the include paths, and your binary must be linked to the klu library and its dependencies. - * The dependencies depend on how umfpack has been compiled. - * For a cmake based project, you can use our FindKLU.cmake module to help you in this task. - * - */ - -#include "src/KLUSupport/KLUSupport.h" - -#include - -#endif // EIGEN_KLUSUPPORT_MODULE_H diff --git a/uppsrc/plugin/Eigen/Eigen/OrderingMethods b/uppsrc/plugin/Eigen/Eigen/OrderingMethods index 29691a62b..d8ea36193 100644 --- a/uppsrc/plugin/Eigen/Eigen/OrderingMethods +++ b/uppsrc/plugin/Eigen/Eigen/OrderingMethods @@ -63,7 +63,10 @@ * \endcode */ +#ifndef EIGEN_MPL2_ONLY #include "src/OrderingMethods/Amd.h" +#endif + #include "src/OrderingMethods/Ordering.h" #include "src/Core/util/ReenableStupidWarnings.h" diff --git a/uppsrc/plugin/Eigen/Eigen/PaStiXSupport b/uppsrc/plugin/Eigen/Eigen/PaStiXSupport index 234619acc..de3a63b4d 100644 --- a/uppsrc/plugin/Eigen/Eigen/PaStiXSupport +++ b/uppsrc/plugin/Eigen/Eigen/PaStiXSupport @@ -36,7 +36,6 @@ extern "C" { * \endcode * * In order to use this module, the PaSTiX headers must be accessible from the include paths, and your binary must be linked to the PaSTiX library and its dependencies. - * This wrapper resuires PaStiX version 5.x compiled without MPI support. * The dependencies depend on how PaSTiX has been compiled. * For a cmake based project, you can use our FindPaSTiX.cmake module to help you in this task. * diff --git a/uppsrc/plugin/Eigen/Eigen/Sparse b/uppsrc/plugin/Eigen/Eigen/Sparse index a2ef7a665..136e681a1 100644 --- a/uppsrc/plugin/Eigen/Eigen/Sparse +++ b/uppsrc/plugin/Eigen/Eigen/Sparse @@ -25,7 +25,9 @@ #include "SparseCore" #include "OrderingMethods" +#ifndef EIGEN_MPL2_ONLY #include "SparseCholesky" +#endif #include "SparseLU" #include "SparseQR" #include "IterativeLinearSolvers" diff --git a/uppsrc/plugin/Eigen/Eigen/SparseCholesky b/uppsrc/plugin/Eigen/Eigen/SparseCholesky index d2b1f1276..b6a320c40 100644 --- a/uppsrc/plugin/Eigen/Eigen/SparseCholesky +++ b/uppsrc/plugin/Eigen/Eigen/SparseCholesky @@ -30,8 +30,16 @@ * \endcode */ +#ifdef EIGEN_MPL2_ONLY +#error The SparseCholesky module has nothing to offer in MPL2 only mode +#endif + #include "src/SparseCholesky/SimplicialCholesky.h" + +#ifndef EIGEN_MPL2_ONLY #include "src/SparseCholesky/SimplicialCholesky_impl.h" +#endif + #include "src/Core/util/ReenableStupidWarnings.h" #endif // EIGEN_SPARSECHOLESKY_MODULE_H diff --git a/uppsrc/plugin/Eigen/Eigen/SparseLU b/uppsrc/plugin/Eigen/Eigen/SparseLU index 37c4a5c5a..38b38b531 100644 --- a/uppsrc/plugin/Eigen/Eigen/SparseLU +++ b/uppsrc/plugin/Eigen/Eigen/SparseLU @@ -23,8 +23,6 @@ // Ordering interface #include "OrderingMethods" -#include "src/Core/util/DisableStupidWarnings.h" - #include "src/SparseLU/SparseLU_gemm_kernel.h" #include "src/SparseLU/SparseLU_Structs.h" @@ -45,6 +43,4 @@ #include "src/SparseLU/SparseLU_Utils.h" #include "src/SparseLU/SparseLU.h" -#include "src/Core/util/ReenableStupidWarnings.h" - #endif // EIGEN_SPARSELU_MODULE_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Cholesky/LDLT.h b/uppsrc/plugin/Eigen/Eigen/src/Cholesky/LDLT.h index 67e97ffb8..15ccf24f1 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Cholesky/LDLT.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Cholesky/LDLT.h @@ -16,15 +16,6 @@ namespace Eigen { namespace internal { - template struct traits > - : traits<_MatrixType> - { - typedef MatrixXpr XprKind; - typedef SolverStorage StorageKind; - typedef int StorageIndex; - enum { Flags = 0 }; - }; - template struct LDLT_Traits; // PositiveSemiDef means positive semi-definite and non-zero; same for NegativeSemiDef @@ -57,19 +48,20 @@ namespace internal { * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT */ template class LDLT - : public SolverBase > { public: typedef _MatrixType MatrixType; - typedef SolverBase Base; - friend class SolverBase; - - EIGEN_GENERIC_PUBLIC_INTERFACE(LDLT) enum { + RowsAtCompileTime = MatrixType::RowsAtCompileTime, + ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, UpLo = _UpLo }; + typedef typename MatrixType::Scalar Scalar; + typedef typename NumTraits::Real RealScalar; + typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 + typedef typename MatrixType::StorageIndex StorageIndex; typedef Matrix TmpMatrixType; typedef Transpositions TranspositionType; @@ -188,7 +180,6 @@ template class LDLT return m_sign == internal::NegativeSemiDef || m_sign == internal::ZeroSign; } - #ifdef EIGEN_PARSED_BY_DOXYGEN /** \returns a solution x of \f$ A x = b \f$ using the current decomposition of A. * * This function also supports in-place solves using the syntax x = decompositionObject.solve(x) . @@ -206,8 +197,13 @@ template class LDLT */ template inline const Solve - solve(const MatrixBase& b) const; - #endif + solve(const MatrixBase& b) const + { + eigen_assert(m_isInitialized && "LDLT is not initialized."); + eigen_assert(m_matrix.rows()==b.rows() + && "LDLT::solve(): invalid number of rows of the right hand side matrix b"); + return Solve(*this, b.derived()); + } template bool solveInPlace(MatrixBase &bAndX) const; @@ -251,7 +247,7 @@ template class LDLT /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was successful, + * \returns \c Success if computation was succesful, * \c NumericalIssue if the factorization failed because of a zero pivot. */ ComputationInfo info() const @@ -262,10 +258,8 @@ template class LDLT #ifndef EIGEN_PARSED_BY_DOXYGEN template + EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; - - template - void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -566,22 +560,14 @@ template template void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const { - _solve_impl_transposed(rhs, dst); -} - -template -template -void LDLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const -{ + eigen_assert(rhs.rows() == rows()); // dst = P b dst = m_transpositions * rhs; // dst = L^-1 (P b) - // dst = L^-*T (P b) - matrixL().template conjugateIf().solveInPlace(dst); + matrixL().solveInPlace(dst); - // dst = D^-* (L^-1 P b) - // dst = D^-1 (L^-*T P b) + // dst = D^-1 (L^-1 P b) // more precisely, use pseudo-inverse of D (see bug 241) using std::abs; const typename Diagonal::RealReturnType vecD(vectorD()); @@ -593,6 +579,7 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType // Moreover, Lapack's xSYTRS routines use 0 for the tolerance. // Using numeric_limits::min() gives us more robustness to denormals. RealScalar tolerance = (std::numeric_limits::min)(); + for (Index i = 0; i < vecD.size(); ++i) { if(abs(vecD(i)) > tolerance) @@ -601,12 +588,10 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType dst.row(i).setZero(); } - // dst = L^-* (D^-* L^-1 P b) - // dst = L^-T (D^-1 L^-*T P b) - matrixL().transpose().template conjugateIf().solveInPlace(dst); + // dst = L^-T (D^-1 L^-1 P b) + matrixU().solveInPlace(dst); - // dst = P^T (L^-* D^-* L^-1 P b) = A^-1 b - // dst = P^-T (L^-T D^-1 L^-*T P b) = A^-1 b + // dst = P^-1 (L^-T D^-1 L^-1 P b) = A^-1 b dst = m_transpositions.transpose() * dst; } #endif diff --git a/uppsrc/plugin/Eigen/Eigen/src/Cholesky/LLT.h b/uppsrc/plugin/Eigen/Eigen/src/Cholesky/LLT.h index 5876966e6..e1624d21b 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Cholesky/LLT.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Cholesky/LLT.h @@ -13,16 +13,6 @@ namespace Eigen { namespace internal{ - -template struct traits > - : traits<_MatrixType> -{ - typedef MatrixXpr XprKind; - typedef SolverStorage StorageKind; - typedef int StorageIndex; - enum { Flags = 0 }; -}; - template struct LLT_Traits; } @@ -64,17 +54,18 @@ template struct LLT_Traits; * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT */ template class LLT - : public SolverBase > { public: typedef _MatrixType MatrixType; - typedef SolverBase Base; - friend class SolverBase; - - EIGEN_GENERIC_PUBLIC_INTERFACE(LLT) enum { + RowsAtCompileTime = MatrixType::RowsAtCompileTime, + ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; + typedef typename MatrixType::Scalar Scalar; + typedef typename NumTraits::Real RealScalar; + typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 + typedef typename MatrixType::StorageIndex StorageIndex; enum { PacketSize = internal::packet_traits::size, @@ -109,7 +100,7 @@ template class LLT compute(matrix.derived()); } - /** \brief Constructs a LLT factorization from a given matrix + /** \brief Constructs a LDLT factorization from a given matrix * * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when * \c MatrixType is a Eigen::Ref. @@ -138,7 +129,6 @@ template class LLT return Traits::getL(m_matrix); } - #ifdef EIGEN_PARSED_BY_DOXYGEN /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A. * * Since this LLT class assumes anyway that the matrix A is invertible, the solution @@ -151,8 +141,13 @@ template class LLT */ template inline const Solve - solve(const MatrixBase& b) const; - #endif + solve(const MatrixBase& b) const + { + eigen_assert(m_isInitialized && "LLT is not initialized."); + eigen_assert(m_matrix.rows()==b.rows() + && "LLT::solve(): invalid number of rows of the right hand side matrix b"); + return Solve(*this, b.derived()); + } template void solveInPlace(const MatrixBase &bAndX) const; @@ -185,7 +180,7 @@ template class LLT /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was successful, + * \returns \c Success if computation was succesful, * \c NumericalIssue if the matrix.appears not to be positive definite. */ ComputationInfo info() const @@ -205,14 +200,12 @@ template class LLT inline Index cols() const { return m_matrix.cols(); } template - LLT & rankUpdate(const VectorType& vec, const RealScalar& sigma = 1); + LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1); #ifndef EIGEN_PARSED_BY_DOXYGEN template + EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; - - template - void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -466,7 +459,7 @@ LLT& LLT::compute(const EigenBase */ template template -LLT<_MatrixType,_UpLo> & LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma) +LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorType); eigen_assert(v.size()==m_matrix.cols()); @@ -484,17 +477,8 @@ template template void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const { - _solve_impl_transposed(rhs, dst); -} - -template -template -void LLT<_MatrixType,_UpLo>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const -{ - dst = rhs; - - matrixL().template conjugateIf().solveInPlace(dst); - matrixU().template conjugateIf().solveInPlace(dst); + dst = rhs; + solveInPlace(dst); } #endif diff --git a/uppsrc/plugin/Eigen/Eigen/src/CholmodSupport/CholmodSupport.h b/uppsrc/plugin/Eigen/Eigen/src/CholmodSupport/CholmodSupport.h index adaf52858..571972023 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/CholmodSupport/CholmodSupport.h +++ b/uppsrc/plugin/Eigen/Eigen/src/CholmodSupport/CholmodSupport.h @@ -10,7 +10,7 @@ #ifndef EIGEN_CHOLMODSUPPORT_H #define EIGEN_CHOLMODSUPPORT_H -namespace Eigen { +namespace Eigen { namespace internal { @@ -32,7 +32,7 @@ template<> struct cholmod_configure_matrix > { } }; -// Other scalar types are not yet supported by Cholmod +// Other scalar types are not yet suppotred by Cholmod // template<> struct cholmod_configure_matrix { // template // static void run(CholmodType& mat) { @@ -79,12 +79,12 @@ cholmod_sparse viewAsCholmod(Ref > res.dtype = 0; res.stype = -1; - + if (internal::is_same<_StorageIndex,int>::value) { res.itype = CHOLMOD_INT; } - else if (internal::is_same<_StorageIndex,SuiteSparse_long>::value) + else if (internal::is_same<_StorageIndex,long>::value) { res.itype = CHOLMOD_LONG; } @@ -95,9 +95,9 @@ cholmod_sparse viewAsCholmod(Ref > // setup res.xtype internal::cholmod_configure_matrix<_Scalar>::run(res); - + res.stype = 0; - + return res; } @@ -121,12 +121,9 @@ template cholmod_sparse viewAsCholmod(const SparseSelfAdjointView, UpLo>& mat) { cholmod_sparse res = viewAsCholmod(Ref >(mat.matrix().const_cast_derived())); - + if(UpLo==Upper) res.stype = 1; if(UpLo==Lower) res.stype = -1; - // swap stype for rowmajor matrices (only works for real matrices) - EIGEN_STATIC_ASSERT((_Options & RowMajorBit) == 0 || NumTraits<_Scalar>::IsComplex == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES); - if(_Options & RowMajorBit) res.stype *=-1; return res; } @@ -162,44 +159,6 @@ MappedSparseMatrix viewAsEigen(cholmod_sparse& cm) static_cast(cm.p), static_cast(cm.i),static_cast(cm.x) ); } -namespace internal { - -// template specializations for int and long that call the correct cholmod method - -#define EIGEN_CHOLMOD_SPECIALIZE0(ret, name) \ - template inline ret cm_ ## name (cholmod_common &Common) { return cholmod_ ## name (&Common); } \ - template<> inline ret cm_ ## name (cholmod_common &Common) { return cholmod_l_ ## name (&Common); } - -#define EIGEN_CHOLMOD_SPECIALIZE1(ret, name, t1, a1) \ - template inline ret cm_ ## name (t1& a1, cholmod_common &Common) { return cholmod_ ## name (&a1, &Common); } \ - template<> inline ret cm_ ## name (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); } - -EIGEN_CHOLMOD_SPECIALIZE0(int, start) -EIGEN_CHOLMOD_SPECIALIZE0(int, finish) - -EIGEN_CHOLMOD_SPECIALIZE1(int, free_factor, cholmod_factor*, L) -EIGEN_CHOLMOD_SPECIALIZE1(int, free_dense, cholmod_dense*, X) -EIGEN_CHOLMOD_SPECIALIZE1(int, free_sparse, cholmod_sparse*, A) - -EIGEN_CHOLMOD_SPECIALIZE1(cholmod_factor*, analyze, cholmod_sparse, A) - -template inline cholmod_dense* cm_solve (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_solve (sys, &L, &B, &Common); } -template<> inline cholmod_dense* cm_solve (int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common &Common) { return cholmod_l_solve (sys, &L, &B, &Common); } - -template inline cholmod_sparse* cm_spsolve (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve (sys, &L, &B, &Common); } -template<> inline cholmod_sparse* cm_spsolve (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); } - -template -inline int cm_factorize_p (cholmod_sparse* A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p (A, beta, fset, fsize, L, &Common); } -template<> -inline int cm_factorize_p (cholmod_sparse* A, double beta[2], SuiteSparse_long* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); } - -#undef EIGEN_CHOLMOD_SPECIALIZE0 -#undef EIGEN_CHOLMOD_SPECIALIZE1 - -} // namespace internal - - enum CholmodMode { CholmodAuto, CholmodSimplicialLLt, CholmodSupernodalLLt, CholmodLDLt }; @@ -236,7 +195,7 @@ class CholmodBase : public SparseSolverBase { EIGEN_STATIC_ASSERT((internal::is_same::value), CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY); m_shiftOffset[0] = m_shiftOffset[1] = 0.0; - internal::cm_start(m_cholmod); + cholmod_start(&m_cholmod); } explicit CholmodBase(const MatrixType& matrix) @@ -244,23 +203,23 @@ class CholmodBase : public SparseSolverBase { EIGEN_STATIC_ASSERT((internal::is_same::value), CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY); m_shiftOffset[0] = m_shiftOffset[1] = 0.0; - internal::cm_start(m_cholmod); + cholmod_start(&m_cholmod); compute(matrix); } ~CholmodBase() { if(m_cholmodFactor) - internal::cm_free_factor(m_cholmodFactor, m_cholmod); - internal::cm_finish(m_cholmod); + cholmod_free_factor(&m_cholmodFactor, &m_cholmod); + cholmod_finish(&m_cholmod); } - + inline StorageIndex cols() const { return internal::convert_index(m_cholmodFactor->n); } inline StorageIndex rows() const { return internal::convert_index(m_cholmodFactor->n); } - + /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was successful, + * \returns \c Success if computation was succesful, * \c NumericalIssue if the matrix.appears to be negative. */ ComputationInfo info() const @@ -276,29 +235,29 @@ class CholmodBase : public SparseSolverBase factorize(matrix); return derived(); } - + /** Performs a symbolic decomposition on the sparsity pattern of \a matrix. * * This function is particularly useful when solving for several problems having the same structure. - * + * * \sa factorize() */ void analyzePattern(const MatrixType& matrix) { if(m_cholmodFactor) { - internal::cm_free_factor(m_cholmodFactor, m_cholmod); + cholmod_free_factor(&m_cholmodFactor, &m_cholmod); m_cholmodFactor = 0; } cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView()); - m_cholmodFactor = internal::cm_analyze(A, m_cholmod); - + m_cholmodFactor = cholmod_analyze(&A, &m_cholmod); + this->m_isInitialized = true; this->m_info = Success; m_analysisIsOk = true; m_factorizationIsOk = false; } - + /** Performs a numeric decomposition of \a matrix * * The given matrix must have the same sparsity pattern as the matrix on which the symbolic decomposition has been performed. @@ -309,17 +268,17 @@ class CholmodBase : public SparseSolverBase { eigen_assert(m_analysisIsOk && "You must first call analyzePattern()"); cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView()); - internal::cm_factorize_p(&A, m_shiftOffset, 0, 0, m_cholmodFactor, m_cholmod); + cholmod_factorize_p(&A, m_shiftOffset, 0, 0, m_cholmodFactor, &m_cholmod); // If the factorization failed, minor is the column at which it did. On success minor == n. this->m_info = (m_cholmodFactor->minor == m_cholmodFactor->n ? Success : NumericalIssue); m_factorizationIsOk = true; } - + /** Returns a reference to the Cholmod's configuration structure to get a full control over the performed operations. * See the Cholmod user guide for details. */ cholmod_common& cholmod() { return m_cholmod; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN /** \internal */ template @@ -329,23 +288,22 @@ class CholmodBase : public SparseSolverBase const Index size = m_cholmodFactor->n; EIGEN_UNUSED_VARIABLE(size); eigen_assert(size==b.rows()); - - // Cholmod needs column-major storage without inner-stride, which corresponds to the default behavior of Ref. + + // Cholmod needs column-major stoarge without inner-stride, which corresponds to the default behavior of Ref. Ref > b_ref(b.derived()); cholmod_dense b_cd = viewAsCholmod(b_ref); - cholmod_dense* x_cd = internal::cm_solve(CHOLMOD_A, *m_cholmodFactor, b_cd, m_cholmod); + cholmod_dense* x_cd = cholmod_solve(CHOLMOD_A, m_cholmodFactor, &b_cd, &m_cholmod); if(!x_cd) { this->m_info = NumericalIssue; return; } // TODO optimize this copy by swapping when possible (be careful with alignment, etc.) - // NOTE Actually, the copy can be avoided by calling cholmod_solve2 instead of cholmod_solve dest = Matrix::Map(reinterpret_cast(x_cd->x),b.rows(),b.cols()); - internal::cm_free_dense(x_cd, m_cholmod); + cholmod_free_dense(&x_cd, &m_cholmod); } - + /** \internal */ template void _solve_impl(const SparseMatrixBase &b, SparseMatrixBase &dest) const @@ -358,20 +316,19 @@ class CholmodBase : public SparseSolverBase // note: cs stands for Cholmod Sparse Ref > b_ref(b.const_cast_derived()); cholmod_sparse b_cs = viewAsCholmod(b_ref); - cholmod_sparse* x_cs = internal::cm_spsolve(CHOLMOD_A, *m_cholmodFactor, b_cs, m_cholmod); + cholmod_sparse* x_cs = cholmod_spsolve(CHOLMOD_A, m_cholmodFactor, &b_cs, &m_cholmod); if(!x_cs) { this->m_info = NumericalIssue; return; } // TODO optimize this copy by swapping when possible (be careful with alignment, etc.) - // NOTE cholmod_spsolve in fact just calls the dense solver for blocks of 4 columns at a time (similar to Eigen's sparse solver) dest.derived() = viewAsEigen(*x_cs); - internal::cm_free_sparse(x_cs, m_cholmod); + cholmod_free_sparse(&x_cs, &m_cholmod); } #endif // EIGEN_PARSED_BY_DOXYGEN - - + + /** Sets the shift parameter that will be used to adjust the diagonal coefficients during the numerical factorization. * * During the numerical factorization, an offset term is added to the diagonal coefficients:\n @@ -386,7 +343,7 @@ class CholmodBase : public SparseSolverBase m_shiftOffset[0] = double(offset); return derived(); } - + /** \returns the determinant of the underlying matrix from the current factorization */ Scalar determinant() const { @@ -441,7 +398,7 @@ class CholmodBase : public SparseSolverBase template void dumpMemory(Stream& /*s*/) {} - + protected: mutable cholmod_common m_cholmod; cholmod_factor* m_cholmodFactor; @@ -478,11 +435,11 @@ class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimpl { typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT> Base; using Base::m_cholmod; - + public: - + typedef _MatrixType MatrixType; - + CholmodSimplicialLLT() : Base() { init(); } CholmodSimplicialLLT(const MatrixType& matrix) : Base() @@ -529,11 +486,11 @@ class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimp { typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT> Base; using Base::m_cholmod; - + public: - + typedef _MatrixType MatrixType; - + CholmodSimplicialLDLT() : Base() { init(); } CholmodSimplicialLDLT(const MatrixType& matrix) : Base() @@ -578,11 +535,11 @@ class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSuper { typedef CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT> Base; using Base::m_cholmod; - + public: - + typedef _MatrixType MatrixType; - + CholmodSupernodalLLT() : Base() { init(); } CholmodSupernodalLLT(const MatrixType& matrix) : Base() @@ -629,11 +586,11 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom { typedef CholmodBase<_MatrixType, _UpLo, CholmodDecomposition> Base; using Base::m_cholmod; - + public: - + typedef _MatrixType MatrixType; - + CholmodDecomposition() : Base() { init(); } CholmodDecomposition(const MatrixType& matrix) : Base() @@ -643,7 +600,7 @@ class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecom } ~CholmodDecomposition() {} - + void setMode(CholmodMode mode) { switch(mode) diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/ArithmeticSequence.h b/uppsrc/plugin/Eigen/Eigen/src/Core/ArithmeticSequence.h deleted file mode 100644 index b6200fac1..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/ArithmeticSequence.h +++ /dev/null @@ -1,413 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2017 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_ARITHMETIC_SEQUENCE_H -#define EIGEN_ARITHMETIC_SEQUENCE_H - -namespace Eigen { - -namespace internal { - -#if (!EIGEN_HAS_CXX11) || !((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48) -template struct aseq_negate {}; - -template<> struct aseq_negate { - typedef Index type; -}; - -template struct aseq_negate > { - typedef FixedInt<-N> type; -}; - -// Compilation error in the following case: -template<> struct aseq_negate > {}; - -template::value, - bool SizeIsSymbolic =symbolic::is_symbolic::value> -struct aseq_reverse_first_type { - typedef Index type; -}; - -template -struct aseq_reverse_first_type { - typedef symbolic::AddExpr > >, - symbolic::ValueExpr > - > type; -}; - -template -struct aseq_reverse_first_type_aux { - typedef Index type; -}; - -template -struct aseq_reverse_first_type_aux::type> { - typedef FixedInt<(SizeType::value-1)*IncrType::value> type; -}; - -template -struct aseq_reverse_first_type { - typedef typename aseq_reverse_first_type_aux::type Aux; - typedef symbolic::AddExpr > type; -}; - -template -struct aseq_reverse_first_type { - typedef symbolic::AddExpr > >, - symbolic::ValueExpr >, - symbolic::ValueExpr<> > type; -}; -#endif - -// Helper to cleanup the type of the increment: -template struct cleanup_seq_incr { - typedef typename cleanup_index_type::type type; -}; - -} - -//-------------------------------------------------------------------------------- -// seq(first,last,incr) and seqN(first,size,incr) -//-------------------------------------------------------------------------------- - -template > -class ArithmeticSequence; - -template -ArithmeticSequence::type, - typename internal::cleanup_index_type::type, - typename internal::cleanup_seq_incr::type > -seqN(FirstType first, SizeType size, IncrType incr); - -/** \class ArithmeticSequence - * \ingroup Core_Module - * - * This class represents an arithmetic progression \f$ a_0, a_1, a_2, ..., a_{n-1}\f$ defined by - * its \em first value \f$ a_0 \f$, its \em size (aka length) \em n, and the \em increment (aka stride) - * that is equal to \f$ a_{i+1}-a_{i}\f$ for any \em i. - * - * It is internally used as the return type of the Eigen::seq and Eigen::seqN functions, and as the input arguments - * of DenseBase::operator()(const RowIndices&, const ColIndices&), and most of the time this is the - * only way it is used. - * - * \tparam FirstType type of the first element, usually an Index, - * but internally it can be a symbolic expression - * \tparam SizeType type representing the size of the sequence, usually an Index - * or a compile time integral constant. Internally, it can also be a symbolic expression - * \tparam IncrType type of the increment, can be a runtime Index, or a compile time integral constant (default is compile-time 1) - * - * \sa Eigen::seq, Eigen::seqN, DenseBase::operator()(const RowIndices&, const ColIndices&), class IndexedView - */ -template -class ArithmeticSequence -{ -public: - ArithmeticSequence(FirstType first, SizeType size) : m_first(first), m_size(size) {} - ArithmeticSequence(FirstType first, SizeType size, IncrType incr) : m_first(first), m_size(size), m_incr(incr) {} - - enum { - SizeAtCompileTime = internal::get_fixed_value::value, - IncrAtCompileTime = internal::get_fixed_value::value - }; - - /** \returns the size, i.e., number of elements, of the sequence */ - Index size() const { return m_size; } - - /** \returns the first element \f$ a_0 \f$ in the sequence */ - Index first() const { return m_first; } - - /** \returns the value \f$ a_i \f$ at index \a i in the sequence. */ - Index operator[](Index i) const { return m_first + i * m_incr; } - - const FirstType& firstObject() const { return m_first; } - const SizeType& sizeObject() const { return m_size; } - const IncrType& incrObject() const { return m_incr; } - -protected: - FirstType m_first; - SizeType m_size; - IncrType m_incr; - -public: - -#if EIGEN_HAS_CXX11 && ((!EIGEN_COMP_GNUC) || EIGEN_COMP_GNUC>=48) - auto reverse() const -> decltype(Eigen::seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr)) { - return seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr); - } -#else -protected: - typedef typename internal::aseq_negate::type ReverseIncrType; - typedef typename internal::aseq_reverse_first_type::type ReverseFirstType; -public: - ArithmeticSequence - reverse() const { - return seqN(m_first+(m_size+fix<-1>())*m_incr,m_size,-m_incr); - } -#endif -}; - -/** \returns an ArithmeticSequence starting at \a first, of length \a size, and increment \a incr - * - * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */ -template -ArithmeticSequence::type,typename internal::cleanup_index_type::type,typename internal::cleanup_seq_incr::type > -seqN(FirstType first, SizeType size, IncrType incr) { - return ArithmeticSequence::type,typename internal::cleanup_index_type::type,typename internal::cleanup_seq_incr::type>(first,size,incr); -} - -/** \returns an ArithmeticSequence starting at \a first, of length \a size, and unit increment - * - * \sa seqN(FirstType,SizeType,IncrType), seq(FirstType,LastType) */ -template -ArithmeticSequence::type,typename internal::cleanup_index_type::type > -seqN(FirstType first, SizeType size) { - return ArithmeticSequence::type,typename internal::cleanup_index_type::type>(first,size); -} - -#ifdef EIGEN_PARSED_BY_DOXYGEN - -/** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and with positive (or negative) increment \a incr - * - * It is essentially an alias to: - * \code - * seqN(f, (l-f+incr)/incr, incr); - * \endcode - * - * \sa seqN(FirstType,SizeType,IncrType), seq(FirstType,LastType) - */ -template -auto seq(FirstType f, LastType l, IncrType incr); - -/** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and unit increment - * - * It is essentially an alias to: - * \code - * seqN(f,l-f+1); - * \endcode - * - * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) - */ -template -auto seq(FirstType f, LastType l); - -#else // EIGEN_PARSED_BY_DOXYGEN - -#if EIGEN_HAS_CXX11 -template -auto seq(FirstType f, LastType l) -> decltype(seqN(typename internal::cleanup_index_type::type(f), - ( typename internal::cleanup_index_type::type(l) - - typename internal::cleanup_index_type::type(f)+fix<1>()))) -{ - return seqN(typename internal::cleanup_index_type::type(f), - (typename internal::cleanup_index_type::type(l) - -typename internal::cleanup_index_type::type(f)+fix<1>())); -} - -template -auto seq(FirstType f, LastType l, IncrType incr) - -> decltype(seqN(typename internal::cleanup_index_type::type(f), - ( typename internal::cleanup_index_type::type(l) - - typename internal::cleanup_index_type::type(f)+typename internal::cleanup_seq_incr::type(incr) - ) / typename internal::cleanup_seq_incr::type(incr), - typename internal::cleanup_seq_incr::type(incr))) -{ - typedef typename internal::cleanup_seq_incr::type CleanedIncrType; - return seqN(typename internal::cleanup_index_type::type(f), - ( typename internal::cleanup_index_type::type(l) - -typename internal::cleanup_index_type::type(f)+CleanedIncrType(incr)) / CleanedIncrType(incr), - CleanedIncrType(incr)); -} - -#else // EIGEN_HAS_CXX11 - -template -typename internal::enable_if::value || symbolic::is_symbolic::value), - ArithmeticSequence::type,Index> >::type -seq(FirstType f, LastType l) -{ - return seqN(typename internal::cleanup_index_type::type(f), - Index((typename internal::cleanup_index_type::type(l)-typename internal::cleanup_index_type::type(f)+fix<1>()))); -} - -template -typename internal::enable_if::value, - ArithmeticSequence,symbolic::ValueExpr<> >, - symbolic::ValueExpr > > > >::type -seq(const symbolic::BaseExpr &f, LastType l) -{ - return seqN(f.derived(),(typename internal::cleanup_index_type::type(l)-f.derived()+fix<1>())); -} - -template -typename internal::enable_if::value, - ArithmeticSequence::type, - symbolic::AddExpr >, - symbolic::ValueExpr > > > >::type -seq(FirstType f, const symbolic::BaseExpr &l) -{ - return seqN(typename internal::cleanup_index_type::type(f),(l.derived()-typename internal::cleanup_index_type::type(f)+fix<1>())); -} - -template -ArithmeticSequence >,symbolic::ValueExpr > > > -seq(const symbolic::BaseExpr &f, const symbolic::BaseExpr &l) -{ - return seqN(f.derived(),(l.derived()-f.derived()+fix<1>())); -} - - -template -typename internal::enable_if::value || symbolic::is_symbolic::value), - ArithmeticSequence::type,Index,typename internal::cleanup_seq_incr::type> >::type -seq(FirstType f, LastType l, IncrType incr) -{ - typedef typename internal::cleanup_seq_incr::type CleanedIncrType; - return seqN(typename internal::cleanup_index_type::type(f), - Index((typename internal::cleanup_index_type::type(l)-typename internal::cleanup_index_type::type(f)+CleanedIncrType(incr))/CleanedIncrType(incr)), incr); -} - -template -typename internal::enable_if::value, - ArithmeticSequence, - symbolic::ValueExpr<> >, - symbolic::ValueExpr::type> >, - symbolic::ValueExpr::type> >, - typename internal::cleanup_seq_incr::type> >::type -seq(const symbolic::BaseExpr &f, LastType l, IncrType incr) -{ - typedef typename internal::cleanup_seq_incr::type CleanedIncrType; - return seqN(f.derived(),(typename internal::cleanup_index_type::type(l)-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr); -} - -template -typename internal::enable_if::value, - ArithmeticSequence::type, - symbolic::QuotientExpr >, - symbolic::ValueExpr::type> >, - symbolic::ValueExpr::type> >, - typename internal::cleanup_seq_incr::type> >::type -seq(FirstType f, const symbolic::BaseExpr &l, IncrType incr) -{ - typedef typename internal::cleanup_seq_incr::type CleanedIncrType; - return seqN(typename internal::cleanup_index_type::type(f), - (l.derived()-typename internal::cleanup_index_type::type(f)+CleanedIncrType(incr))/CleanedIncrType(incr), incr); -} - -template -ArithmeticSequence >, - symbolic::ValueExpr::type> >, - symbolic::ValueExpr::type> >, - typename internal::cleanup_seq_incr::type> -seq(const symbolic::BaseExpr &f, const symbolic::BaseExpr &l, IncrType incr) -{ - typedef typename internal::cleanup_seq_incr::type CleanedIncrType; - return seqN(f.derived(),(l.derived()-f.derived()+CleanedIncrType(incr))/CleanedIncrType(incr), incr); -} -#endif // EIGEN_HAS_CXX11 - -#endif // EIGEN_PARSED_BY_DOXYGEN - - -#if EIGEN_HAS_CXX11 || defined(EIGEN_PARSED_BY_DOXYGEN) -/** \cpp11 - * \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr. - * - * It is a shortcut for: \code seqN(last-(size-fix<1>)*incr, size, incr) \endcode - * - * \sa lastN(SizeType), seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */ -template -auto lastN(SizeType size, IncrType incr) --> decltype(seqN(Eigen::last-(size-fix<1>())*incr, size, incr)) -{ - return seqN(Eigen::last-(size-fix<1>())*incr, size, incr); -} - -/** \cpp11 - * \returns a symbolic ArithmeticSequence representing the last \a size elements with a unit increment. - * - * It is a shortcut for: \code seq(last+fix<1>-size, last) \endcode - * - * \sa lastN(SizeType,IncrType, seqN(FirstType,SizeType), seq(FirstType,LastType) */ -template -auto lastN(SizeType size) --> decltype(seqN(Eigen::last+fix<1>()-size, size)) -{ - return seqN(Eigen::last+fix<1>()-size, size); -} -#endif - -namespace internal { - -// Convert a symbolic span into a usable one (i.e., remove last/end "keywords") -template -struct make_size_type { - typedef typename internal::conditional::value, Index, T>::type type; -}; - -template -struct IndexedViewCompatibleType, XprSize> { - typedef ArithmeticSequence::type,IncrType> type; -}; - -template -ArithmeticSequence::type,IncrType> -makeIndexedViewCompatible(const ArithmeticSequence& ids, Index size,SpecializedType) { - return ArithmeticSequence::type,IncrType>( - eval_expr_given_size(ids.firstObject(),size),eval_expr_given_size(ids.sizeObject(),size),ids.incrObject()); -} - -template -struct get_compile_time_incr > { - enum { value = get_fixed_value::value }; -}; - -} // end namespace internal - -/** \namespace Eigen::indexing - * \ingroup Core_Module - * - * The sole purpose of this namespace is to be able to import all functions - * and symbols that are expected to be used within operator() for indexing - * and slicing. If you already imported the whole Eigen namespace: - * \code using namespace Eigen; \endcode - * then you are already all set. Otherwise, if you don't want/cannot import - * the whole Eigen namespace, the following line: - * \code using namespace Eigen::indexing; \endcode - * is equivalent to: - * \code - using Eigen::all; - using Eigen::seq; - using Eigen::seqN; - using Eigen::lastN; // c++11 only - using Eigen::last; - using Eigen::lastp1; - using Eigen::fix; - \endcode - */ -namespace indexing { - using Eigen::all; - using Eigen::seq; - using Eigen::seqN; - #if EIGEN_HAS_CXX11 - using Eigen::lastN; - #endif - using Eigen::last; - using Eigen::lastp1; - using Eigen::fix; -} - -} // end namespace Eigen - -#endif // EIGEN_ARITHMETIC_SEQUENCE_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Array.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Array.h index 64fd02ddf..16770fc7b 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/Array.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Array.h @@ -162,45 +162,6 @@ class Array } #endif - #if EIGEN_HAS_CXX11 - /** \copydoc PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) - * - * Example: \include Array_variadic_ctor_cxx11.cpp - * Output: \verbinclude Array_variadic_ctor_cxx11.out - * - * \sa Array(const std::initializer_list>&) - * \sa Array(const Scalar&), Array(const Scalar&,const Scalar&) - */ - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) - : Base(a0, a1, a2, a3, args...) {} - - /** \brief Constructs an array and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11 - * - * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients: - * - * Example: \include Array_initializer_list_23_cxx11.cpp - * Output: \verbinclude Array_initializer_list_23_cxx11.out - * - * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered. - * - * In the case of a compile-time column 1D array, implicit transposition from a single row is allowed. - * Therefore Array{{1,2,3,4,5}} is legal and the more verbose syntax - * Array{{1},{2},{3},{4},{5}} can be avoided: - * - * Example: \include Array_initializer_list_vector_cxx11.cpp - * Output: \verbinclude Array_initializer_list_vector_cxx11.out - * - * In the case of fixed-sized arrays, the initializer list sizes must exactly match the array sizes, - * and implicit transposition is allowed for compile-time 1D arrays only. - * - * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) - */ - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Array(const std::initializer_list>& list) : Base(list) {} - #endif // end EIGEN_HAS_CXX11 - #ifndef EIGEN_PARSED_BY_DOXYGEN template EIGEN_DEVICE_FUNC @@ -217,7 +178,6 @@ class Array Base::_check_template_params(); this->template _init2(val0, val1); } - #else /** \brief Constructs a fixed-sized array initialized with coefficients starting at \a data */ EIGEN_DEVICE_FUNC explicit Array(const Scalar *data); @@ -229,8 +189,7 @@ class Array */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Array(Index dim); - /** constructs an initialized 1x1 Array with the given coefficient - * \sa const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args */ + /** constructs an initialized 1x1 Array with the given coefficient */ Array(const Scalar& value); /** constructs an uninitialized array with \a rows rows and \a cols columns. * @@ -238,14 +197,11 @@ class Array * it is redundant to pass these parameters, so one should use the default constructor * Array() instead. */ Array(Index rows, Index cols); - /** constructs an initialized 2D vector with given coefficients - * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) */ + /** constructs an initialized 2D vector with given coefficients */ Array(const Scalar& val0, const Scalar& val1); - #endif // end EIGEN_PARSED_BY_DOXYGEN + #endif - /** constructs an initialized 3D vector with given coefficients - * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) - */ + /** constructs an initialized 3D vector with given coefficients */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2) { @@ -255,9 +211,7 @@ class Array m_storage.data()[1] = val1; m_storage.data()[2] = val2; } - /** constructs an initialized 4D vector with given coefficients - * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) - */ + /** constructs an initialized 4D vector with given coefficients */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2, const Scalar& val3) { @@ -304,7 +258,7 @@ class Array /** \defgroup arraytypedefs Global array typedefs * \ingroup Core_Module * - * %Eigen defines several typedef shortcuts for most common 1D and 2D array types. + * Eigen defines several typedef shortcuts for most common 1D and 2D array types. * * The general patterns are the following: * @@ -317,12 +271,6 @@ class Array * There are also \c ArraySizeType which are self-explanatory. For example, \c Array4cf is * a fixed-size 1D array of 4 complex floats. * - * With \cpp11, template alias are also defined for common sizes. - * They follow the same pattern as above except that the scalar type suffix is replaced by a - * template parameter, i.e.: - * - `ArrayRowsCols` where `Rows` and `Cols` can be \c 2,\c 3,\c 4, or \c X for fixed or dynamic size. - * - `ArraySize` where `Size` can be \c 2,\c 3,\c 4 or \c X for fixed or dynamic size 1D arrays. - * * \sa class Array */ @@ -355,43 +303,9 @@ EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(std::complex, cd) #undef EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES #undef EIGEN_MAKE_ARRAY_TYPEDEFS -#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS -#if EIGEN_HAS_CXX11 +#undef EIGEN_MAKE_ARRAY_TYPEDEFS_LARGE -#define EIGEN_MAKE_ARRAY_TYPEDEFS(Size, SizeSuffix) \ -/** \ingroup arraytypedefs */ \ -/** \brief \cpp11 */ \ -template \ -using Array##SizeSuffix##SizeSuffix = Array; \ -/** \ingroup arraytypedefs */ \ -/** \brief \cpp11 */ \ -template \ -using Array##SizeSuffix = Array; - -#define EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Size) \ -/** \ingroup arraytypedefs */ \ -/** \brief \cpp11 */ \ -template \ -using Array##Size##X = Array; \ -/** \ingroup arraytypedefs */ \ -/** \brief \cpp11 */ \ -template \ -using Array##X##Size = Array; - -EIGEN_MAKE_ARRAY_TYPEDEFS(2, 2) -EIGEN_MAKE_ARRAY_TYPEDEFS(3, 3) -EIGEN_MAKE_ARRAY_TYPEDEFS(4, 4) -EIGEN_MAKE_ARRAY_TYPEDEFS(Dynamic, X) -EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(2) -EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(3) -EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(4) - -#undef EIGEN_MAKE_ARRAY_TYPEDEFS -#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS - -#endif // EIGEN_HAS_CXX11 - #define EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, SizeSuffix) \ using Eigen::Matrix##SizeSuffix##TypeSuffix; \ using Eigen::Vector##SizeSuffix##TypeSuffix; \ diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/ArrayBase.h b/uppsrc/plugin/Eigen/Eigen/src/Core/ArrayBase.h index ea3dd1c3b..33f644e21 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/ArrayBase.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/ArrayBase.h @@ -69,7 +69,6 @@ template class ArrayBase using Base::coeff; using Base::coeffRef; using Base::lazyAssign; - using Base::operator-; using Base::operator=; using Base::operator+=; using Base::operator-=; @@ -89,6 +88,7 @@ template class ArrayBase #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase #define EIGEN_DOC_UNARY_ADDONS(X,Y) +# include "../plugins/CommonCwiseUnaryOps.h" # include "../plugins/MatrixCwiseUnaryOps.h" # include "../plugins/ArrayCwiseUnaryOps.h" # include "../plugins/CommonCwiseBinaryOps.h" diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/ArrayWrapper.h b/uppsrc/plugin/Eigen/Eigen/src/Core/ArrayWrapper.h index 757b31825..688aadd62 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/ArrayWrapper.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/ArrayWrapper.h @@ -90,8 +90,8 @@ class ArrayWrapper : public ArrayBase > EIGEN_DEVICE_FUNC inline void evalTo(Dest& dst) const { dst = m_expression; } - EIGEN_DEVICE_FUNC const typename internal::remove_all::type& + EIGEN_DEVICE_FUNC nestedExpression() const { return m_expression; diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Assign.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Assign.h index 655412efd..53806ba33 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/Assign.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Assign.h @@ -16,7 +16,7 @@ namespace Eigen { template template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase +EIGEN_STRONG_INLINE Derived& DenseBase ::lazyAssign(const DenseBase& other) { enum{ diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/AssignEvaluator.h b/uppsrc/plugin/Eigen/Eigen/src/Core/AssignEvaluator.h index 229e25854..dbe435d86 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/AssignEvaluator.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/AssignEvaluator.h @@ -24,7 +24,7 @@ namespace internal { // copy_using_evaluator_traits is based on assign_traits -template +template struct copy_using_evaluator_traits { typedef typename DstEvaluator::XprType Dst; @@ -51,15 +51,13 @@ private: InnerMaxSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime) : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime) : int(Dst::MaxRowsAtCompileTime), - RestrictedInnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(InnerSize,MaxPacketSize), - RestrictedLinearSize = EIGEN_SIZE_MIN_PREFER_FIXED(Dst::SizeAtCompileTime,MaxPacketSize), OuterStride = int(outer_stride_at_compile_time::ret), MaxSizeAtCompileTime = Dst::SizeAtCompileTime }; // TODO distinguish between linear traversal and inner-traversals - typedef typename find_best_packet::type LinearPacketType; - typedef typename find_best_packet::type InnerPacketType; + typedef typename find_best_packet::type LinearPacketType; + typedef typename find_best_packet::type InnerPacketType; enum { LinearPacketSize = unpacket_traits::size, @@ -99,7 +97,7 @@ private: public: enum { - Traversal = (int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize)) ? int(LinearVectorizedTraversal) + Traversal = int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize) ? int(LinearVectorizedTraversal) : int(MayInnerVectorize) ? int(InnerVectorizedTraversal) : int(MayLinearVectorize) ? int(LinearVectorizedTraversal) : int(MaySliceVectorize) ? int(SliceVectorizedTraversal) @@ -174,8 +172,6 @@ public: EIGEN_DEBUG_VAR(MaySliceVectorize) std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl; EIGEN_DEBUG_VAR(SrcEvaluator::CoeffReadCost) - EIGEN_DEBUG_VAR(DstEvaluator::CoeffReadCost) - EIGEN_DEBUG_VAR(Dst::SizeAtCompileTime) EIGEN_DEBUG_VAR(UnrollingLimit) EIGEN_DEBUG_VAR(MayUnrollCompletely) EIGEN_DEBUG_VAR(MayUnrollInner) @@ -534,7 +530,7 @@ struct dense_assignment_loop const Scalar *dst_ptr = kernel.dstDataPtr(); if((!bool(dstIsAligned)) && (UIntPtr(dst_ptr) % sizeof(Scalar))>0) { - // the pointer is not aligned-on scalar, so alignment is not possible + // the pointer is not aligend-on scalar, so alignment is not possible return dense_assignment_loop::run(kernel); } const Index packetAlignedMask = packetSize - 1; @@ -611,8 +607,7 @@ public: typedef typename AssignmentTraits::PacketType PacketType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr) + EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr) : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr) { #ifdef EIGEN_DEBUG_ASSIGN @@ -702,27 +697,6 @@ protected: DstXprType& m_dstExpr; }; -// Special kernel used when computing small products whose operands have dynamic dimensions. It ensures that the -// PacketSize used is no larger than 4, thereby increasing the chance that vectorized instructions will be used -// when computing the product. - -template -class restricted_packet_dense_assignment_kernel : public generic_dense_assignment_kernel -{ -protected: - typedef generic_dense_assignment_kernel Base; - public: - typedef typename Base::Scalar Scalar; - typedef typename Base::DstXprType DstXprType; - typedef copy_using_evaluator_traits AssignmentTraits; - typedef typename AssignmentTraits::PacketType PacketType; - - EIGEN_DEVICE_FUNC restricted_packet_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr) - : Base(dst, src, func, dstExpr) - { - } - }; - /*************************************************************************** * Part 5 : Entry point for dense rectangular assignment ***************************************************************************/ @@ -782,7 +756,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType // AssignmentKind must define a Kind typedef. template struct AssignmentKind; -// Assignment kind defined in this file: +// Assignement kind defined in this file: struct Dense2Dense {}; struct EigenBase2EigenBase {}; @@ -861,27 +835,6 @@ void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func) Assignment::run(actualDst, src, func); } - -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -void call_restricted_packet_assignment_no_alias(Dst& dst, const Src& src, const Func& func) -{ - typedef evaluator DstEvaluatorType; - typedef evaluator SrcEvaluatorType; - typedef restricted_packet_dense_assignment_kernel Kernel; - - EIGEN_STATIC_ASSERT_LVALUE(Dst) - EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename Dst::Scalar,typename Src::Scalar); - - SrcEvaluatorType srcEvaluator(src); - resize_if_allowed(dst, src, func); - - DstEvaluatorType dstEvaluator(dst); - Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived()); - - dense_assignment_loop::run(kernel); -} - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_assignment_no_alias(Dst& dst, const Src& src) @@ -946,7 +899,7 @@ struct Assignment src.evalTo(dst); } - // NOTE The following two functions are templated to avoid their instantiation if not needed + // NOTE The following two functions are templated to avoid their instanciation if not needed // This is needed because some expressions supports evalTo only and/or have 'void' as scalar type. template EIGEN_DEVICE_FUNC diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Assign_MKL.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Assign_MKL.h index c6140d185..6866095bf 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/Assign_MKL.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Assign_MKL.h @@ -68,16 +68,16 @@ class vml_assign_traits #define EIGEN_PP_EXPAND(ARG) ARG #if !defined (EIGEN_FAST_MATH) || (EIGEN_FAST_MATH != 1) -#define EIGEN_VMLMODE_EXPAND_xLA , VML_HA +#define EIGEN_VMLMODE_EXPAND_LA , VML_HA #else -#define EIGEN_VMLMODE_EXPAND_xLA , VML_LA +#define EIGEN_VMLMODE_EXPAND_LA , VML_LA #endif -#define EIGEN_VMLMODE_EXPAND_x_ +#define EIGEN_VMLMODE_EXPAND__ -#define EIGEN_VMLMODE_PREFIX_xLA vm -#define EIGEN_VMLMODE_PREFIX_x_ v -#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_x,VMLMODE) +#define EIGEN_VMLMODE_PREFIX_LA vm +#define EIGEN_VMLMODE_PREFIX__ v +#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_,VMLMODE) #define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE) \ template< typename DstXprType, typename SrcXprNested> \ @@ -89,7 +89,7 @@ class vml_assign_traits eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ if(vml_assign_traits::Traversal==LinearTraversal) { \ VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(), \ - (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE) ); \ + (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) ); \ } else { \ const Index outerSize = dst.outerSize(); \ for(Index outer = 0; outer < outerSize; ++outer) { \ @@ -97,7 +97,7 @@ class vml_assign_traits &(src.nestedExpression().coeffRef(0, outer)); \ EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer)); \ VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr, \ - (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE)); \ + (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE)); \ } \ } \ } \ @@ -152,7 +152,7 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil, Ceil, _) if(vml_assign_traits::Traversal==LinearTraversal) \ { \ VMLOP( dst.size(), (const VMLTYPE*)src.lhs().data(), exponent, \ - (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE) ); \ + (VMLTYPE*)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE) ); \ } else { \ const Index outerSize = dst.outerSize(); \ for(Index outer = 0; outer < outerSize; ++outer) { \ @@ -160,7 +160,7 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil, Ceil, _) &(src.lhs().coeffRef(0, outer)); \ EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer)); \ VMLOP( dst.innerSize(), (const VMLTYPE*)src_ptr, exponent, \ - (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE)); \ + (VMLTYPE*)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_##VMLMODE)); \ } \ } \ } \ diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Block.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Block.h index 6e938ea58..11de45c2e 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/Block.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Block.h @@ -114,8 +114,8 @@ template class /** Column or Row constructor */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Block(XprType& xpr, Index i) : Impl(xpr,i) + EIGEN_DEVICE_FUNC + inline Block(XprType& xpr, Index i) : Impl(xpr,i) { eigen_assert( (i>=0) && ( ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && i class /** Fixed-size constructor */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Block(XprType& xpr, Index startRow, Index startCol) + EIGEN_DEVICE_FUNC + inline Block(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) { EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE) @@ -135,8 +135,8 @@ template class /** Dynamic-size constructor */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Block(XprType& xpr, + EIGEN_DEVICE_FUNC + inline Block(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) : Impl(xpr, startRow, startCol, blockRows, blockCols) @@ -159,10 +159,10 @@ class BlockImpl public: typedef Impl Base; EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {} + EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {} + EIGEN_DEVICE_FUNC inline BlockImpl(XprType& xpr, Index startRow, Index startCol) : Impl(xpr, startRow, startCol) {} EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) + inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) : Impl(xpr, startRow, startCol, blockRows, blockCols) {} }; @@ -294,22 +294,22 @@ template::type& nestedExpression() const { return m_xpr; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_DEVICE_FUNC XprType& nestedExpression() { return m_xpr; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_DEVICE_FUNC StorageIndex startRow() const { return m_startRow.value(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_DEVICE_FUNC StorageIndex startCol() const { return m_startCol.value(); @@ -342,8 +342,8 @@ class BlockImpl_dense /** Column or Row constructor */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - BlockImpl_dense(XprType& xpr, Index i) + EIGEN_DEVICE_FUNC + inline BlockImpl_dense(XprType& xpr, Index i) : Base(xpr.data() + i * ( ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()), BlockRows==1 ? 1 : xpr.rows(), @@ -357,8 +357,8 @@ class BlockImpl_dense /** Fixed-size constructor */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - BlockImpl_dense(XprType& xpr, Index startRow, Index startCol) + EIGEN_DEVICE_FUNC + inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol) : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)), m_xpr(xpr), m_startRow(startRow), m_startCol(startCol) { @@ -367,8 +367,8 @@ class BlockImpl_dense /** Dynamic-size constructor */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - BlockImpl_dense(XprType& xpr, + EIGEN_DEVICE_FUNC + inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols), @@ -377,18 +377,18 @@ class BlockImpl_dense init(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_DEVICE_FUNC const typename internal::remove_all::type& nestedExpression() const { return m_xpr; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_DEVICE_FUNC XprType& nestedExpression() { return m_xpr; } /** \sa MapBase::innerStride() */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index innerStride() const + EIGEN_DEVICE_FUNC + inline Index innerStride() const { return internal::traits::HasSameStorageOrderAsXprType ? m_xpr.innerStride() @@ -396,19 +396,19 @@ class BlockImpl_dense } /** \sa MapBase::outerStride() */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index outerStride() const + EIGEN_DEVICE_FUNC + inline Index outerStride() const { return m_outerStride; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_DEVICE_FUNC StorageIndex startRow() const { return m_startRow.value(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_DEVICE_FUNC StorageIndex startCol() const { return m_startCol.value(); @@ -422,8 +422,8 @@ class BlockImpl_dense #ifndef EIGEN_PARSED_BY_DOXYGEN /** \internal used by allowAligned() */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols) + EIGEN_DEVICE_FUNC + inline BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols) : Base(data, blockRows, blockCols), m_xpr(xpr) { init(); @@ -431,7 +431,7 @@ class BlockImpl_dense #endif protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_DEVICE_FUNC void init() { m_outerStride = internal::traits::HasSameStorageOrderAsXprType diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/BooleanRedux.h b/uppsrc/plugin/Eigen/Eigen/src/Core/BooleanRedux.h index e32c4ac5b..8409d8749 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/BooleanRedux.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/BooleanRedux.h @@ -14,56 +14,58 @@ namespace Eigen { namespace internal { -template +template struct all_unroller { + typedef typename Derived::ExpressionTraits Traits; enum { - col = (UnrollCount-1) / Rows, - row = (UnrollCount-1) % Rows + col = (UnrollCount-1) / Traits::RowsAtCompileTime, + row = (UnrollCount-1) % Traits::RowsAtCompileTime }; - EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat) + static inline bool run(const Derived &mat) { - return all_unroller::run(mat) && mat.coeff(row, col); + return all_unroller::run(mat) && mat.coeff(row, col); } }; -template -struct all_unroller +template +struct all_unroller { - EIGEN_DEVICE_FUNC static inline bool run(const Derived &/*mat*/) { return true; } + static inline bool run(const Derived &/*mat*/) { return true; } }; -template -struct all_unroller +template +struct all_unroller { - EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; } + static inline bool run(const Derived &) { return false; } }; -template +template struct any_unroller { + typedef typename Derived::ExpressionTraits Traits; enum { - col = (UnrollCount-1) / Rows, - row = (UnrollCount-1) % Rows + col = (UnrollCount-1) / Traits::RowsAtCompileTime, + row = (UnrollCount-1) % Traits::RowsAtCompileTime }; - EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat) + static inline bool run(const Derived &mat) { - return any_unroller::run(mat) || mat.coeff(row, col); + return any_unroller::run(mat) || mat.coeff(row, col); } }; -template -struct any_unroller +template +struct any_unroller { - EIGEN_DEVICE_FUNC static inline bool run(const Derived & /*mat*/) { return false; } + static inline bool run(const Derived & /*mat*/) { return false; } }; -template -struct any_unroller +template +struct any_unroller { - EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; } + static inline bool run(const Derived &) { return false; } }; } // end namespace internal @@ -76,7 +78,7 @@ struct any_unroller * \sa any(), Cwise::operator<() */ template -EIGEN_DEVICE_FUNC inline bool DenseBase::all() const +inline bool DenseBase::all() const { typedef internal::evaluator Evaluator; enum { @@ -85,7 +87,7 @@ EIGEN_DEVICE_FUNC inline bool DenseBase::all() const }; Evaluator evaluator(derived()); if(unroll) - return internal::all_unroller::RowsAtCompileTime>::run(evaluator); + return internal::all_unroller::run(evaluator); else { for(Index j = 0; j < cols(); ++j) @@ -100,7 +102,7 @@ EIGEN_DEVICE_FUNC inline bool DenseBase::all() const * \sa all() */ template -EIGEN_DEVICE_FUNC inline bool DenseBase::any() const +inline bool DenseBase::any() const { typedef internal::evaluator Evaluator; enum { @@ -109,7 +111,7 @@ EIGEN_DEVICE_FUNC inline bool DenseBase::any() const }; Evaluator evaluator(derived()); if(unroll) - return internal::any_unroller::RowsAtCompileTime>::run(evaluator); + return internal::any_unroller::run(evaluator); else { for(Index j = 0; j < cols(); ++j) @@ -124,7 +126,7 @@ EIGEN_DEVICE_FUNC inline bool DenseBase::any() const * \sa all(), any() */ template -EIGEN_DEVICE_FUNC inline Eigen::Index DenseBase::count() const +inline Eigen::Index DenseBase::count() const { return derived().template cast().template cast().sum(); } diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/CommaInitializer.h b/uppsrc/plugin/Eigen/Eigen/src/Core/CommaInitializer.h index c0e29c75c..d218e9814 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/CommaInitializer.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/CommaInitializer.h @@ -33,8 +33,6 @@ struct CommaInitializer inline CommaInitializer(XprType& xpr, const Scalar& s) : m_xpr(xpr), m_row(0), m_col(1), m_currentBlockRows(1) { - eigen_assert(m_xpr.rows() > 0 && m_xpr.cols() > 0 - && "Cannot comma-initialize a 0x0 matrix (operator<<)"); m_xpr.coeffRef(0,0) = s; } @@ -43,8 +41,6 @@ struct CommaInitializer inline CommaInitializer(XprType& xpr, const DenseBase& other) : m_xpr(xpr), m_row(0), m_col(other.cols()), m_currentBlockRows(other.rows()) { - eigen_assert(m_xpr.rows() >= other.rows() && m_xpr.cols() >= other.cols() - && "Cannot comma-initialize a 0x0 matrix (operator<<)"); m_xpr.block(0, 0, other.rows(), other.cols()) = other; } @@ -107,7 +103,7 @@ struct CommaInitializer EIGEN_EXCEPTION_SPEC(Eigen::eigen_assert_exception) #endif { - finished(); + finished(); } /** \returns the built matrix once all its coefficients have been set. @@ -145,7 +141,7 @@ struct CommaInitializer * \sa CommaInitializer::finished(), class CommaInitializer */ template -EIGEN_DEVICE_FUNC inline CommaInitializer DenseBase::operator<< (const Scalar& s) +inline CommaInitializer DenseBase::operator<< (const Scalar& s) { return CommaInitializer(*static_cast(this), s); } @@ -153,7 +149,7 @@ EIGEN_DEVICE_FUNC inline CommaInitializer DenseBase::operator< /** \sa operator<<(const Scalar&) */ template template -EIGEN_DEVICE_FUNC inline CommaInitializer +inline CommaInitializer DenseBase::operator<<(const DenseBase& other) { return CommaInitializer(*static_cast(this), other); diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/CoreEvaluators.h b/uppsrc/plugin/Eigen/Eigen/src/Core/CoreEvaluators.h index a77c0fa81..910889efa 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/CoreEvaluators.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/CoreEvaluators.h @@ -90,8 +90,7 @@ template struct evaluator : public unary_evaluator { typedef unary_evaluator Base; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit evaluator(const T& xpr) : Base(xpr) {} + EIGEN_DEVICE_FUNC explicit evaluator(const T& xpr) : Base(xpr) {} }; @@ -100,14 +99,14 @@ template struct evaluator : evaluator { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_DEVICE_FUNC explicit evaluator(const T& xpr) : evaluator(xpr) {} }; // ---------- base class for all evaluators ---------- template -struct evaluator_base +struct evaluator_base : public noncopyable { // TODO that's not very nice to have to propagate all these traits. They are currently only needed to handle outer,inner indices. typedef traits ExpressionTraits; @@ -115,14 +114,6 @@ struct evaluator_base enum { Alignment = 0 }; - // noncopyable: - // Don't make this class inherit noncopyable as this kills EBO (Empty Base Optimization) - // and make complex evaluator much larger than then should do. - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE evaluator_base() {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~evaluator_base() {} -private: - EIGEN_DEVICE_FUNC evaluator_base(const evaluator_base&); - EIGEN_DEVICE_FUNC const evaluator_base& operator=(const evaluator_base&); }; // -------------------- Matrix and Array -------------------- @@ -132,33 +123,6 @@ private: // Here we directly specialize evaluator. This is not really a unary expression, and it is, by definition, dense, // so no need for more sophisticated dispatching. -// this helper permits to completely eliminate m_outerStride if it is known at compiletime. -template class plainobjectbase_evaluator_data { -public: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr) - { -#ifndef EIGEN_INTERNAL_DEBUGGING - EIGEN_UNUSED_VARIABLE(outerStride); -#endif - eigen_internal_assert(outerStride==OuterStride); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index outerStride() const { return OuterStride; } - const Scalar *data; -}; - -template class plainobjectbase_evaluator_data { -public: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride) : data(ptr), m_outerStride(outerStride) {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index outerStride() const { return m_outerStride; } - const Scalar *data; -protected: - Index m_outerStride; -}; - template struct evaluator > : evaluator_base @@ -177,23 +141,18 @@ struct evaluator > Flags = traits::EvaluatorFlags, Alignment = traits::Alignment }; - enum { - // We do not need to know the outer stride for vectors - OuterStrideAtCompileTime = IsVectorAtCompileTime ? 0 - : int(IsRowMajor) ? ColsAtCompileTime - : RowsAtCompileTime - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - evaluator() - : m_d(0,OuterStrideAtCompileTime) + + EIGEN_DEVICE_FUNC evaluator() + : m_data(0), + m_outerStride(IsVectorAtCompileTime ? 0 + : int(IsRowMajor) ? ColsAtCompileTime + : RowsAtCompileTime) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit evaluator(const PlainObjectType& m) - : m_d(m.data(),IsVectorAtCompileTime ? 0 : m.outerStride()) + + EIGEN_DEVICE_FUNC explicit evaluator(const PlainObjectType& m) + : m_data(m.data()), m_outerStride(IsVectorAtCompileTime ? 0 : m.outerStride()) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } @@ -202,30 +161,30 @@ struct evaluator > CoeffReturnType coeff(Index row, Index col) const { if (IsRowMajor) - return m_d.data[row * m_d.outerStride() + col]; + return m_data[row * m_outerStride.value() + col]; else - return m_d.data[row + col * m_d.outerStride()]; + return m_data[row + col * m_outerStride.value()]; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_d.data[index]; + return m_data[index]; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) { if (IsRowMajor) - return const_cast(m_d.data)[row * m_d.outerStride() + col]; + return const_cast(m_data)[row * m_outerStride.value() + col]; else - return const_cast(m_d.data)[row + col * m_d.outerStride()]; + return const_cast(m_data)[row + col * m_outerStride.value()]; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { - return const_cast(m_d.data)[index]; + return const_cast(m_data)[index]; } template @@ -233,16 +192,16 @@ struct evaluator > PacketType packet(Index row, Index col) const { if (IsRowMajor) - return ploadt(m_d.data + row * m_d.outerStride() + col); + return ploadt(m_data + row * m_outerStride.value() + col); else - return ploadt(m_d.data + row + col * m_d.outerStride()); + return ploadt(m_data + row + col * m_outerStride.value()); } template EIGEN_STRONG_INLINE PacketType packet(Index index) const { - return ploadt(m_d.data + index); + return ploadt(m_data + index); } template @@ -251,22 +210,26 @@ struct evaluator > { if (IsRowMajor) return pstoret - (const_cast(m_d.data) + row * m_d.outerStride() + col, x); + (const_cast(m_data) + row * m_outerStride.value() + col, x); else return pstoret - (const_cast(m_d.data) + row + col * m_d.outerStride(), x); + (const_cast(m_data) + row + col * m_outerStride.value(), x); } template EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { - return pstoret(const_cast(m_d.data) + index, x); + return pstoret(const_cast(m_data) + index, x); } protected: + const Scalar *m_data; - plainobjectbase_evaluator_data m_d; + // We do not need to know the outer stride for vectors + variable_if_dynamic m_outerStride; }; template @@ -275,11 +238,9 @@ struct evaluator > { typedef Matrix XprType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - evaluator() {} + EIGEN_DEVICE_FUNC evaluator() {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit evaluator(const XprType& m) + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m) : evaluator >(m) { } }; @@ -290,11 +251,9 @@ struct evaluator > { typedef Array XprType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - evaluator() {} + EIGEN_DEVICE_FUNC evaluator() {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit evaluator(const XprType& m) + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& m) : evaluator >(m) { } }; @@ -313,8 +272,7 @@ struct unary_evaluator, IndexBased> Alignment = evaluator::Alignment }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {} + EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {} typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; @@ -569,7 +527,9 @@ struct unary_evaluator, IndexBased > }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit unary_evaluator(const XprType& op) : m_d(op) + explicit unary_evaluator(const XprType& op) + : m_functor(op.functor()), + m_argImpl(op.nestedExpression()) { EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits::Cost); EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); @@ -580,43 +540,32 @@ struct unary_evaluator, IndexBased > EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { - return m_d.func()(m_d.argImpl.coeff(row, col)); + return m_functor(m_argImpl.coeff(row, col)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_d.func()(m_d.argImpl.coeff(index)); + return m_functor(m_argImpl.coeff(index)); } template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { - return m_d.func().packetOp(m_d.argImpl.template packet(row, col)); + return m_functor.packetOp(m_argImpl.template packet(row, col)); } template EIGEN_STRONG_INLINE PacketType packet(Index index) const { - return m_d.func().packetOp(m_d.argImpl.template packet(index)); + return m_functor.packetOp(m_argImpl.template packet(index)); } protected: - - // this helper permits to completely eliminate the functor if it is empty - class Data : private UnaryOp - { - public: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Data(const XprType& xpr) : UnaryOp(xpr.functor()), argImpl(xpr.nestedExpression()) {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const UnaryOp& func() const { return static_cast(*this); } - evaluator argImpl; - }; - - Data m_d; + const UnaryOp m_functor; + evaluator m_argImpl; }; // -------------------- CwiseTernaryOp -------------------- @@ -660,7 +609,11 @@ struct ternary_evaluator, IndexBased evaluator::Alignment) }; - EIGEN_DEVICE_FUNC explicit ternary_evaluator(const XprType& xpr) : m_d(xpr) + EIGEN_DEVICE_FUNC explicit ternary_evaluator(const XprType& xpr) + : m_functor(xpr.functor()), + m_arg1Impl(xpr.arg1()), + m_arg2Impl(xpr.arg2()), + m_arg3Impl(xpr.arg3()) { EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits::Cost); EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); @@ -671,47 +624,38 @@ struct ternary_evaluator, IndexBased EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { - return m_d.func()(m_d.arg1Impl.coeff(row, col), m_d.arg2Impl.coeff(row, col), m_d.arg3Impl.coeff(row, col)); + return m_functor(m_arg1Impl.coeff(row, col), m_arg2Impl.coeff(row, col), m_arg3Impl.coeff(row, col)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_d.func()(m_d.arg1Impl.coeff(index), m_d.arg2Impl.coeff(index), m_d.arg3Impl.coeff(index)); + return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index)); } template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { - return m_d.func().packetOp(m_d.arg1Impl.template packet(row, col), - m_d.arg2Impl.template packet(row, col), - m_d.arg3Impl.template packet(row, col)); + return m_functor.packetOp(m_arg1Impl.template packet(row, col), + m_arg2Impl.template packet(row, col), + m_arg3Impl.template packet(row, col)); } template EIGEN_STRONG_INLINE PacketType packet(Index index) const { - return m_d.func().packetOp(m_d.arg1Impl.template packet(index), - m_d.arg2Impl.template packet(index), - m_d.arg3Impl.template packet(index)); + return m_functor.packetOp(m_arg1Impl.template packet(index), + m_arg2Impl.template packet(index), + m_arg3Impl.template packet(index)); } protected: - // this helper permits to completely eliminate the functor if it is empty - struct Data : private TernaryOp - { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Data(const XprType& xpr) : TernaryOp(xpr.functor()), arg1Impl(xpr.arg1()), arg2Impl(xpr.arg2()), arg3Impl(xpr.arg3()) {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TernaryOp& func() const { return static_cast(*this); } - evaluator arg1Impl; - evaluator arg2Impl; - evaluator arg3Impl; - }; - - Data m_d; + const TernaryOp m_functor; + evaluator m_arg1Impl; + evaluator m_arg2Impl; + evaluator m_arg3Impl; }; // -------------------- CwiseBinaryOp -------------------- @@ -724,8 +668,7 @@ struct evaluator > typedef CwiseBinaryOp XprType; typedef binary_evaluator > Base; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit evaluator(const XprType& xpr) : Base(xpr) {} + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {} }; template @@ -753,8 +696,10 @@ struct binary_evaluator, IndexBased, IndexBase Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment,evaluator::Alignment) }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit binary_evaluator(const XprType& xpr) : m_d(xpr) + EIGEN_DEVICE_FUNC explicit binary_evaluator(const XprType& xpr) + : m_functor(xpr.functor()), + m_lhsImpl(xpr.lhs()), + m_rhsImpl(xpr.rhs()) { EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits::Cost); EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); @@ -765,45 +710,35 @@ struct binary_evaluator, IndexBased, IndexBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { - return m_d.func()(m_d.lhsImpl.coeff(row, col), m_d.rhsImpl.coeff(row, col)); + return m_functor(m_lhsImpl.coeff(row, col), m_rhsImpl.coeff(row, col)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_d.func()(m_d.lhsImpl.coeff(index), m_d.rhsImpl.coeff(index)); + return m_functor(m_lhsImpl.coeff(index), m_rhsImpl.coeff(index)); } template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { - return m_d.func().packetOp(m_d.lhsImpl.template packet(row, col), - m_d.rhsImpl.template packet(row, col)); + return m_functor.packetOp(m_lhsImpl.template packet(row, col), + m_rhsImpl.template packet(row, col)); } template EIGEN_STRONG_INLINE PacketType packet(Index index) const { - return m_d.func().packetOp(m_d.lhsImpl.template packet(index), - m_d.rhsImpl.template packet(index)); + return m_functor.packetOp(m_lhsImpl.template packet(index), + m_rhsImpl.template packet(index)); } protected: - - // this helper permits to completely eliminate the functor if it is empty - struct Data : private BinaryOp - { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Data(const XprType& xpr) : BinaryOp(xpr.functor()), lhsImpl(xpr.lhs()), rhsImpl(xpr.rhs()) {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const BinaryOp& func() const { return static_cast(*this); } - evaluator lhsImpl; - evaluator rhsImpl; - }; - - Data m_d; + const BinaryOp m_functor; + evaluator m_lhsImpl; + evaluator m_rhsImpl; }; // -------------------- CwiseUnaryView -------------------- @@ -822,7 +757,9 @@ struct unary_evaluator, IndexBased> Alignment = 0 // FIXME it is not very clear why alignment is necessarily lost... }; - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op) : m_d(op) + EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op) + : m_unaryOp(op.functor()), + m_argImpl(op.nestedExpression()) { EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits::Cost); EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); @@ -834,40 +771,30 @@ struct unary_evaluator, IndexBased> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { - return m_d.func()(m_d.argImpl.coeff(row, col)); + return m_unaryOp(m_argImpl.coeff(row, col)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return m_d.func()(m_d.argImpl.coeff(index)); + return m_unaryOp(m_argImpl.coeff(index)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) { - return m_d.func()(m_d.argImpl.coeffRef(row, col)); + return m_unaryOp(m_argImpl.coeffRef(row, col)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { - return m_d.func()(m_d.argImpl.coeffRef(index)); + return m_unaryOp(m_argImpl.coeffRef(index)); } protected: - - // this helper permits to completely eliminate the functor if it is empty - struct Data : private UnaryOp - { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Data(const XprType& xpr) : UnaryOp(xpr.functor()), argImpl(xpr.nestedExpression()) {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const UnaryOp& func() const { return static_cast(*this); } - evaluator argImpl; - }; - - Data m_d; + const UnaryOp m_unaryOp; + evaluator m_argImpl; }; // -------------------- Map -------------------- @@ -891,8 +818,7 @@ struct mapbase_evaluator : evaluator_base CoeffReadCost = NumTraits::ReadCost }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit mapbase_evaluator(const XprType& map) + EIGEN_DEVICE_FUNC explicit mapbase_evaluator(const XprType& map) : m_data(const_cast(map.data())), m_innerStride(map.innerStride()), m_outerStride(map.outerStride()) @@ -956,10 +882,10 @@ struct mapbase_evaluator : evaluator_base internal::pstoret(m_data + index * m_innerStride.value(), x); } protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); } + EIGEN_DEVICE_FUNC + inline Index rowStride() const { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); } + EIGEN_DEVICE_FUNC + inline Index colStride() const { return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value(); } PointerType m_data; const internal::variable_if_dynamic m_innerStride; @@ -1012,8 +938,7 @@ struct evaluator > Alignment = evaluator >::Alignment }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit evaluator(const XprType& ref) + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& ref) : mapbase_evaluator(ref) { } }; @@ -1068,8 +993,7 @@ struct evaluator > Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, Alignment0) }; typedef block_evaluator block_evaluator_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit evaluator(const XprType& block) : block_evaluator_type(block) + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& block) : block_evaluator_type(block) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } @@ -1082,8 +1006,7 @@ struct block_evaluator XprType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit block_evaluator(const XprType& block) + EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block) : unary_evaluator(block) {} }; @@ -1094,12 +1017,11 @@ struct unary_evaluator, IndexBa { typedef Block XprType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit unary_evaluator(const XprType& block) + EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block) : m_argImpl(block.nestedExpression()), m_startRow(block.startRow()), m_startCol(block.startCol()), - m_linear_offset(ForwardLinearAccess?(ArgType::IsRowMajor ? block.startRow()*block.nestedExpression().cols() + block.startCol() : block.startCol()*block.nestedExpression().rows() + block.startRow()):0) + m_linear_offset(InnerPanel?(XprType::IsRowMajor ? block.startRow()*block.cols() : block.startCol()*block.rows()):0) { } typedef typename XprType::Scalar Scalar; @@ -1107,7 +1029,7 @@ struct unary_evaluator, IndexBa enum { RowsAtCompileTime = XprType::RowsAtCompileTime, - ForwardLinearAccess = (InnerPanel || int(XprType::IsRowMajor)==int(ArgType::IsRowMajor)) && bool(evaluator::Flags&LinearAccessBit) + ForwardLinearAccess = InnerPanel && bool(evaluator::Flags&LinearAccessBit) }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -1118,8 +1040,11 @@ struct unary_evaluator, IndexBa EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const - { - return linear_coeff_impl(index, bool_constant()); + { + if (ForwardLinearAccess) + return m_argImpl.coeff(m_linear_offset.value() + index); + else + return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -1130,8 +1055,11 @@ struct unary_evaluator, IndexBa EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) - { - return linear_coeffRef_impl(index, bool_constant()); + { + if (ForwardLinearAccess) + return m_argImpl.coeffRef(m_linear_offset.value() + index); + else + return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); } template @@ -1172,32 +1100,10 @@ struct unary_evaluator, IndexBa } protected: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - CoeffReturnType linear_coeff_impl(Index index, internal::true_type /* ForwardLinearAccess */) const - { - return m_argImpl.coeff(m_linear_offset.value() + index); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - CoeffReturnType linear_coeff_impl(Index index, internal::false_type /* not ForwardLinearAccess */) const - { - return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Scalar& linear_coeffRef_impl(Index index, internal::true_type /* ForwardLinearAccess */) - { - return m_argImpl.coeffRef(m_linear_offset.value() + index); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Scalar& linear_coeffRef_impl(Index index, internal::false_type /* not ForwardLinearAccess */) - { - return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); - } - evaluator m_argImpl; const variable_if_dynamic m_startRow; const variable_if_dynamic m_startCol; - const variable_if_dynamic m_linear_offset; + const variable_if_dynamic m_linear_offset; }; // TODO: This evaluator does not actually use the child evaluator; @@ -1211,8 +1117,7 @@ struct block_evaluator XprType; typedef typename XprType::Scalar Scalar; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit block_evaluator(const XprType& block) + EIGEN_DEVICE_FUNC explicit block_evaluator(const XprType& block) : mapbase_evaluator(block) { // TODO: for the 3.3 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime @@ -1240,8 +1145,7 @@ struct evaluator > Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, evaluator::Alignment) }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit evaluator(const XprType& select) + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& select) : m_conditionImpl(select.conditionMatrix()), m_thenImpl(select.thenMatrix()), m_elseImpl(select.elseMatrix()) @@ -1298,8 +1202,7 @@ struct unary_evaluator > Alignment = evaluator::Alignment }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit unary_evaluator(const XprType& replicate) + EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& replicate) : m_arg(replicate.nestedExpression()), m_argImpl(m_arg), m_rows(replicate.nestedExpression().rows()), @@ -1363,6 +1266,64 @@ protected: const variable_if_dynamic m_cols; }; + +// -------------------- PartialReduxExpr -------------------- + +template< typename ArgType, typename MemberOp, int Direction> +struct evaluator > + : evaluator_base > +{ + typedef PartialReduxExpr XprType; + typedef typename internal::nested_eval::type ArgTypeNested; + typedef typename internal::remove_all::type ArgTypeNestedCleaned; + typedef typename ArgType::Scalar InputScalar; + typedef typename XprType::Scalar Scalar; + enum { + TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) : int(ArgType::ColsAtCompileTime) + }; + typedef typename MemberOp::template Cost CostOpType; + enum { + CoeffReadCost = TraversalSize==Dynamic ? HugeCost + : TraversalSize * evaluator::CoeffReadCost + int(CostOpType::value), + + Flags = (traits::Flags&RowMajorBit) | (evaluator::Flags&(HereditaryBits&(~RowMajorBit))) | LinearAccessBit, + + Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized + }; + + EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr) + : m_arg(xpr.nestedExpression()), m_functor(xpr.functor()) + { + EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : int(CostOpType::value)); + EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); + } + + typedef typename XprType::CoeffReturnType CoeffReturnType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar coeff(Index i, Index j) const + { + if (Direction==Vertical) + return m_functor(m_arg.col(j)); + else + return m_functor(m_arg.row(i)); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + const Scalar coeff(Index index) const + { + if (Direction==Vertical) + return m_functor(m_arg.col(index)); + else + return m_functor(m_arg.row(index)); + } + +protected: + typename internal::add_const_on_value_type::type m_arg; + const MemberOp m_functor; +}; + + // -------------------- MatrixWrapper and ArrayWrapper -------------------- // // evaluator_wrapper_base is a common base class for the @@ -1379,8 +1340,7 @@ struct evaluator_wrapper_base Alignment = evaluator::Alignment }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {} + EIGEN_DEVICE_FUNC explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {} typedef typename ArgType::Scalar Scalar; typedef typename ArgType::CoeffReturnType CoeffReturnType; @@ -1447,8 +1407,7 @@ struct unary_evaluator > { typedef MatrixWrapper XprType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit unary_evaluator(const XprType& wrapper) + EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper) : evaluator_wrapper_base >(wrapper.nestedExpression()) { } }; @@ -1459,8 +1418,7 @@ struct unary_evaluator > { typedef ArrayWrapper XprType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit unary_evaluator(const XprType& wrapper) + EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& wrapper) : evaluator_wrapper_base >(wrapper.nestedExpression()) { } }; @@ -1502,8 +1460,7 @@ struct unary_evaluator > Alignment = 0 // FIXME in some rare cases, Alignment could be preserved, like a Vector4f. }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit unary_evaluator(const XprType& reverse) + EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& reverse) : m_argImpl(reverse.nestedExpression()), m_rows(ReverseRow ? reverse.nestedExpression().rows() : 1), m_cols(ReverseCol ? reverse.nestedExpression().cols() : 1) @@ -1610,8 +1567,7 @@ struct evaluator > Alignment = 0 }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit evaluator(const XprType& diagonal) + EIGEN_DEVICE_FUNC explicit evaluator(const XprType& diagonal) : m_argImpl(diagonal.nestedExpression()), m_index(diagonal.index()) { } diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/CoreIterators.h b/uppsrc/plugin/Eigen/Eigen/src/Core/CoreIterators.h index b96719681..4eb42b93a 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/CoreIterators.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/CoreIterators.h @@ -48,11 +48,6 @@ public: * Explicit zeros are not skipped over. To skip explicit zeros, see class SparseView */ EIGEN_STRONG_INLINE InnerIterator& operator++() { m_iter.operator++(); return *this; } - EIGEN_STRONG_INLINE InnerIterator& operator+=(Index i) { m_iter.operator+=(i); return *this; } - EIGEN_STRONG_INLINE InnerIterator operator+(Index i) - { InnerIterator result(*this); result+=i; return result; } - - /// \returns the column or row index of the current coefficient. EIGEN_STRONG_INLINE Index index() const { return m_iter.index(); } /// \returns the row index of the current coefficient. diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/CwiseBinaryOp.h b/uppsrc/plugin/Eigen/Eigen/src/Core/CwiseBinaryOp.h index 8b8de8382..a36765e39 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/CwiseBinaryOp.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/CwiseBinaryOp.h @@ -100,14 +100,8 @@ class CwiseBinaryOp : typedef typename internal::remove_reference::type _LhsNested; typedef typename internal::remove_reference::type _RhsNested; -#if EIGEN_COMP_MSVC && EIGEN_HAS_CXX11 - //Required for Visual Studio or the Copy constructor will probably not get inlined! - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - CwiseBinaryOp(const CwiseBinaryOp&) = default; -#endif - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp()) + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp()) : m_lhs(aLhs), m_rhs(aRhs), m_functor(func) { EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename Rhs::Scalar); @@ -116,16 +110,16 @@ class CwiseBinaryOp : eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols()); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index rows() const { + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index rows() const { // return the fixed size type if available to enable compile time optimizations if (internal::traits::type>::RowsAtCompileTime==Dynamic) return m_rhs.rows(); else return m_lhs.rows(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index cols() const { + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Index cols() const { // return the fixed size type if available to enable compile time optimizations if (internal::traits::type>::ColsAtCompileTime==Dynamic) return m_rhs.cols(); @@ -134,13 +128,13 @@ class CwiseBinaryOp : } /** \returns the left hand side nested expression */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_DEVICE_FUNC const _LhsNested& lhs() const { return m_lhs; } /** \returns the right hand side nested expression */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_DEVICE_FUNC const _RhsNested& rhs() const { return m_rhs; } /** \returns the functor representing the binary operation */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_DEVICE_FUNC const BinaryOp& functor() const { return m_functor; } protected: @@ -164,7 +158,7 @@ public: */ template template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & +EIGEN_STRONG_INLINE Derived & MatrixBase::operator-=(const MatrixBase &other) { call_assignment(derived(), other.derived(), internal::sub_assign_op()); @@ -177,7 +171,7 @@ MatrixBase::operator-=(const MatrixBase &other) */ template template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & +EIGEN_STRONG_INLINE Derived & MatrixBase::operator+=(const MatrixBase& other) { call_assignment(derived(), other.derived(), internal::add_assign_op()); @@ -187,3 +181,4 @@ MatrixBase::operator+=(const MatrixBase& other) } // end namespace Eigen #endif // EIGEN_CWISE_BINARY_OP_H + diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/CwiseNullaryOp.h b/uppsrc/plugin/Eigen/Eigen/src/Core/CwiseNullaryOp.h index ddac9df78..ddd607e38 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/CwiseNullaryOp.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/CwiseNullaryOp.h @@ -105,12 +105,7 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -const CwiseNullaryOp::PlainObject> -#else -const CwiseNullaryOp -#endif +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> DenseBase::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func) { return CwiseNullaryOp(rows, cols, func); @@ -136,12 +131,7 @@ DenseBase::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& f */ template template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -const CwiseNullaryOp::PlainObject> -#else -const CwiseNullaryOp -#endif +EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> DenseBase::NullaryExpr(Index size, const CustomNullaryOp& func) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -160,12 +150,7 @@ DenseBase::NullaryExpr(Index size, const CustomNullaryOp& func) */ template template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -const CwiseNullaryOp::PlainObject> -#else -const CwiseNullaryOp -#endif +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> DenseBase::NullaryExpr(const CustomNullaryOp& func) { return CwiseNullaryOp(RowsAtCompileTime, ColsAtCompileTime, func); @@ -185,7 +170,7 @@ DenseBase::NullaryExpr(const CustomNullaryOp& func) * \sa class CwiseNullaryOp */ template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Constant(Index rows, Index cols, const Scalar& value) { return DenseBase::NullaryExpr(rows, cols, internal::scalar_constant_op(value)); @@ -232,32 +217,27 @@ DenseBase::Constant(const Scalar& value) /** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(Index,const Scalar&,const Scalar&) * - * \only_for_vectors - * - * Example: \include DenseBase_LinSpaced_seq_deprecated.cpp - * Output: \verbinclude DenseBase_LinSpaced_seq_deprecated.out - * - * \sa LinSpaced(Index,const Scalar&, const Scalar&), setLinSpaced(Index,const Scalar&,const Scalar&) + * \sa LinSpaced(Index,Scalar,Scalar), setLinSpaced(Index,const Scalar&,const Scalar&) */ template -EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return DenseBase::NullaryExpr(size, internal::linspaced_op(low,high,size)); + return DenseBase::NullaryExpr(size, internal::linspaced_op(low,high,size)); } /** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(const Scalar&,const Scalar&) * - * \sa LinSpaced(const Scalar&, const Scalar&) + * \sa LinSpaced(Scalar,Scalar) */ template -EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) - return DenseBase::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op(low,high,Derived::SizeAtCompileTime)); + return DenseBase::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op(low,high,Derived::SizeAtCompileTime)); } /** @@ -288,7 +268,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomA DenseBase::LinSpaced(Index size, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return DenseBase::NullaryExpr(size, internal::linspaced_op(low,high,size)); + return DenseBase::NullaryExpr(size, internal::linspaced_op(low,high,size)); } /** @@ -301,7 +281,7 @@ DenseBase::LinSpaced(const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) - return DenseBase::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op(low,high,Derived::SizeAtCompileTime)); + return DenseBase::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op(low,high,Derived::SizeAtCompileTime)); } /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */ @@ -403,7 +383,7 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op(low,high,newSize)); + return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op(low,high,newSize)); } /** @@ -881,42 +861,6 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitW() { return Derived::Unit(3); } -/** \brief Set the coefficients of \c *this to the i-th unit (basis) vector - * - * \param i index of the unique coefficient to be set to 1 - * - * \only_for_vectors - * - * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index) - */ -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setUnit(Index i) -{ - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); - eigen_assert(i -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setUnit(Index newSize, Index i) -{ - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); - eigen_assert(i::type& - nestedExpression() { return m_matrix; } + nestedExpression() { return m_matrix.const_cast_derived(); } protected: MatrixTypeNested m_matrix; diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/DenseBase.h b/uppsrc/plugin/Eigen/Eigen/src/Core/DenseBase.h index 59756a494..c55a68230 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/DenseBase.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/DenseBase.h @@ -150,18 +150,13 @@ template class DenseBase * \sa SizeAtCompileTime, MaxRowsAtCompileTime, MaxColsAtCompileTime */ - IsVectorAtCompileTime = internal::traits::RowsAtCompileTime == 1 - || internal::traits::ColsAtCompileTime == 1, + IsVectorAtCompileTime = internal::traits::MaxRowsAtCompileTime == 1 + || internal::traits::MaxColsAtCompileTime == 1, /**< This is set to true if either the number of rows or the number of * columns is known at compile-time to be equal to 1. Indeed, in that case, * we are dealing with a column-vector (if there is only one column) or with * a row-vector (if there is only one row). */ - NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2, - /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors, - * and 2 for matrices. - */ - Flags = internal::traits::Flags, /**< This stores expression \ref flags flags which may or may not be inherited by new expressions * constructed from this one. See the \ref flags "list of flags". @@ -266,9 +261,9 @@ template class DenseBase /** \internal Represents a matrix with all coefficients equal to one another*/ typedef CwiseNullaryOp,PlainObject> ConstantReturnType; /** \internal \deprecated Represents a vector with linearly spaced coefficients that allows sequential access only. */ - EIGEN_DEPRECATED typedef CwiseNullaryOp,PlainObject> SequentialLinSpacedReturnType; + typedef CwiseNullaryOp,PlainObject> SequentialLinSpacedReturnType; /** \internal Represents a vector with linearly spaced coefficients that allows random access. */ - typedef CwiseNullaryOp,PlainObject> RandomAccessLinSpacedReturnType; + typedef CwiseNullaryOp,PlainObject> RandomAccessLinSpacedReturnType; /** \internal the return type of MatrixBase::eigenvalues() */ typedef Matrix::Scalar>::Real, internal::traits::ColsAtCompileTime, 1> EigenvaluesReturnType; @@ -302,17 +297,17 @@ template class DenseBase Derived& operator=(const ReturnByValue& func); /** \internal - * Copies \a other into *this without evaluating other. \returns a reference to *this. */ + * Copies \a other into *this without evaluating other. \returns a reference to *this. + * \deprecated */ template - /** \deprecated */ - EIGEN_DEPRECATED EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC Derived& lazyAssign(const DenseBase& other); EIGEN_DEVICE_FUNC CommaInitializer operator<< (const Scalar& s); - template /** \deprecated it now returns \c *this */ + template EIGEN_DEPRECATED const Derived& flagged() const { return derived(); } @@ -337,13 +332,12 @@ template class DenseBase EIGEN_DEVICE_FUNC static const ConstantReturnType Constant(const Scalar& value); - EIGEN_DEPRECATED EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType + EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high); - EIGEN_DEPRECATED EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType - LinSpaced(Sequential_t, const Scalar& low, const Scalar& high); - EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(Index size, const Scalar& low, const Scalar& high); + EIGEN_DEVICE_FUNC static const SequentialLinSpacedReturnType + LinSpaced(Sequential_t, const Scalar& low, const Scalar& high); EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(const Scalar& low, const Scalar& high); @@ -375,7 +369,7 @@ template class DenseBase template EIGEN_DEVICE_FUNC bool isApprox(const DenseBase& other, const RealScalar& prec = NumTraits::dummy_precision()) const; - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC bool isMuchSmallerThan(const RealScalar& other, const RealScalar& prec = NumTraits::dummy_precision()) const; template EIGEN_DEVICE_FUNC @@ -386,7 +380,7 @@ template class DenseBase EIGEN_DEVICE_FUNC bool isConstant(const Scalar& value, const RealScalar& prec = NumTraits::dummy_precision()) const; EIGEN_DEVICE_FUNC bool isZero(const RealScalar& prec = NumTraits::dummy_precision()) const; EIGEN_DEVICE_FUNC bool isOnes(const RealScalar& prec = NumTraits::dummy_precision()) const; - + inline bool hasNaN() const; inline bool allFinite() const; @@ -400,8 +394,8 @@ template class DenseBase * * Notice that in the case of a plain matrix or vector (not an expression) this function just returns * a const reference, in order to avoid a useless copy. - * - * \warning Be careful with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink. + * + * \warning Be carefull with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page \endlink. */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvalReturnType eval() const @@ -416,7 +410,7 @@ template class DenseBase * */ template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_DEVICE_FUNC void swap(const DenseBase& other) { EIGEN_STATIC_ASSERT(!OtherDerived::IsPlainObjectBase,THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY); @@ -428,7 +422,7 @@ template class DenseBase * */ template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_DEVICE_FUNC void swap(PlainObjectBase& other) { eigen_assert(rows()==other.rows() && cols()==other.cols()); @@ -499,7 +493,7 @@ template class DenseBase typedef VectorwiseOp ColwiseReturnType; typedef const VectorwiseOp ConstColwiseReturnType; - /** \returns a VectorwiseOp wrapper of *this for broadcasting and partial reductions + /** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations * * Example: \include MatrixBase_rowwise.cpp * Output: \verbinclude MatrixBase_rowwise.out @@ -512,7 +506,7 @@ template class DenseBase } EIGEN_DEVICE_FUNC RowwiseReturnType rowwise(); - /** \returns a VectorwiseOp wrapper of *this broadcasting and partial reductions + /** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations * * Example: \include MatrixBase_colwise.cpp * Output: \verbinclude MatrixBase_colwise.out @@ -573,59 +567,16 @@ template class DenseBase } EIGEN_DEVICE_FUNC void reverseInPlace(); - #ifdef EIGEN_PARSED_BY_DOXYGEN - /** STL-like RandomAccessIterator - * iterator type as returned by the begin() and end() methods. - */ - typedef random_access_iterator_type iterator; - /** This is the const version of iterator (aka read-only) */ - typedef random_access_iterator_type const_iterator; - #else - typedef typename internal::conditional< (Flags&DirectAccessBit)==DirectAccessBit, - internal::pointer_based_stl_iterator, - internal::generic_randaccess_stl_iterator - >::type iterator_type; - - typedef typename internal::conditional< (Flags&DirectAccessBit)==DirectAccessBit, - internal::pointer_based_stl_iterator, - internal::generic_randaccess_stl_iterator - >::type const_iterator_type; - - // Stl-style iterators are supported only for vectors. - - typedef typename internal::conditional< IsVectorAtCompileTime, - iterator_type, - void - >::type iterator; - - typedef typename internal::conditional< IsVectorAtCompileTime, - const_iterator_type, - void - >::type const_iterator; - #endif - - inline iterator begin(); - inline const_iterator begin() const; - inline const_iterator cbegin() const; - inline iterator end(); - inline const_iterator end() const; - inline const_iterator cend() const; - #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase #define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL #define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND) -#define EIGEN_DOC_UNARY_ADDONS(X,Y) -# include "../plugins/CommonCwiseUnaryOps.h" # include "../plugins/BlockMethods.h" -# include "../plugins/IndexedViewMethods.h" -# include "../plugins/ReshapedMethods.h" # ifdef EIGEN_DENSEBASE_PLUGIN # include EIGEN_DENSEBASE_PLUGIN # endif #undef EIGEN_CURRENT_STORAGE_BASE_CLASS #undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL #undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF -#undef EIGEN_DOC_UNARY_ADDONS // disable the use of evalTo for dense objects with a nice compilation error template diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/DenseCoeffsBase.h b/uppsrc/plugin/Eigen/Eigen/src/Core/DenseCoeffsBase.h index 463b471c8..c4af48ab6 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/DenseCoeffsBase.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/DenseCoeffsBase.h @@ -22,8 +22,7 @@ template struct add_const_on_value_type_if_arithmetic /** \brief Base class providing read-only coefficient access to matrices and arrays. * \ingroup Core_Module * \tparam Derived Type of the derived class - * - * \note #ReadOnlyAccessors Constant indicating read-only access + * \tparam #ReadOnlyAccessors Constant indicating read-only access * * This class defines the \c operator() \c const function and friends, which can be used to read specific * entries of a matrix or array. @@ -289,8 +288,7 @@ class DenseCoeffsBase : public EigenBase /** \brief Base class providing read/write coefficient access to matrices and arrays. * \ingroup Core_Module * \tparam Derived Type of the derived class - * - * \note #WriteAccessors Constant indicating read/write access + * \tparam #WriteAccessors Constant indicating read/write access * * This class defines the non-const \c operator() function and friends, which can be used to write specific * entries of a matrix or array. This class inherits DenseCoeffsBase which @@ -468,8 +466,7 @@ class DenseCoeffsBase : public DenseCoeffsBase which defines functions to access entries read-only using @@ -542,8 +539,7 @@ class DenseCoeffsBase : public DenseCoeffsBase which defines functions to access entries read/write using diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/DenseStorage.h b/uppsrc/plugin/Eigen/Eigen/src/Core/DenseStorage.h index a8bb8a624..7d6d4e66d 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/DenseStorage.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/DenseStorage.h @@ -61,7 +61,7 @@ struct plain_array #if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT) #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) #elif EIGEN_GNUC_AT_LEAST(4,7) - // GCC 4.7 is too aggressive in its optimizations and remove the alignment test based on the fact the array is declared to be aligned. + // GCC 4.7 is too aggressive in its optimizations and remove the alignement test based on the fact the array is declared to be aligned. // See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900 // Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined: template @@ -207,9 +207,7 @@ template class DenseSt EIGEN_UNUSED_VARIABLE(rows); EIGEN_UNUSED_VARIABLE(cols); } - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { - numext::swap(m_data, other.m_data); - } + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); } EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;} EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;} EIGEN_DEVICE_FUNC void conservativeResize(Index,Index,Index) {} @@ -269,11 +267,7 @@ template class DenseStorage class DenseStorage class DenseStorage class DenseStorage(m_data, m_rows*m_cols); } EIGEN_DEVICE_FUNC void swap(DenseStorage& other) - { - numext::swap(m_data,other.m_data); - numext::swap(m_rows,other.m_rows); - numext::swap(m_cols,other.m_cols); - } + { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); } EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;} EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;} void conservativeResize(Index size, Index rows, Index cols) @@ -475,16 +459,14 @@ template class DenseStorage(m_data, _Rows*m_cols); } - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { - numext::swap(m_data,other.m_data); - numext::swap(m_cols,other.m_cols); - } + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); } EIGEN_DEVICE_FUNC static Index rows(void) {return _Rows;} EIGEN_DEVICE_FUNC Index cols(void) const {return m_cols;} EIGEN_DEVICE_FUNC void conservativeResize(Index size, Index, Index cols) @@ -551,16 +533,14 @@ template class DenseStorage(m_data, _Cols*m_rows); } - EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { - numext::swap(m_data,other.m_data); - numext::swap(m_rows,other.m_rows); - } + EIGEN_DEVICE_FUNC void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); } EIGEN_DEVICE_FUNC Index rows(void) const {return m_rows;} EIGEN_DEVICE_FUNC static Index cols(void) {return _Cols;} void conservativeResize(Index size, Index rows, Index) diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Diagonal.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Diagonal.h index 563135fb2..afcaf3575 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/Diagonal.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Diagonal.h @@ -187,7 +187,7 @@ template class Diagonal * * \sa class Diagonal */ template -EIGEN_DEVICE_FUNC inline typename MatrixBase::DiagonalReturnType +inline typename MatrixBase::DiagonalReturnType MatrixBase::diagonal() { return DiagonalReturnType(derived()); @@ -195,7 +195,7 @@ MatrixBase::diagonal() /** This is the const version of diagonal(). */ template -EIGEN_DEVICE_FUNC inline typename MatrixBase::ConstDiagonalReturnType +inline typename MatrixBase::ConstDiagonalReturnType MatrixBase::diagonal() const { return ConstDiagonalReturnType(derived()); @@ -213,7 +213,7 @@ MatrixBase::diagonal() const * * \sa MatrixBase::diagonal(), class Diagonal */ template -EIGEN_DEVICE_FUNC inline typename MatrixBase::DiagonalDynamicIndexReturnType +inline typename MatrixBase::DiagonalDynamicIndexReturnType MatrixBase::diagonal(Index index) { return DiagonalDynamicIndexReturnType(derived(), index); @@ -221,7 +221,7 @@ MatrixBase::diagonal(Index index) /** This is the const version of diagonal(Index). */ template -EIGEN_DEVICE_FUNC inline typename MatrixBase::ConstDiagonalDynamicIndexReturnType +inline typename MatrixBase::ConstDiagonalDynamicIndexReturnType MatrixBase::diagonal(Index index) const { return ConstDiagonalDynamicIndexReturnType(derived(), index); @@ -240,7 +240,6 @@ MatrixBase::diagonal(Index index) const * \sa MatrixBase::diagonal(), class Diagonal */ template template -EIGEN_DEVICE_FUNC inline typename MatrixBase::template DiagonalIndexReturnType::Type MatrixBase::diagonal() { @@ -250,7 +249,6 @@ MatrixBase::diagonal() /** This is the const version of diagonal(). */ template template -EIGEN_DEVICE_FUNC inline typename MatrixBase::template ConstDiagonalIndexReturnType::Type MatrixBase::diagonal() const { diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/DiagonalMatrix.h b/uppsrc/plugin/Eigen/Eigen/src/Core/DiagonalMatrix.h index 542685c65..ecfdce8ef 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/DiagonalMatrix.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/DiagonalMatrix.h @@ -44,7 +44,7 @@ class DiagonalBase : public EigenBase EIGEN_DEVICE_FUNC DenseMatrixType toDenseMatrix() const { return derived(); } - + EIGEN_DEVICE_FUNC inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); } EIGEN_DEVICE_FUNC @@ -83,30 +83,6 @@ class DiagonalBase : public EigenBase { return DiagonalWrapper(scalar * other.diagonal()); } - - template - EIGEN_DEVICE_FUNC - #ifdef EIGEN_PARSED_BY_DOXYGEN - inline unspecified_expression_type - #else - inline const DiagonalWrapper - #endif - operator+(const DiagonalBase& other) const - { - return (diagonal() + other.diagonal()).asDiagonal(); - } - - template - EIGEN_DEVICE_FUNC - #ifdef EIGEN_PARSED_BY_DOXYGEN - inline unspecified_expression_type - #else - inline const DiagonalWrapper - #endif - operator-(const DiagonalBase& other) const - { - return (diagonal() - other.diagonal()).asDiagonal(); - } }; #endif @@ -178,30 +154,6 @@ class DiagonalMatrix EIGEN_DEVICE_FUNC inline DiagonalMatrix(const Scalar& x, const Scalar& y, const Scalar& z) : m_diagonal(x,y,z) {} - #if EIGEN_HAS_CXX11 - /** \brief Construct a diagonal matrix with fixed size from an arbitrary number of coefficients. \cpp11 - * - * There exists C++98 anologue constructors for fixed-size diagonal matrices having 2 or 3 coefficients. - * - * \warning To construct a diagonal matrix of fixed size, the number of values passed to this - * constructor must match the fixed dimension of \c *this. - * - * \sa DiagonalMatrix(const Scalar&, const Scalar&) - * \sa DiagonalMatrix(const Scalar&, const Scalar&, const Scalar&) - */ - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - DiagonalMatrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const ArgTypes&... args) - : m_diagonal(a0, a1, a2, args...) {} - - /** \brief Constructs a DiagonalMatrix and initializes it by elements given by an initializer list of initializer - * lists \cpp11 - */ - EIGEN_DEVICE_FUNC - explicit EIGEN_STRONG_INLINE DiagonalMatrix(const std::initializer_list>& list) - : m_diagonal(list) {} - #endif // EIGEN_HAS_CXX11 - /** Copy constructor. */ template EIGEN_DEVICE_FUNC @@ -321,7 +273,7 @@ class DiagonalWrapper * \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal() **/ template -EIGEN_DEVICE_FUNC inline const DiagonalWrapper +inline const DiagonalWrapper MatrixBase::asDiagonal() const { return DiagonalWrapper(derived()); diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/DiagonalProduct.h b/uppsrc/plugin/Eigen/Eigen/src/Core/DiagonalProduct.h index 7911d1cd1..d372b938f 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/DiagonalProduct.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/DiagonalProduct.h @@ -17,7 +17,7 @@ namespace Eigen { */ template template -EIGEN_DEVICE_FUNC inline const Product +inline const Product MatrixBase::operator*(const DiagonalBase &a_diagonal) const { return Product(derived(),a_diagonal.derived()); diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Dot.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Dot.h index 11da432b2..1fe7a84a4 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/Dot.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Dot.h @@ -93,7 +93,7 @@ MatrixBase::dot(const MatrixBase& other) const * \sa dot(), norm(), lpNorm() */ template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::squaredNorm() const +EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::squaredNorm() const { return numext::real((*this).cwiseAbs2().sum()); } @@ -105,7 +105,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::norm() const +EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::norm() const { return numext::sqrt(squaredNorm()); } @@ -120,7 +120,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::PlainObject +EIGEN_STRONG_INLINE const typename MatrixBase::PlainObject MatrixBase::normalized() const { typedef typename internal::nested_eval::type _Nested; @@ -142,7 +142,7 @@ MatrixBase::normalized() const * \sa norm(), normalized() */ template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase::normalize() +EIGEN_STRONG_INLINE void MatrixBase::normalize() { RealScalar z = squaredNorm(); // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU @@ -163,7 +163,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase::normalize() * \sa stableNorm(), stableNormalize(), normalized() */ template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::PlainObject +EIGEN_STRONG_INLINE const typename MatrixBase::PlainObject MatrixBase::stableNormalized() const { typedef typename internal::nested_eval::type _Nested; @@ -188,7 +188,7 @@ MatrixBase::stableNormalized() const * \sa stableNorm(), stableNormalized(), normalize() */ template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase::stableNormalize() +EIGEN_STRONG_INLINE void MatrixBase::stableNormalize() { RealScalar w = cwiseAbs().maxCoeff(); RealScalar z = (derived()/w).squaredNorm(); @@ -260,9 +260,9 @@ struct lpNorm_selector template template #ifndef EIGEN_PARSED_BY_DOXYGEN -EIGEN_DEVICE_FUNC inline typename NumTraits::Scalar>::Real +inline typename NumTraits::Scalar>::Real #else -EIGEN_DEVICE_FUNC MatrixBase::RealScalar +MatrixBase::RealScalar #endif MatrixBase::lpNorm() const { diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/EigenBase.h b/uppsrc/plugin/Eigen/Eigen/src/Core/EigenBase.h index 0c34fb656..b195506a9 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/EigenBase.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/EigenBase.h @@ -32,9 +32,8 @@ template struct EigenBase /** \brief The interface type of indices * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE. + * \deprecated Since Eigen 3.3, its usage is deprecated. Use Eigen::Index instead. * \sa StorageIndex, \ref TopicPreprocessorDirectives. - * DEPRECATED: Since Eigen 3.3, its usage is deprecated. Use Eigen::Index instead. - * Deprecation is not marked with a doxygen comment because there are too many existing usages to add the deprecation attribute. */ typedef Eigen::Index Index; diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Fuzzy.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Fuzzy.h index 43aa49b2b..3e403a09d 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/Fuzzy.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Fuzzy.h @@ -100,7 +100,7 @@ struct isMuchSmallerThan_scalar_selector */ template template -EIGEN_DEVICE_FUNC bool DenseBase::isApprox( +bool DenseBase::isApprox( const DenseBase& other, const RealScalar& prec ) const @@ -122,7 +122,7 @@ EIGEN_DEVICE_FUNC bool DenseBase::isApprox( * \sa isApprox(), isMuchSmallerThan(const DenseBase&, RealScalar) const */ template -EIGEN_DEVICE_FUNC bool DenseBase::isMuchSmallerThan( +bool DenseBase::isMuchSmallerThan( const typename NumTraits::Real& other, const RealScalar& prec ) const @@ -142,7 +142,7 @@ EIGEN_DEVICE_FUNC bool DenseBase::isMuchSmallerThan( */ template template -EIGEN_DEVICE_FUNC bool DenseBase::isMuchSmallerThan( +bool DenseBase::isMuchSmallerThan( const DenseBase& other, const RealScalar& prec ) const diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/GeneralProduct.h b/uppsrc/plugin/Eigen/Eigen/src/Core/GeneralProduct.h index bf7ef54b5..6f0cc80e9 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/GeneralProduct.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/GeneralProduct.h @@ -18,16 +18,6 @@ enum { Small = 3 }; -// Define the threshold value to fallback from the generic matrix-matrix product -// implementation (heavy) to the lightweight coeff-based product one. -// See generic_product_impl -// in products/GeneralMatrixMatrix.h for more details. -// TODO This threshold should also be used in the compile-time selector below. -#ifndef EIGEN_GEMM_TO_COEFFBASED_THRESHOLD -// This default value has been obtained on a Haswell architecture. -#define EIGEN_GEMM_TO_COEFFBASED_THRESHOLD 20 -#endif - namespace internal { template struct product_type_selector; @@ -35,7 +25,7 @@ template struct product_type_selector; template struct product_size_category { enum { - #ifndef EIGEN_GPU_COMPILE_PHASE + #ifndef EIGEN_CUDA_ARCH is_large = MaxSize == Dynamic || Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD || (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD), @@ -163,13 +153,13 @@ template struct gemv_static_vect template struct gemv_static_vector_if { - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; } + EIGEN_STRONG_INLINE Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; } }; template struct gemv_static_vector_if { - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Scalar* data() { return 0; } + EIGEN_STRONG_INLINE Scalar* data() { return 0; } }; template @@ -239,7 +229,7 @@ template<> struct gemv_dense_selector // on, the other hand it is good for the cache to pack the vector anyways... EvalToDestAtCompileTime = (ActualDest::InnerStrideAtCompileTime==1), ComplexByReal = (NumTraits::IsComplex) && (!NumTraits::IsComplex), - MightCannotUseDest = ((!EvalToDestAtCompileTime) || ComplexByReal) && (ActualDest::MaxSizeAtCompileTime!=0) + MightCannotUseDest = (!EvalToDestAtCompileTime) || ComplexByReal }; typedef const_blas_data_mapper LhsMapper; @@ -326,7 +316,7 @@ template<> struct gemv_dense_selector enum { // FIXME find a way to allow an inner stride on the result if packet_traits::size==1 // on, the other hand it is good for the cache to pack the vector anyways... - DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 || ActualRhsTypeCleaned::MaxSizeAtCompileTime==0 + DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 }; gemv_static_vector_if static_rhs; @@ -396,8 +386,7 @@ template<> struct gemv_dense_selector */ template template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -const Product +inline const Product MatrixBase::operator*(const MatrixBase &other) const { // A note regarding the function declaration: In MSVC, this function will sometimes @@ -439,7 +428,6 @@ MatrixBase::operator*(const MatrixBase &other) const */ template template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Product MatrixBase::lazyProduct(const MatrixBase &other) const { diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/GenericPacketMath.h b/uppsrc/plugin/Eigen/Eigen/src/Core/GenericPacketMath.h index 449793372..e59443779 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/GenericPacketMath.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/GenericPacketMath.h @@ -44,27 +44,23 @@ struct default_packet_traits enum { HasHalfPacket = 0, - HasAdd = 1, - HasSub = 1, - HasShift = 1, - HasMul = 1, - HasNegate = 1, - HasAbs = 1, - HasArg = 0, - HasAbs2 = 1, - HasAbsDiff = 0, - HasMin = 1, - HasMax = 1, - HasConj = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, HasSetLinear = 1, - HasBlend = 0, - HasInsert = 0, + HasBlend = 0, HasDiv = 0, HasSqrt = 0, HasRsqrt = 0, HasExp = 0, - HasExpm1 = 0, HasLog = 0, HasLog1p = 0, HasLog10 = 0, @@ -85,19 +81,14 @@ struct default_packet_traits HasPolygamma = 0, HasErf = 0, HasErfc = 0, - HasNdtri = 0, - HasBessel = 0, HasIGamma = 0, - HasIGammaDerA = 0, - HasGammaSampleDerAlpha = 0, HasIGammac = 0, HasBetaInc = 0, HasRound = 0, - HasRint = 0, HasFloor = 0, HasCeil = 0, - HasCast = 0, + HasSign = 0 }; }; @@ -136,22 +127,6 @@ template struct type_casting_traits { }; }; -/** \internal Wrapper to ensure that multiple packet types can map to the same - same underlying vector type. */ -template -struct eigen_packet_wrapper -{ - EIGEN_ALWAYS_INLINE operator T&() { return m_val; } - EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; } - EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {} - EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {} - EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) { - m_val = v; - return *this; - } - - T m_val; -}; /** \internal \returns static_cast(a) (coeff-wise) */ template @@ -171,21 +146,15 @@ pcast(const SrcPacket& a, const SrcPacket& /*b*/, const SrcPacket& /*c*/, const return static_cast(a); } -/** \internal \returns reinterpret_cast(a) */ -template -EIGEN_DEVICE_FUNC inline Target -preinterpret(const Packet& a); /* { return reinterpret_cast(a); } */ - /** \internal \returns a + b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet -padd(const Packet& a, const Packet& b) { return a+b; } -// Avoid compiler warning for boolean algebra. -template<> EIGEN_DEVICE_FUNC inline bool -padd(const bool& a, const bool& b) { return a || b; } +padd(const Packet& a, + const Packet& b) { return a+b; } /** \internal \returns a - b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet -psub(const Packet& a, const Packet& b) { return a-b; } +psub(const Packet& a, + const Packet& b) { return a-b; } /** \internal \returns -a (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet @@ -198,86 +167,32 @@ pconj(const Packet& a) { return numext::conj(a); } /** \internal \returns a * b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet -pmul(const Packet& a, const Packet& b) { return a*b; } -// Avoid compiler warning for boolean algebra. -template<> EIGEN_DEVICE_FUNC inline bool -pmul(const bool& a, const bool& b) { return a && b; } +pmul(const Packet& a, + const Packet& b) { return a*b; } /** \internal \returns a / b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet -pdiv(const Packet& a, const Packet& b) { return a/b; } +pdiv(const Packet& a, + const Packet& b) { return a/b; } /** \internal \returns the min of \a a and \a b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet -pmin(const Packet& a, const Packet& b) { return numext::mini(a, b); } +pmin(const Packet& a, + const Packet& b) { return numext::mini(a, b); } /** \internal \returns the max of \a a and \a b (coeff-wise) */ template EIGEN_DEVICE_FUNC inline Packet -pmax(const Packet& a, const Packet& b) { return numext::maxi(a, b); } +pmax(const Packet& a, + const Packet& b) { return numext::maxi(a, b); } /** \internal \returns the absolute value of \a a */ template EIGEN_DEVICE_FUNC inline Packet pabs(const Packet& a) { using std::abs; return abs(a); } -template<> EIGEN_DEVICE_FUNC inline unsigned int -pabs(const unsigned int& a) { return a; } -template<> EIGEN_DEVICE_FUNC inline unsigned long -pabs(const unsigned long& a) { return a; } -template<> EIGEN_DEVICE_FUNC inline unsigned long long -pabs(const unsigned long long& a) { return a; } /** \internal \returns the phase angle of \a a */ template EIGEN_DEVICE_FUNC inline Packet parg(const Packet& a) { using numext::arg; return arg(a); } - -/** \internal \returns \a a logically shifted by N bits to the right */ -template EIGEN_DEVICE_FUNC inline int -parithmetic_shift_right(const int& a) { return a >> N; } -template EIGEN_DEVICE_FUNC inline long int -parithmetic_shift_right(const long int& a) { return a >> N; } - -/** \internal \returns \a a arithmetically shifted by N bits to the right */ -template EIGEN_DEVICE_FUNC inline int -plogical_shift_right(const int& a) { return static_cast(static_cast(a) >> N); } -template EIGEN_DEVICE_FUNC inline long int -plogical_shift_right(const long int& a) { return static_cast(static_cast(a) >> N); } - -/** \internal \returns \a a shifted by N bits to the left */ -template EIGEN_DEVICE_FUNC inline int -plogical_shift_left(const int& a) { return a << N; } -template EIGEN_DEVICE_FUNC inline long int -plogical_shift_left(const long int& a) { return a << N; } - -/** \internal \returns the significant and exponent of the underlying floating point numbers - * See https://en.cppreference.com/w/cpp/numeric/math/frexp - */ -template -EIGEN_DEVICE_FUNC inline Packet pfrexp(const Packet& a, Packet& exponent) { - int exp; - EIGEN_USING_STD_MATH(frexp); - Packet result = frexp(a, &exp); - exponent = static_cast(exp); - return result; -} - -/** \internal \returns a * 2^exponent - * See https://en.cppreference.com/w/cpp/numeric/math/ldexp - */ -template EIGEN_DEVICE_FUNC inline Packet -pldexp(const Packet &a, const Packet &exponent) { - EIGEN_USING_STD_MATH(ldexp); - return ldexp(a, static_cast(exponent)); -} - -// Notice: The following ops accept and operator on bitwise masks. -// The value of each field in a masks is Scalar(0) or ~Scalar(0). -// For boolean packet like Packet16b, this is different from the -// representation of true and false, which are 1 and 0. -// As an example -// ptrue() = 0xffffffffffffffffffffffffffffffff -// while -// pset1(true) = 0x01010101010101010101010101010101 - /** \internal \returns the bitwise and of \a a and \a b */ template EIGEN_DEVICE_FUNC inline Packet pand(const Packet& a, const Packet& b) { return a & b; } @@ -290,76 +205,9 @@ por(const Packet& a, const Packet& b) { return a | b; } template EIGEN_DEVICE_FUNC inline Packet pxor(const Packet& a, const Packet& b) { return a ^ b; } -/** \internal \returns the bitwise and of \a a and not \a b */ +/** \internal \returns the bitwise andnot of \a a and \a b */ template EIGEN_DEVICE_FUNC inline Packet -pandnot(const Packet& a, const Packet& b) { return a & (~b); } - -/** \internal \returns ones */ -template EIGEN_DEVICE_FUNC inline Packet -ptrue(const Packet& /*a*/) { Packet b; memset((void*)&b, 0xff, sizeof(b)); return b;} - -/** \internal \returns zeros */ -template EIGEN_DEVICE_FUNC inline Packet -pzero(const Packet& a) { return pxor(a,a); } - -template<> EIGEN_DEVICE_FUNC inline float pzero(const float& a) { - EIGEN_UNUSED_VARIABLE(a); - return 0.f; -} - -template<> EIGEN_DEVICE_FUNC inline double pzero(const double& a) { - EIGEN_UNUSED_VARIABLE(a); - return 0.; -} - -template -EIGEN_DEVICE_FUNC inline std::complex ptrue(const std::complex& /*a*/) { - RealScalar b; - b = ptrue(b); - return std::complex(b, b); -} - -/** \internal \returns the bitwise not of \a a */ -template EIGEN_DEVICE_FUNC inline Packet -pnot(const Packet& a) { return pxor(ptrue(a), a);} - -/** \internal \returns a <= b as a bit mask */ -template EIGEN_DEVICE_FUNC inline Packet -pcmp_le(const Packet& a, const Packet& b) { return a<=b ? ptrue(a) : pzero(a); } - -/** \internal \returns a < b as a bit mask */ -template EIGEN_DEVICE_FUNC inline Packet -pcmp_lt(const Packet& a, const Packet& b) { return a EIGEN_DEVICE_FUNC inline Packet -pcmp_eq(const Packet& a, const Packet& b) { return a==b ? ptrue(a) : pzero(a); } - -/** \internal \returns a < b or a==NaN or b==NaN as a bit mask */ -template EIGEN_DEVICE_FUNC inline Packet -pcmp_lt_or_nan(const Packet& a, const Packet& b) { return pnot(pcmp_le(b,a)); } - -/** \internal \returns \a or \b for each field in packet according to \mask */ -template EIGEN_DEVICE_FUNC inline Packet -pselect(const Packet& mask, const Packet& a, const Packet& b) { - return por(pand(a,mask),pandnot(b,mask)); -} - -template<> EIGEN_DEVICE_FUNC inline float pselect( - const float& cond, const float& a, const float&b) { - return numext::equal_strict(cond,0.f) ? b : a; -} - -template<> EIGEN_DEVICE_FUNC inline double pselect( - const double& cond, const double& a, const double& b) { - return numext::equal_strict(cond,0.) ? b : a; -} - - - -/** \internal \returns the min of \a a and \a b (coeff-wise) */ -template EIGEN_DEVICE_FUNC inline Packet -pabsdiff(const Packet& a, const Packet& b) { return pselect(pcmp_lt(a, b), psub(b, a), psub(a, b)); } +pandnot(const Packet& a, const Packet& b) { return a & (!b); } /** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */ template EIGEN_DEVICE_FUNC inline Packet @@ -369,22 +217,10 @@ pload(const typename unpacket_traits::type* from) { return *from; } template EIGEN_DEVICE_FUNC inline Packet ploadu(const typename unpacket_traits::type* from) { return *from; } -/** \internal \returns a packet version of \a *from, (un-aligned masked load) - * There is no generic implementation. We only have implementations for specialized - * cases. Generic case should not be called. - */ -template EIGEN_DEVICE_FUNC inline -typename enable_if::masked_load_available, Packet>::type -ploadu(const typename unpacket_traits::type* from, typename unpacket_traits::mask_t umask); - /** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */ template EIGEN_DEVICE_FUNC inline Packet pset1(const typename unpacket_traits::type& a) { return a; } -/** \internal \returns a packet with constant coefficients set from bits */ -template EIGEN_DEVICE_FUNC inline Packet -pset1frombits(BitsType a); - /** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */ template EIGEN_DEVICE_FUNC inline Packet pload1(const typename unpacket_traits::type *a) { return pset1(*a); } @@ -453,15 +289,6 @@ template EIGEN_DEVICE_FUNC inline void pstore( template EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from) { (*to) = from; } -/** \internal copy the packet \a from to \a *to, (un-aligned store with a mask) - * There is no generic implementation. We only have implementations for specialized - * cases. Generic case should not be called. - */ -template -EIGEN_DEVICE_FUNC inline -typename enable_if::masked_store_available, void>::type -pstoreu(Scalar* to, const Packet& from, typename unpacket_traits::mask_t umask); - template EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/) { return ploadu(from); } @@ -471,9 +298,7 @@ pstoreu(Scalar* to, const Packet& from, typename unpacket_traits::mask_t /** \internal tries to do cache prefetching of \a addr */ template EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - // do nothing -#elif defined(EIGEN_CUDA_ARCH) +#ifdef __CUDA_ARCH__ #if defined(__LP64__) // 64-bit pointer operand constraint for inlined asm asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr)); @@ -490,52 +315,35 @@ template EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* a template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type pfirst(const Packet& a) { return a; } +/** \internal \returns a packet where the element i contains the sum of the packet of \a vec[i] */ +template EIGEN_DEVICE_FUNC inline Packet +preduxp(const Packet* vecs) { return vecs[0]; } + /** \internal \returns the sum of the elements of \a a*/ template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux(const Packet& a) { return a; } -/** \internal \returns the sum of the elements of upper and lower half of \a a if \a a is larger than 4. +/** \internal \returns the sum of the elements of \a a by block of 4 elements. * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7} * For packet-size smaller or equal to 4, this boils down to a noop. */ template EIGEN_DEVICE_FUNC inline typename conditional<(unpacket_traits::size%8)==0,typename unpacket_traits::half,Packet>::type -predux_half_dowto4(const Packet& a) +predux_downto4(const Packet& a) { return a; } -/** \internal \returns the product of the elements of \a a */ +/** \internal \returns the product of the elements of \a a*/ template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_mul(const Packet& a) { return a; } -/** \internal \returns the min of the elements of \a a */ +/** \internal \returns the min of the elements of \a a*/ template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_min(const Packet& a) { return a; } -/** \internal \returns the max of the elements of \a a */ +/** \internal \returns the max of the elements of \a a*/ template EIGEN_DEVICE_FUNC inline typename unpacket_traits::type predux_max(const Packet& a) { return a; } -/** \internal \returns true if all coeffs of \a a means "true" - * It is supposed to be called on values returned by pcmp_*. - */ -// not needed yet -// template EIGEN_DEVICE_FUNC inline bool predux_all(const Packet& a) -// { return bool(a); } - -/** \internal \returns true if any coeffs of \a a means "true" - * It is supposed to be called on values returned by pcmp_*. - */ -template EIGEN_DEVICE_FUNC inline bool predux_any(const Packet& a) -{ - // Dirty but generic implementation where "true" is assumed to be non 0 and all the sames. - // It is expected that "true" is either: - // - Scalar(1) - // - bits full of ones (NaN for floats), - // - or first bit equals to 1 (1 for ints, smallest denormal for floats). - // For all these cases, taking the sum is just fine, and this boils down to a no-op for scalars. - return bool(predux(a)); -} - /** \internal \returns the reversed elements of \a a*/ template EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a) { return a; } @@ -543,7 +351,7 @@ template EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */ template EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a) { - return Packet(numext::imag(a),numext::real(a)); + return Packet(a.imag(),a.real()); } /************************** @@ -552,51 +360,47 @@ template EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet /** \internal \returns the sine of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet psin(const Packet& a) { EIGEN_USING_STD_MATH(sin); return sin(a); } +Packet psin(const Packet& a) { using std::sin; return sin(a); } /** \internal \returns the cosine of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pcos(const Packet& a) { EIGEN_USING_STD_MATH(cos); return cos(a); } +Packet pcos(const Packet& a) { using std::cos; return cos(a); } /** \internal \returns the tan of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet ptan(const Packet& a) { EIGEN_USING_STD_MATH(tan); return tan(a); } +Packet ptan(const Packet& a) { using std::tan; return tan(a); } /** \internal \returns the arc sine of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pasin(const Packet& a) { EIGEN_USING_STD_MATH(asin); return asin(a); } +Packet pasin(const Packet& a) { using std::asin; return asin(a); } /** \internal \returns the arc cosine of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pacos(const Packet& a) { EIGEN_USING_STD_MATH(acos); return acos(a); } +Packet pacos(const Packet& a) { using std::acos; return acos(a); } /** \internal \returns the arc tangent of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet patan(const Packet& a) { EIGEN_USING_STD_MATH(atan); return atan(a); } +Packet patan(const Packet& a) { using std::atan; return atan(a); } /** \internal \returns the hyperbolic sine of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet psinh(const Packet& a) { EIGEN_USING_STD_MATH(sinh); return sinh(a); } +Packet psinh(const Packet& a) { using std::sinh; return sinh(a); } /** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pcosh(const Packet& a) { EIGEN_USING_STD_MATH(cosh); return cosh(a); } +Packet pcosh(const Packet& a) { using std::cosh; return cosh(a); } /** \internal \returns the hyperbolic tan of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet ptanh(const Packet& a) { EIGEN_USING_STD_MATH(tanh); return tanh(a); } +Packet ptanh(const Packet& a) { using std::tanh; return tanh(a); } /** \internal \returns the exp of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pexp(const Packet& a) { EIGEN_USING_STD_MATH(exp); return exp(a); } - -/** \internal \returns the expm1 of \a a (coeff-wise) */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet pexpm1(const Packet& a) { return numext::expm1(a); } +Packet pexp(const Packet& a) { using std::exp; return exp(a); } /** \internal \returns the log of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet plog(const Packet& a) { EIGEN_USING_STD_MATH(log); return log(a); } +Packet plog(const Packet& a) { using std::log; return log(a); } /** \internal \returns the log1p of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS @@ -604,11 +408,11 @@ Packet plog1p(const Packet& a) { return numext::log1p(a); } /** \internal \returns the log10 of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet plog10(const Packet& a) { EIGEN_USING_STD_MATH(log10); return log10(a); } +Packet plog10(const Packet& a) { using std::log10; return log10(a); } /** \internal \returns the square-root of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet psqrt(const Packet& a) { EIGEN_USING_STD_MATH(sqrt); return sqrt(a); } +Packet psqrt(const Packet& a) { using std::sqrt; return sqrt(a); } /** \internal \returns the reciprocal square-root of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS @@ -624,11 +428,6 @@ Packet pround(const Packet& a) { using numext::round; return round(a); } template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pfloor(const Packet& a) { using numext::floor; return floor(a); } -/** \internal \returns the rounded value of \a a (coeff-wise) with current - * rounding mode */ -template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -Packet print(const Packet& a) { using numext::rint; return rint(a); } - /** \internal \returns the ceil of \a a (coeff-wise) */ template EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); } @@ -637,7 +436,7 @@ Packet pceil(const Packet& a) { using numext::ceil; return ceil(a); } * The following functions might not have to be overwritten for vectorized types ***************************************************************************/ -/** \internal copy a packet with constant coefficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */ +/** \internal copy a packet with constant coeficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */ // NOTE: this function must really be templated on the packet type (think about different packet types for the same scalar type) template inline void pstore1(typename unpacket_traits::type* to, const typename unpacket_traits::type& a) @@ -685,12 +484,41 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_t return ploadt(from); } +/** \internal default implementation of palign() allowing partial specialization */ +template +struct palign_impl +{ + // by default data are aligned, so there is nothing to be done :) + static inline void run(PacketType&, const PacketType&) {} +}; + +/** \internal update \a first using the concatenation of the packet_size minus \a Offset last elements + * of \a first and \a Offset first elements of \a second. + * + * This function is currently only used to optimize matrix-vector products on unligned matrices. + * It takes 2 packets that represent a contiguous memory array, and returns a packet starting + * at the position \a Offset. For instance, for packets of 4 elements, we have: + * Input: + * - first = {f0,f1,f2,f3} + * - second = {s0,s1,s2,s3} + * Output: + * - if Offset==0 then {f0,f1,f2,f3} + * - if Offset==1 then {f1,f2,f3,s0} + * - if Offset==2 then {f2,f3,s0,s1} + * - if Offset==3 then {f3,s0,s1,s3} + */ +template +inline void palign(PacketType& first, const PacketType& second) +{ + palign_impl::run(first,second); +} + /*************************************************************************** * Fast complex products (GCC generates a function call which is very slow) ***************************************************************************/ // Eigen+CUDA does not support complexes. -#if !defined(EIGEN_GPUCC) +#ifndef __CUDACC__ template<> inline std::complex pmul(const std::complex& a, const std::complex& b) { return std::complex(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); } @@ -727,21 +555,33 @@ pblend(const Selector::size>& ifPacket, const Packet& th return ifPacket.select[0] ? thenPacket : elsePacket; } -/*************************************************************************** - * Some generic implementations to be used by implementors -***************************************************************************/ +/** \internal \returns \a a with the first coefficient replaced by the scalar b */ +template EIGEN_DEVICE_FUNC inline Packet +pinsertfirst(const Packet& a, typename unpacket_traits::type b) +{ + // Default implementation based on pblend. + // It must be specialized for higher performance. + Selector::size> mask; + mask.select[0] = true; + // This for loop should be optimized away by the compiler. + for(Index i=1; i::size; ++i) + mask.select[i] = false; + return pblend(mask, pset1(b), a); +} -/** Default implementation of pfrexp for float. - * It is expected to be called by implementers of template<> pfrexp. - */ -template EIGEN_STRONG_INLINE Packet -pfrexp_float(const Packet& a, Packet& exponent); - -/** Default implementation of pldexp for float. - * It is expected to be called by implementers of template<> pldexp. - */ -template EIGEN_STRONG_INLINE Packet -pldexp_float(Packet a, Packet exponent); +/** \internal \returns \a a with the last coefficient replaced by the scalar b */ +template EIGEN_DEVICE_FUNC inline Packet +pinsertlast(const Packet& a, typename unpacket_traits::type b) +{ + // Default implementation based on pblend. + // It must be specialized for higher performance. + Selector::size> mask; + // This for loop should be optimized away by the compiler. + for(Index i=0; i::size-1; ++i) + mask.select[i] = false; + mask.select[unpacket_traits::size-1] = true; + return pblend(mask, pset1(b), a); +} } // end namespace internal diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/GlobalFunctions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/GlobalFunctions.h index 8d54f92df..769dc255c 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/GlobalFunctions.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/GlobalFunctions.h @@ -66,19 +66,11 @@ namespace Eigen EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh,scalar_sinh_op,hyperbolic sine,\sa ArrayBase::sinh) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh,scalar_cosh_op,hyperbolic cosine,\sa ArrayBase::cosh) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh,scalar_tanh_op,hyperbolic tangent,\sa ArrayBase::tanh) -#if EIGEN_HAS_CXX11_MATH - EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asinh,scalar_asinh_op,inverse hyperbolic sine,\sa ArrayBase::asinh) - EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acosh,scalar_acosh_op,inverse hyperbolic cosine,\sa ArrayBase::acosh) - EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atanh,scalar_atanh_op,inverse hyperbolic tangent,\sa ArrayBase::atanh) -#endif - EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(logistic,scalar_logistic_op,logistic function,\sa ArrayBase::logistic) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma,scalar_lgamma_op,natural logarithm of the gamma function,\sa ArrayBase::lgamma) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma,scalar_digamma_op,derivative of lgamma,\sa ArrayBase::digamma) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf,scalar_erf_op,error function,\sa ArrayBase::erf) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc,scalar_erfc_op,complement error function,\sa ArrayBase::erfc) - EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ndtri,scalar_ndtri_op,inverse normal distribution function,\sa ArrayBase::ndtri) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op,exponential,\sa ArrayBase::exp) - EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(expm1,scalar_expm1_op,exponential of a value minus 1,\sa ArrayBase::expm1) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op,natural logarithm,\sa Eigen::log10 DOXCOMMA ArrayBase::log) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log1p,scalar_log1p_op,natural logarithm of 1 plus the value,\sa ArrayBase::log1p) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10,scalar_log10_op,base 10 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log) @@ -89,7 +81,6 @@ namespace Eigen EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rsqrt,scalar_rsqrt_op,reciprocal square root,\sa ArrayBase::rsqrt) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(square,scalar_square_op,square (power 2),\sa Eigen::abs2 DOXCOMMA Eigen::pow DOXCOMMA ArrayBase::square) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cube,scalar_cube_op,cube (power 3),\sa Eigen::pow DOXCOMMA ArrayBase::cube) - EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rint,scalar_rint_op,nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(round,scalar_round_op,nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(floor,scalar_floor_op,nearest integer not greater than the giben value,\sa Eigen::ceil DOXCOMMA ArrayBase::floor) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ceil,scalar_ceil_op,nearest integer not less than the giben value,\sa Eigen::floor DOXCOMMA ArrayBase::ceil) @@ -97,7 +88,7 @@ namespace Eigen EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isinf,scalar_isinf_op,infinite value test,\sa Eigen::isnan DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isinf) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite,scalar_isfinite_op,finite value test,\sa Eigen::isinf DOXCOMMA Eigen::isnan DOXCOMMA ArrayBase::isfinite) EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign,scalar_sign_op,sign (or 0),\sa ArrayBase::sign) - + /** \returns an expression of the coefficient-wise power of \a x to the given constant \a exponent. * * \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type of the given expression (\c Derived::Scalar). @@ -111,18 +102,17 @@ namespace Eigen inline const CwiseBinaryOp,Derived,Constant > pow(const Eigen::ArrayBase& x, const ScalarExponent& exponent); #else - template - EIGEN_DEVICE_FUNC inline - EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE( - const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg::type,pow)) - pow(const Eigen::ArrayBase& x, const ScalarExponent& exponent) - { - typedef typename internal::promote_scalar_arg::type PromotedExponent; - return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,PromotedExponent,pow)(x.derived(), - typename internal::plain_constant_type::type(x.derived().rows(), x.derived().cols(), internal::scalar_constant_op(exponent))); + template + inline typename internal::enable_if< !(internal::is_same::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,typename Derived::Scalar,ScalarExponent), + const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,ScalarExponent,pow) >::type + pow(const Eigen::ArrayBase& x, const ScalarExponent& exponent) { + return x.derived().pow(exponent); + } + + template + inline const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename Derived::Scalar,pow) + pow(const Eigen::ArrayBase& x, const typename Derived::Scalar& exponent) { + return x.derived().pow(exponent); } #endif @@ -132,21 +122,21 @@ namespace Eigen * * Example: \include Cwise_array_power_array.cpp * Output: \verbinclude Cwise_array_power_array.out - * + * * \sa ArrayBase::pow() * * \relates ArrayBase */ template inline const Eigen::CwiseBinaryOp, const Derived, const ExponentDerived> - pow(const Eigen::ArrayBase& x, const Eigen::ArrayBase& exponents) + pow(const Eigen::ArrayBase& x, const Eigen::ArrayBase& exponents) { return Eigen::CwiseBinaryOp, const Derived, const ExponentDerived>( x.derived(), exponents.derived() ); } - + /** \returns an expression of the coefficient-wise power of the scalar \a x to the given array of \a exponents. * * This function computes the coefficient-wise power between a scalar and an array of exponents. @@ -155,7 +145,7 @@ namespace Eigen * * Example: \include Cwise_scalar_power_array.cpp * Output: \verbinclude Cwise_scalar_power_array.out - * + * * \sa ArrayBase::pow() * * \relates ArrayBase @@ -165,17 +155,21 @@ namespace Eigen inline const CwiseBinaryOp,Constant,Derived> pow(const Scalar& x,const Eigen::ArrayBase& x); #else - template - EIGEN_DEVICE_FUNC inline - EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE( - const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg::type,Derived,pow)) - pow(const Scalar& x, const Eigen::ArrayBase& exponents) { - typedef typename internal::promote_scalar_arg::type PromotedScalar; - return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedScalar,Derived,pow)( - typename internal::plain_constant_type::type(exponents.derived().rows(), exponents.derived().cols(), internal::scalar_constant_op(x)), exponents.derived()); + template + inline typename internal::enable_if< !(internal::is_same::value) && EIGEN_SCALAR_BINARY_SUPPORTED(pow,Scalar,typename Derived::Scalar), + const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow) >::type + pow(const Scalar& x, const Eigen::ArrayBase& exponents) + { + return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar,Derived,pow)( + typename internal::plain_constant_type::type(exponents.rows(), exponents.cols(), x), exponents.derived() ); + } + + template + inline const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow) + pow(const typename Derived::Scalar& x, const Eigen::ArrayBase& exponents) + { + return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename Derived::Scalar,Derived,pow)( + typename internal::plain_constant_type::type(exponents.rows(), exponents.cols(), x), exponents.derived() ); } #endif diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/IO.h b/uppsrc/plugin/Eigen/Eigen/src/Core/IO.h index e81c31521..da7fd6cce 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/IO.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/IO.h @@ -41,7 +41,6 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& * - \b rowSuffix string printed at the end of each row * - \b matPrefix string printed at the beginning of the matrix * - \b matSuffix string printed at the end of the matrix - * - \b fill character printed to fill the empty space in aligned columns * * Example: \include IOFormat.cpp * Output: \verbinclude IOFormat.out @@ -54,9 +53,9 @@ struct IOFormat IOFormat(int _precision = StreamPrecision, int _flags = 0, const std::string& _coeffSeparator = " ", const std::string& _rowSeparator = "\n", const std::string& _rowPrefix="", const std::string& _rowSuffix="", - const std::string& _matPrefix="", const std::string& _matSuffix="", const char _fill=' ') + const std::string& _matPrefix="", const std::string& _matSuffix="") : matPrefix(_matPrefix), matSuffix(_matSuffix), rowPrefix(_rowPrefix), rowSuffix(_rowSuffix), rowSeparator(_rowSeparator), - rowSpacer(""), coeffSeparator(_coeffSeparator), fill(_fill), precision(_precision), flags(_flags) + rowSpacer(""), coeffSeparator(_coeffSeparator), precision(_precision), flags(_flags) { // TODO check if rowPrefix, rowSuffix or rowSeparator contains a newline // don't add rowSpacer if columns are not to be aligned @@ -72,7 +71,6 @@ struct IOFormat std::string matPrefix, matSuffix; std::string rowPrefix, rowSuffix, rowSeparator, rowSpacer; std::string coeffSeparator; - char fill; int precision; int flags; }; @@ -130,9 +128,6 @@ struct significant_decimals_impl template std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& fmt) { - using internal::is_same; - using internal::conditional; - if(_m.size() == 0) { s << fmt.matPrefix << fmt.matSuffix; @@ -141,22 +136,6 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& typename Derived::Nested m = _m; typedef typename Derived::Scalar Scalar; - typedef typename - conditional< - is_same::value || - is_same::value || - is_same::value || - is_same::value, - int, - typename conditional< - is_same >::value || - is_same >::value || - is_same >::value || - is_same >::value, - std::complex, - const Scalar& - >::type - >::type PrintType; Index width = 0; @@ -193,31 +172,23 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& { std::stringstream sstr; sstr.copyfmt(s); - sstr << static_cast(m.coeff(i,j)); + sstr << m.coeff(i,j); width = std::max(width, Index(sstr.str().length())); } } - std::streamsize old_width = s.width(); - char old_fill_character = s.fill(); s << fmt.matPrefix; for(Index i = 0; i < m.rows(); ++i) { if (i) s << fmt.rowSpacer; s << fmt.rowPrefix; - if(width) { - s.fill(fmt.fill); - s.width(width); - } - s << static_cast(m.coeff(i, 0)); + if(width) s.width(width); + s << m.coeff(i, 0); for(Index j = 1; j < m.cols(); ++j) { s << fmt.coeffSeparator; - if(width) { - s.fill(fmt.fill); - s.width(width); - } - s << static_cast(m.coeff(i, j)); + if (width) s.width(width); + s << m.coeff(i, j); } s << fmt.rowSuffix; if( i < m.rows() - 1) @@ -225,10 +196,6 @@ std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& } s << fmt.matSuffix; if(explicit_precision) s.precision(old_precision); - if(width) { - s.fill(old_fill_character); - s.width(old_width); - } return s; } diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/IndexedView.h b/uppsrc/plugin/Eigen/Eigen/src/Core/IndexedView.h deleted file mode 100644 index 377f8a5cc..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/IndexedView.h +++ /dev/null @@ -1,207 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2017 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_INDEXED_VIEW_H -#define EIGEN_INDEXED_VIEW_H - -namespace Eigen { - -namespace internal { - -template -struct traits > - : traits -{ - enum { - RowsAtCompileTime = int(array_size::value), - ColsAtCompileTime = int(array_size::value), - MaxRowsAtCompileTime = RowsAtCompileTime != Dynamic ? int(RowsAtCompileTime) : Dynamic, - MaxColsAtCompileTime = ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime) : Dynamic, - - XprTypeIsRowMajor = (int(traits::Flags)&RowMajorBit) != 0, - IsRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1 - : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0 - : XprTypeIsRowMajor, - - RowIncr = int(get_compile_time_incr::value), - ColIncr = int(get_compile_time_incr::value), - InnerIncr = IsRowMajor ? ColIncr : RowIncr, - OuterIncr = IsRowMajor ? RowIncr : ColIncr, - - HasSameStorageOrderAsXprType = (IsRowMajor == XprTypeIsRowMajor), - XprInnerStride = HasSameStorageOrderAsXprType ? int(inner_stride_at_compile_time::ret) : int(outer_stride_at_compile_time::ret), - XprOuterstride = HasSameStorageOrderAsXprType ? int(outer_stride_at_compile_time::ret) : int(inner_stride_at_compile_time::ret), - - InnerSize = XprTypeIsRowMajor ? ColsAtCompileTime : RowsAtCompileTime, - IsBlockAlike = InnerIncr==1 && OuterIncr==1, - IsInnerPannel = HasSameStorageOrderAsXprType && is_same,typename conditional::type>::value, - - InnerStrideAtCompileTime = InnerIncr<0 || InnerIncr==DynamicIndex || XprInnerStride==Dynamic ? Dynamic : XprInnerStride * InnerIncr, - OuterStrideAtCompileTime = OuterIncr<0 || OuterIncr==DynamicIndex || XprOuterstride==Dynamic ? Dynamic : XprOuterstride * OuterIncr, - - ReturnAsScalar = is_same::value && is_same::value, - ReturnAsBlock = (!ReturnAsScalar) && IsBlockAlike, - ReturnAsIndexedView = (!ReturnAsScalar) && (!ReturnAsBlock), - - // FIXME we deal with compile-time strides if and only if we have DirectAccessBit flag, - // but this is too strict regarding negative strides... - DirectAccessMask = (int(InnerIncr)!=UndefinedIncr && int(OuterIncr)!=UndefinedIncr && InnerIncr>=0 && OuterIncr>=0) ? DirectAccessBit : 0, - FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0, - FlagsLvalueBit = is_lvalue::value ? LvalueBit : 0, - Flags = (traits::Flags & (HereditaryBits | DirectAccessMask)) | FlagsLvalueBit | FlagsRowMajorBit - }; - - typedef Block BlockType; -}; - -} - -template -class IndexedViewImpl; - - -/** \class IndexedView - * \ingroup Core_Module - * - * \brief Expression of a non-sequential sub-matrix defined by arbitrary sequences of row and column indices - * - * \tparam XprType the type of the expression in which we are taking the intersections of sub-rows and sub-columns - * \tparam RowIndices the type of the object defining the sequence of row indices - * \tparam ColIndices the type of the object defining the sequence of column indices - * - * This class represents an expression of a sub-matrix (or sub-vector) defined as the intersection - * of sub-sets of rows and columns, that are themself defined by generic sequences of row indices \f$ \{r_0,r_1,..r_{m-1}\} \f$ - * and column indices \f$ \{c_0,c_1,..c_{n-1} \}\f$. Let \f$ A \f$ be the nested matrix, then the resulting matrix \f$ B \f$ has \c m - * rows and \c n columns, and its entries are given by: \f$ B(i,j) = A(r_i,c_j) \f$. - * - * The \c RowIndices and \c ColIndices types must be compatible with the following API: - * \code - * operator[](Index) const; - * Index size() const; - * \endcode - * - * Typical supported types thus include: - * - std::vector - * - std::valarray - * - std::array - * - Plain C arrays: int[N] - * - Eigen::ArrayXi - * - decltype(ArrayXi::LinSpaced(...)) - * - Any view/expressions of the previous types - * - Eigen::ArithmeticSequence - * - Eigen::internal::AllRange (helper for Eigen::all) - * - Eigen::internal::SingleRange (helper for single index) - * - etc. - * - * In typical usages of %Eigen, this class should never be used directly. It is the return type of - * DenseBase::operator()(const RowIndices&, const ColIndices&). - * - * \sa class Block - */ -template -class IndexedView : public IndexedViewImpl::StorageKind> -{ -public: - typedef typename IndexedViewImpl::StorageKind>::Base Base; - EIGEN_GENERIC_PUBLIC_INTERFACE(IndexedView) - EIGEN_INHERIT_ASSIGNMENT_OPERATORS(IndexedView) - - typedef typename internal::ref_selector::non_const_type MatrixTypeNested; - typedef typename internal::remove_all::type NestedExpression; - - template - IndexedView(XprType& xpr, const T0& rowIndices, const T1& colIndices) - : m_xpr(xpr), m_rowIndices(rowIndices), m_colIndices(colIndices) - {} - - /** \returns number of rows */ - Index rows() const { return internal::size(m_rowIndices); } - - /** \returns number of columns */ - Index cols() const { return internal::size(m_colIndices); } - - /** \returns the nested expression */ - const typename internal::remove_all::type& - nestedExpression() const { return m_xpr; } - - /** \returns the nested expression */ - typename internal::remove_reference::type& - nestedExpression() { return m_xpr; } - - /** \returns a const reference to the object storing/generating the row indices */ - const RowIndices& rowIndices() const { return m_rowIndices; } - - /** \returns a const reference to the object storing/generating the column indices */ - const ColIndices& colIndices() const { return m_colIndices; } - -protected: - MatrixTypeNested m_xpr; - RowIndices m_rowIndices; - ColIndices m_colIndices; -}; - - -// Generic API dispatcher -template -class IndexedViewImpl - : public internal::generic_xpr_base >::type -{ -public: - typedef typename internal::generic_xpr_base >::type Base; -}; - -namespace internal { - - -template -struct unary_evaluator, IndexBased> - : evaluator_base > -{ - typedef IndexedView XprType; - - enum { - CoeffReadCost = evaluator::CoeffReadCost /* TODO + cost of row/col index */, - - Flags = (evaluator::Flags & (HereditaryBits /*| LinearAccessBit | DirectAccessBit*/)), - - Alignment = 0 - }; - - EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_xpr(xpr) - { - EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); - } - - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - CoeffReturnType coeff(Index row, Index col) const - { - return m_argImpl.coeff(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Scalar& coeffRef(Index row, Index col) - { - return m_argImpl.coeffRef(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); - } - -protected: - - evaluator m_argImpl; - const XprType& m_xpr; - -}; - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_INDEXED_VIEW_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Inverse.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Inverse.h index 7352d8037..b76f0439d 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/Inverse.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Inverse.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2014-2019 Gael Guennebaud +// Copyright (C) 2014 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -44,6 +44,7 @@ class Inverse : public InverseImpl::S { public: typedef typename XprType::StorageIndex StorageIndex; + typedef typename XprType::PlainObject PlainObject; typedef typename XprType::Scalar Scalar; typedef typename internal::ref_selector::type XprTypeNested; typedef typename internal::remove_all::type XprTypeNestedCleaned; @@ -54,8 +55,8 @@ public: : m_xpr(xpr) {} - EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.cols(); } - EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.rows(); } + EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); } + EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); } EIGEN_DEVICE_FUNC const XprTypeNestedCleaned& nestedExpression() const { return m_xpr; } diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Map.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Map.h index c437f1a92..548bf9a2d 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/Map.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Map.h @@ -113,10 +113,10 @@ template class Ma EIGEN_DEVICE_FUNC inline Index outerStride() const { - return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer() - : internal::traits::OuterStrideAtCompileTime != Dynamic ? Index(internal::traits::OuterStrideAtCompileTime) + return int(StrideType::OuterStrideAtCompileTime) != 0 ? m_stride.outer() + : int(internal::traits::OuterStrideAtCompileTime) != Dynamic ? Index(internal::traits::OuterStrideAtCompileTime) : IsVectorAtCompileTime ? (this->size() * innerStride()) - : int(Flags)&RowMajorBit ? (this->cols() * innerStride()) + : (int(Flags)&RowMajorBit) ? (this->cols() * innerStride()) : (this->rows() * innerStride()); } diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/MathFunctions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/MathFunctions.h index 96cb24fcb..01736c2a0 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/MathFunctions.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/MathFunctions.h @@ -14,6 +14,7 @@ // TODO this should better be moved to NumTraits #define EIGEN_PI 3.141592653589793238462643383279502884197169399375105820974944592307816406L + namespace Eigen { // On WINCE, std::abs is defined for int only, so let's defined our own overloads: @@ -96,7 +97,7 @@ struct real_default_impl template struct real_impl : real_default_impl {}; -#if defined(EIGEN_GPU_COMPILE_PHASE) +#ifdef __CUDA_ARCH__ template struct real_impl > { @@ -144,7 +145,7 @@ struct imag_default_impl template struct imag_impl : imag_default_impl {}; -#if defined(EIGEN_GPU_COMPILE_PHASE) +#ifdef __CUDA_ARCH__ template struct imag_impl > { @@ -238,7 +239,7 @@ struct imag_ref_retval ****************************************************************************/ template::IsComplex> -struct conj_default_impl +struct conj_impl { EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) @@ -248,7 +249,7 @@ struct conj_default_impl }; template -struct conj_default_impl +struct conj_impl { EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) @@ -258,20 +259,6 @@ struct conj_default_impl } }; -template struct conj_impl : conj_default_impl {}; - -#if defined(EIGEN_GPU_COMPILE_PHASE) -template -struct conj_impl > -{ - EIGEN_DEVICE_FUNC - static inline std::complex run(const std::complex& x) - { - return std::complex(x.real(), -x.imag()); - } -}; -#endif - template struct conj_retval { @@ -402,11 +389,10 @@ inline NewType cast(const OldType& x) #if EIGEN_HAS_CXX11_MATH template struct round_impl { - EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) { EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), NUMERIC_TYPE_MUST_BE_REAL) - EIGEN_USING_STD_MATH(round); + using std::round; return round(x); } }; @@ -414,7 +400,6 @@ inline NewType cast(const OldType& x) template struct round_impl { - EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) { EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), NUMERIC_TYPE_MUST_BE_REAL) @@ -431,48 +416,6 @@ struct round_retval typedef Scalar type; }; -/**************************************************************************** -* Implementation of rint * -****************************************************************************/ - -template -struct rint_impl { - EIGEN_DEVICE_FUNC - static inline Scalar run(const Scalar& x) - { - EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), NUMERIC_TYPE_MUST_BE_REAL) -#if EIGEN_HAS_CXX11_MATH - EIGEN_USING_STD_MATH(rint); -#endif - return rint(x); - } -}; - -#if !EIGEN_HAS_CXX11_MATH -template<> -struct rint_impl { - EIGEN_DEVICE_FUNC - static inline double run(const double& x) - { - return ::rint(x); - } -}; -template<> -struct rint_impl { - EIGEN_DEVICE_FUNC - static inline float run(const float& x) - { - return ::rintf(x); - } -}; -#endif - -template -struct rint_retval -{ - typedef Scalar type; -}; - /**************************************************************************** * Implementation of arg * ****************************************************************************/ @@ -480,15 +423,9 @@ struct rint_retval #if EIGEN_HAS_CXX11_MATH template struct arg_impl { - EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) { - #if defined(EIGEN_HIP_DEVICE_COMPILE) - // HIP does not seem to have a native device side implementation for the math routine "arg" - using std::arg; - #else EIGEN_USING_STD_MATH(arg); - #endif return arg(x); } }; @@ -524,86 +461,6 @@ struct arg_retval typedef typename NumTraits::Real type; }; -/**************************************************************************** -* Implementation of expm1 * -****************************************************************************/ - -// This implementation is based on GSL Math's expm1. -namespace std_fallback { - // fallback expm1 implementation in case there is no expm1(Scalar) function in namespace of Scalar, - // or that there is no suitable std::expm1 function available. Implementation - // attributed to Kahan. See: http://www.plunk.org/~hatch/rightway.php. - template - EIGEN_DEVICE_FUNC inline Scalar expm1(const Scalar& x) { - EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) - typedef typename NumTraits::Real RealScalar; - - EIGEN_USING_STD_MATH(exp); - Scalar u = exp(x); - if (numext::equal_strict(u, Scalar(1))) { - return x; - } - Scalar um1 = u - RealScalar(1); - if (numext::equal_strict(um1, Scalar(-1))) { - return RealScalar(-1); - } - - EIGEN_USING_STD_MATH(log); - Scalar logu = log(u); - return numext::equal_strict(u, logu) ? u : (u - RealScalar(1)) * x / logu; - } -} - -template -struct expm1_impl { - EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) - { - EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) - #if EIGEN_HAS_CXX11_MATH - using std::expm1; - #else - using std_fallback::expm1; - #endif - return expm1(x); - } -}; - -// Specialization for complex types that are not supported by std::expm1. -template -struct expm1_impl > { - EIGEN_DEVICE_FUNC static inline std::complex run( - const std::complex& x) { - EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar) - RealScalar xr = x.real(); - RealScalar xi = x.imag(); - // expm1(z) = exp(z) - 1 - // = exp(x + i * y) - 1 - // = exp(x) * (cos(y) + i * sin(y)) - 1 - // = exp(x) * cos(y) - 1 + i * exp(x) * sin(y) - // Imag(expm1(z)) = exp(x) * sin(y) - // Real(expm1(z)) = exp(x) * cos(y) - 1 - // = exp(x) * cos(y) - 1. - // = expm1(x) + exp(x) * (cos(y) - 1) - // = expm1(x) + exp(x) * (2 * sin(y / 2) ** 2) - - // TODO better use numext::expm1 and numext::sin (but that would require forward declarations or moving this specialization down). - RealScalar erm1 = expm1_impl::run(xr); - RealScalar er = erm1 + RealScalar(1.); - EIGEN_USING_STD_MATH(sin); - RealScalar sin2 = sin(xi / RealScalar(2.)); - sin2 = sin2 * sin2; - RealScalar s = sin(xi); - RealScalar real_part = erm1 - RealScalar(2.) * er * sin2; - return std::complex(real_part, er * s); - } -}; - -template -struct expm1_retval -{ - typedef Scalar type; -}; - /**************************************************************************** * Implementation of log1p * ****************************************************************************/ @@ -617,36 +474,23 @@ namespace std_fallback { typedef typename NumTraits::Real RealScalar; EIGEN_USING_STD_MATH(log); Scalar x1p = RealScalar(1) + x; - Scalar log_1p = log(x1p); - const bool is_small = numext::equal_strict(x1p, Scalar(1)); - const bool is_inf = numext::equal_strict(x1p, log_1p); - return (is_small || is_inf) ? x : x * (log_1p / (x1p - RealScalar(1))); + return numext::equal_strict(x1p, Scalar(1)) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) ); } } template struct log1p_impl { - EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) + static inline Scalar run(const Scalar& x) { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) #if EIGEN_HAS_CXX11_MATH using std::log1p; - #else - using std_fallback::log1p; #endif + using std_fallback::log1p; return log1p(x); } }; -// Specialization for complex types that are not supported by std::log1p. -template -struct log1p_impl > { - EIGEN_DEVICE_FUNC static inline std::complex run( - const std::complex& x) { - EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar) - return std_fallback::log1p(x); - } -}; template struct log1p_retval @@ -843,7 +687,7 @@ inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random() return EIGEN_MATHFUNC_IMPL(random, Scalar)::run(); } -// Implementation of is* functions +// Implementatin of is* functions // std::is* do not work with fast-math and gcc, std::is* are available on MSVC 2013 and newer, as well as in clang. #if (EIGEN_HAS_CXX11_MATH && !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || (EIGEN_COMP_MSVC>=1800) || (EIGEN_COMP_CLANG) @@ -872,7 +716,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isfinite_impl(const T& x) { - #if defined(EIGEN_GPU_COMPILE_PHASE) + #ifdef __CUDA_ARCH__ return (::isfinite)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isfinite; @@ -887,7 +731,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isinf_impl(const T& x) { - #if defined(EIGEN_GPU_COMPILE_PHASE) + #ifdef __CUDA_ARCH__ return (::isinf)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isinf; @@ -902,7 +746,7 @@ EIGEN_DEVICE_FUNC typename internal::enable_if<(!internal::is_integral::value)&&(!NumTraits::IsComplex),bool>::type isnan_impl(const T& x) { - #if defined(EIGEN_GPU_COMPILE_PHASE) + #ifdef __CUDA_ARCH__ return (::isnan)(x); #elif EIGEN_USE_STD_FPCLASSIFY using std::isnan; @@ -959,6 +803,7 @@ template EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex& x) template EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex& x); template T generic_fast_tanh_float(const T& a_x); + } // end namespace internal /**************************************************************************** @@ -967,7 +812,7 @@ template T generic_fast_tanh_float(const T& a_x); namespace numext { -#if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) +#ifndef __CUDA_ARCH__ template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) @@ -996,24 +841,6 @@ EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y) { return fminf(x, y); } -template<> -EIGEN_DEVICE_FUNC -EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y) -{ - return fmin(x, y); -} -template<> -EIGEN_DEVICE_FUNC -EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y) -{ -#if defined(EIGEN_HIPCC) - // no "fminl" on HIP yet - return (x < y) ? x : y; -#else - return fminl(x, y); -#endif -} - template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) @@ -1026,92 +853,6 @@ EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y) { return fmaxf(x, y); } -template<> -EIGEN_DEVICE_FUNC -EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y) -{ - return fmax(x, y); -} -template<> -EIGEN_DEVICE_FUNC -EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y) -{ -#if defined(EIGEN_HIPCC) - // no "fmaxl" on HIP yet - return (x > y) ? x : y; -#else - return fmaxl(x, y); -#endif -} -#endif - -#if defined(SYCL_DEVICE_ONLY) - - -#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \ - SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_char) \ - SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_short) \ - SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_int) \ - SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_long) -#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \ - SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_char) \ - SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_short) \ - SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_int) \ - SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_long) -#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \ - SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar) \ - SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \ - SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uint) \ - SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong) -#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \ - SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar) \ - SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \ - SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uint) \ - SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong) -#define SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(NAME, FUNC) \ - SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \ - SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) -#define SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(NAME, FUNC) \ - SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \ - SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) -#define SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(NAME, FUNC) \ - SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \ - SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC,cl::sycl::cl_double) -#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(NAME, FUNC) \ - SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \ - SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC,cl::sycl::cl_double) -#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(NAME, FUNC, RET_TYPE) \ - SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_float) \ - SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_double) - -#define SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \ -template<> \ - EIGEN_DEVICE_FUNC \ - EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE& x) { \ - return cl::sycl::FUNC(x); \ - } - -#define SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, TYPE) \ - SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, TYPE, TYPE) - -#define SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE1, ARG_TYPE2) \ - template<> \ - EIGEN_DEVICE_FUNC \ - EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE1& x, const ARG_TYPE2& y) { \ - return cl::sycl::FUNC(x, y); \ - } - -#define SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \ - SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE, ARG_TYPE) - -#define SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, TYPE) \ - SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, TYPE, TYPE) - -SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(mini, min) -SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(mini, fmin) -SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(maxi, max) -SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(maxi, fmax) - #endif @@ -1181,36 +922,6 @@ inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x) EIGEN_DEVICE_FUNC inline bool abs2(bool x) { return x; } -template -EIGEN_DEVICE_FUNC -EIGEN_ALWAYS_INLINE T absdiff(const T& x, const T& y) -{ - return x > y ? x - y : y - x; -} -template<> -EIGEN_DEVICE_FUNC -EIGEN_ALWAYS_INLINE float absdiff(const float& x, const float& y) -{ - return fabsf(x - y); -} -template<> -EIGEN_DEVICE_FUNC -EIGEN_ALWAYS_INLINE double absdiff(const double& x, const double& y) -{ - return fabs(x - y); -} -template<> -EIGEN_DEVICE_FUNC -EIGEN_ALWAYS_INLINE long double absdiff(const long double& x, const long double& y) -{ -#if defined(EIGEN_HIPCC) - // no "fabsl" on HIP yet - return (x > y) ? x : y; -#else - return fabsl(x - y); -#endif -} - template EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x) @@ -1225,10 +936,6 @@ inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar& return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y); } -#if defined(SYCL_DEVICE_ONLY) - SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(hypot, hypot) -#endif - template EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x) @@ -1236,11 +943,7 @@ inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x) return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x); } -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log1p, log1p) -#endif - -#if defined(EIGEN_GPUCC) +#ifdef __CUDACC__ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log1p(const float &x) { return ::log1pf(x); } @@ -1255,27 +958,10 @@ inline typename internal::pow_impl::result_type pow(const Scala return internal::pow_impl::run(x, y); } -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(pow, pow) -#endif - template EIGEN_DEVICE_FUNC bool (isnan) (const T &x) { return internal::isnan_impl(x); } template EIGEN_DEVICE_FUNC bool (isinf) (const T &x) { return internal::isinf_impl(x); } template EIGEN_DEVICE_FUNC bool (isfinite)(const T &x) { return internal::isfinite_impl(x); } -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isnan, isnan, bool) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isinf, isinf, bool) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isfinite, isfinite, bool) -#endif - -template -EIGEN_DEVICE_FUNC -inline EIGEN_MATHFUNC_RETVAL(rint, Scalar) rint(const Scalar& x) -{ - return EIGEN_MATHFUNC_IMPL(rint, Scalar)::run(x); -} - template EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x) @@ -1283,10 +969,6 @@ inline EIGEN_MATHFUNC_RETVAL(round, Scalar) round(const Scalar& x) return EIGEN_MATHFUNC_IMPL(round, Scalar)::run(x); } -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(round, round) -#endif - template EIGEN_DEVICE_FUNC T (floor)(const T& x) @@ -1295,11 +977,7 @@ T (floor)(const T& x) return floor(x); } -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(floor, floor) -#endif - -#if defined(EIGEN_GPUCC) +#ifdef __CUDACC__ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float floor(const float &x) { return ::floorf(x); } @@ -1315,11 +993,7 @@ T (ceil)(const T& x) return ceil(x); } -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(ceil, ceil) -#endif - -#if defined(EIGEN_GPUCC) +#ifdef __CUDACC__ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float ceil(const float &x) { return ::ceilf(x); } @@ -1360,10 +1034,6 @@ T sqrt(const T &x) return sqrt(x); } -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt) -#endif - template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T log(const T &x) { @@ -1371,12 +1041,7 @@ T log(const T &x) { return log(x); } -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log, log) -#endif - - -#if defined(EIGEN_GPUCC) +#ifdef __CUDACC__ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log(const float &x) { return ::logf(x); } @@ -1399,12 +1064,12 @@ abs(const T &x) { return x; } -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(abs, abs) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(abs, fabs) -#endif +#if defined(__SYCL_DEVICE_ONLY__) +EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); } +EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); } +#endif // defined(__SYCL_DEVICE_ONLY__) -#if defined(EIGEN_GPUCC) +#ifdef __CUDACC__ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const float &x) { return ::fabsf(x); } @@ -1429,51 +1094,12 @@ T exp(const T &x) { return exp(x); } -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(exp, exp) -#endif - -#if defined(EIGEN_GPUCC) +#ifdef __CUDACC__ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float exp(const float &x) { return ::expf(x); } template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double exp(const double &x) { return ::exp(x); } - -template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -std::complex exp(const std::complex& x) { - float com = ::expf(x.real()); - float res_real = com * ::cosf(x.imag()); - float res_imag = com * ::sinf(x.imag()); - return std::complex(res_real, res_imag); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -std::complex exp(const std::complex& x) { - double com = ::exp(x.real()); - double res_real = com * ::cos(x.imag()); - double res_imag = com * ::sin(x.imag()); - return std::complex(res_real, res_imag); -} -#endif - -template -EIGEN_DEVICE_FUNC -inline EIGEN_MATHFUNC_RETVAL(expm1, Scalar) expm1(const Scalar& x) -{ - return EIGEN_MATHFUNC_IMPL(expm1, Scalar)::run(x); -} - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(expm1, expm1) -#endif - -#if defined(EIGEN_GPUCC) -template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -float expm1(const float &x) { return ::expm1f(x); } - -template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -double expm1(const double &x) { return ::expm1(x); } #endif template @@ -1483,11 +1109,7 @@ T cos(const T &x) { return cos(x); } -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cos,cos) -#endif - -#if defined(EIGEN_GPUCC) +#ifdef __CUDACC__ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cos(const float &x) { return ::cosf(x); } @@ -1502,11 +1124,7 @@ T sin(const T &x) { return sin(x); } -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sin, sin) -#endif - -#if defined(EIGEN_GPUCC) +#ifdef __CUDACC__ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sin(const float &x) { return ::sinf(x); } @@ -1521,11 +1139,7 @@ T tan(const T &x) { return tan(x); } -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tan, tan) -#endif - -#if defined(EIGEN_GPUCC) +#ifdef __CUDACC__ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tan(const float &x) { return ::tanf(x); } @@ -1540,21 +1154,7 @@ T acos(const T &x) { return acos(x); } -#if EIGEN_HAS_CXX11_MATH -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -T acosh(const T &x) { - EIGEN_USING_STD_MATH(acosh); - return acosh(x); -} -#endif - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acos, acos) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acosh, acosh) -#endif - -#if defined(EIGEN_GPUCC) +#ifdef __CUDACC__ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float acos(const float &x) { return ::acosf(x); } @@ -1569,21 +1169,7 @@ T asin(const T &x) { return asin(x); } -#if EIGEN_HAS_CXX11_MATH -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -T asinh(const T &x) { - EIGEN_USING_STD_MATH(asinh); - return asinh(x); -} -#endif - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asin, asin) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asinh, asinh) -#endif - -#if defined(EIGEN_GPUCC) +#ifdef __CUDACC__ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float asin(const float &x) { return ::asinf(x); } @@ -1598,21 +1184,7 @@ T atan(const T &x) { return atan(x); } -#if EIGEN_HAS_CXX11_MATH -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -T atanh(const T &x) { - EIGEN_USING_STD_MATH(atanh); - return atanh(x); -} -#endif - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atan, atan) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atanh, atanh) -#endif - -#if defined(EIGEN_GPUCC) +#ifdef __CUDACC__ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float atan(const float &x) { return ::atanf(x); } @@ -1628,11 +1200,7 @@ T cosh(const T &x) { return cosh(x); } -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cosh, cosh) -#endif - -#if defined(EIGEN_GPUCC) +#ifdef __CUDACC__ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cosh(const float &x) { return ::coshf(x); } @@ -1647,11 +1215,7 @@ T sinh(const T &x) { return sinh(x); } -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sinh, sinh) -#endif - -#if defined(EIGEN_GPUCC) +#ifdef __CUDACC__ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sinh(const float &x) { return ::sinhf(x); } @@ -1666,16 +1230,12 @@ T tanh(const T &x) { return tanh(x); } -#if (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH && !defined(SYCL_DEVICE_ONLY) +#if (!defined(__CUDACC__)) && EIGEN_FAST_MATH EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(float x) { return internal::generic_fast_tanh_float(x); } #endif -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tanh, tanh) -#endif - -#if defined(EIGEN_GPUCC) +#ifdef __CUDACC__ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(const float &x) { return ::tanhf(x); } @@ -1690,11 +1250,7 @@ T fmod(const T& a, const T& b) { return fmod(a, b); } -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(fmod, fmod) -#endif - -#if defined(EIGEN_GPUCC) +#ifdef __CUDACC__ template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float fmod(const float& a, const float& b) { @@ -1708,23 +1264,6 @@ double fmod(const double& a, const double& b) { } #endif -#if defined(SYCL_DEVICE_ONLY) -#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY -#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY -#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY -#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY -#undef SYCL_SPECIALIZE_INTEGER_TYPES_BINARY -#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY -#undef SYCL_SPECIALIZE_FLOATING_TYPES_BINARY -#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY -#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE -#undef SYCL_SPECIALIZE_GEN_UNARY_FUNC -#undef SYCL_SPECIALIZE_UNARY_FUNC -#undef SYCL_SPECIALIZE_GEN1_BINARY_FUNC -#undef SYCL_SPECIALIZE_GEN2_BINARY_FUNC -#undef SYCL_SPECIALIZE_BINARY_FUNC -#endif - } // end namespace numext namespace internal { @@ -1853,13 +1392,13 @@ template<> struct random_impl template<> struct scalar_fuzzy_impl { typedef bool RealScalar; - + template EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(const bool& x, const bool&, const bool&) { return !x; } - + EIGEN_DEVICE_FUNC static inline bool isApprox(bool x, bool y, bool) { @@ -1871,10 +1410,10 @@ template<> struct scalar_fuzzy_impl { return (!x) || y; } - + }; - + } // end namespace internal } // end namespace Eigen diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/MathFunctionsImpl.h b/uppsrc/plugin/Eigen/Eigen/src/Core/MathFunctionsImpl.h index 7af58fadb..9c1ceb0eb 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/MathFunctionsImpl.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/MathFunctionsImpl.h @@ -17,28 +17,24 @@ namespace internal { /** \internal \returns the hyperbolic tan of \a a (coeff-wise) Doesn't do anything fancy, just a 13/6-degree rational interpolant which - is accurate up to a couple of ulps in the (approximate) range [-8, 8], - outside of which tanh(x) = +/-1 in single precision. The input is clamped - to the range [-c, c]. The value c is chosen as the smallest value where - the approximation evaluates to exactly 1. In the reange [-0.0004, 0.0004] - the approxmation tanh(x) ~= x is used for better accuracy as x tends to zero. + is accurate up to a couple of ulp in the range [-9, 9], outside of which + the tanh(x) = +/-1. This implementation works on both scalars and packets. */ template T generic_fast_tanh_float(const T& a_x) { - // Clamp the inputs to the range [-c, c] -#ifdef EIGEN_VECTORIZE_FMA - const T plus_clamp = pset1(7.99881172180175781f); - const T minus_clamp = pset1(-7.99881172180175781f); -#else - const T plus_clamp = pset1(7.90531110763549805f); - const T minus_clamp = pset1(-7.90531110763549805f); -#endif - const T tiny = pset1(0.0004f); - const T x = pmax(pmin(a_x, plus_clamp), minus_clamp); - const T tiny_mask = pcmp_lt(pabs(a_x), tiny); + // Clamp the inputs to the range [-9, 9] since anything outside + // this range is +/-1.0f in single-precision. + const T plus_9 = pset1(9.f); + const T minus_9 = pset1(-9.f); + // NOTE GCC prior to 6.3 might improperly optimize this max/min + // step such that if a_x is nan, x will be either 9 or -9, + // and tanh will return 1 or -1 instead of nan. + // This is supposed to be fixed in gcc6.3, + // see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 + const T x = pmax(minus_9,pmin(plus_9,a_x)); // The monomial coefficients of the numerator polynomial (odd). const T alpha_1 = pset1(4.89352455891786e-03f); const T alpha_3 = pset1(6.37261928875436e-04f); @@ -66,24 +62,24 @@ T generic_fast_tanh_float(const T& a_x) p = pmadd(x2, p, alpha_1); p = pmul(x, p); - // Evaluate the denominator polynomial q. + // Evaluate the denominator polynomial p. T q = pmadd(x2, beta_6, beta_4); q = pmadd(x2, q, beta_2); q = pmadd(x2, q, beta_0); // Divide the numerator by the denominator. - return pselect(tiny_mask, x, pdiv(p, q)); + return pdiv(p, q); } template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +EIGEN_STRONG_INLINE RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y) { EIGEN_USING_STD_MATH(sqrt); RealScalar p, qp; p = numext::maxi(x,y); if(p==RealScalar(0)) return RealScalar(0); - qp = numext::mini(y,x) / p; + qp = numext::mini(y,x) / p; return p * sqrt(RealScalar(1) + qp*qp); } @@ -91,8 +87,7 @@ template struct hypot_impl { typedef typename NumTraits::Real RealScalar; - static EIGEN_DEVICE_FUNC - inline RealScalar run(const Scalar& x, const Scalar& y) + static inline RealScalar run(const Scalar& x, const Scalar& y) { EIGEN_USING_STD_MATH(abs); return positive_real_hypot(abs(x), abs(y)); diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Matrix.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Matrix.h index fb7238265..7f4a7af93 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/Matrix.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Matrix.h @@ -255,27 +255,27 @@ class Matrix * * \sa resize(Index,Index) */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Matrix() : Base() + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Matrix() : Base() { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } // FIXME is it still needed - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_DEVICE_FUNC explicit Matrix(internal::constructor_without_unaligned_array_assert) : Base(internal::constructor_without_unaligned_array_assert()) { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } #if EIGEN_HAS_RVALUE_REFERENCES - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_DEVICE_FUNC Matrix(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible::value) : Base(std::move(other)) { Base::_check_template_params(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_DEVICE_FUNC Matrix& operator=(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable::value) { other.swap(*this); @@ -283,65 +283,25 @@ class Matrix } #endif -#if EIGEN_HAS_CXX11 - /** \copydoc PlainObjectBase(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&... args) - * - * Example: \include Matrix_variadic_ctor_cxx11.cpp - * Output: \verbinclude Matrix_variadic_ctor_cxx11.out - * - * \sa Matrix(const std::initializer_list>&) - */ - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) - : Base(a0, a1, a2, a3, args...) {} - - /** \brief Constructs a Matrix and initializes it from the coefficients given as initializer-lists grouped by row. \cpp11 - * - * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients: - * - * Example: \include Matrix_initializer_list_23_cxx11.cpp - * Output: \verbinclude Matrix_initializer_list_23_cxx11.out - * - * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is triggered. - * - * In the case of a compile-time column vector, implicit transposition from a single row is allowed. - * Therefore VectorXd{{1,2,3,4,5}} is legal and the more verbose syntax - * RowVectorXd{{1},{2},{3},{4},{5}} can be avoided: - * - * Example: \include Matrix_initializer_list_vector_cxx11.cpp - * Output: \verbinclude Matrix_initializer_list_vector_cxx11.out - * - * In the case of fixed-sized matrices, the initializer list sizes must exactly match the matrix sizes, - * and implicit transposition is allowed for compile-time vectors only. - * - * \sa Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) - */ - EIGEN_DEVICE_FUNC - explicit EIGEN_STRONG_INLINE Matrix(const std::initializer_list>& list) : Base(list) {} -#endif // end EIGEN_HAS_CXX11 - -#ifndef EIGEN_PARSED_BY_DOXYGEN + #ifndef EIGEN_PARSED_BY_DOXYGEN // This constructor is for both 1x1 matrices and dynamic vectors template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit Matrix(const T& x) + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE explicit Matrix(const T& x) { Base::_check_template_params(); Base::template _init1(x); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Matrix(const T0& x, const T1& y) + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y) { Base::_check_template_params(); Base::template _init2(x, y); } - - -#else + #else /** \brief Constructs a fixed-sized matrix initialized with coefficients starting at \a data */ EIGEN_DEVICE_FUNC explicit Matrix(const Scalar *data); @@ -359,8 +319,7 @@ class Matrix * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives). */ EIGEN_STRONG_INLINE explicit Matrix(Index dim); - /** \brief Constructs an initialized 1x1 matrix with the given coefficient - * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) */ + /** \brief Constructs an initialized 1x1 matrix with the given coefficient */ Matrix(const Scalar& x); /** \brief Constructs an uninitialized matrix with \a rows rows and \a cols columns. * @@ -377,14 +336,11 @@ class Matrix EIGEN_DEVICE_FUNC Matrix(Index rows, Index cols); - /** \brief Constructs an initialized 2D vector with given coefficients - * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) */ + /** \brief Constructs an initialized 2D vector with given coefficients */ Matrix(const Scalar& x, const Scalar& y); - #endif // end EIGEN_PARSED_BY_DOXYGEN + #endif - /** \brief Constructs an initialized 3D vector with given coefficients - * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) - */ + /** \brief Constructs an initialized 3D vector with given coefficients */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z) { @@ -394,9 +350,7 @@ class Matrix m_storage.data()[1] = y; m_storage.data()[2] = z; } - /** \brief Constructs an initialized 4D vector with given coefficients - * \sa Matrix(const Scalar&, const Scalar&, const Scalar&, const Scalar&, const ArgTypes&...) - */ + /** \brief Constructs an initialized 4D vector with given coefficients */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w) { @@ -451,7 +405,7 @@ class Matrix * * \ingroup Core_Module * - * %Eigen defines several typedef shortcuts for most common matrix and vector types. + * Eigen defines several typedef shortcuts for most common matrix and vector types. * * The general patterns are the following: * @@ -463,15 +417,6 @@ class Matrix * * There are also \c VectorSizeType and \c RowVectorSizeType which are self-explanatory. For example, \c Vector4cf is * a fixed-size vector of 4 complex floats. - * - * With \cpp11, template alias are also defined for common sizes. - * They follow the same pattern as above except that the scalar type suffix is replaced by a - * template parameter, i.e.: - * - `MatrixSize` where `Size` can be \c 2,\c 3,\c 4 for fixed size square matrices or \c X for dynamic size. - * - `MatrixXSize` and `MatrixSizeX` where `Size` can be \c 2,\c 3,\c 4 for hybrid dynamic/fixed matrices. - * - `VectorSize` and `RowVectorSize` for column and row vectors. - * - * With \cpp11, you can also use fully generic column and row vector types: `Vector` and `RowVector`. * * \sa class Matrix */ @@ -509,55 +454,6 @@ EIGEN_MAKE_TYPEDEFS_ALL_SIZES(std::complex, cd) #undef EIGEN_MAKE_TYPEDEFS #undef EIGEN_MAKE_FIXED_TYPEDEFS -#if EIGEN_HAS_CXX11 - -#define EIGEN_MAKE_TYPEDEFS(Size, SizeSuffix) \ -/** \ingroup matrixtypedefs */ \ -/** \brief \cpp11 */ \ -template \ -using Matrix##SizeSuffix = Matrix; \ -/** \ingroup matrixtypedefs */ \ -/** \brief \cpp11 */ \ -template \ -using Vector##SizeSuffix = Matrix; \ -/** \ingroup matrixtypedefs */ \ -/** \brief \cpp11 */ \ -template \ -using RowVector##SizeSuffix = Matrix; - -#define EIGEN_MAKE_FIXED_TYPEDEFS(Size) \ -/** \ingroup matrixtypedefs */ \ -/** \brief \cpp11 */ \ -template \ -using Matrix##Size##X = Matrix; \ -/** \ingroup matrixtypedefs */ \ -/** \brief \cpp11 */ \ -template \ -using Matrix##X##Size = Matrix; - -EIGEN_MAKE_TYPEDEFS(2, 2) -EIGEN_MAKE_TYPEDEFS(3, 3) -EIGEN_MAKE_TYPEDEFS(4, 4) -EIGEN_MAKE_TYPEDEFS(Dynamic, X) -EIGEN_MAKE_FIXED_TYPEDEFS(2) -EIGEN_MAKE_FIXED_TYPEDEFS(3) -EIGEN_MAKE_FIXED_TYPEDEFS(4) - -/** \ingroup matrixtypedefs - * \brief \cpp11 */ -template -using Vector = Matrix; - -/** \ingroup matrixtypedefs - * \brief \cpp11 */ -template -using RowVector = Matrix; - -#undef EIGEN_MAKE_TYPEDEFS -#undef EIGEN_MAKE_FIXED_TYPEDEFS - -#endif // EIGEN_HAS_CXX11 - } // end namespace Eigen #endif // EIGEN_MATRIX_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/MatrixBase.h b/uppsrc/plugin/Eigen/Eigen/src/Core/MatrixBase.h index 45c3a596e..f8bcc8c6f 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/MatrixBase.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/MatrixBase.h @@ -76,7 +76,6 @@ template class MatrixBase using Base::coeffRef; using Base::lazyAssign; using Base::eval; - using Base::operator-; using Base::operator+=; using Base::operator-=; using Base::operator*=; @@ -123,6 +122,7 @@ template class MatrixBase #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::MatrixBase #define EIGEN_DOC_UNARY_ADDONS(X,Y) +# include "../plugins/CommonCwiseUnaryOps.h" # include "../plugins/CommonCwiseBinaryOps.h" # include "../plugins/MatrixCwiseUnaryOps.h" # include "../plugins/MatrixCwiseBinaryOps.h" @@ -268,8 +268,6 @@ template class MatrixBase Derived& setIdentity(); EIGEN_DEVICE_FUNC Derived& setIdentity(Index rows, Index cols); - EIGEN_DEVICE_FUNC Derived& setUnit(Index i); - EIGEN_DEVICE_FUNC Derived& setUnit(Index newSize, Index i); bool isIdentity(const RealScalar& prec = NumTraits::dummy_precision()) const; bool isDiagonal(const RealScalar& prec = NumTraits::dummy_precision()) const; @@ -298,7 +296,7 @@ template class MatrixBase EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase& other) const { return cwiseNotEqual(other).any(); } - NoAlias EIGEN_DEVICE_FUNC noalias(); + NoAlias noalias(); // TODO forceAlignedAccess is temporarily disabled // Need to find a nicer workaround. @@ -328,7 +326,6 @@ template class MatrixBase inline const PartialPivLU lu() const; - EIGEN_DEVICE_FUNC inline const Inverse inverse() const; template @@ -338,15 +335,12 @@ template class MatrixBase bool& invertible, const RealScalar& absDeterminantThreshold = NumTraits::dummy_precision() ) const; - template inline void computeInverseWithCheck( ResultType& inverse, bool& invertible, const RealScalar& absDeterminantThreshold = NumTraits::dummy_precision() ) const; - - EIGEN_DEVICE_FUNC Scalar determinant() const; /////////// Cholesky module /////////// @@ -418,19 +412,15 @@ template class MatrixBase ////////// Householder module /////////// - EIGEN_DEVICE_FUNC void makeHouseholderInPlace(Scalar& tau, RealScalar& beta); template - EIGEN_DEVICE_FUNC void makeHouseholder(EssentialPart& essential, Scalar& tau, RealScalar& beta) const; template - EIGEN_DEVICE_FUNC void applyHouseholderOnTheLeft(const EssentialPart& essential, const Scalar& tau, Scalar* workspace); template - EIGEN_DEVICE_FUNC void applyHouseholderOnTheRight(const EssentialPart& essential, const Scalar& tau, Scalar* workspace); @@ -438,10 +428,8 @@ template class MatrixBase ///////// Jacobi module ///////// template - EIGEN_DEVICE_FUNC void applyOnTheLeft(Index p, Index q, const JacobiRotation& j); template - EIGEN_DEVICE_FUNC void applyOnTheRight(Index p, Index q, const JacobiRotation& j); ///////// SparseCore module ///////// @@ -468,11 +456,6 @@ template class MatrixBase const MatrixFunctionReturnValue matrixFunction(StemFunction f) const; EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cosh, hyperbolic cosine) EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sinh, hyperbolic sine) -#if EIGEN_HAS_CXX11_MATH - EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, atanh, inverse hyperbolic cosine) - EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, acosh, inverse hyperbolic cosine) - EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, asinh, inverse hyperbolic sine) -#endif EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cos, cosine) EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sin, sine) EIGEN_MATRIX_FUNCTION(MatrixSquareRootReturnValue, sqrt, square root) diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/NestByValue.h b/uppsrc/plugin/Eigen/Eigen/src/Core/NestByValue.h index 239bbba63..13adf070e 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/NestByValue.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/NestByValue.h @@ -16,11 +16,7 @@ namespace Eigen { namespace internal { template struct traits > : public traits -{ - enum { - Flags = traits::Flags & ~NestByRefBit - }; -}; +{}; } /** \class NestByValue @@ -47,11 +43,55 @@ template class NestByValue EIGEN_DEVICE_FUNC inline Index rows() const { return m_expression.rows(); } EIGEN_DEVICE_FUNC inline Index cols() const { return m_expression.cols(); } + EIGEN_DEVICE_FUNC inline Index outerStride() const { return m_expression.outerStride(); } + EIGEN_DEVICE_FUNC inline Index innerStride() const { return m_expression.innerStride(); } + + EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const + { + return m_expression.coeff(row, col); + } + + EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col) + { + return m_expression.const_cast_derived().coeffRef(row, col); + } + + EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const + { + return m_expression.coeff(index); + } + + EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index) + { + return m_expression.const_cast_derived().coeffRef(index); + } + + template + inline const PacketScalar packet(Index row, Index col) const + { + return m_expression.template packet(row, col); + } + + template + inline void writePacket(Index row, Index col, const PacketScalar& x) + { + m_expression.const_cast_derived().template writePacket(row, col, x); + } + + template + inline const PacketScalar packet(Index index) const + { + return m_expression.template packet(index); + } + + template + inline void writePacket(Index index, const PacketScalar& x) + { + m_expression.const_cast_derived().template writePacket(index, x); + } EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; } - EIGEN_DEVICE_FUNC const ExpressionType& nestedExpression() const { return m_expression; } - protected: const ExpressionType m_expression; }; @@ -59,27 +99,12 @@ template class NestByValue /** \returns an expression of the temporary version of *this. */ template -EIGEN_DEVICE_FUNC inline const NestByValue +inline const NestByValue DenseBase::nestByValue() const { return NestByValue(derived()); } -namespace internal { - -// Evaluator of Solve -> eval into a temporary -template -struct evaluator > - : public evaluator -{ - typedef evaluator Base; - - EIGEN_DEVICE_FUNC explicit evaluator(const NestByValue& xpr) - : Base(xpr.nestedExpression()) - {} -}; -} - } // end namespace Eigen #endif // EIGEN_NESTBYVALUE_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/NoAlias.h b/uppsrc/plugin/Eigen/Eigen/src/Core/NoAlias.h index 570283d90..33908010b 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/NoAlias.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/NoAlias.h @@ -33,7 +33,6 @@ class NoAlias public: typedef typename ExpressionType::Scalar Scalar; - EIGEN_DEVICE_FUNC explicit NoAlias(ExpressionType& expression) : m_expression(expression) {} template @@ -75,10 +74,10 @@ class NoAlias * * More precisely, noalias() allows to bypass the EvalBeforeAssignBit flag. * Currently, even though several expressions may alias, only product - * expressions have this flag. Therefore, noalias() is only useful when + * expressions have this flag. Therefore, noalias() is only usefull when * the source expression contains a matrix product. * - * Here are some examples where noalias is useful: + * Here are some examples where noalias is usefull: * \code * D.noalias() = A * B; * D.noalias() += A.transpose() * B; @@ -99,7 +98,7 @@ class NoAlias * \sa class NoAlias */ template -NoAlias EIGEN_DEVICE_FUNC MatrixBase::noalias() +NoAlias MatrixBase::noalias() { return NoAlias(derived()); } diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/NumTraits.h b/uppsrc/plugin/Eigen/Eigen/src/Core/NumTraits.h index 9ab55534f..daf489878 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/NumTraits.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/NumTraits.h @@ -21,14 +21,12 @@ template< typename T, bool is_integer = NumTraits::IsInteger> struct default_digits10_impl { - EIGEN_DEVICE_FUNC static int run() { return std::numeric_limits::digits10; } }; template struct default_digits10_impl // Floating point { - EIGEN_DEVICE_FUNC static int run() { using std::log10; using std::ceil; @@ -40,38 +38,6 @@ struct default_digits10_impl // Floating point template struct default_digits10_impl // Integer { - EIGEN_DEVICE_FUNC - static int run() { return 0; } -}; - - -// default implementation of digits(), based on numeric_limits if specialized, -// 0 for integer types, and log2(epsilon()) otherwise. -template< typename T, - bool use_numeric_limits = std::numeric_limits::is_specialized, - bool is_integer = NumTraits::IsInteger> -struct default_digits_impl -{ - EIGEN_DEVICE_FUNC - static int run() { return std::numeric_limits::digits; } -}; - -template -struct default_digits_impl // Floating point -{ - EIGEN_DEVICE_FUNC - static int run() { - using std::log; - using std::ceil; - typedef typename NumTraits::Real Real; - return int(ceil(-log(NumTraits::epsilon())/log(static_cast(2)))); - } -}; - -template -struct default_digits_impl // Integer -{ - EIGEN_DEVICE_FUNC static int run() { return 0; } }; @@ -105,7 +71,7 @@ struct default_digits_impl // Integer * and to \c 0 otherwise. * \li Enum values ReadCost, AddCost and MulCost representing a rough estimate of the number of CPU cycles needed * to by move / add / mul instructions respectively, assuming the data is already stored in CPU registers. - * Stay vague here. No need to do architecture-specific stuff. If you don't know what this means, just use \c Eigen::HugeCost. + * Stay vague here. No need to do architecture-specific stuff. * \li An enum value \a IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned. * \li An enum value \a RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must * be called, and to 0 if it is safe not to call it. Default is 0 if \a T is an arithmetic type, and 1 otherwise. @@ -152,12 +118,6 @@ template struct GenericNumTraits return internal::default_digits10_impl::run(); } - EIGEN_DEVICE_FUNC - static inline int digits() - { - return internal::default_digits_impl::run(); - } - EIGEN_DEVICE_FUNC static inline Real dummy_precision() { @@ -173,8 +133,7 @@ template struct GenericNumTraits EIGEN_DEVICE_FUNC static inline T lowest() { - return IsInteger ? (numext::numeric_limits::min)() - : static_cast(-(numext::numeric_limits::max)()); + return IsInteger ? (numext::numeric_limits::min)() : (-(numext::numeric_limits::max)()); } EIGEN_DEVICE_FUNC @@ -284,8 +243,6 @@ private: // Empty specialization for void to allow template specialization based on NumTraits::Real with T==void and SFINAE. template<> struct NumTraits {}; -template<> struct NumTraits : GenericNumTraits {}; - } // end namespace Eigen #endif // EIGEN_NUMTRAITS_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/PartialReduxEvaluator.h b/uppsrc/plugin/Eigen/Eigen/src/Core/PartialReduxEvaluator.h deleted file mode 100644 index 0be694259..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/PartialReduxEvaluator.h +++ /dev/null @@ -1,232 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2011-2018 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_PARTIALREDUX_H -#define EIGEN_PARTIALREDUX_H - -namespace Eigen { - -namespace internal { - - -/*************************************************************************** -* -* This file provides evaluators for partial reductions. -* There are two modes: -* -* - scalar path: simply calls the respective function on the column or row. -* -> nothing special here, all the tricky part is handled by the return -* types of VectorwiseOp's members. They embed the functor calling the -* respective DenseBase's member function. -* -* - vectorized path: implements a packet-wise reductions followed by -* some (optional) processing of the outcome, e.g., division by n for mean. -* -* For the vectorized path let's observe that the packet-size and outer-unrolling -* are both decided by the assignement logic. So all we have to do is to decide -* on the inner unrolling. -* -* For the unrolling, we can reuse "internal::redux_vec_unroller" from Redux.h, -* but be need to be careful to specify correct increment. -* -***************************************************************************/ - - -/* logic deciding a strategy for unrolling of vectorized paths */ -template -struct packetwise_redux_traits -{ - enum { - OuterSize = int(Evaluator::IsRowMajor) ? Evaluator::RowsAtCompileTime : Evaluator::ColsAtCompileTime, - Cost = OuterSize == Dynamic ? HugeCost - : OuterSize * Evaluator::CoeffReadCost + (OuterSize-1) * functor_traits::Cost, - Unrolling = Cost <= EIGEN_UNROLLING_LIMIT ? CompleteUnrolling : NoUnrolling - }; - -}; - -/* Value to be returned when size==0 , by default let's return 0 */ -template -EIGEN_DEVICE_FUNC -PacketType packetwise_redux_empty_value(const Func& ) { return pset1(0); } - -/* For products the default is 1 */ -template -EIGEN_DEVICE_FUNC -PacketType packetwise_redux_empty_value(const scalar_product_op& ) { return pset1(1); } - -/* Perform the actual reduction */ -template::Unrolling -> -struct packetwise_redux_impl; - -/* Perform the actual reduction with unrolling */ -template -struct packetwise_redux_impl -{ - typedef redux_novec_unroller Base; - typedef typename Evaluator::Scalar Scalar; - - template - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE - PacketType run(const Evaluator &eval, const Func& func, Index /*size*/) - { - return redux_vec_unroller::OuterSize>::template run(eval,func); - } -}; - -/* Add a specialization of redux_vec_unroller for size==0 at compiletime. - * This specialization is not required for general reductions, which is - * why it is defined here. - */ -template -struct redux_vec_unroller -{ - template - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE PacketType run(const Evaluator &, const Func& f) - { - return packetwise_redux_empty_value(f); - } -}; - -/* Perform the actual reduction for dynamic sizes */ -template -struct packetwise_redux_impl -{ - typedef typename Evaluator::Scalar Scalar; - typedef typename redux_traits::PacketType PacketScalar; - - template - EIGEN_DEVICE_FUNC - static PacketType run(const Evaluator &eval, const Func& func, Index size) - { - if(size==0) - return packetwise_redux_empty_value(func); - - const Index size4 = (size-1)&(~3); - PacketType p = eval.template packetByOuterInner(0,0); - Index i = 1; - // This loop is optimized for instruction pipelining: - // - each iteration generates two independent instructions - // - thanks to branch prediction and out-of-order execution we have independent instructions across loops - for(; i(i+0,0),eval.template packetByOuterInner(i+1,0)), - func.packetOp(eval.template packetByOuterInner(i+2,0),eval.template packetByOuterInner(i+3,0)))); - for(; i(i,0)); - return p; - } -}; - -template< typename ArgType, typename MemberOp, int Direction> -struct evaluator > - : evaluator_base > -{ - typedef PartialReduxExpr XprType; - typedef typename internal::nested_eval::type ArgTypeNested; - typedef typename internal::add_const_on_value_type::type ConstArgTypeNested; - typedef typename internal::remove_all::type ArgTypeNestedCleaned; - typedef typename ArgType::Scalar InputScalar; - typedef typename XprType::Scalar Scalar; - enum { - TraversalSize = Direction==int(Vertical) ? int(ArgType::RowsAtCompileTime) : int(ArgType::ColsAtCompileTime) - }; - typedef typename MemberOp::template Cost CostOpType; - enum { - CoeffReadCost = TraversalSize==Dynamic ? HugeCost - : TraversalSize==0 ? 1 - : TraversalSize * evaluator::CoeffReadCost + int(CostOpType::value), - - _ArgFlags = evaluator::Flags, - - _Vectorizable = bool(int(_ArgFlags)&PacketAccessBit) - && bool(MemberOp::Vectorizable) - && (Direction==int(Vertical) ? bool(_ArgFlags&RowMajorBit) : (_ArgFlags&RowMajorBit)==0) - && (TraversalSize!=0), - - Flags = (traits::Flags&RowMajorBit) - | (evaluator::Flags&(HereditaryBits&(~RowMajorBit))) - | (_Vectorizable ? PacketAccessBit : 0) - | LinearAccessBit, - - Alignment = 0 // FIXME this will need to be improved once PartialReduxExpr is vectorized - }; - - EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr) - : m_arg(xpr.nestedExpression()), m_functor(xpr.functor()) - { - EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize==Dynamic ? HugeCost : (TraversalSize==0 ? 1 : int(CostOpType::value))); - EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); - } - - typedef typename XprType::CoeffReturnType CoeffReturnType; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Scalar coeff(Index i, Index j) const - { - return coeff(Direction==Vertical ? j : i); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Scalar coeff(Index index) const - { - return m_functor(m_arg.template subVector(index)); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PacketType packet(Index i, Index j) const - { - return packet(Direction==Vertical ? j : i); - } - - template - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC - PacketType packet(Index idx) const - { - enum { PacketSize = internal::unpacket_traits::size }; - typedef Block PanelType; - - PanelType panel(m_arg, - Direction==Vertical ? 0 : idx, - Direction==Vertical ? idx : 0, - Direction==Vertical ? m_arg.rows() : Index(PacketSize), - Direction==Vertical ? Index(PacketSize) : m_arg.cols()); - - // FIXME - // See bug 1612, currently if PacketSize==1 (i.e. complex with 128bits registers) then the storage-order of panel get reversed - // and methods like packetByOuterInner do not make sense anymore in this context. - // So let's just by pass "vectorization" in this case: - if(PacketSize==1) - return internal::pset1(coeff(idx)); - - typedef typename internal::redux_evaluator PanelEvaluator; - PanelEvaluator panel_eval(panel); - typedef typename MemberOp::BinaryOp BinaryOp; - PacketType p = internal::packetwise_redux_impl::template run(panel_eval,m_functor.binaryFunc(),m_arg.outerSize()); - return p; - } - -protected: - ConstArgTypeNested m_arg; - const MemberOp m_functor; -}; - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_PARTIALREDUX_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/PermutationMatrix.h b/uppsrc/plugin/Eigen/Eigen/src/Core/PermutationMatrix.h index 69401bf41..b1fb455b9 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/PermutationMatrix.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/PermutationMatrix.h @@ -87,14 +87,25 @@ class PermutationBase : public EigenBase return derived(); } + #ifndef EIGEN_PARSED_BY_DOXYGEN + /** This is a special case of the templated operator=. Its purpose is to + * prevent a default operator= from hiding the templated operator=. + */ + Derived& operator=(const PermutationBase& other) + { + indices() = other.indices(); + return derived(); + } + #endif + /** \returns the number of rows */ - inline EIGEN_DEVICE_FUNC Index rows() const { return Index(indices().size()); } + inline Index rows() const { return Index(indices().size()); } /** \returns the number of columns */ - inline EIGEN_DEVICE_FUNC Index cols() const { return Index(indices().size()); } + inline Index cols() const { return Index(indices().size()); } /** \returns the size of a side of the respective square matrix, i.e., the number of indices */ - inline EIGEN_DEVICE_FUNC Index size() const { return Index(indices().size()); } + inline Index size() const { return Index(indices().size()); } #ifndef EIGEN_PARSED_BY_DOXYGEN template @@ -322,6 +333,12 @@ class PermutationMatrix : public PermutationBase& other) : m_indices(other.indices()) {} + #ifndef EIGEN_PARSED_BY_DOXYGEN + /** Standard copy constructor. Defined only to prevent a default copy constructor + * from hiding the other templated constructor */ + inline PermutationMatrix(const PermutationMatrix& other) : m_indices(other.indices()) {} + #endif + /** Generic constructor from expression of the indices. The indices * array has the meaning that the permutations sends each integer i to indices[i]. * @@ -356,6 +373,17 @@ class PermutationMatrix : public PermutationBase::type typedef typename internal::traits::StorageKind StorageKind; typedef typename internal::traits::Scalar Scalar; - + typedef typename internal::packet_traits::type PacketScalar; typedef typename NumTraits::Real RealScalar; typedef Derived DenseType; @@ -358,7 +358,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * remain row-vectors and vectors remain vectors. */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resizeLike(const EigenBase& _other) { const OtherDerived& other = _other.derived(); @@ -383,7 +383,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or * conservativeResize(Index, NoChange_t). * - * Matrices are resized relative to the top-left element. In case values need to be + * Matrices are resized relative to the top-left element. In case values need to be * appended to the matrix they will be uninitialized. */ EIGEN_DEVICE_FUNC @@ -440,7 +440,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or * conservativeResize(Index, NoChange_t). * - * Matrices are resized relative to the top-left element. In case values need to be + * Matrices are resized relative to the top-left element. In case values need to be * appended to the matrix they will copied from \c other. */ template @@ -526,71 +526,6 @@ class PlainObjectBase : public internal::dense_xpr_base::type // EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED } - #if EIGEN_HAS_CXX11 - /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients. \cpp11 - * - * \only_for_vectors - * - * This constructor is for 1D array or vectors with more than 4 coefficients. - * There exists C++98 analogue constructors for fixed-size array/vector having 1, 2, 3, or 4 coefficients. - * - * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this - * constructor must match the the fixed number of rows (resp. columns) of \c *this. - */ - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) - : m_storage() - { - _check_template_params(); - EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, sizeof...(args) + 4); - m_storage.data()[0] = a0; - m_storage.data()[1] = a1; - m_storage.data()[2] = a2; - m_storage.data()[3] = a3; - int i = 4; - auto x = {(m_storage.data()[i++] = args, 0)...}; - static_cast(x); - } - - /** \brief Constructs a Matrix or Array and initializes it by elements given by an initializer list of initializer - * lists \cpp11 - */ - EIGEN_DEVICE_FUNC - explicit EIGEN_STRONG_INLINE PlainObjectBase(const std::initializer_list>& list) - : m_storage() - { - _check_template_params(); - - size_t list_size = 0; - if (list.begin() != list.end()) { - list_size = list.begin()->size(); - } - - // This is to allow syntax like VectorXi {{1, 2, 3, 4}} - if (ColsAtCompileTime == 1 && list.size() == 1) { - eigen_assert(list_size == static_cast(RowsAtCompileTime) || RowsAtCompileTime == Dynamic); - resize(list_size, ColsAtCompileTime); - std::copy(list.begin()->begin(), list.begin()->end(), m_storage.data()); - } else { - eigen_assert(list.size() == static_cast(RowsAtCompileTime) || RowsAtCompileTime == Dynamic); - eigen_assert(list_size == static_cast(ColsAtCompileTime) || ColsAtCompileTime == Dynamic); - resize(list.size(), list_size); - - Index row_index = 0; - for (const std::initializer_list& row : list) { - eigen_assert(list_size == row.size()); - Index col_index = 0; - for (const Scalar& e : row) { - coeffRef(row_index, col_index) = e; - ++col_index; - } - ++row_index; - } - } - } - #endif // end EIGEN_HAS_CXX11 - /** \sa PlainObjectBase::operator=(const EigenBase&) */ template EIGEN_DEVICE_FUNC @@ -629,7 +564,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * \copydetails DenseBase::operator=(const EigenBase &other) */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const EigenBase &other) { _resize_to_match(other); @@ -743,7 +678,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * remain row-vectors and vectors remain vectors. */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _resize_to_match(const EigenBase& other) { #ifdef EIGEN_NO_AUTOMATIC_RESIZING @@ -770,10 +705,10 @@ class PlainObjectBase : public internal::dense_xpr_base::type * * \internal */ - // aliasing is dealt once in internal::call_assignment + // aliasing is dealt once in internall::call_assignment // so at this stage we have to assume aliasing... and resising has to be done later. template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& _set(const DenseBase& other) { internal::call_assignment(this->derived(), other.derived()); @@ -786,7 +721,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type * \sa operator=(const MatrixBase&), _set() */ template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& _set_noalias(const DenseBase& other) { // I don't think we need this resize call since the lazyAssign will anyways resize @@ -809,18 +744,18 @@ class PlainObjectBase : public internal::dense_xpr_base::type FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED) resize(rows,cols); } - + template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init2(const T0& val0, const T1& val1, typename internal::enable_if::type* = 0) { EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2) m_storage.data()[0] = Scalar(val0); m_storage.data()[1] = Scalar(val1); } - + template - EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init2(const Index& val0, const Index& val1, typename internal::enable_if< (!internal::is_same::value) && (internal::is_same::value) @@ -846,8 +781,8 @@ class PlainObjectBase : public internal::dense_xpr_base::type FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED) resize(size); } - - // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitly converted) + + // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type can be implicitely converted) template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const Scalar& val0, typename internal::enable_if::value,T>::type* = 0) @@ -855,7 +790,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1) m_storage.data()[0] = val0; } - + // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar type match the index type) template EIGEN_DEVICE_FUNC @@ -911,7 +846,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type { this->derived() = r; } - + // For fixed-size Array template EIGEN_DEVICE_FUNC @@ -923,7 +858,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type { Base::setConstant(val0); } - + // For fixed-size Array template EIGEN_DEVICE_FUNC @@ -937,34 +872,34 @@ class PlainObjectBase : public internal::dense_xpr_base::type { Base::setConstant(val0); } - + template friend struct internal::matrix_swap_impl; public: - + #ifndef EIGEN_PARSED_BY_DOXYGEN /** \internal * \brief Override DenseBase::swap() since for dynamic-sized matrices * of same type it is enough to swap the data pointers. */ template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_DEVICE_FUNC void swap(DenseBase & other) { enum { SwapPointers = internal::is_same::value && Base::SizeAtCompileTime==Dynamic }; internal::matrix_swap_impl::run(this->derived(), other.derived()); } - + /** \internal * \brief const version forwarded to DenseBase::swap */ template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_DEVICE_FUNC void swap(DenseBase const & other) { Base::swap(other.derived()); } - - EIGEN_DEVICE_FUNC + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void _check_template_params() { EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (Options&RowMajor)==RowMajor) @@ -988,19 +923,13 @@ namespace internal { template struct conservative_resize_like_impl { - #if EIGEN_HAS_TYPE_TRAITS - static const bool IsRelocatable = std::is_trivially_copyable::value; - #else - static const bool IsRelocatable = !NumTraits::RequireInitialization; - #endif static void run(DenseBase& _this, Index rows, Index cols) { if (_this.rows() == rows && _this.cols() == cols) return; EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived) - if ( IsRelocatable - && (( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows - (!Derived::IsRowMajor && _this.rows() == rows) )) // column-major and we change only the number of columns + if ( ( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows + (!Derived::IsRowMajor && _this.rows() == rows) ) // column-major and we change only the number of columns { internal::check_rows_cols_for_overflow::run(rows, cols); _this.derived().m_storage.conservativeResize(rows*cols,rows,cols); @@ -1028,9 +957,8 @@ struct conservative_resize_like_impl EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived) EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(OtherDerived) - if ( IsRelocatable && - (( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows - (!Derived::IsRowMajor && _this.rows() == other.rows()) )) // column-major and we change only the number of columns + if ( ( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows + (!Derived::IsRowMajor && _this.rows() == other.rows()) ) // column-major and we change only the number of columns { const Index new_rows = other.rows() - _this.rows(); const Index new_cols = other.cols() - _this.cols(); @@ -1058,18 +986,13 @@ template struct conservative_resize_like_impl : conservative_resize_like_impl { - typedef conservative_resize_like_impl Base; - using Base::run; - using Base::IsRelocatable; - + using conservative_resize_like_impl::run; + static void run(DenseBase& _this, Index size) { const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : size; const Index new_cols = Derived::RowsAtCompileTime==1 ? size : 1; - if(IsRelocatable) - _this.derived().m_storage.conservativeResize(size,new_rows,new_cols); - else - Base::run(_this.derived(), new_rows, new_cols); + _this.derived().m_storage.conservativeResize(size,new_rows,new_cols); } static void run(DenseBase& _this, const DenseBase& other) @@ -1080,10 +1003,7 @@ struct conservative_resize_like_impl const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows(); const Index new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1; - if(IsRelocatable) - _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols); - else - Base::run(_this.derived(), new_rows, new_cols); + _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols); if (num_new_elements > 0) _this.tail(num_new_elements) = other.tail(num_new_elements); @@ -1094,7 +1014,7 @@ template struct matrix_swap_impl { EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE void run(MatrixTypeA& a, MatrixTypeB& b) + static inline void run(MatrixTypeA& a, MatrixTypeB& b) { a.base().swap(b); } diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Product.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Product.h index 13d5662df..676c48027 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/Product.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Product.h @@ -90,23 +90,18 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option, typedef typename internal::remove_all::type LhsNestedCleaned; typedef typename internal::remove_all::type RhsNestedCleaned; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) + EIGEN_DEVICE_FUNC Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) { eigen_assert(lhs.cols() == rhs.rows() && "invalid matrix product" && "if you wanted a coeff-wise or a dot product use the respective explicit functions"); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index rows() const { return m_lhs.rows(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index cols() const { return m_rhs.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const LhsNestedCleaned& lhs() const { return m_lhs; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const RhsNestedCleaned& rhs() const { return m_rhs; } + EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; } + EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; } protected: @@ -121,7 +116,7 @@ class dense_product_base : public internal::dense_xpr_base >::type {}; -/** Conversion to scalar for inner-products */ +/** Convertion to scalar for inner-products */ template class dense_product_base : public internal::dense_xpr_base >::type @@ -132,7 +127,7 @@ public: using Base::derived; typedef typename Base::Scalar Scalar; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator const Scalar() const + EIGEN_STRONG_INLINE operator const Scalar() const { return internal::evaluator(derived()).coeff(0,0); } diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/ProductEvaluators.h b/uppsrc/plugin/Eigen/Eigen/src/Core/ProductEvaluators.h index 792b1811c..bce1310c9 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/ProductEvaluators.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/ProductEvaluators.h @@ -20,7 +20,7 @@ namespace internal { /** \internal * Evaluator of a product expression. * Since products require special treatments to handle all possible cases, - * we simply defer the evaluation logic to a product_evaluator class + * we simply deffer the evaluation logic to a product_evaluator class * which offers more partial specialization possibilities. * * \sa class product_evaluator @@ -128,7 +128,7 @@ protected: PlainObject m_result; }; -// The following three shortcuts are enabled only if the scalar types match exactly. +// The following three shortcuts are enabled only if the scalar types match excatly. // TODO: we could enable them for different scalar types when the product is not vectorized. // Dense = Product @@ -137,7 +137,7 @@ struct Assignment, internal::assign_op::type> { typedef Product SrcXprType; - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &) { Index dstRows = src.rows(); @@ -155,7 +155,7 @@ struct Assignment, internal::add_assign_op< typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type> { typedef Product SrcXprType; - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op &) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); @@ -170,7 +170,7 @@ struct Assignment, internal::sub_assign_op< typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type> { typedef Product SrcXprType; - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op &) { eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); @@ -190,7 +190,7 @@ struct Assignment, const CwiseNullaryOp,Plain>, const Product > SrcXprType; - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const AssignFunc& func) { call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs())*src.rhs().rhs(), func); @@ -217,7 +217,7 @@ template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/) { call_assignment_no_alias(dst, src.lhs(), Func1()); @@ -246,19 +246,19 @@ template struct generic_product_impl { template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum(); } template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum(); } template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); } }; @@ -269,10 +269,10 @@ struct generic_product_impl // Column major result template -void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&) +void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const false_type&) { evaluator rhsEval(rhs); - ei_declare_local_nested_eval(Lhs,lhs,Rhs::SizeAtCompileTime,actual_lhs); + typename nested_eval::type actual_lhs(lhs); // FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored // FIXME not very good if rhs is real and lhs complex while alpha is real too const Index cols = dst.cols(); @@ -282,10 +282,10 @@ void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, cons // Row major result template -void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) +void outer_product_selector_run(Dst& dst, const Lhs &lhs, const Rhs &rhs, const Func& func, const true_type&) { evaluator lhsEval(lhs); - ei_declare_local_nested_eval(Rhs,rhs,Lhs::SizeAtCompileTime,actual_rhs); + typename nested_eval::type actual_rhs(rhs); // FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored // FIXME not very good if lhs is real and rhs complex while alpha is real too const Index rows = dst.rows(); @@ -300,37 +300,37 @@ struct generic_product_impl typedef typename Product::Scalar Scalar; // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose - struct set { template EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() = src; } }; - struct add { template EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } }; - struct sub { template EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } }; + struct set { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() = src; } }; + struct add { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } }; + struct sub { template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } }; struct adds { Scalar m_scale; explicit adds(const Scalar& s) : m_scale(s) {} - template void EIGEN_DEVICE_FUNC operator()(const Dst& dst, const Src& src) const { + template void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += m_scale * src; } }; template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major()); } template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major()); } template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major()); } template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major()); } @@ -345,19 +345,19 @@ struct generic_product_impl_base typedef typename Product::Scalar Scalar; template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.setZero(); scaleAndAddTo(dst, lhs, rhs, Scalar(1)); } template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { scaleAndAddTo(dst,lhs, rhs, Scalar(1)); } template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); } template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { Derived::scaleAndAddTo(dst,lhs,rhs,alpha); } }; @@ -373,7 +373,7 @@ struct generic_product_impl typedef typename internal::remove_all::type>::type MatrixType; template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { LhsNested actual_lhs(lhs); RhsNested actual_rhs(rhs); @@ -390,7 +390,7 @@ struct generic_product_impl typedef typename Product::Scalar Scalar; template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { // Same as: dst.noalias() = lhs.lazyProduct(rhs); // but easier on the compiler side @@ -398,71 +398,48 @@ struct generic_product_impl } template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { // dst.noalias() += lhs.lazyProduct(rhs); call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op()); } template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { // dst.noalias() -= lhs.lazyProduct(rhs); call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op()); } - // This is a special evaluation path called from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h - // This variant tries to extract scalar multiples from both the LHS and RHS and factor them out. For instance: - // dst {,+,-}= (s1*A)*(B*s2) - // will be rewritten as: - // dst {,+,-}= (s1*s2) * (A.lazyProduct(B)) - // There are at least four benefits of doing so: - // 1 - huge performance gain for heap-allocated matrix types as it save costly allocations. - // 2 - it is faster than simply by-passing the heap allocation through stack allocation. - // 3 - it makes this fallback consistent with the heavy GEMM routine. - // 4 - it fully by-passes huge stack allocation attempts when multiplying huge fixed-size matrices. - // (see https://stackoverflow.com/questions/54738495) - // For small fixed sizes matrices, howver, the gains are less obvious, it is sometimes x2 faster, but sometimes x3 slower, - // and the behavior depends also a lot on the compiler... This is why this re-writting strategy is currently - // enabled only when falling back from the main GEMM. - template + // Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor: + // dst {,+,-}= s * (A.lazyProduct(B)) + // This is a huge benefit for heap-allocated matrix types as it save one costly allocation. + // For them, this strategy is also faster than simply by-passing the heap allocation through + // stack allocation. + // For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower, + // and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only, + // that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h + template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void eval_dynamic(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Func &func) + void eval_dynamic(Dst& dst, const CwiseBinaryOp, + const CwiseNullaryOp, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func) { - enum { - HasScalarFactor = blas_traits::HasScalarFactor || blas_traits::HasScalarFactor, - ConjLhs = blas_traits::NeedToConjugate, - ConjRhs = blas_traits::NeedToConjugate - }; - // FIXME: in c++11 this should be auto, and extractScalarFactor should also return auto - // this is important for real*complex_mat - Scalar actualAlpha = blas_traits::extractScalarFactor(lhs) - * blas_traits::extractScalarFactor(rhs); - eval_dynamic_impl(dst, - blas_traits::extract(lhs).template conjugateIf(), - blas_traits::extract(rhs).template conjugateIf(), - func, - actualAlpha, - typename conditional::type()); + call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func); } -protected: - - template + // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above + // overload more specialized. + template static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar& s /* == 1 */, false_type) + void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func) { - EIGEN_UNUSED_VARIABLE(s); - eigen_internal_assert(s==Scalar(1)); - call_restricted_packet_assignment_no_alias(dst, lhs.lazyProduct(rhs), func); - } - - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs, const Func &func, const Scalar& s, true_type) - { - call_restricted_packet_assignment_no_alias(dst, s * lhs.lazyProduct(rhs), func); + call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func); } + + +// template +// static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) +// { dst.noalias() += alpha * lhs.lazyProduct(rhs); } }; // This specialization enforces the use of a coefficient-based evaluation strategy @@ -605,8 +582,7 @@ struct product_evaluator, ProductTag, DenseShape, * which is why we don't set the LinearAccessBit. * TODO: this seems possible when the result is a vector */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const CoeffReturnType coeff(Index index) const + EIGEN_DEVICE_FUNC const CoeffReturnType coeff(Index index) const { const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index; const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? index : 0; @@ -614,7 +590,6 @@ struct product_evaluator, ProductTag, DenseShape, } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packet(Index row, Index col) const { PacketType res; @@ -626,7 +601,6 @@ struct product_evaluator, ProductTag, DenseShape, } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packet(Index index) const { const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime==1) ? 0 : index; @@ -655,8 +629,7 @@ struct product_evaluator, LazyCoeffBasedProduc enum { Flags = Base::Flags | EvalBeforeNestingBit }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit product_evaluator(const XprType& xpr) + EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(BaseProduct(xpr.lhs(),xpr.rhs())) {} }; @@ -794,8 +767,7 @@ struct generic_product_impl typedef typename Product::Scalar Scalar; template - static EIGEN_DEVICE_FUNC - void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { selfadjoint_product_impl::run(dst, lhs.nestedExpression(), rhs, alpha); } @@ -830,21 +802,13 @@ public: MatrixFlags = evaluator::Flags, DiagFlags = evaluator::Flags, - - _StorageOrder = (Derived::MaxRowsAtCompileTime==1 && Derived::MaxColsAtCompileTime!=1) ? RowMajor - : (Derived::MaxColsAtCompileTime==1 && Derived::MaxRowsAtCompileTime!=1) ? ColMajor - : MatrixFlags & RowMajorBit ? RowMajor : ColMajor, - _SameStorageOrder = _StorageOrder == (MatrixFlags & RowMajorBit ? RowMajor : ColMajor), - + _StorageOrder = MatrixFlags & RowMajorBit ? RowMajor : ColMajor, _ScalarAccessOnDiag = !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft) ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)), _SameTypes = is_same::value, // FIXME currently we need same types, but in the future the next rule should be the one //_Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagFlags)&PacketAccessBit))), - _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) - && _SameTypes - && (_SameStorageOrder || (MatrixFlags&LinearAccessBit)==LinearAccessBit) - && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))), + _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))), _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0, Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0), Alignment = evaluator::Alignment, @@ -854,7 +818,7 @@ public: || (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight) }; - EIGEN_DEVICE_FUNC diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag) + diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag) : m_diagImpl(diag), m_matImpl(mat) { EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits::MulCost); @@ -905,10 +869,10 @@ struct product_evaluator, ProductTag, DiagonalSha typedef Product XprType; typedef typename XprType::PlainObject PlainObject; - typedef typename Lhs::DiagonalVectorType DiagonalType; - - enum { StorageOrder = Base::_StorageOrder }; + enum { + StorageOrder = int(Rhs::Flags) & RowMajorBit ? RowMajor : ColMajor + }; EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.rhs(), xpr.lhs().diagonal()) @@ -920,7 +884,7 @@ struct product_evaluator, ProductTag, DiagonalSha return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col); } -#ifndef EIGEN_GPUCC +#ifndef __CUDACC__ template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { @@ -952,7 +916,7 @@ struct product_evaluator, ProductTag, DenseShape, typedef Product XprType; typedef typename XprType::PlainObject PlainObject; - enum { StorageOrder = Base::_StorageOrder }; + enum { StorageOrder = int(Lhs::Flags) & RowMajorBit ? RowMajor : ColMajor }; EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs().diagonal()) @@ -964,7 +928,7 @@ struct product_evaluator, ProductTag, DenseShape, return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col); } -#ifndef EIGEN_GPUCC +#ifndef __CUDACC__ template EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const { diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Random.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Random.h index 486e9ed52..6faf789c7 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/Random.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Random.h @@ -128,7 +128,7 @@ DenseBase::Random() * \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index) */ template -EIGEN_DEVICE_FUNC inline Derived& DenseBase::setRandom() +inline Derived& DenseBase::setRandom() { return *this = Random(rows(), cols()); } diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Redux.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Redux.h index 2eef5abc5..760e9f861 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/Redux.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Redux.h @@ -23,29 +23,23 @@ namespace internal { * Part 1 : the logic deciding a strategy for vectorization and unrolling ***************************************************************************/ -template +template struct redux_traits { public: - typedef typename find_best_packet::type PacketType; + typedef typename find_best_packet::type PacketType; enum { PacketSize = unpacket_traits::size, - InnerMaxSize = int(Evaluator::IsRowMajor) - ? Evaluator::MaxColsAtCompileTime - : Evaluator::MaxRowsAtCompileTime, - OuterMaxSize = int(Evaluator::IsRowMajor) - ? Evaluator::MaxRowsAtCompileTime - : Evaluator::MaxColsAtCompileTime, - SliceVectorizedWork = int(InnerMaxSize)==Dynamic ? Dynamic - : int(OuterMaxSize)==Dynamic ? (int(InnerMaxSize)>=int(PacketSize) ? Dynamic : 0) - : (int(InnerMaxSize)/int(PacketSize)) * int(OuterMaxSize) + InnerMaxSize = int(Derived::IsRowMajor) + ? Derived::MaxColsAtCompileTime + : Derived::MaxRowsAtCompileTime }; enum { - MightVectorize = (int(Evaluator::Flags)&ActualPacketAccessBit) + MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit) && (functor_traits::PacketAccess), - MayLinearVectorize = bool(MightVectorize) && (int(Evaluator::Flags)&LinearAccessBit), - MaySliceVectorize = bool(MightVectorize) && (int(SliceVectorizedWork)==Dynamic || int(SliceVectorizedWork)>=3) + MayLinearVectorize = bool(MightVectorize) && (int(Derived::Flags)&LinearAccessBit), + MaySliceVectorize = bool(MightVectorize) && int(InnerMaxSize)>=3*PacketSize }; public: @@ -57,8 +51,8 @@ public: public: enum { - Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost - : Evaluator::SizeAtCompileTime * Evaluator::CoeffReadCost + (Evaluator::SizeAtCompileTime-1) * functor_traits::Cost, + Cost = Derived::SizeAtCompileTime == Dynamic ? HugeCost + : Derived::SizeAtCompileTime * Derived::CoeffReadCost + (Derived::SizeAtCompileTime-1) * functor_traits::Cost, UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize)) }; @@ -70,20 +64,18 @@ public: #ifdef EIGEN_DEBUG_ASSIGN static void debug() { - std::cerr << "Xpr: " << typeid(typename Evaluator::XprType).name() << std::endl; + std::cerr << "Xpr: " << typeid(typename Derived::XprType).name() << std::endl; std::cerr.setf(std::ios::hex, std::ios::basefield); - EIGEN_DEBUG_VAR(Evaluator::Flags) + EIGEN_DEBUG_VAR(Derived::Flags) std::cerr.unsetf(std::ios::hex); EIGEN_DEBUG_VAR(InnerMaxSize) - EIGEN_DEBUG_VAR(OuterMaxSize) - EIGEN_DEBUG_VAR(SliceVectorizedWork) EIGEN_DEBUG_VAR(PacketSize) EIGEN_DEBUG_VAR(MightVectorize) EIGEN_DEBUG_VAR(MayLinearVectorize) EIGEN_DEBUG_VAR(MaySliceVectorize) - std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl; + EIGEN_DEBUG_VAR(Traversal) EIGEN_DEBUG_VAR(UnrollingLimit) - std::cerr << "Unrolling" << " = " << Unrolling << " (" << demangle_unrolling(Unrolling) << ")" << std::endl; + EIGEN_DEBUG_VAR(Unrolling) std::cerr << std::endl; } #endif @@ -95,86 +87,88 @@ public: /*** no vectorization ***/ -template +template struct redux_novec_unroller { enum { HalfLength = Length/2 }; - typedef typename Evaluator::Scalar Scalar; + typedef typename Derived::Scalar Scalar; EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func& func) + static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) { - return func(redux_novec_unroller::run(eval,func), - redux_novec_unroller::run(eval,func)); + return func(redux_novec_unroller::run(mat,func), + redux_novec_unroller::run(mat,func)); } }; -template -struct redux_novec_unroller +template +struct redux_novec_unroller { enum { - outer = Start / Evaluator::InnerSizeAtCompileTime, - inner = Start % Evaluator::InnerSizeAtCompileTime + outer = Start / Derived::InnerSizeAtCompileTime, + inner = Start % Derived::InnerSizeAtCompileTime }; - typedef typename Evaluator::Scalar Scalar; + typedef typename Derived::Scalar Scalar; EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Evaluator &eval, const Func&) + static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&) { - return eval.coeffByOuterInner(outer, inner); + return mat.coeffByOuterInner(outer, inner); } }; // This is actually dead code and will never be called. It is required // to prevent false warnings regarding failed inlining though // for 0 length run() will never be called at all. -template -struct redux_novec_unroller +template +struct redux_novec_unroller { - typedef typename Evaluator::Scalar Scalar; + typedef typename Derived::Scalar Scalar; EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); } + static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); } }; /*** vectorization ***/ -template +template struct redux_vec_unroller { - template - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE PacketType run(const Evaluator &eval, const Func& func) - { - enum { - PacketSize = unpacket_traits::size, - HalfLength = Length/2 - }; + enum { + PacketSize = redux_traits::PacketSize, + HalfLength = Length/2 + }; + typedef typename Derived::Scalar Scalar; + typedef typename redux_traits::PacketType PacketScalar; + + static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func) + { return func.packetOp( - redux_vec_unroller::template run(eval,func), - redux_vec_unroller::template run(eval,func) ); + redux_vec_unroller::run(mat,func), + redux_vec_unroller::run(mat,func) ); } }; -template -struct redux_vec_unroller +template +struct redux_vec_unroller { - template - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE PacketType run(const Evaluator &eval, const Func&) + enum { + index = Start * redux_traits::PacketSize, + outer = index / int(Derived::InnerSizeAtCompileTime), + inner = index % int(Derived::InnerSizeAtCompileTime), + alignment = Derived::Alignment + }; + + typedef typename Derived::Scalar Scalar; + typedef typename redux_traits::PacketType PacketScalar; + + static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&) { - enum { - PacketSize = unpacket_traits::size, - index = Start * PacketSize, - outer = index / int(Evaluator::InnerSizeAtCompileTime), - inner = index % int(Evaluator::InnerSizeAtCompileTime), - alignment = Evaluator::Alignment - }; - return eval.template packetByOuterInner(outer, inner); + return mat.template packetByOuterInner(outer, inner); } }; @@ -182,65 +176,53 @@ struct redux_vec_unroller * Part 3 : implementation of all cases ***************************************************************************/ -template::Traversal, - int Unrolling = redux_traits::Unrolling +template::Traversal, + int Unrolling = redux_traits::Unrolling > struct redux_impl; -template -struct redux_impl +template +struct redux_impl { - typedef typename Evaluator::Scalar Scalar; - - template - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE - Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr) + typedef typename Derived::Scalar Scalar; + EIGEN_DEVICE_FUNC + static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) { - eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix"); + eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); Scalar res; - res = eval.coeffByOuterInner(0, 0); - for(Index i = 1; i < xpr.innerSize(); ++i) - res = func(res, eval.coeffByOuterInner(0, i)); - for(Index i = 1; i < xpr.outerSize(); ++i) - for(Index j = 0; j < xpr.innerSize(); ++j) - res = func(res, eval.coeffByOuterInner(i, j)); + res = mat.coeffByOuterInner(0, 0); + for(Index i = 1; i < mat.innerSize(); ++i) + res = func(res, mat.coeffByOuterInner(0, i)); + for(Index i = 1; i < mat.outerSize(); ++i) + for(Index j = 0; j < mat.innerSize(); ++j) + res = func(res, mat.coeffByOuterInner(i, j)); return res; } }; -template -struct redux_impl - : redux_novec_unroller -{ - typedef redux_novec_unroller Base; - typedef typename Evaluator::Scalar Scalar; - template - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE - Scalar run(const Evaluator &eval, const Func& func, const XprType& /*xpr*/) - { - return Base::run(eval,func); - } -}; +template +struct redux_impl + : public redux_novec_unroller +{}; -template -struct redux_impl +template +struct redux_impl { - typedef typename Evaluator::Scalar Scalar; - typedef typename redux_traits::PacketType PacketScalar; + typedef typename Derived::Scalar Scalar; + typedef typename redux_traits::PacketType PacketScalar; - template - static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr) + static Scalar run(const Derived &mat, const Func& func) { - const Index size = xpr.size(); + const Index size = mat.size(); - const Index packetSize = redux_traits::PacketSize; + const Index packetSize = redux_traits::PacketSize; const int packetAlignment = unpacket_traits::alignment; enum { - alignment0 = (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned), - alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Evaluator::Alignment) + alignment0 = (bool(Derived::Flags & DirectAccessBit) && bool(packet_traits::AlignedOnScalar)) ? int(packetAlignment) : int(Unaligned), + alignment = EIGEN_PLAIN_ENUM_MAX(alignment0, Derived::Alignment) }; - const Index alignedStart = internal::first_default_aligned(xpr); + const Index alignedStart = internal::first_default_aligned(mat.nestedExpression()); const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize); const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize); const Index alignedEnd2 = alignedStart + alignedSize2; @@ -248,34 +230,34 @@ struct redux_impl Scalar res; if(alignedSize) { - PacketScalar packet_res0 = eval.template packet(alignedStart); + PacketScalar packet_res0 = mat.template packet(alignedStart); if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop { - PacketScalar packet_res1 = eval.template packet(alignedStart+packetSize); + PacketScalar packet_res1 = mat.template packet(alignedStart+packetSize); for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize) { - packet_res0 = func.packetOp(packet_res0, eval.template packet(index)); - packet_res1 = func.packetOp(packet_res1, eval.template packet(index+packetSize)); + packet_res0 = func.packetOp(packet_res0, mat.template packet(index)); + packet_res1 = func.packetOp(packet_res1, mat.template packet(index+packetSize)); } packet_res0 = func.packetOp(packet_res0,packet_res1); if(alignedEnd>alignedEnd2) - packet_res0 = func.packetOp(packet_res0, eval.template packet(alignedEnd2)); + packet_res0 = func.packetOp(packet_res0, mat.template packet(alignedEnd2)); } res = func.predux(packet_res0); for(Index index = 0; index < alignedStart; ++index) - res = func(res,eval.coeff(index)); + res = func(res,mat.coeff(index)); for(Index index = alignedEnd; index < size; ++index) - res = func(res,eval.coeff(index)); + res = func(res,mat.coeff(index)); } else // too small to vectorize anything. // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize. { - res = eval.coeff(0); + res = mat.coeff(0); for(Index index = 1; index < size; ++index) - res = func(res,eval.coeff(index)); + res = func(res,mat.coeff(index)); } return res; @@ -283,108 +265,130 @@ struct redux_impl }; // NOTE: for SliceVectorizedTraversal we simply bypass unrolling -template -struct redux_impl +template +struct redux_impl { - typedef typename Evaluator::Scalar Scalar; - typedef typename redux_traits::PacketType PacketType; + typedef typename Derived::Scalar Scalar; + typedef typename redux_traits::PacketType PacketType; - template - EIGEN_DEVICE_FUNC static Scalar run(const Evaluator &eval, const Func& func, const XprType& xpr) + EIGEN_DEVICE_FUNC static Scalar run(const Derived &mat, const Func& func) { - eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix"); - const Index innerSize = xpr.innerSize(); - const Index outerSize = xpr.outerSize(); + eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); + const Index innerSize = mat.innerSize(); + const Index outerSize = mat.outerSize(); enum { - packetSize = redux_traits::PacketSize + packetSize = redux_traits::PacketSize }; const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize; Scalar res; if(packetedInnerSize) { - PacketType packet_res = eval.template packet(0,0); + PacketType packet_res = mat.template packet(0,0); for(Index j=0; j(j,i)); + packet_res = func.packetOp(packet_res, mat.template packetByOuterInner(j,i)); res = func.predux(packet_res); for(Index j=0; j::run(eval, func, xpr); + res = redux_impl::run(mat, func); } return res; } }; -template -struct redux_impl +template +struct redux_impl { - typedef typename Evaluator::Scalar Scalar; + typedef typename Derived::Scalar Scalar; - typedef typename redux_traits::PacketType PacketType; + typedef typename redux_traits::PacketType PacketScalar; enum { - PacketSize = redux_traits::PacketSize, - Size = Evaluator::SizeAtCompileTime, + PacketSize = redux_traits::PacketSize, + Size = Derived::SizeAtCompileTime, VectorizedSize = (Size / PacketSize) * PacketSize }; - - template - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE - Scalar run(const Evaluator &eval, const Func& func, const XprType &xpr) + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func) { - EIGEN_ONLY_USED_FOR_DEBUG(xpr) - eigen_assert(xpr.rows()>0 && xpr.cols()>0 && "you are using an empty matrix"); + eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix"); if (VectorizedSize > 0) { - Scalar res = func.predux(redux_vec_unroller::template run(eval,func)); + Scalar res = func.predux(redux_vec_unroller::run(mat,func)); if (VectorizedSize != Size) - res = func(res,redux_novec_unroller::run(eval,func)); + res = func(res,redux_novec_unroller::run(mat,func)); return res; } else { - return redux_novec_unroller::run(eval,func); + return redux_novec_unroller::run(mat,func); } } }; // evaluator adaptor template -class redux_evaluator : public internal::evaluator<_XprType> +class redux_evaluator { - typedef internal::evaluator<_XprType> Base; public: typedef _XprType XprType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - explicit redux_evaluator(const XprType &xpr) : Base(xpr) {} + EIGEN_DEVICE_FUNC explicit redux_evaluator(const XprType &xpr) : m_evaluator(xpr), m_xpr(xpr) {} typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename XprType::PacketScalar PacketScalar; + typedef typename XprType::PacketReturnType PacketReturnType; enum { MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime, MaxColsAtCompileTime = XprType::MaxColsAtCompileTime, // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime from the evaluator - Flags = Base::Flags & ~DirectAccessBit, + Flags = evaluator::Flags & ~DirectAccessBit, IsRowMajor = XprType::IsRowMajor, SizeAtCompileTime = XprType::SizeAtCompileTime, - InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime + InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime, + CoeffReadCost = evaluator::CoeffReadCost, + Alignment = evaluator::Alignment }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_DEVICE_FUNC Index rows() const { return m_xpr.rows(); } + EIGEN_DEVICE_FUNC Index cols() const { return m_xpr.cols(); } + EIGEN_DEVICE_FUNC Index size() const { return m_xpr.size(); } + EIGEN_DEVICE_FUNC Index innerSize() const { return m_xpr.innerSize(); } + EIGEN_DEVICE_FUNC Index outerSize() const { return m_xpr.outerSize(); } + + EIGEN_DEVICE_FUNC + CoeffReturnType coeff(Index row, Index col) const + { return m_evaluator.coeff(row, col); } + + EIGEN_DEVICE_FUNC + CoeffReturnType coeff(Index index) const + { return m_evaluator.coeff(index); } + + template + PacketType packet(Index row, Index col) const + { return m_evaluator.template packet(row, col); } + + template + PacketType packet(Index index) const + { return m_evaluator.template packet(index); } + + EIGEN_DEVICE_FUNC CoeffReturnType coeffByOuterInner(Index outer, Index inner) const - { return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } + { return m_evaluator.coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetByOuterInner(Index outer, Index inner) const - { return Base::template packet(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } + { return m_evaluator.template packet(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } + const XprType & nestedExpression() const { return m_xpr; } + +protected: + internal::evaluator m_evaluator; + const XprType &m_xpr; }; } // end namespace internal @@ -399,42 +403,36 @@ public: * The template parameter \a BinaryOp is the type of the functor \a func which must be * an associative operator. Both current C++98 and C++11 functor styles are handled. * - * \warning the matrix must be not empty, otherwise an assertion is triggered. - * * \sa DenseBase::sum(), DenseBase::minCoeff(), DenseBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise() */ template template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::redux(const Func& func) const { eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); typedef typename internal::redux_evaluator ThisEvaluator; ThisEvaluator thisEval(derived()); - - // The initial expression is passed to the reducer as an additional argument instead of - // passing it as a member of redux_evaluator to help - return internal::redux_impl::run(thisEval, func, derived()); + + return internal::redux_impl::run(thisEval, func); } /** \returns the minimum of all coefficients of \c *this. - * \warning the matrix must be not empty, otherwise an assertion is triggered. * \warning the result is undefined if \c *this contains NaN. */ template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::minCoeff() const { return derived().redux(Eigen::internal::scalar_min_op()); } /** \returns the maximum of all coefficients of \c *this. - * \warning the matrix must be not empty, otherwise an assertion is triggered. * \warning the result is undefined if \c *this contains NaN. */ template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::maxCoeff() const { return derived().redux(Eigen::internal::scalar_max_op()); @@ -447,7 +445,7 @@ DenseBase::maxCoeff() const * \sa trace(), prod(), mean() */ template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::sum() const { if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0)) @@ -460,7 +458,7 @@ DenseBase::sum() const * \sa trace(), prod(), sum() */ template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::mean() const { #ifdef __INTEL_COMPILER @@ -481,7 +479,7 @@ DenseBase::mean() const * \sa sum(), mean(), trace() */ template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::prod() const { if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0)) @@ -496,7 +494,7 @@ DenseBase::prod() const * \sa diagonal(), sum() */ template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits::Scalar +EIGEN_STRONG_INLINE typename internal::traits::Scalar MatrixBase::trace() const { return derived().diagonal().sum(); diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Ref.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Ref.h index 172c8ffb6..17a1496b8 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/Ref.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Ref.h @@ -187,8 +187,6 @@ protected: * void foo(const Ref >& A) { foo_impl(A); } * \endcode * - * See also the following stackoverflow questions for further references: - * - Correct usage of the Eigen::Ref<> class * * \sa PlainObjectBase::Map(), \ref TopicStorageOrders */ diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Replicate.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Replicate.h index 0b2d6d743..9960ef884 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/Replicate.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Replicate.h @@ -115,7 +115,7 @@ template class Replicate */ template template -EIGEN_DEVICE_FUNC const Replicate +const Replicate DenseBase::replicate() const { return Replicate(derived()); @@ -130,7 +130,7 @@ DenseBase::replicate() const * \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate */ template -EIGEN_DEVICE_FUNC const typename VectorwiseOp::ReplicateReturnType +const typename VectorwiseOp::ReplicateReturnType VectorwiseOp::replicate(Index factor) const { return typename VectorwiseOp::ReplicateReturnType diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Reshaped.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Reshaped.h deleted file mode 100644 index a78fd880f..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/Reshaped.h +++ /dev/null @@ -1,453 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2008-2017 Gael Guennebaud -// Copyright (C) 2014 yoco -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_RESHAPED_H -#define EIGEN_RESHAPED_H - -namespace Eigen { -namespace internal { - -/** \class Reshaped - * \ingroup Core_Module - * - * \brief Expression of a fixed-size or dynamic-size reshape - * - * \tparam XprType the type of the expression in which we are taking a reshape - * \tparam Rows the number of rows of the reshape we are taking at compile time (optional) - * \tparam Cols the number of columns of the reshape we are taking at compile time (optional) - * \tparam Order can be ColMajor or RowMajor, default is ColMajor. - * - * This class represents an expression of either a fixed-size or dynamic-size reshape. - * It is the return type of DenseBase::reshaped(NRowsType,NColsType) and - * most of the time this is the only way it is used. - * - * However, in C++98, if you want to directly maniputate reshaped expressions, - * for instance if you want to write a function returning such an expression, you - * will need to use this class. In C++11, it is advised to use the \em auto - * keyword for such use cases. - * - * Here is an example illustrating the dynamic case: - * \include class_Reshaped.cpp - * Output: \verbinclude class_Reshaped.out - * - * Here is an example illustrating the fixed-size case: - * \include class_FixedReshaped.cpp - * Output: \verbinclude class_FixedReshaped.out - * - * \sa DenseBase::reshaped(NRowsType,NColsType) - */ - -template -struct traits > : traits -{ - typedef typename traits::Scalar Scalar; - typedef typename traits::StorageKind StorageKind; - typedef typename traits::XprKind XprKind; - enum{ - MatrixRows = traits::RowsAtCompileTime, - MatrixCols = traits::ColsAtCompileTime, - RowsAtCompileTime = Rows, - ColsAtCompileTime = Cols, - MaxRowsAtCompileTime = Rows, - MaxColsAtCompileTime = Cols, - XpxStorageOrder = ((int(traits::Flags) & RowMajorBit) == RowMajorBit) ? RowMajor : ColMajor, - ReshapedStorageOrder = (RowsAtCompileTime == 1 && ColsAtCompileTime != 1) ? RowMajor - : (ColsAtCompileTime == 1 && RowsAtCompileTime != 1) ? ColMajor - : XpxStorageOrder, - HasSameStorageOrderAsXprType = (ReshapedStorageOrder == XpxStorageOrder), - InnerSize = (ReshapedStorageOrder==int(RowMajor)) ? int(ColsAtCompileTime) : int(RowsAtCompileTime), - InnerStrideAtCompileTime = HasSameStorageOrderAsXprType - ? int(inner_stride_at_compile_time::ret) - : Dynamic, - OuterStrideAtCompileTime = Dynamic, - - HasDirectAccess = internal::has_direct_access::ret - && (Order==int(XpxStorageOrder)) - && ((evaluator::Flags&LinearAccessBit)==LinearAccessBit), - - MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits::size) == 0) - && (InnerStrideAtCompileTime == 1) - ? PacketAccessBit : 0, - //MaskAlignedBit = ((OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16) == 0)) ? AlignedBit : 0, - FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0, - FlagsLvalueBit = is_lvalue::value ? LvalueBit : 0, - FlagsRowMajorBit = (ReshapedStorageOrder==int(RowMajor)) ? RowMajorBit : 0, - FlagsDirectAccessBit = HasDirectAccess ? DirectAccessBit : 0, - Flags0 = traits::Flags & ( (HereditaryBits & ~RowMajorBit) | MaskPacketAccessBit), - - Flags = (Flags0 | FlagsLinearAccessBit | FlagsLvalueBit | FlagsRowMajorBit | FlagsDirectAccessBit) - }; -}; - -template class ReshapedImpl_dense; - -} // end namespace internal - -template class ReshapedImpl; - -template class Reshaped - : public ReshapedImpl::StorageKind> -{ - typedef ReshapedImpl::StorageKind> Impl; - public: - //typedef typename Impl::Base Base; - typedef Impl Base; - EIGEN_GENERIC_PUBLIC_INTERFACE(Reshaped) - EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reshaped) - - /** Fixed-size constructor - */ - EIGEN_DEVICE_FUNC - inline Reshaped(XprType& xpr) - : Impl(xpr) - { - EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE) - eigen_assert(Rows * Cols == xpr.rows() * xpr.cols()); - } - - /** Dynamic-size constructor - */ - EIGEN_DEVICE_FUNC - inline Reshaped(XprType& xpr, - Index reshapeRows, Index reshapeCols) - : Impl(xpr, reshapeRows, reshapeCols) - { - eigen_assert((RowsAtCompileTime==Dynamic || RowsAtCompileTime==reshapeRows) - && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==reshapeCols)); - eigen_assert(reshapeRows * reshapeCols == xpr.rows() * xpr.cols()); - } -}; - -// The generic default implementation for dense reshape simply forward to the internal::ReshapedImpl_dense -// that must be specialized for direct and non-direct access... -template -class ReshapedImpl - : public internal::ReshapedImpl_dense >::HasDirectAccess> -{ - typedef internal::ReshapedImpl_dense >::HasDirectAccess> Impl; - public: - typedef Impl Base; - EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl) - EIGEN_DEVICE_FUNC inline ReshapedImpl(XprType& xpr) : Impl(xpr) {} - EIGEN_DEVICE_FUNC inline ReshapedImpl(XprType& xpr, Index reshapeRows, Index reshapeCols) - : Impl(xpr, reshapeRows, reshapeCols) {} -}; - -namespace internal { - -/** \internal Internal implementation of dense Reshaped in the general case. */ -template -class ReshapedImpl_dense - : public internal::dense_xpr_base >::type -{ - typedef Reshaped ReshapedType; - public: - - typedef typename internal::dense_xpr_base::type Base; - EIGEN_DENSE_PUBLIC_INTERFACE(ReshapedType) - EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl_dense) - - typedef typename internal::ref_selector::non_const_type MatrixTypeNested; - typedef typename internal::remove_all::type NestedExpression; - - class InnerIterator; - - /** Fixed-size constructor - */ - EIGEN_DEVICE_FUNC - inline ReshapedImpl_dense(XprType& xpr) - : m_xpr(xpr), m_rows(Rows), m_cols(Cols) - {} - - /** Dynamic-size constructor - */ - EIGEN_DEVICE_FUNC - inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols) - : m_xpr(xpr), m_rows(nRows), m_cols(nCols) - {} - - EIGEN_DEVICE_FUNC Index rows() const { return m_rows; } - EIGEN_DEVICE_FUNC Index cols() const { return m_cols; } - - #ifdef EIGEN_PARSED_BY_DOXYGEN - /** \sa MapBase::data() */ - EIGEN_DEVICE_FUNC inline const Scalar* data() const; - EIGEN_DEVICE_FUNC inline Index innerStride() const; - EIGEN_DEVICE_FUNC inline Index outerStride() const; - #endif - - /** \returns the nested expression */ - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& - nestedExpression() const { return m_xpr; } - - /** \returns the nested expression */ - EIGEN_DEVICE_FUNC - typename internal::remove_reference::type& - nestedExpression() { return m_xpr; } - - protected: - - MatrixTypeNested m_xpr; - const internal::variable_if_dynamic m_rows; - const internal::variable_if_dynamic m_cols; -}; - - -/** \internal Internal implementation of dense Reshaped in the direct access case. */ -template -class ReshapedImpl_dense - : public MapBase > -{ - typedef Reshaped ReshapedType; - typedef typename internal::ref_selector::non_const_type XprTypeNested; - public: - - typedef MapBase Base; - EIGEN_DENSE_PUBLIC_INTERFACE(ReshapedType) - EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl_dense) - - /** Fixed-size constructor - */ - EIGEN_DEVICE_FUNC - inline ReshapedImpl_dense(XprType& xpr) - : Base(xpr.data()), m_xpr(xpr) - {} - - /** Dynamic-size constructor - */ - EIGEN_DEVICE_FUNC - inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols) - : Base(xpr.data(), nRows, nCols), - m_xpr(xpr) - {} - - EIGEN_DEVICE_FUNC - const typename internal::remove_all::type& nestedExpression() const - { - return m_xpr; - } - - EIGEN_DEVICE_FUNC - XprType& nestedExpression() { return m_xpr; } - - /** \sa MapBase::innerStride() */ - EIGEN_DEVICE_FUNC - inline Index innerStride() const - { - return m_xpr.innerStride(); - } - - /** \sa MapBase::outerStride() */ - EIGEN_DEVICE_FUNC - inline Index outerStride() const - { - return ((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows(); - } - - protected: - - XprTypeNested m_xpr; -}; - -// Evaluators -template struct reshaped_evaluator; - -template -struct evaluator > - : reshaped_evaluator >::HasDirectAccess> -{ - typedef Reshaped XprType; - typedef typename XprType::Scalar Scalar; - // TODO: should check for smaller packet types - typedef typename packet_traits::type PacketScalar; - - enum { - CoeffReadCost = evaluator::CoeffReadCost, - HasDirectAccess = traits::HasDirectAccess, - -// RowsAtCompileTime = traits::RowsAtCompileTime, -// ColsAtCompileTime = traits::ColsAtCompileTime, -// MaxRowsAtCompileTime = traits::MaxRowsAtCompileTime, -// MaxColsAtCompileTime = traits::MaxColsAtCompileTime, -// -// InnerStrideAtCompileTime = traits::HasSameStorageOrderAsXprType -// ? int(inner_stride_at_compile_time::ret) -// : Dynamic, -// OuterStrideAtCompileTime = Dynamic, - - FlagsLinearAccessBit = (traits::RowsAtCompileTime == 1 || traits::ColsAtCompileTime == 1 || HasDirectAccess) ? LinearAccessBit : 0, - FlagsRowMajorBit = (traits::ReshapedStorageOrder==int(RowMajor)) ? RowMajorBit : 0, - FlagsDirectAccessBit = HasDirectAccess ? DirectAccessBit : 0, - Flags0 = evaluator::Flags & (HereditaryBits & ~RowMajorBit), - Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit | FlagsDirectAccessBit, - - PacketAlignment = unpacket_traits::alignment, - Alignment = evaluator::Alignment - }; - typedef reshaped_evaluator reshaped_evaluator_type; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : reshaped_evaluator_type(xpr) - { - EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); - } -}; - -template -struct reshaped_evaluator - : evaluator_base > -{ - typedef Reshaped XprType; - - enum { - CoeffReadCost = evaluator::CoeffReadCost /* TODO + cost of index computations */, - - Flags = (evaluator::Flags & (HereditaryBits /*| LinearAccessBit | DirectAccessBit*/)), - - Alignment = 0 - }; - - EIGEN_DEVICE_FUNC explicit reshaped_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_xpr(xpr) - { - EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); - } - - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - - typedef std::pair RowCol; - - inline RowCol index_remap(Index rowId, Index colId) const - { - if(Order==ColMajor) - { - const Index nth_elem_idx = colId * m_xpr.rows() + rowId; - return RowCol(nth_elem_idx % m_xpr.nestedExpression().rows(), - nth_elem_idx / m_xpr.nestedExpression().rows()); - } - else - { - const Index nth_elem_idx = colId + rowId * m_xpr.cols(); - return RowCol(nth_elem_idx / m_xpr.nestedExpression().cols(), - nth_elem_idx % m_xpr.nestedExpression().cols()); - } - } - - EIGEN_DEVICE_FUNC - inline Scalar& coeffRef(Index rowId, Index colId) - { - EIGEN_STATIC_ASSERT_LVALUE(XprType) - const RowCol row_col = index_remap(rowId, colId); - return m_argImpl.coeffRef(row_col.first, row_col.second); - } - - EIGEN_DEVICE_FUNC - inline const Scalar& coeffRef(Index rowId, Index colId) const - { - const RowCol row_col = index_remap(rowId, colId); - return m_argImpl.coeffRef(row_col.first, row_col.second); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index rowId, Index colId) const - { - const RowCol row_col = index_remap(rowId, colId); - return m_argImpl.coeff(row_col.first, row_col.second); - } - - EIGEN_DEVICE_FUNC - inline Scalar& coeffRef(Index index) - { - EIGEN_STATIC_ASSERT_LVALUE(XprType) - const RowCol row_col = index_remap(Rows == 1 ? 0 : index, - Rows == 1 ? index : 0); - return m_argImpl.coeffRef(row_col.first, row_col.second); - - } - - EIGEN_DEVICE_FUNC - inline const Scalar& coeffRef(Index index) const - { - const RowCol row_col = index_remap(Rows == 1 ? 0 : index, - Rows == 1 ? index : 0); - return m_argImpl.coeffRef(row_col.first, row_col.second); - } - - EIGEN_DEVICE_FUNC - inline const CoeffReturnType coeff(Index index) const - { - const RowCol row_col = index_remap(Rows == 1 ? 0 : index, - Rows == 1 ? index : 0); - return m_argImpl.coeff(row_col.first, row_col.second); - } -#if 0 - EIGEN_DEVICE_FUNC - template - inline PacketScalar packet(Index rowId, Index colId) const - { - const RowCol row_col = index_remap(rowId, colId); - return m_argImpl.template packet(row_col.first, row_col.second); - - } - - template - EIGEN_DEVICE_FUNC - inline void writePacket(Index rowId, Index colId, const PacketScalar& val) - { - const RowCol row_col = index_remap(rowId, colId); - m_argImpl.const_cast_derived().template writePacket - (row_col.first, row_col.second, val); - } - - template - EIGEN_DEVICE_FUNC - inline PacketScalar packet(Index index) const - { - const RowCol row_col = index_remap(RowsAtCompileTime == 1 ? 0 : index, - RowsAtCompileTime == 1 ? index : 0); - return m_argImpl.template packet(row_col.first, row_col.second); - } - - template - EIGEN_DEVICE_FUNC - inline void writePacket(Index index, const PacketScalar& val) - { - const RowCol row_col = index_remap(RowsAtCompileTime == 1 ? 0 : index, - RowsAtCompileTime == 1 ? index : 0); - return m_argImpl.template packet(row_col.first, row_col.second, val); - } -#endif -protected: - - evaluator m_argImpl; - const XprType& m_xpr; - -}; - -template -struct reshaped_evaluator -: mapbase_evaluator, - typename Reshaped::PlainObject> -{ - typedef Reshaped XprType; - typedef typename XprType::Scalar Scalar; - - EIGEN_DEVICE_FUNC explicit reshaped_evaluator(const XprType& xpr) - : mapbase_evaluator(xpr) - { - // TODO: for the 3.4 release, this should be turned to an internal assertion, but let's keep it as is for the beta lifetime - eigen_assert(((internal::UIntPtr(xpr.data()) % EIGEN_PLAIN_ENUM_MAX(1,evaluator::Alignment)) == 0) && "data is not aligned"); - } -}; - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_RESHAPED_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/ReturnByValue.h b/uppsrc/plugin/Eigen/Eigen/src/Core/ReturnByValue.h index 11dc86d07..c44b7673b 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/ReturnByValue.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/ReturnByValue.h @@ -79,7 +79,7 @@ template class ReturnByValue template template -EIGEN_DEVICE_FUNC Derived& DenseBase::operator=(const ReturnByValue& other) +Derived& DenseBase::operator=(const ReturnByValue& other) { other.evalTo(derived()); return derived(); diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Reverse.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Reverse.h index 853093923..0640cda2a 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/Reverse.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Reverse.h @@ -114,7 +114,7 @@ template class Reverse * */ template -EIGEN_DEVICE_FUNC inline typename DenseBase::ReverseReturnType +inline typename DenseBase::ReverseReturnType DenseBase::reverse() { return ReverseReturnType(derived()); @@ -136,7 +136,7 @@ DenseBase::reverse() * * \sa VectorwiseOp::reverseInPlace(), reverse() */ template -EIGEN_DEVICE_FUNC inline void DenseBase::reverseInPlace() +inline void DenseBase::reverseInPlace() { if(cols()>rows()) { @@ -171,10 +171,8 @@ struct vectorwise_reverse_inplace_impl template static void run(ExpressionType &xpr) { - const int HalfAtCompileTime = ExpressionType::RowsAtCompileTime==Dynamic?Dynamic:ExpressionType::RowsAtCompileTime/2; Index half = xpr.rows()/2; - xpr.topRows(fix(half)) - .swap(xpr.bottomRows(fix(half)).colwise().reverse()); + xpr.topRows(half).swap(xpr.bottomRows(half).colwise().reverse()); } }; @@ -184,10 +182,8 @@ struct vectorwise_reverse_inplace_impl template static void run(ExpressionType &xpr) { - const int HalfAtCompileTime = ExpressionType::ColsAtCompileTime==Dynamic?Dynamic:ExpressionType::ColsAtCompileTime/2; Index half = xpr.cols()/2; - xpr.leftCols(fix(half)) - .swap(xpr.rightCols(fix(half)).rowwise().reverse()); + xpr.leftCols(half).swap(xpr.rightCols(half).rowwise().reverse()); } }; @@ -205,9 +201,9 @@ struct vectorwise_reverse_inplace_impl * * \sa DenseBase::reverseInPlace(), reverse() */ template -EIGEN_DEVICE_FUNC void VectorwiseOp::reverseInPlace() +void VectorwiseOp::reverseInPlace() { - internal::vectorwise_reverse_inplace_impl::run(m_matrix); + internal::vectorwise_reverse_inplace_impl::run(_expression().const_cast_derived()); } } // end namespace Eigen diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/SelfAdjointView.h b/uppsrc/plugin/Eigen/Eigen/src/Core/SelfAdjointView.h index 2173799d9..b2e51f37a 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/SelfAdjointView.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/SelfAdjointView.h @@ -61,7 +61,6 @@ template class SelfAdjointView typedef typename internal::traits::Scalar Scalar; typedef typename MatrixType::StorageIndex StorageIndex; typedef typename internal::remove_all::type MatrixConjugateReturnType; - typedef SelfAdjointView::type, UpLo> ConstSelfAdjointView; enum { Mode = internal::traits::Mode, @@ -198,18 +197,6 @@ template class SelfAdjointView inline const ConjugateReturnType conjugate() const { return ConjugateReturnType(m_matrix.conjugate()); } - /** \returns an expression of the complex conjugate of \c *this if Cond==true, - * returns \c *this otherwise. - */ - template - EIGEN_DEVICE_FUNC - inline typename internal::conditional::type - conjugateIf() const - { - typedef typename internal::conditional::type ReturnType; - return ReturnType(m_matrix.template conjugateIf()); - } - typedef SelfAdjointView AdjointReturnType; /** \sa MatrixBase::adjoint() const */ EIGEN_DEVICE_FUNC @@ -337,7 +324,7 @@ public: /** This is the const version of MatrixBase::selfadjointView() */ template template -EIGEN_DEVICE_FUNC typename MatrixBase::template ConstSelfAdjointViewReturnType::Type +typename MatrixBase::template ConstSelfAdjointViewReturnType::Type MatrixBase::selfadjointView() const { return typename ConstSelfAdjointViewReturnType::Type(derived()); @@ -354,7 +341,7 @@ MatrixBase::selfadjointView() const */ template template -EIGEN_DEVICE_FUNC typename MatrixBase::template SelfAdjointViewReturnType::Type +typename MatrixBase::template SelfAdjointViewReturnType::Type MatrixBase::selfadjointView() { return typename SelfAdjointViewReturnType::Type(derived()); diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Solve.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Solve.h index ec4b4a987..a8daea511 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/Solve.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Solve.h @@ -19,7 +19,7 @@ template class S * * \brief Pseudo expression representing a solving operation * - * \tparam Decomposition the type of the matrix or decomposition object + * \tparam Decomposition the type of the matrix or decomposion object * \tparam Rhstype the type of the right-hand side * * This class represents an expression of A.solve(B) @@ -181,7 +181,7 @@ struct Assignment { #ifndef EIGEN_PARSED_BY_DOXYGEN template template -EIGEN_DEVICE_FUNC void TriangularViewImpl::solveInPlace(const MatrixBase& _other) const +void TriangularViewImpl::solveInPlace(const MatrixBase& _other) const { OtherDerived& other = _other.const_cast_derived(); eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) ); diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/SolverBase.h b/uppsrc/plugin/Eigen/Eigen/src/Core/SolverBase.h index 501461042..8a4adc229 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/SolverBase.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/SolverBase.h @@ -14,35 +14,8 @@ namespace Eigen { namespace internal { -template -struct solve_assertion { - template - static void run(const Derived& solver, const Rhs& b) { solver.template _check_solve_assertion(b); } -}; -template -struct solve_assertion > -{ - typedef Transpose type; - template - static void run(const type& transpose, const Rhs& b) - { - internal::solve_assertion::type>::template run(transpose.nestedExpression(), b); - } -}; - -template -struct solve_assertion, const Transpose > > -{ - typedef CwiseUnaryOp, const Transpose > type; - - template - static void run(const type& adjoint, const Rhs& b) - { - internal::solve_assertion >::type>::template run(adjoint.nestedExpression(), b); - } -}; } // end namespace internal /** \class SolverBase @@ -62,7 +35,7 @@ struct solve_assertion * * \warning Currently, any other usage of transpose() and adjoint() are not supported and will produce compilation errors. * - * \sa class PartialPivLU, class FullPivLU, class HouseholderQR, class ColPivHouseholderQR, class FullPivHouseholderQR, class CompleteOrthogonalDecomposition, class LLT, class LDLT, class SVDBase + * \sa class PartialPivLU, class FullPivLU */ template class SolverBase : public EigenBase @@ -73,9 +46,6 @@ class SolverBase : public EigenBase typedef typename internal::traits::Scalar Scalar; typedef Scalar CoeffReturnType; - template - friend struct internal::solve_assertion; - enum { RowsAtCompileTime = internal::traits::RowsAtCompileTime, ColsAtCompileTime = internal::traits::ColsAtCompileTime, @@ -86,8 +56,7 @@ class SolverBase : public EigenBase MaxSizeAtCompileTime = (internal::size_at_compile_time::MaxRowsAtCompileTime, internal::traits::MaxColsAtCompileTime>::ret), IsVectorAtCompileTime = internal::traits::MaxRowsAtCompileTime == 1 - || internal::traits::MaxColsAtCompileTime == 1, - NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2 + || internal::traits::MaxColsAtCompileTime == 1 }; /** Default constructor */ @@ -105,7 +74,7 @@ class SolverBase : public EigenBase inline const Solve solve(const MatrixBase& b) const { - internal::solve_assertion::type>::template run(derived(), b); + eigen_assert(derived().rows()==b.rows() && "solve(): invalid number of rows of the right hand side matrix b"); return Solve(derived(), b.derived()); } @@ -143,13 +112,6 @@ class SolverBase : public EigenBase } protected: - - template - void _check_solve_assertion(const Rhs& b) const { - EIGEN_ONLY_USED_FOR_DEBUG(b); - eigen_assert(derived().m_isInitialized && "Solver is not initialized."); - eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "SolverBase::solve(): invalid number of rows of the right hand side matrix b"); - } }; namespace internal { diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/StableNorm.h b/uppsrc/plugin/Eigen/Eigen/src/Core/StableNorm.h index 77ea3c261..88c8d9890 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/StableNorm.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/StableNorm.h @@ -50,71 +50,6 @@ inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& sc ssq += (bl*invScale).squaredNorm(); } -template -void stable_norm_impl_inner_step(const VectorType &vec, RealScalar& ssq, RealScalar& scale, RealScalar& invScale) -{ - typedef typename VectorType::Scalar Scalar; - const Index blockSize = 4096; - - typedef typename internal::nested_eval::type VectorTypeCopy; - typedef typename internal::remove_all::type VectorTypeCopyClean; - const VectorTypeCopy copy(vec); - - enum { - CanAlign = ( (int(VectorTypeCopyClean::Flags)&DirectAccessBit) - || (int(internal::evaluator::Alignment)>0) // FIXME Alignment)>0 might not be enough - ) && (blockSize*sizeof(Scalar)*20) // if we cannot allocate on the stack, then let's not bother about this optimization - }; - typedef typename internal::conditional, internal::evaluator::Alignment>, - typename VectorTypeCopyClean::ConstSegmentReturnType>::type SegmentWrapper; - Index n = vec.size(); - - Index bi = internal::first_default_aligned(copy); - if (bi>0) - internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale); - for (; bi -typename VectorType::RealScalar -stable_norm_impl(const VectorType &vec, typename enable_if::type* = 0 ) -{ - using std::sqrt; - using std::abs; - - Index n = vec.size(); - - if(n==1) - return abs(vec.coeff(0)); - - typedef typename VectorType::RealScalar RealScalar; - RealScalar scale(0); - RealScalar invScale(1); - RealScalar ssq(0); // sum of squares - - stable_norm_impl_inner_step(vec, ssq, scale, invScale); - - return scale * sqrt(ssq); -} - -template -typename MatrixType::RealScalar -stable_norm_impl(const MatrixType &mat, typename enable_if::type* = 0 ) -{ - using std::sqrt; - - typedef typename MatrixType::RealScalar RealScalar; - RealScalar scale(0); - RealScalar invScale(1); - RealScalar ssq(0); // sum of squares - - for(Index j=0; j inline typename NumTraits::Scalar>::Real blueNorm_impl(const EigenBase& _vec) @@ -139,7 +74,7 @@ blueNorm_impl(const EigenBase& _vec) // are used. For any specific computer, each of the assignment // statements can be replaced ibeta = std::numeric_limits::radix; // base for floating-point numbers - it = NumTraits::digits(); // number of base-beta digits in mantissa + it = std::numeric_limits::digits; // number of base-beta digits in mantissa iemin = std::numeric_limits::min_exponent; // minimum exponent iemax = std::numeric_limits::max_exponent; // maximum exponent rbig = (std::numeric_limits::max)(); // largest floating-point number @@ -163,16 +98,12 @@ blueNorm_impl(const EigenBase& _vec) RealScalar asml = RealScalar(0); RealScalar amed = RealScalar(0); RealScalar abig = RealScalar(0); - - for(Index j=0; j ab2) abig += numext::abs2(ax*s2m); - else if(ax < b1) asml += numext::abs2(ax*s1m); - else amed += numext::abs2(ax); - } + RealScalar ax = abs(it.value()); + if(ax > ab2) abig += numext::abs2(ax*s2m); + else if(ax < b1) asml += numext::abs2(ax*s1m); + else amed += numext::abs2(ax); } if(amed!=amed) return amed; // we got a NaN @@ -225,7 +156,36 @@ template inline typename NumTraits::Scalar>::Real MatrixBase::stableNorm() const { - return internal::stable_norm_impl(derived()); + using std::sqrt; + using std::abs; + const Index blockSize = 4096; + RealScalar scale(0); + RealScalar invScale(1); + RealScalar ssq(0); // sum of square + + typedef typename internal::nested_eval::type DerivedCopy; + typedef typename internal::remove_all::type DerivedCopyClean; + const DerivedCopy copy(derived()); + + enum { + CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit) + || (int(internal::evaluator::Alignment)>0) // FIXME Alignment)>0 might not be enough + ) && (blockSize*sizeof(Scalar)*20) // if we cannot allocate on the stack, then let's not bother about this optimization + }; + typedef typename internal::conditional, internal::evaluator::Alignment>, + typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper; + Index n = size(); + + if(n==1) + return abs(this->coeff(0)); + + Index bi = internal::first_default_aligned(copy); + if (bi>0) + internal::stable_norm_kernel(copy.head(bi), ssq, scale, invScale); + for (; bi inline typename NumTraits::Scalar>::Real MatrixBase::hypotNorm() const { - if(size()==1) - return numext::abs(coeff(0,0)); - else - return this->cwiseAbs().redux(internal::scalar_hypot_op()); + return this->cwiseAbs().redux(internal::scalar_hypot_op()); } } // end namespace Eigen diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/StlIterators.h b/uppsrc/plugin/Eigen/Eigen/src/Core/StlIterators.h deleted file mode 100644 index 0d8bd1aa3..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/StlIterators.h +++ /dev/null @@ -1,331 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2018 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -namespace Eigen { - -namespace internal { - -template -struct indexed_based_stl_iterator_traits; - -template -class indexed_based_stl_iterator_base -{ -protected: - typedef indexed_based_stl_iterator_traits traits; - typedef typename traits::XprType XprType; - typedef indexed_based_stl_iterator_base non_const_iterator; - typedef indexed_based_stl_iterator_base const_iterator; - typedef typename internal::conditional::value,non_const_iterator,const_iterator>::type other_iterator; - // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class: - friend class indexed_based_stl_iterator_base; - friend class indexed_based_stl_iterator_base; -public: - typedef Index difference_type; - typedef std::random_access_iterator_tag iterator_category; - - indexed_based_stl_iterator_base() : mp_xpr(0), m_index(0) {} - indexed_based_stl_iterator_base(XprType& xpr, Index index) : mp_xpr(&xpr), m_index(index) {} - - indexed_based_stl_iterator_base(const non_const_iterator& other) - : mp_xpr(other.mp_xpr), m_index(other.m_index) - {} - - indexed_based_stl_iterator_base& operator=(const non_const_iterator& other) - { - mp_xpr = other.mp_xpr; - m_index = other.m_index; - return *this; - } - - Derived& operator++() { ++m_index; return derived(); } - Derived& operator--() { --m_index; return derived(); } - - Derived operator++(int) { Derived prev(derived()); operator++(); return prev;} - Derived operator--(int) { Derived prev(derived()); operator--(); return prev;} - - friend Derived operator+(const indexed_based_stl_iterator_base& a, Index b) { Derived ret(a.derived()); ret += b; return ret; } - friend Derived operator-(const indexed_based_stl_iterator_base& a, Index b) { Derived ret(a.derived()); ret -= b; return ret; } - friend Derived operator+(Index a, const indexed_based_stl_iterator_base& b) { Derived ret(b.derived()); ret += a; return ret; } - friend Derived operator-(Index a, const indexed_based_stl_iterator_base& b) { Derived ret(b.derived()); ret -= a; return ret; } - - Derived& operator+=(Index b) { m_index += b; return derived(); } - Derived& operator-=(Index b) { m_index -= b; return derived(); } - - difference_type operator-(const indexed_based_stl_iterator_base& other) const - { - eigen_assert(mp_xpr == other.mp_xpr); - return m_index - other.m_index; - } - - difference_type operator-(const other_iterator& other) const - { - eigen_assert(mp_xpr == other.mp_xpr); - return m_index - other.m_index; - } - - bool operator==(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; } - bool operator!=(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; } - bool operator< (const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index < other.m_index; } - bool operator<=(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; } - bool operator> (const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index > other.m_index; } - bool operator>=(const indexed_based_stl_iterator_base& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; } - - bool operator==(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index == other.m_index; } - bool operator!=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index != other.m_index; } - bool operator< (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index < other.m_index; } - bool operator<=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index <= other.m_index; } - bool operator> (const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index > other.m_index; } - bool operator>=(const other_iterator& other) const { eigen_assert(mp_xpr == other.mp_xpr); return m_index >= other.m_index; } - -protected: - - Derived& derived() { return static_cast(*this); } - const Derived& derived() const { return static_cast(*this); } - - XprType *mp_xpr; - Index m_index; -}; - -template -class pointer_based_stl_iterator -{ - enum { is_lvalue = internal::is_lvalue::value }; - typedef pointer_based_stl_iterator::type> non_const_iterator; - typedef pointer_based_stl_iterator::type> const_iterator; - typedef typename internal::conditional::value,non_const_iterator,const_iterator>::type other_iterator; - // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class: - friend class pointer_based_stl_iterator::type>; - friend class pointer_based_stl_iterator::type>; -public: - typedef Index difference_type; - typedef typename XprType::Scalar value_type; - typedef std::random_access_iterator_tag iterator_category; - typedef typename internal::conditional::type pointer; - typedef typename internal::conditional::type reference; - - - pointer_based_stl_iterator() : m_ptr(0) {} - pointer_based_stl_iterator(XprType& xpr, Index index) : m_incr(xpr.innerStride()) - { - m_ptr = xpr.data() + index * m_incr.value(); - } - - pointer_based_stl_iterator(const non_const_iterator& other) - : m_ptr(other.m_ptr), m_incr(other.m_incr) - {} - - pointer_based_stl_iterator& operator=(const non_const_iterator& other) - { - m_ptr = other.m_ptr; - m_incr.setValue(other.m_incr); - return *this; - } - - reference operator*() const { return *m_ptr; } - reference operator[](Index i) const { return *(m_ptr+i*m_incr.value()); } - pointer operator->() const { return m_ptr; } - - pointer_based_stl_iterator& operator++() { m_ptr += m_incr.value(); return *this; } - pointer_based_stl_iterator& operator--() { m_ptr -= m_incr.value(); return *this; } - - pointer_based_stl_iterator operator++(int) { pointer_based_stl_iterator prev(*this); operator++(); return prev;} - pointer_based_stl_iterator operator--(int) { pointer_based_stl_iterator prev(*this); operator--(); return prev;} - - friend pointer_based_stl_iterator operator+(const pointer_based_stl_iterator& a, Index b) { pointer_based_stl_iterator ret(a); ret += b; return ret; } - friend pointer_based_stl_iterator operator-(const pointer_based_stl_iterator& a, Index b) { pointer_based_stl_iterator ret(a); ret -= b; return ret; } - friend pointer_based_stl_iterator operator+(Index a, const pointer_based_stl_iterator& b) { pointer_based_stl_iterator ret(b); ret += a; return ret; } - friend pointer_based_stl_iterator operator-(Index a, const pointer_based_stl_iterator& b) { pointer_based_stl_iterator ret(b); ret -= a; return ret; } - - pointer_based_stl_iterator& operator+=(Index b) { m_ptr += b*m_incr.value(); return *this; } - pointer_based_stl_iterator& operator-=(Index b) { m_ptr -= b*m_incr.value(); return *this; } - - difference_type operator-(const pointer_based_stl_iterator& other) const { - return (m_ptr - other.m_ptr)/m_incr.value(); - } - - difference_type operator-(const other_iterator& other) const { - return (m_ptr - other.m_ptr)/m_incr.value(); - } - - bool operator==(const pointer_based_stl_iterator& other) const { return m_ptr == other.m_ptr; } - bool operator!=(const pointer_based_stl_iterator& other) const { return m_ptr != other.m_ptr; } - bool operator< (const pointer_based_stl_iterator& other) const { return m_ptr < other.m_ptr; } - bool operator<=(const pointer_based_stl_iterator& other) const { return m_ptr <= other.m_ptr; } - bool operator> (const pointer_based_stl_iterator& other) const { return m_ptr > other.m_ptr; } - bool operator>=(const pointer_based_stl_iterator& other) const { return m_ptr >= other.m_ptr; } - - bool operator==(const other_iterator& other) const { return m_ptr == other.m_ptr; } - bool operator!=(const other_iterator& other) const { return m_ptr != other.m_ptr; } - bool operator< (const other_iterator& other) const { return m_ptr < other.m_ptr; } - bool operator<=(const other_iterator& other) const { return m_ptr <= other.m_ptr; } - bool operator> (const other_iterator& other) const { return m_ptr > other.m_ptr; } - bool operator>=(const other_iterator& other) const { return m_ptr >= other.m_ptr; } - -protected: - - pointer m_ptr; - internal::variable_if_dynamic m_incr; -}; - -template -struct indexed_based_stl_iterator_traits > -{ - typedef _XprType XprType; - typedef generic_randaccess_stl_iterator::type> non_const_iterator; - typedef generic_randaccess_stl_iterator::type> const_iterator; -}; - -template -class generic_randaccess_stl_iterator : public indexed_based_stl_iterator_base > -{ -public: - typedef typename XprType::Scalar value_type; - -protected: - - enum { - has_direct_access = (internal::traits::Flags & DirectAccessBit) ? 1 : 0, - is_lvalue = internal::is_lvalue::value - }; - - typedef indexed_based_stl_iterator_base Base; - using Base::m_index; - using Base::mp_xpr; - - // TODO currently const Transpose/Reshape expressions never returns const references, - // so lets return by value too. - //typedef typename internal::conditional::type read_only_ref_t; - typedef const value_type read_only_ref_t; - -public: - - typedef typename internal::conditional::type pointer; - typedef typename internal::conditional::type reference; - - generic_randaccess_stl_iterator() : Base() {} - generic_randaccess_stl_iterator(XprType& xpr, Index index) : Base(xpr,index) {} - generic_randaccess_stl_iterator(const typename Base::non_const_iterator& other) : Base(other) {} - using Base::operator=; - - reference operator*() const { return (*mp_xpr)(m_index); } - reference operator[](Index i) const { return (*mp_xpr)(m_index+i); } - pointer operator->() const { return &((*mp_xpr)(m_index)); } -}; - -template -struct indexed_based_stl_iterator_traits > -{ - typedef _XprType XprType; - typedef subvector_stl_iterator::type, Direction> non_const_iterator; - typedef subvector_stl_iterator::type, Direction> const_iterator; -}; - -template -class subvector_stl_iterator : public indexed_based_stl_iterator_base > -{ -protected: - - enum { is_lvalue = internal::is_lvalue::value }; - - typedef indexed_based_stl_iterator_base Base; - using Base::m_index; - using Base::mp_xpr; - - typedef typename internal::conditional::type SubVectorType; - typedef typename internal::conditional::type ConstSubVectorType; - - -public: - typedef typename internal::conditional::type reference; - typedef typename reference::PlainObject value_type; - -private: - class subvector_stl_iterator_ptr - { - public: - subvector_stl_iterator_ptr(const reference &subvector) : m_subvector(subvector) {} - reference* operator->() { return &m_subvector; } - private: - reference m_subvector; - }; -public: - - typedef subvector_stl_iterator_ptr pointer; - - subvector_stl_iterator() : Base() {} - subvector_stl_iterator(XprType& xpr, Index index) : Base(xpr,index) {} - - reference operator*() const { return (*mp_xpr).template subVector(m_index); } - reference operator[](Index i) const { return (*mp_xpr).template subVector(m_index+i); } - pointer operator->() const { return (*mp_xpr).template subVector(m_index); } -}; - -} // namespace internal - - -/** returns an iterator to the first element of the 1D vector or array - * \only_for_vectors - * \sa end(), cbegin() - */ -template -inline typename DenseBase::iterator DenseBase::begin() -{ - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); - return iterator(derived(), 0); -} - -/** const version of begin() */ -template -inline typename DenseBase::const_iterator DenseBase::begin() const -{ - return cbegin(); -} - -/** returns a read-only const_iterator to the first element of the 1D vector or array - * \only_for_vectors - * \sa cend(), begin() - */ -template -inline typename DenseBase::const_iterator DenseBase::cbegin() const -{ - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); - return const_iterator(derived(), 0); -} - -/** returns an iterator to the element following the last element of the 1D vector or array - * \only_for_vectors - * \sa begin(), cend() - */ -template -inline typename DenseBase::iterator DenseBase::end() -{ - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); - return iterator(derived(), size()); -} - -/** const version of end() */ -template -inline typename DenseBase::const_iterator DenseBase::end() const -{ - return cend(); -} - -/** returns a read-only const_iterator to the element following the last element of the 1D vector or array - * \only_for_vectors - * \sa begin(), cend() - */ -template -inline typename DenseBase::const_iterator DenseBase::cend() const -{ - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived); - return const_iterator(derived(), size()); -} - -} // namespace Eigen diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Swap.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Swap.h index 180a4e5ad..d70200918 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/Swap.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Swap.h @@ -30,13 +30,12 @@ public: typedef typename Base::DstXprType DstXprType; typedef swap_assign_op Functor; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr) + EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorTypeT &dst, const SrcEvaluatorTypeT &src, const Functor &func, DstXprType& dstExpr) : Base(dst, src, func, dstExpr) {} template - EIGEN_STRONG_INLINE void assignPacket(Index row, Index col) + void assignPacket(Index row, Index col) { PacketType tmp = m_src.template packet(row,col); const_cast(m_src).template writePacket(row,col, m_dst.template packet(row,col)); @@ -44,7 +43,7 @@ public: } template - EIGEN_STRONG_INLINE void assignPacket(Index index) + void assignPacket(Index index) { PacketType tmp = m_src.template packet(index); const_cast(m_src).template writePacket(index, m_dst.template packet(index)); @@ -53,7 +52,7 @@ public: // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I mean no CRTP (Gael) template - EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner) + void assignPacketByOuterInner(Index outer, Index inner) { Index row = Base::rowIndexByOuterInner(outer, inner); Index col = Base::colIndexByOuterInner(outer, inner); diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Transpose.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Transpose.h index 49804b0ab..960dc4510 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/Transpose.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Transpose.h @@ -61,27 +61,24 @@ template class Transpose typedef typename internal::remove_all::type NestedExpression; EIGEN_DEVICE_FUNC - explicit EIGEN_STRONG_INLINE Transpose(MatrixType& matrix) : m_matrix(matrix) {} + explicit inline Transpose(MatrixType& matrix) : m_matrix(matrix) {} EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index rows() const { return m_matrix.cols(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index cols() const { return m_matrix.rows(); } + EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.cols(); } + EIGEN_DEVICE_FUNC inline Index cols() const { return m_matrix.rows(); } /** \returns the nested expression */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_DEVICE_FUNC const typename internal::remove_all::type& nestedExpression() const { return m_matrix; } /** \returns the nested expression */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_DEVICE_FUNC typename internal::remove_reference::type& nestedExpression() { return m_matrix; } /** \internal */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(Index nrows, Index ncols) { m_matrix.resize(ncols,nrows); } @@ -125,10 +122,8 @@ template class TransposeImpl EIGEN_DENSE_PUBLIC_INTERFACE(Transpose) EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TransposeImpl) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index innerStride() const { return derived().nestedExpression().innerStride(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Index outerStride() const { return derived().nestedExpression().outerStride(); } + EIGEN_DEVICE_FUNC inline Index innerStride() const { return derived().nestedExpression().innerStride(); } + EIGEN_DEVICE_FUNC inline Index outerStride() const { return derived().nestedExpression().outerStride(); } typedef typename internal::conditional< internal::is_lvalue::value, @@ -136,20 +131,18 @@ template class TransposeImpl const Scalar >::type ScalarWithConstIfNotLvalue; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Scalar* data() const { return derived().nestedExpression().data(); } + EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); } + EIGEN_DEVICE_FUNC inline const Scalar* data() const { return derived().nestedExpression().data(); } // FIXME: shall we keep the const version of coeffRef? - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Scalar& coeffRef(Index rowId, Index colId) const + EIGEN_DEVICE_FUNC + inline const Scalar& coeffRef(Index rowId, Index colId) const { return derived().nestedExpression().coeffRef(colId, rowId); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const Scalar& coeffRef(Index index) const + EIGEN_DEVICE_FUNC + inline const Scalar& coeffRef(Index index) const { return derived().nestedExpression().coeffRef(index); } @@ -177,8 +170,7 @@ template class TransposeImpl * * \sa transposeInPlace(), adjoint() */ template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -Transpose +inline Transpose DenseBase::transpose() { return TransposeReturnType(derived()); @@ -190,8 +182,7 @@ DenseBase::transpose() * * \sa transposeInPlace(), adjoint() */ template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename DenseBase::ConstTransposeReturnType +inline typename DenseBase::ConstTransposeReturnType DenseBase::transpose() const { return ConstTransposeReturnType(derived()); @@ -217,7 +208,7 @@ DenseBase::transpose() const * * \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */ template -EIGEN_DEVICE_FUNC inline const typename MatrixBase::AdjointReturnType +inline const typename MatrixBase::AdjointReturnType MatrixBase::adjoint() const { return AdjointReturnType(this->transpose()); @@ -239,10 +230,11 @@ struct inplace_transpose_selector; template struct inplace_transpose_selector { // square matrix static void run(MatrixType& m) { - m.matrix().template triangularView().swap(m.matrix().transpose().template triangularView()); + m.matrix().template triangularView().swap(m.matrix().transpose()); } }; +// TODO: vectorized path is currently limited to LargestPacketSize x LargestPacketSize cases only. template struct inplace_transpose_selector { // PacketSize x PacketSize static void run(MatrixType& m) { @@ -259,66 +251,16 @@ struct inplace_transpose_selector { // PacketSize x Packet } }; - -template -void BlockedInPlaceTranspose(MatrixType& m) { - typedef typename MatrixType::Scalar Scalar; - typedef typename internal::packet_traits::type Packet; - const Index PacketSize = internal::packet_traits::size; - eigen_assert(m.rows() == m.cols()); - int row_start = 0; - for (; row_start + PacketSize <= m.rows(); row_start += PacketSize) { - for (int col_start = row_start; col_start + PacketSize <= m.cols(); col_start += PacketSize) { - PacketBlock A; - if (row_start == col_start) { - for (Index i=0; i(row_start + i,col_start); - internal::ptranspose(A); - for (Index i=0; i(m.rowIndexByOuterInner(row_start + i, col_start), m.colIndexByOuterInner(row_start + i,col_start), A.packet[i]); - } else { - PacketBlock B; - for (Index i=0; i(row_start + i,col_start); - B.packet[i] = m.template packetByOuterInner(col_start + i, row_start); - } - internal::ptranspose(A); - internal::ptranspose(B); - for (Index i=0; i(m.rowIndexByOuterInner(row_start + i, col_start), m.colIndexByOuterInner(row_start + i,col_start), B.packet[i]); - m.template writePacket(m.rowIndexByOuterInner(col_start + i, row_start), m.colIndexByOuterInner(col_start + i,row_start), A.packet[i]); - } - } - } - } - for (Index row = row_start; row < m.rows(); ++row) { - m.matrix().row(row).head(row).swap( - m.matrix().col(row).head(row).transpose()); - } -} - template -struct inplace_transpose_selector { // non square or dynamic matrix +struct inplace_transpose_selector { // non square matrix static void run(MatrixType& m) { - typedef typename MatrixType::Scalar Scalar; - if (m.rows() == m.cols()) { - const Index PacketSize = internal::packet_traits::size; - if (!NumTraits::IsComplex && m.rows() >= PacketSize) { - if ((m.rows() % PacketSize) == 0) - BlockedInPlaceTranspose::Alignment>(m); - else - BlockedInPlaceTranspose(m); - } - else { - m.matrix().template triangularView().swap(m.matrix().transpose().template triangularView()); - } - } else { + if (m.rows()==m.cols()) + m.matrix().template triangularView().swap(m.matrix().transpose()); + else m = m.transpose().eval(); - } } }; - } // end namespace internal /** This is the "in place" version of transpose(): it replaces \c *this by its own transpose. @@ -341,7 +283,7 @@ struct inplace_transpose_selector { // non squ * * \sa transpose(), adjoint(), adjointInPlace() */ template -EIGEN_DEVICE_FUNC inline void DenseBase::transposeInPlace() +inline void DenseBase::transposeInPlace() { eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic)) && "transposeInPlace() called on a non-square non-resizable matrix"); @@ -372,7 +314,7 @@ EIGEN_DEVICE_FUNC inline void DenseBase::transposeInPlace() * * \sa transpose(), adjoint(), transposeInPlace() */ template -EIGEN_DEVICE_FUNC inline void MatrixBase::adjointInPlace() +inline void MatrixBase::adjointInPlace() { derived() = adjoint().eval(); } @@ -451,8 +393,7 @@ struct checkTransposeAliasing_impl template void check_for_aliasing(const Dst &dst, const Src &src) { - if((!Dst::IsVectorAtCompileTime) && dst.rows()>1 && dst.cols()>1) - internal::checkTransposeAliasing_impl::run(dst, src); + internal::checkTransposeAliasing_impl::run(dst, src); } } // end namespace internal diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Transpositions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Transpositions.h index f6d02f7d8..86da5af59 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/Transpositions.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Transpositions.h @@ -33,6 +33,17 @@ class TranspositionsBase indices() = other.indices(); return derived(); } + + #ifndef EIGEN_PARSED_BY_DOXYGEN + /** This is a special case of the templated operator=. Its purpose is to + * prevent a default operator= from hiding the templated operator=. + */ + Derived& operator=(const TranspositionsBase& other) + { + indices() = other.indices(); + return derived(); + } + #endif /** \returns the number of transpositions */ Index size() const { return indices().size(); } @@ -73,7 +84,7 @@ class TranspositionsBase } // FIXME: do we want such methods ? - // might be useful when the target matrix expression is complex, e.g.: + // might be usefull when the target matrix expression is complex, e.g.: // object.matrix().block(..,..,..,..) = trans * object.matrix().block(..,..,..,..); /* template @@ -160,6 +171,12 @@ class Transpositions : public TranspositionsBase& other) : m_indices(other.indices()) {} + #ifndef EIGEN_PARSED_BY_DOXYGEN + /** Standard copy constructor. Defined only to prevent a default copy constructor + * from hiding the other templated constructor */ + inline Transpositions(const Transpositions& other) : m_indices(other.indices()) {} + #endif + /** Generic constructor from expression of the transposition indices. */ template explicit inline Transpositions(const MatrixBase& indices) : m_indices(indices) @@ -172,6 +189,17 @@ class Transpositions : public TranspositionsBase class TriangularBase : public EigenBase inline Index innerStride() const { return derived().innerStride(); } // dummy resize function - EIGEN_DEVICE_FUNC void resize(Index rows, Index cols) { EIGEN_UNUSED_VARIABLE(rows); @@ -198,7 +197,6 @@ template class TriangularView typedef typename internal::traits::MatrixTypeNestedNonRef MatrixTypeNestedNonRef; typedef typename internal::remove_all::type MatrixConjugateReturnType; - typedef TriangularView::type, _Mode> ConstTriangularView; public: @@ -242,18 +240,6 @@ template class TriangularView inline const ConjugateReturnType conjugate() const { return ConjugateReturnType(m_matrix.conjugate()); } - /** \returns an expression of the complex conjugate of \c *this if Cond==true, - * returns \c *this otherwise. - */ - template - EIGEN_DEVICE_FUNC - inline typename internal::conditional::type - conjugateIf() const - { - typedef typename internal::conditional::type ReturnType; - return ReturnType(m_matrix.template conjugateIf()); - } - typedef TriangularView AdjointReturnType; /** \sa MatrixBase::adjoint() const */ EIGEN_DEVICE_FUNC @@ -447,14 +433,14 @@ template class TriangularViewImpl<_Mat TriangularViewType& operator=(const TriangularViewImpl& other) { return *this = other.derived().nestedExpression(); } - template /** \deprecated */ - EIGEN_DEPRECATED EIGEN_DEVICE_FUNC + template + EIGEN_DEVICE_FUNC void lazyAssign(const TriangularBase& other); - template /** \deprecated */ - EIGEN_DEPRECATED EIGEN_DEVICE_FUNC + template + EIGEN_DEVICE_FUNC void lazyAssign(const MatrixBase& other); #endif @@ -482,7 +468,7 @@ template class TriangularViewImpl<_Mat * \a Side==OnTheLeft (the default), or the right-inverse-multiply \a other * inverse(\c *this) if * \a Side==OnTheRight. * - * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft + * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft * * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this @@ -500,6 +486,7 @@ template class TriangularViewImpl<_Mat * \sa TriangularView::solveInPlace() */ template + EIGEN_DEVICE_FUNC inline const internal::triangular_solve_retval solve(const MatrixBase& other) const; @@ -508,7 +495,7 @@ template class TriangularViewImpl<_Mat * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here. * This function will const_cast it, so constness isn't honored here. * - * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft + * Note that the template parameter \c Side can be ommitted, in which case \c Side==OnTheLeft * * See TriangularView:solve() for the details. */ @@ -534,10 +521,10 @@ template class TriangularViewImpl<_Mat call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op()); } - /** Shortcut for \code (*this).swap(other.triangularView<(*this)::Mode>()) \endcode */ + /** \deprecated + * Shortcut for \code (*this).swap(other.triangularView<(*this)::Mode>()) \endcode */ template - /** \deprecated */ - EIGEN_DEPRECATED EIGEN_DEVICE_FUNC + EIGEN_DEVICE_FUNC void swap(MatrixBase const & other) { EIGEN_STATIC_ASSERT_LVALUE(OtherDerived); @@ -569,7 +556,7 @@ template class TriangularViewImpl<_Mat // FIXME should we keep that possibility template template -EIGEN_DEVICE_FUNC inline TriangularView& +inline TriangularView& TriangularViewImpl::operator=(const MatrixBase& other) { internal::call_assignment_no_alias(derived(), other.derived(), internal::assign_op()); @@ -579,7 +566,7 @@ TriangularViewImpl::operator=(const MatrixBase template -EIGEN_DEVICE_FUNC void TriangularViewImpl::lazyAssign(const MatrixBase& other) +void TriangularViewImpl::lazyAssign(const MatrixBase& other) { internal::call_assignment_no_alias(derived(), other.template triangularView()); } @@ -588,7 +575,7 @@ EIGEN_DEVICE_FUNC void TriangularViewImpl::lazyAssign(c template template -EIGEN_DEVICE_FUNC inline TriangularView& +inline TriangularView& TriangularViewImpl::operator=(const TriangularBase& other) { eigen_assert(Mode == int(OtherDerived::Mode)); @@ -598,7 +585,7 @@ TriangularViewImpl::operator=(const TriangularBase template -EIGEN_DEVICE_FUNC void TriangularViewImpl::lazyAssign(const TriangularBase& other) +void TriangularViewImpl::lazyAssign(const TriangularBase& other) { eigen_assert(Mode == int(OtherDerived::Mode)); internal::call_assignment_no_alias(derived(), other.derived()); @@ -613,7 +600,7 @@ EIGEN_DEVICE_FUNC void TriangularViewImpl::lazyAssign(c * If the matrix is triangular, the opposite part is set to zero. */ template template -EIGEN_DEVICE_FUNC void TriangularBase::evalTo(MatrixBase &other) const +void TriangularBase::evalTo(MatrixBase &other) const { evalToLazy(other.derived()); } @@ -639,7 +626,6 @@ EIGEN_DEVICE_FUNC void TriangularBase::evalTo(MatrixBase */ template template -EIGEN_DEVICE_FUNC typename MatrixBase::template TriangularViewReturnType::Type MatrixBase::triangularView() { @@ -649,7 +635,6 @@ MatrixBase::triangularView() /** This is the const version of MatrixBase::triangularView() */ template template -EIGEN_DEVICE_FUNC typename MatrixBase::template ConstTriangularViewReturnType::Type MatrixBase::triangularView() const { @@ -732,7 +717,6 @@ struct unary_evaluator, IndexBased> { typedef TriangularView XprType; typedef evaluator::type> Base; - EIGEN_DEVICE_FUNC unary_evaluator(const XprType &xpr) : Base(xpr.nestedExpression()) {} }; @@ -948,7 +932,7 @@ struct triangular_assignment_loop * If the matrix is triangular, the opposite part is set to zero. */ template template -EIGEN_DEVICE_FUNC void TriangularBase::evalToLazy(MatrixBase &other) const +void TriangularBase::evalToLazy(MatrixBase &other) const { other.derived().resize(this->rows(), this->cols()); internal::call_triangular_assignment_loop(other.derived(), derived().nestedExpression()); diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/VectorBlock.h b/uppsrc/plugin/Eigen/Eigen/src/Core/VectorBlock.h index 71c5b95ee..d72fbf7e9 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/VectorBlock.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/VectorBlock.h @@ -35,7 +35,7 @@ struct traits > * It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment(Index) and * most of the time this is the only way it is used. * - * However, if you want to directly manipulate sub-vector expressions, + * However, if you want to directly maniputate sub-vector expressions, * for instance if you want to write a function returning such an expression, you * will need to use this class. * @@ -71,8 +71,8 @@ template class VectorBlock /** Dynamic-size constructor */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - VectorBlock(VectorType& vector, Index start, Index size) + EIGEN_DEVICE_FUNC + inline VectorBlock(VectorType& vector, Index start, Index size) : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start, IsColVector ? size : 1, IsColVector ? 1 : size) @@ -82,8 +82,8 @@ template class VectorBlock /** Fixed-size constructor */ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - VectorBlock(VectorType& vector, Index start) + EIGEN_DEVICE_FUNC + inline VectorBlock(VectorType& vector, Index start) : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorBlock); diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/VectorwiseOp.h b/uppsrc/plugin/Eigen/Eigen/src/Core/VectorwiseOp.h index 865691b32..4fe267e9f 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/VectorwiseOp.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/VectorwiseOp.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2008-2019 Gael Guennebaud +// Copyright (C) 2008-2010 Gael Guennebaud // Copyright (C) 2006-2008 Benoit Jacob // // This Source Code Form is subject to the terms of the Mozilla @@ -81,46 +81,39 @@ class PartialReduxExpr : public internal::dense_xpr_base< PartialReduxExpr struct partial_redux_dummy_func; - -#define EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(MEMBER,COST,VECTORIZABLE,BINARYOP) \ - template \ - struct member_##MEMBER { \ - EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER) \ - typedef ResultType result_type; \ - typedef BINARYOP BinaryOp; \ - template struct Cost { enum { value = COST }; }; \ - enum { Vectorizable = VECTORIZABLE }; \ - template \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ - ResultType operator()(const XprType& mat) const \ - { return mat.MEMBER(); } \ - BinaryOp binaryFunc() const { return BinaryOp(); } \ +#define EIGEN_MEMBER_FUNCTOR(MEMBER,COST) \ + template \ + struct member_##MEMBER { \ + EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER) \ + typedef ResultType result_type; \ + template struct Cost \ + { enum { value = COST }; }; \ + template \ + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ + ResultType operator()(const XprType& mat) const \ + { return mat.MEMBER(); } \ } -#define EIGEN_MEMBER_FUNCTOR(MEMBER,COST) \ - EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(MEMBER,COST,0,partial_redux_dummy_func) - namespace internal { +EIGEN_MEMBER_FUNCTOR(squaredNorm, Size * NumTraits::MulCost + (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(norm, (Size+5) * NumTraits::MulCost + (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(stableNorm, (Size+5) * NumTraits::MulCost + (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(blueNorm, (Size+5) * NumTraits::MulCost + (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(hypotNorm, (Size-1) * functor_traits >::Cost ); +EIGEN_MEMBER_FUNCTOR(sum, (Size-1)*NumTraits::AddCost); +EIGEN_MEMBER_FUNCTOR(mean, (Size-1)*NumTraits::AddCost + NumTraits::MulCost); +EIGEN_MEMBER_FUNCTOR(minCoeff, (Size-1)*NumTraits::AddCost); +EIGEN_MEMBER_FUNCTOR(maxCoeff, (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(all, (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(any, (Size-1)*NumTraits::AddCost); EIGEN_MEMBER_FUNCTOR(count, (Size-1)*NumTraits::AddCost); +EIGEN_MEMBER_FUNCTOR(prod, (Size-1)*NumTraits::MulCost); -EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(sum, (Size-1)*NumTraits::AddCost, 1, internal::scalar_sum_op); -EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(minCoeff, (Size-1)*NumTraits::AddCost, 1, internal::scalar_min_op); -EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(maxCoeff, (Size-1)*NumTraits::AddCost, 1, internal::scalar_max_op); -EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(prod, (Size-1)*NumTraits::MulCost, 1, internal::scalar_product_op); - -template +template struct member_lpnorm { typedef ResultType result_type; - enum { Vectorizable = 0 }; - template struct Cost + template struct Cost { enum { value = (Size+5) * NumTraits::MulCost + (Size-1)*NumTraits::AddCost }; }; EIGEN_DEVICE_FUNC member_lpnorm() {} template @@ -128,20 +121,17 @@ struct member_lpnorm { { return mat.template lpNorm

(); } }; -template +template struct member_redux { - typedef BinaryOpT BinaryOp; typedef typename result_of< BinaryOp(const Scalar&,const Scalar&) >::type result_type; - - enum { Vectorizable = functor_traits::PacketAccess }; - template struct Cost { enum { value = (Size-1) * functor_traits::Cost }; }; + template struct Cost + { enum { value = (Size-1) * functor_traits::Cost }; }; EIGEN_DEVICE_FUNC explicit member_redux(const BinaryOp func) : m_functor(func) {} template EIGEN_DEVICE_FUNC inline result_type operator()(const DenseBase& mat) const { return mat.redux(m_functor); } - const BinaryOp& binaryFunc() const { return m_functor; } const BinaryOp m_functor; }; } @@ -149,38 +139,18 @@ struct member_redux { /** \class VectorwiseOp * \ingroup Core_Module * - * \brief Pseudo expression providing broadcasting and partial reduction operations + * \brief Pseudo expression providing partial reduction operations * * \tparam ExpressionType the type of the object on which to do partial reductions - * \tparam Direction indicates whether to operate on columns (#Vertical) or rows (#Horizontal) + * \tparam Direction indicates the direction of the redux (#Vertical or #Horizontal) * - * This class represents a pseudo expression with broadcasting and partial reduction features. + * This class represents a pseudo expression with partial reduction features. * It is the return type of DenseBase::colwise() and DenseBase::rowwise() - * and most of the time this is the only way it is explicitly used. + * and most of the time this is the only way it is used. * - * To understand the logic of rowwise/colwise expression, let's consider a generic case `A.colwise().foo()` - * where `foo` is any method of `VectorwiseOp`. This expression is equivalent to applying `foo()` to each - * column of `A` and then re-assemble the outputs in a matrix expression: - * \code [A.col(0).foo(), A.col(1).foo(), ..., A.col(A.cols()-1).foo()] \endcode - * * Example: \include MatrixBase_colwise.cpp * Output: \verbinclude MatrixBase_colwise.out * - * The begin() and end() methods are obviously exceptions to the previous rule as they - * return STL-compatible begin/end iterators to the rows or columns of the nested expression. - * Typical use cases include for-range-loop and calls to STL algorithms: - * - * Example: \include MatrixBase_colwise_iterator_cxx11.cpp - * Output: \verbinclude MatrixBase_colwise_iterator_cxx11.out - * - * For a partial reduction on an empty input, some rules apply. - * For the sake of clarity, let's consider a vertical reduction: - * - If the number of columns is zero, then a 1x0 row-major vector expression is returned. - * - Otherwise, if the number of rows is zero, then - * - a row vector of zeros is returned for sum-like reductions (sum, squaredNorm, norm, etc.) - * - a row vector of ones is returned for a product reduction (e.g., MatrixXd(n,0).colwise().prod()) - * - an assert is triggered for all other reductions (minCoeff,maxCoeff,redux(bin_op)) - * * \sa DenseBase::colwise(), DenseBase::rowwise(), class PartialReduxExpr */ template class VectorwiseOp @@ -193,11 +163,11 @@ template class VectorwiseOp typedef typename internal::ref_selector::non_const_type ExpressionTypeNested; typedef typename internal::remove_all::type ExpressionTypeNestedCleaned; - template class Functor, - typename ReturnScalar=Scalar> struct ReturnType + template class Functor, + typename Scalar_=Scalar> struct ReturnType { typedef PartialReduxExpr, + Functor, Direction > Type; }; @@ -216,7 +186,24 @@ template class VectorwiseOp }; protected: - + + typedef typename internal::conditional::type SubVector; + /** \internal + * \returns the i-th subvector according to the \c Direction */ + EIGEN_DEVICE_FUNC + SubVector subVector(Index i) + { + return SubVector(m_matrix.derived(),i); + } + + /** \internal + * \returns the number of subvectors in the direction \c Direction */ + EIGEN_DEVICE_FUNC + Index subVectors() const + { return isVertical?m_matrix.cols():m_matrix.rows(); } + template struct ExtendedType { typedef Replicate class VectorwiseOp EIGEN_DEVICE_FUNC inline const ExpressionType& _expression() const { return m_matrix; } - #ifdef EIGEN_PARSED_BY_DOXYGEN - /** STL-like RandomAccessIterator - * iterator type over the columns or rows as returned by the begin() and end() methods. - */ - random_access_iterator_type iterator; - /** This is the const version of iterator (aka read-only) */ - random_access_iterator_type const_iterator; - #else - typedef internal::subvector_stl_iterator iterator; - typedef internal::subvector_stl_iterator const_iterator; - #endif - - /** returns an iterator to the first row (rowwise) or column (colwise) of the nested expression. - * \sa end(), cbegin() - */ - iterator begin() { return iterator (m_matrix, 0); } - /** const version of begin() */ - const_iterator begin() const { return const_iterator(m_matrix, 0); } - /** const version of begin() */ - const_iterator cbegin() const { return const_iterator(m_matrix, 0); } - - /** returns an iterator to the row (resp. column) following the last row (resp. column) of the nested expression - * \sa begin(), cend() - */ - iterator end() { return iterator (m_matrix, m_matrix.template subVectors()); } - /** const version of end() */ - const_iterator end() const { return const_iterator(m_matrix, m_matrix.template subVectors()); } - /** const version of end() */ - const_iterator cend() const { return const_iterator(m_matrix, m_matrix.template subVectors()); } - /** \returns a row or column vector expression of \c *this reduxed by \a func * * The template parameter \a BinaryOp is the type of the functor * of the custom redux operator. Note that func must be an associative operator. * - * \warning the size along the reduction direction must be strictly positive, - * otherwise an assertion is triggered. - * * \sa class VectorwiseOp, DenseBase::colwise(), DenseBase::rowwise() */ template EIGEN_DEVICE_FUNC const typename ReduxReturnType::Type redux(const BinaryOp& func = BinaryOp()) const - { - eigen_assert(redux_length()>0 && "you are using an empty matrix"); - return typename ReduxReturnType::Type(_expression(), internal::member_redux(func)); - } + { return typename ReduxReturnType::Type(_expression(), internal::member_redux(func)); } typedef typename ReturnType::Type MinCoeffReturnType; typedef typename ReturnType::Type MaxCoeffReturnType; - typedef PartialReduxExpr, const ExpressionTypeNestedCleaned>,internal::member_sum,Direction> SquaredNormReturnType; - typedef CwiseUnaryOp, const SquaredNormReturnType> NormReturnType; + typedef typename ReturnType::Type SquaredNormReturnType; + typedef typename ReturnType::Type NormReturnType; typedef typename ReturnType::Type BlueNormReturnType; typedef typename ReturnType::Type StableNormReturnType; typedef typename ReturnType::Type HypotNormReturnType; typedef typename ReturnType::Type SumReturnType; - typedef EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(SumReturnType,Scalar,quotient) MeanReturnType; + typedef typename ReturnType::Type MeanReturnType; typedef typename ReturnType::Type AllReturnType; typedef typename ReturnType::Type AnyReturnType; - typedef PartialReduxExpr, Direction> CountReturnType; + typedef PartialReduxExpr, Direction> CountReturnType; typedef typename ReturnType::Type ProdReturnType; typedef Reverse ConstReverseReturnType; typedef Reverse ReverseReturnType; template struct LpNormReturnType { - typedef PartialReduxExpr,Direction> Type; + typedef PartialReduxExpr,Direction> Type; }; /** \returns a row (or column) vector expression of the smallest coefficient * of each column (or row) of the referenced expression. * - * \warning the size along the reduction direction must be strictly positive, - * otherwise an assertion is triggered. - * * \warning the result is undefined if \c *this contains NaN. * * Example: \include PartialRedux_minCoeff.cpp @@ -354,17 +302,11 @@ template class VectorwiseOp * \sa DenseBase::minCoeff() */ EIGEN_DEVICE_FUNC const MinCoeffReturnType minCoeff() const - { - eigen_assert(redux_length()>0 && "you are using an empty matrix"); - return MinCoeffReturnType(_expression()); - } + { return MinCoeffReturnType(_expression()); } /** \returns a row (or column) vector expression of the largest coefficient * of each column (or row) of the referenced expression. * - * \warning the size along the reduction direction must be strictly positive, - * otherwise an assertion is triggered. - * * \warning the result is undefined if \c *this contains NaN. * * Example: \include PartialRedux_maxCoeff.cpp @@ -373,10 +315,7 @@ template class VectorwiseOp * \sa DenseBase::maxCoeff() */ EIGEN_DEVICE_FUNC const MaxCoeffReturnType maxCoeff() const - { - eigen_assert(redux_length()>0 && "you are using an empty matrix"); - return MaxCoeffReturnType(_expression()); - } + { return MaxCoeffReturnType(_expression()); } /** \returns a row (or column) vector expression of the squared norm * of each column (or row) of the referenced expression. @@ -388,7 +327,7 @@ template class VectorwiseOp * \sa DenseBase::squaredNorm() */ EIGEN_DEVICE_FUNC const SquaredNormReturnType squaredNorm() const - { return SquaredNormReturnType(m_matrix.cwiseAbs2()); } + { return SquaredNormReturnType(_expression()); } /** \returns a row (or column) vector expression of the norm * of each column (or row) of the referenced expression. @@ -400,7 +339,7 @@ template class VectorwiseOp * \sa DenseBase::norm() */ EIGEN_DEVICE_FUNC const NormReturnType norm() const - { return NormReturnType(squaredNorm()); } + { return NormReturnType(_expression()); } /** \returns a row (or column) vector expression of the norm * of each column (or row) of the referenced expression. @@ -465,7 +404,7 @@ template class VectorwiseOp * \sa DenseBase::mean() */ EIGEN_DEVICE_FUNC const MeanReturnType mean() const - { return sum() / Scalar(Direction==Vertical?m_matrix.rows():m_matrix.cols()); } + { return MeanReturnType(_expression()); } /** \returns a row (or column) vector expression representing * whether \b all coefficients of each respective column (or row) are \c true. @@ -561,7 +500,7 @@ template class VectorwiseOp EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) //eigen_assert((m_matrix.isNull()) == (other.isNull())); FIXME - return m_matrix = extendedTo(other.derived()); + return const_cast(m_matrix = extendedTo(other.derived())); } /** Adds the vector \a other to each subvector of \c *this */ @@ -571,7 +510,7 @@ template class VectorwiseOp { EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) - return m_matrix += extendedTo(other.derived()); + return const_cast(m_matrix += extendedTo(other.derived())); } /** Substracts the vector \a other to each subvector of \c *this */ @@ -581,7 +520,7 @@ template class VectorwiseOp { EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) - return m_matrix -= extendedTo(other.derived()); + return const_cast(m_matrix -= extendedTo(other.derived())); } /** Multiples each subvector of \c *this by the vector \a other */ @@ -593,7 +532,7 @@ template class VectorwiseOp EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) m_matrix *= extendedTo(other.derived()); - return m_matrix; + return const_cast(m_matrix); } /** Divides each subvector of \c *this by the vector \a other */ @@ -605,7 +544,7 @@ template class VectorwiseOp EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType) EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived) m_matrix /= extendedTo(other.derived()); - return m_matrix; + return const_cast(m_matrix); } /** Returns the expression of the sum of the vector \a other to each subvector of \c *this */ @@ -670,7 +609,7 @@ template class VectorwiseOp EIGEN_DEVICE_FUNC CwiseBinaryOp, const ExpressionTypeNestedCleaned, - const typename OppositeExtendedType::Type> + const typename OppositeExtendedType::Type>::Type> normalized() const { return m_matrix.cwiseQuotient(extendedToOpposite(this->norm())); } @@ -719,15 +658,7 @@ template class VectorwiseOp EIGEN_DEVICE_FUNC const HNormalizedReturnType hnormalized() const; -# ifdef EIGEN_VECTORWISEOP_PLUGIN -# include EIGEN_VECTORWISEOP_PLUGIN -# endif - protected: - Index redux_length() const - { - return Direction==Vertical ? m_matrix.rows() : m_matrix.cols(); - } ExpressionTypeNested m_matrix; }; @@ -739,7 +670,7 @@ template class VectorwiseOp * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting */ template -EIGEN_DEVICE_FUNC inline typename DenseBase::ColwiseReturnType +inline typename DenseBase::ColwiseReturnType DenseBase::colwise() { return ColwiseReturnType(derived()); @@ -753,7 +684,7 @@ DenseBase::colwise() * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting */ template -EIGEN_DEVICE_FUNC inline typename DenseBase::RowwiseReturnType +inline typename DenseBase::RowwiseReturnType DenseBase::rowwise() { return RowwiseReturnType(derived()); diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/Visitor.h b/uppsrc/plugin/Eigen/Eigen/src/Core/Visitor.h index 67a69c54f..54c1883d9 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/Visitor.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/Visitor.h @@ -40,14 +40,6 @@ struct visitor_impl } }; -// This specialization enables visitors on empty matrices at compile-time -template -struct visitor_impl { - EIGEN_DEVICE_FUNC - static inline void run(const Derived &/*mat*/, Visitor& /*visitor*/) - {} -}; - template struct visitor_impl { @@ -106,8 +98,6 @@ protected: * * \note compared to one or two \em for \em loops, visitors offer automatic * unrolling for small fixed size matrix. - * - * \note if the matrix is empty, then the visitor is left unchanged. * * \sa minCoeff(Index*,Index*), maxCoeff(Index*,Index*), DenseBase::redux() */ @@ -116,9 +106,6 @@ template EIGEN_DEVICE_FUNC void DenseBase::visit(Visitor& visitor) const { - if(size()==0) - return; - typedef typename internal::visitor_evaluator ThisEvaluator; ThisEvaluator thisEval(derived()); @@ -137,9 +124,6 @@ namespace internal { template struct coeff_visitor { - // default initialization to avoid countless invalid maybe-uninitialized warnings by gcc - EIGEN_DEVICE_FUNC - coeff_visitor() : row(-1), col(-1), res(0) {} typedef typename Derived::Scalar Scalar; Index row, col; Scalar res; @@ -212,9 +196,6 @@ struct functor_traits > { /** \fn DenseBase::minCoeff(IndexType* rowId, IndexType* colId) const * \returns the minimum of all coefficients of *this and puts in *row and *col its location. - * - * \warning the matrix must be not empty, otherwise an assertion is triggered. - * * \warning the result is undefined if \c *this contains NaN. * * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff() @@ -225,8 +206,6 @@ EIGEN_DEVICE_FUNC typename internal::traits::Scalar DenseBase::minCoeff(IndexType* rowId, IndexType* colId) const { - eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); - internal::min_coeff_visitor minVisitor; this->visit(minVisitor); *rowId = minVisitor.row; @@ -235,9 +214,6 @@ DenseBase::minCoeff(IndexType* rowId, IndexType* colId) const } /** \returns the minimum of all coefficients of *this and puts in *index its location. - * - * \warning the matrix must be not empty, otherwise an assertion is triggered. - * * \warning the result is undefined if \c *this contains NaN. * * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::minCoeff() @@ -248,8 +224,6 @@ EIGEN_DEVICE_FUNC typename internal::traits::Scalar DenseBase::minCoeff(IndexType* index) const { - eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) internal::min_coeff_visitor minVisitor; this->visit(minVisitor); @@ -259,9 +233,6 @@ DenseBase::minCoeff(IndexType* index) const /** \fn DenseBase::maxCoeff(IndexType* rowId, IndexType* colId) const * \returns the maximum of all coefficients of *this and puts in *row and *col its location. - * - * \warning the matrix must be not empty, otherwise an assertion is triggered. - * * \warning the result is undefined if \c *this contains NaN. * * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff() @@ -272,8 +243,6 @@ EIGEN_DEVICE_FUNC typename internal::traits::Scalar DenseBase::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const { - eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); - internal::max_coeff_visitor maxVisitor; this->visit(maxVisitor); *rowPtr = maxVisitor.row; @@ -282,9 +251,6 @@ DenseBase::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const } /** \returns the maximum of all coefficients of *this and puts in *index its location. - * - * \warning the matrix must be not empty, otherwise an assertion is triggered. - * * \warning the result is undefined if \c *this contains NaN. * * \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff() @@ -295,8 +261,6 @@ EIGEN_DEVICE_FUNC typename internal::traits::Scalar DenseBase::maxCoeff(IndexType* index) const { - eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) internal::max_coeff_visitor maxVisitor; this->visit(maxVisitor); diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/Complex.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/Complex.h index c2d5205f2..7fa61969d 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/Complex.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/Complex.h @@ -22,7 +22,6 @@ struct Packet4cf __m256 v; }; -#ifndef EIGEN_VECTORIZE_AVX512 template<> struct packet_traits > : default_packet_traits { typedef Packet4cf type; @@ -42,13 +41,11 @@ template<> struct packet_traits > : default_packet_traits HasAbs2 = 0, HasMin = 0, HasMax = 0, - HasSetLinear = 0, - HasInsert = 1 + HasSetLinear = 0 }; }; -#endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=4, alignment=Aligned32}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet4cf padd(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf psub(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_sub_ps(a.v,b.v)); } @@ -70,18 +67,10 @@ template<> EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, con return Packet4cf(result); } -template <> -EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) { - __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ); - return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1))); -} - -template<> EIGEN_STRONG_INLINE Packet4cf ptrue(const Packet4cf& a) { return Packet4cf(ptrue(Packet8f(a.v))); } -template<> EIGEN_STRONG_INLINE Packet4cf pnot(const Packet4cf& a) { return Packet4cf(pnot(Packet8f(a.v))); } template<> EIGEN_STRONG_INLINE Packet4cf pand (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf por (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pxor (const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_xor_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet4cf pandnot(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(b.v,a.v)); } +template<> EIGEN_STRONG_INLINE Packet4cf pandnot(const Packet4cf& a, const Packet4cf& b) { return Packet4cf(_mm256_andnot_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet4cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(pload(&numext::real_ref(*from))); } template<> EIGEN_STRONG_INLINE Packet4cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(ploadu(&numext::real_ref(*from))); } @@ -151,12 +140,37 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Packe Packet2cf(_mm256_extractf128_ps(a.v,1)))); } +template<> EIGEN_STRONG_INLINE Packet4cf preduxp(const Packet4cf* vecs) +{ + Packet8f t0 = _mm256_shuffle_ps(vecs[0].v, vecs[0].v, _MM_SHUFFLE(3, 1, 2 ,0)); + Packet8f t1 = _mm256_shuffle_ps(vecs[1].v, vecs[1].v, _MM_SHUFFLE(3, 1, 2 ,0)); + t0 = _mm256_hadd_ps(t0,t1); + Packet8f t2 = _mm256_shuffle_ps(vecs[2].v, vecs[2].v, _MM_SHUFFLE(3, 1, 2 ,0)); + Packet8f t3 = _mm256_shuffle_ps(vecs[3].v, vecs[3].v, _MM_SHUFFLE(3, 1, 2 ,0)); + t2 = _mm256_hadd_ps(t2,t3); + + t1 = _mm256_permute2f128_ps(t0,t2, 0 + (2<<4)); + t3 = _mm256_permute2f128_ps(t0,t2, 1 + (3<<4)); + + return Packet4cf(_mm256_add_ps(t1,t3)); +} + template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet4cf& a) { return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)), Packet2cf(_mm256_extractf128_ps(a.v, 1)))); } +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4cf& first, const Packet4cf& second) + { + if (Offset==0) return; + palign_impl::run(first.v, second.v); + } +}; + template<> struct conj_helper { EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const @@ -214,7 +228,6 @@ struct Packet2cd __m256d v; }; -#ifndef EIGEN_VECTORIZE_AVX512 template<> struct packet_traits > : default_packet_traits { typedef Packet2cd type; @@ -237,9 +250,8 @@ template<> struct packet_traits > : default_packet_traits HasSetLinear = 0 }; }; -#endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned32}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet2cd padd(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd psub(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_sub_pd(a.v,b.v)); } @@ -260,18 +272,10 @@ template<> EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, con return Packet2cd(_mm256_addsub_pd(even, odd)); } -template <> -EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) { - __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ); - return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5))); -} - -template<> EIGEN_STRONG_INLINE Packet2cd ptrue(const Packet2cd& a) { return Packet2cd(ptrue(Packet4d(a.v))); } -template<> EIGEN_STRONG_INLINE Packet2cd pnot(const Packet2cd& a) { return Packet2cd(pnot(Packet4d(a.v))); } template<> EIGEN_STRONG_INLINE Packet2cd pand (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd por (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pxor (const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_xor_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cd pandnot(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(b.v,a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cd pandnot(const Packet2cd& a, const Packet2cd& b) { return Packet2cd(_mm256_andnot_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cd pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(pload((const double*)from)); } @@ -323,12 +327,30 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Pack Packet1cd(_mm256_extractf128_pd(a.v,1)))); } +template<> EIGEN_STRONG_INLINE Packet2cd preduxp(const Packet2cd* vecs) +{ + Packet4d t0 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 0 + (2<<4)); + Packet4d t1 = _mm256_permute2f128_pd(vecs[0].v,vecs[1].v, 1 + (3<<4)); + + return Packet2cd(_mm256_add_pd(t0,t1)); +} + template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cd& a) { return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v,0)), Packet1cd(_mm256_extractf128_pd(a.v,1)))); } +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2cd& first, const Packet2cd& second) + { + if (Offset==0) return; + palign_impl::run(first.v, second.v); + } +}; + template<> struct conj_helper { EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const @@ -402,6 +424,26 @@ ptranspose(PacketBlock& kernel) { kernel.packet[0].v = tmp; } +template<> EIGEN_STRONG_INLINE Packet4cf pinsertfirst(const Packet4cf& a, std::complex b) +{ + return Packet4cf(_mm256_blend_ps(a.v,pset1(b).v,1|2)); +} + +template<> EIGEN_STRONG_INLINE Packet2cd pinsertfirst(const Packet2cd& a, std::complex b) +{ + return Packet2cd(_mm256_blend_pd(a.v,pset1(b).v,1|2)); +} + +template<> EIGEN_STRONG_INLINE Packet4cf pinsertlast(const Packet4cf& a, std::complex b) +{ + return Packet4cf(_mm256_blend_ps(a.v,pset1(b).v,(1<<7)|(1<<6))); +} + +template<> EIGEN_STRONG_INLINE Packet2cd pinsertlast(const Packet2cd& a, std::complex b) +{ + return Packet2cd(_mm256_blend_pd(a.v,pset1(b).v,(1<<3)|(1<<2))); +} + } // end namespace internal } // end namespace Eigen diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/MathFunctions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/MathFunctions.h index c5394430f..6af67ce2d 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/MathFunctions.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/MathFunctions.h @@ -10,7 +10,7 @@ #ifndef EIGEN_MATH_FUNCTIONS_AVX_H #define EIGEN_MATH_FUNCTIONS_AVX_H -/* The sin and cos functions of this file are loosely derived from +/* The sin, cos, exp, and log functions of this file are loosely derived from * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ */ @@ -18,32 +18,187 @@ namespace Eigen { namespace internal { +inline Packet8i pshiftleft(Packet8i v, int n) +{ +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_slli_epi32(v, n); +#else + __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(v, 0), n); + __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(v, 1), n); + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +#endif +} + +inline Packet8f pshiftright(Packet8f v, int n) +{ +#ifdef EIGEN_VECTORIZE_AVX2 + return _mm256_cvtepi32_ps(_mm256_srli_epi32(_mm256_castps_si256(v), n)); +#else + __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 0), n); + __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(_mm256_castps_si256(v), 1), n); + return _mm256_cvtepi32_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1)); +#endif +} + +// Sine function +// Computes sin(x) by wrapping x to the interval [-Pi/4,3*Pi/4] and +// evaluating interpolants in [-Pi/4,Pi/4] or [Pi/4,3*Pi/4]. The interpolants +// are (anti-)symmetric and thus have only odd/even coefficients template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f psin(const Packet8f& _x) { - return psin_float(_x); -} - -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f -pcos(const Packet8f& _x) { - return pcos_float(_x); + Packet8f x = _x; + + // Some useful values. + _EIGEN_DECLARE_CONST_Packet8i(one, 1); + _EIGEN_DECLARE_CONST_Packet8f(one, 1.0f); + _EIGEN_DECLARE_CONST_Packet8f(two, 2.0f); + _EIGEN_DECLARE_CONST_Packet8f(one_over_four, 0.25f); + _EIGEN_DECLARE_CONST_Packet8f(one_over_pi, 3.183098861837907e-01f); + _EIGEN_DECLARE_CONST_Packet8f(neg_pi_first, -3.140625000000000e+00f); + _EIGEN_DECLARE_CONST_Packet8f(neg_pi_second, -9.670257568359375e-04f); + _EIGEN_DECLARE_CONST_Packet8f(neg_pi_third, -6.278329571784980e-07f); + _EIGEN_DECLARE_CONST_Packet8f(four_over_pi, 1.273239544735163e+00f); + + // Map x from [-Pi/4,3*Pi/4] to z in [-1,3] and subtract the shifted period. + Packet8f z = pmul(x, p8f_one_over_pi); + Packet8f shift = _mm256_floor_ps(padd(z, p8f_one_over_four)); + x = pmadd(shift, p8f_neg_pi_first, x); + x = pmadd(shift, p8f_neg_pi_second, x); + x = pmadd(shift, p8f_neg_pi_third, x); + z = pmul(x, p8f_four_over_pi); + + // Make a mask for the entries that need flipping, i.e. wherever the shift + // is odd. + Packet8i shift_ints = _mm256_cvtps_epi32(shift); + Packet8i shift_isodd = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(shift_ints), _mm256_castsi256_ps(p8i_one))); + Packet8i sign_flip_mask = pshiftleft(shift_isodd, 31); + + // Create a mask for which interpolant to use, i.e. if z > 1, then the mask + // is set to ones for that entry. + Packet8f ival_mask = _mm256_cmp_ps(z, p8f_one, _CMP_GT_OQ); + + // Evaluate the polynomial for the interval [1,3] in z. + _EIGEN_DECLARE_CONST_Packet8f(coeff_right_0, 9.999999724233232e-01f); + _EIGEN_DECLARE_CONST_Packet8f(coeff_right_2, -3.084242535619928e-01f); + _EIGEN_DECLARE_CONST_Packet8f(coeff_right_4, 1.584991525700324e-02f); + _EIGEN_DECLARE_CONST_Packet8f(coeff_right_6, -3.188805084631342e-04f); + Packet8f z_minus_two = psub(z, p8f_two); + Packet8f z_minus_two2 = pmul(z_minus_two, z_minus_two); + Packet8f right = pmadd(p8f_coeff_right_6, z_minus_two2, p8f_coeff_right_4); + right = pmadd(right, z_minus_two2, p8f_coeff_right_2); + right = pmadd(right, z_minus_two2, p8f_coeff_right_0); + + // Evaluate the polynomial for the interval [-1,1] in z. + _EIGEN_DECLARE_CONST_Packet8f(coeff_left_1, 7.853981525427295e-01f); + _EIGEN_DECLARE_CONST_Packet8f(coeff_left_3, -8.074536727092352e-02f); + _EIGEN_DECLARE_CONST_Packet8f(coeff_left_5, 2.489871967827018e-03f); + _EIGEN_DECLARE_CONST_Packet8f(coeff_left_7, -3.587725841214251e-05f); + Packet8f z2 = pmul(z, z); + Packet8f left = pmadd(p8f_coeff_left_7, z2, p8f_coeff_left_5); + left = pmadd(left, z2, p8f_coeff_left_3); + left = pmadd(left, z2, p8f_coeff_left_1); + left = pmul(left, z); + + // Assemble the results, i.e. select the left and right polynomials. + left = _mm256_andnot_ps(ival_mask, left); + right = _mm256_and_ps(ival_mask, right); + Packet8f res = _mm256_or_ps(left, right); + + // Flip the sign on the odd intervals and return the result. + res = _mm256_xor_ps(res, _mm256_castsi256_ps(sign_flip_mask)); + return res; } +// Natural logarithm +// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) +// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can +// be easily approximated by a polynomial centered on m=1 for stability. +// TODO(gonnet): Further reduce the interval allowing for lower-degree +// polynomial interpolants -> ... -> profit! template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f plog(const Packet8f& _x) { - return plog_float(_x); -} + Packet8f x = _x; + _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f); + _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet8f(126f, 126.0f); -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet8f plog1p(const Packet8f& _x) { - return generic_plog1p(_x); -} + _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inv_mant_mask, ~0x7f800000); -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet8f pexpm1(const Packet8f& _x) { - return generic_expm1(_x); + // The smallest non denormalized float number. + _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(min_norm_pos, 0x00800000); + _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(minus_inf, 0xff800000); + + // Polynomial coefficients. + _EIGEN_DECLARE_CONST_Packet8f(cephes_SQRTHF, 0.707106781186547524f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p0, 7.0376836292E-2f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p1, -1.1514610310E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p2, 1.1676998740E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p3, -1.2420140846E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p4, +1.4249322787E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p5, -1.6668057665E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p6, +2.0000714765E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p7, -2.4999993993E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_p8, +3.3333331174E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q1, -2.12194440e-4f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_log_q2, 0.693359375f); + + Packet8f invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_NGE_UQ); // not greater equal is true if x is NaN + Packet8f iszero_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_EQ_OQ); + + // Truncate input values to the minimum positive normal. + x = pmax(x, p8f_min_norm_pos); + + Packet8f emm0 = pshiftright(x,23); + Packet8f e = _mm256_sub_ps(emm0, p8f_126f); + + // Set the exponents to -1, i.e. x are in the range [0.5,1). + x = _mm256_and_ps(x, p8f_inv_mant_mask); + x = _mm256_or_ps(x, p8f_half); + + // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2)) + // and shift by -1. The values are then centered around 0, which improves + // the stability of the polynomial evaluation. + // if( x < SQRTHF ) { + // e -= 1; + // x = x + x - 1.0; + // } else { x = x - 1.0; } + Packet8f mask = _mm256_cmp_ps(x, p8f_cephes_SQRTHF, _CMP_LT_OQ); + Packet8f tmp = _mm256_and_ps(x, mask); + x = psub(x, p8f_1); + e = psub(e, _mm256_and_ps(p8f_1, mask)); + x = padd(x, tmp); + + Packet8f x2 = pmul(x, x); + Packet8f x3 = pmul(x2, x); + + // Evaluate the polynomial approximant of degree 8 in three parts, probably + // to improve instruction-level parallelism. + Packet8f y, y1, y2; + y = pmadd(p8f_cephes_log_p0, x, p8f_cephes_log_p1); + y1 = pmadd(p8f_cephes_log_p3, x, p8f_cephes_log_p4); + y2 = pmadd(p8f_cephes_log_p6, x, p8f_cephes_log_p7); + y = pmadd(y, x, p8f_cephes_log_p2); + y1 = pmadd(y1, x, p8f_cephes_log_p5); + y2 = pmadd(y2, x, p8f_cephes_log_p8); + y = pmadd(y, x3, y1); + y = pmadd(y, x3, y2); + y = pmul(y, x3); + + // Add the logarithm of the exponent back to the result of the interpolation. + y1 = pmul(e, p8f_cephes_log_q1); + tmp = pmul(x2, p8f_half); + y = padd(y, y1); + x = psub(x, tmp); + y2 = pmul(e, p8f_cephes_log_q2); + x = padd(x, y); + x = padd(x, y2); + + // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. + return _mm256_or_ps( + _mm256_andnot_ps(iszero_mask, _mm256_or_ps(x, invalid_mask)), + _mm256_and_ps(iszero_mask, p8f_minus_inf)); } // Exponential function. Works by writing "x = m*log(2) + r" where @@ -52,7 +207,62 @@ Packet8f pexpm1(const Packet8f& _x) { template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f pexp(const Packet8f& _x) { - return pexp_float(_x); + _EIGEN_DECLARE_CONST_Packet8f(1, 1.0f); + _EIGEN_DECLARE_CONST_Packet8f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet8f(127, 127.0f); + + _EIGEN_DECLARE_CONST_Packet8f(exp_hi, 88.3762626647950f); + _EIGEN_DECLARE_CONST_Packet8f(exp_lo, -88.3762626647949f); + + _EIGEN_DECLARE_CONST_Packet8f(cephes_LOG2EF, 1.44269504088896341f); + + _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p0, 1.9875691500E-4f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p1, 1.3981999507E-3f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p2, 8.3334519073E-3f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p3, 4.1665795894E-2f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p4, 1.6666665459E-1f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_p5, 5.0000001201E-1f); + + // Clamp x. + Packet8f x = pmax(pmin(_x, p8f_exp_hi), p8f_exp_lo); + + // Express exp(x) as exp(m*ln(2) + r), start by extracting + // m = floor(x/ln(2) + 0.5). + Packet8f m = _mm256_floor_ps(pmadd(x, p8f_cephes_LOG2EF, p8f_half)); + +// Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is +// subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating +// truncation errors. Note that we don't use the "pmadd" function here to +// ensure that a precision-preserving FMA instruction is used. +#ifdef EIGEN_VECTORIZE_FMA + _EIGEN_DECLARE_CONST_Packet8f(nln2, -0.6931471805599453f); + Packet8f r = _mm256_fmadd_ps(m, p8f_nln2, x); +#else + _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C1, 0.693359375f); + _EIGEN_DECLARE_CONST_Packet8f(cephes_exp_C2, -2.12194440e-4f); + Packet8f r = psub(x, pmul(m, p8f_cephes_exp_C1)); + r = psub(r, pmul(m, p8f_cephes_exp_C2)); +#endif + + Packet8f r2 = pmul(r, r); + + // TODO(gonnet): Split into odd/even polynomials and try to exploit + // instruction-level parallelism. + Packet8f y = p8f_cephes_exp_p0; + y = pmadd(y, r, p8f_cephes_exp_p1); + y = pmadd(y, r, p8f_cephes_exp_p2); + y = pmadd(y, r, p8f_cephes_exp_p3); + y = pmadd(y, r, p8f_cephes_exp_p4); + y = pmadd(y, r, p8f_cephes_exp_p5); + y = pmadd(y, r2, r); + y = padd(y, p8f_1); + + // Build emm0 = 2^m. + Packet8i emm0 = _mm256_cvttps_epi32(padd(m, p8f_127)); + emm0 = pshiftleft(emm0, 23); + + // Return 2^m * exp(r). + return pmax(pmul(y, _mm256_castsi256_ps(emm0)), _x); } // Hyperbolic Tangent function. @@ -62,11 +272,84 @@ ptanh(const Packet8f& x) { return internal::generic_fast_tanh_float(x); } -// Exponential function for doubles. template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4d -pexp(const Packet4d& x) { - return pexp_double(x); +pexp(const Packet4d& _x) { + Packet4d x = _x; + + _EIGEN_DECLARE_CONST_Packet4d(1, 1.0); + _EIGEN_DECLARE_CONST_Packet4d(2, 2.0); + _EIGEN_DECLARE_CONST_Packet4d(half, 0.5); + + _EIGEN_DECLARE_CONST_Packet4d(exp_hi, 709.437); + _EIGEN_DECLARE_CONST_Packet4d(exp_lo, -709.436139303); + + _EIGEN_DECLARE_CONST_Packet4d(cephes_LOG2EF, 1.4426950408889634073599); + + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p0, 1.26177193074810590878e-4); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p1, 3.02994407707441961300e-2); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_p2, 9.99999999999999999910e-1); + + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q0, 3.00198505138664455042e-6); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q1, 2.52448340349684104192e-3); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q2, 2.27265548208155028766e-1); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_q3, 2.00000000000000000009e0); + + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C1, 0.693145751953125); + _EIGEN_DECLARE_CONST_Packet4d(cephes_exp_C2, 1.42860682030941723212e-6); + _EIGEN_DECLARE_CONST_Packet4i(1023, 1023); + + Packet4d tmp, fx; + + // clamp x + x = pmax(pmin(x, p4d_exp_hi), p4d_exp_lo); + // Express exp(x) as exp(g + n*log(2)). + fx = pmadd(p4d_cephes_LOG2EF, x, p4d_half); + + // Get the integer modulus of log(2), i.e. the "n" described above. + fx = _mm256_floor_pd(fx); + + // Get the remainder modulo log(2), i.e. the "g" described above. Subtract + // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last + // digits right. + tmp = pmul(fx, p4d_cephes_exp_C1); + Packet4d z = pmul(fx, p4d_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + + Packet4d x2 = pmul(x, x); + + // Evaluate the numerator polynomial of the rational interpolant. + Packet4d px = p4d_cephes_exp_p0; + px = pmadd(px, x2, p4d_cephes_exp_p1); + px = pmadd(px, x2, p4d_cephes_exp_p2); + px = pmul(px, x); + + // Evaluate the denominator polynomial of the rational interpolant. + Packet4d qx = p4d_cephes_exp_q0; + qx = pmadd(qx, x2, p4d_cephes_exp_q1); + qx = pmadd(qx, x2, p4d_cephes_exp_q2); + qx = pmadd(qx, x2, p4d_cephes_exp_q3); + + // I don't really get this bit, copied from the SSE2 routines, so... + // TODO(gonnet): Figure out what is going on here, perhaps find a better + // rational interpolant? + x = _mm256_div_pd(px, psub(qx, px)); + x = pmadd(p4d_2, x, p4d_1); + + // Build e=2^n by constructing the exponents in a 128-bit vector and + // shifting them to where they belong in double-precision values. + __m128i emm0 = _mm256_cvtpd_epi32(fx); + emm0 = _mm_add_epi32(emm0, p4i_1023); + emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0)); + __m128i lo = _mm_slli_epi64(emm0, 52); + __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52); + __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0); + e = _mm256_insertf128_si256(e, hi, 1); + + // Construct the result 2^n * exp(g) = e * x. The max is used to catch + // non-finite values in the input. + return pmax(pmul(x, _mm256_castsi256_pd(e)), _x); } // Functions for sqrt. @@ -109,6 +392,7 @@ Packet4d psqrt(const Packet4d& x) { template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8f prsqrt(const Packet8f& _x) { _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(inf, 0x7f800000); + _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(nan, 0x7fc00000); _EIGEN_DECLARE_CONST_Packet8f(one_point_five, 1.5f); _EIGEN_DECLARE_CONST_Packet8f(minus_half, -0.5f); _EIGEN_DECLARE_CONST_Packet8f_FROM_INT(flt_min, 0x00800000); @@ -117,25 +401,20 @@ Packet8f prsqrt(const Packet8f& _x) { // select only the inverse sqrt of positive normal inputs (denormals are // flushed to zero and cause infs as well). - Packet8f lt_min_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ); - Packet8f inf_mask = _mm256_cmp_ps(_x, p8f_inf, _CMP_EQ_OQ); - Packet8f not_normal_finite_mask = _mm256_or_ps(lt_min_mask, inf_mask); + Packet8f le_zero_mask = _mm256_cmp_ps(_x, p8f_flt_min, _CMP_LT_OQ); + Packet8f x = _mm256_andnot_ps(le_zero_mask, _mm256_rsqrt_ps(_x)); - // Compute an approximate result using the rsqrt intrinsic. - Packet8f y_approx = _mm256_rsqrt_ps(_x); + // Fill in NaNs and Infs for the negative/zero entries. + Packet8f neg_mask = _mm256_cmp_ps(_x, _mm256_setzero_ps(), _CMP_LT_OQ); + Packet8f zero_mask = _mm256_andnot_ps(neg_mask, le_zero_mask); + Packet8f infs_and_nans = _mm256_or_ps(_mm256_and_ps(neg_mask, p8f_nan), + _mm256_and_ps(zero_mask, p8f_inf)); - // Do a single step of Newton-Raphson iteration to improve the approximation. - // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n). - // It is essential to evaluate the inner term like this because forming - // y_n^2 may over- or underflow. - Packet8f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p8f_one_point_five)); + // Do a single step of Newton's iteration. + x = pmul(x, pmadd(neg_half, pmul(x, x), p8f_one_point_five)); - // Select the result of the Newton-Raphson step for positive normal arguments. - // For other arguments, choose the output of the intrinsic. This will - // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if - // x is zero or a positive denormalized float (equivalent to flushing positive - // denormalized inputs to zero). - return pselect(not_normal_finite_mask, y_approx, y_newton); + // Insert NaNs and Infs in all the right places. + return _mm256_or_ps(x, infs_and_nans); } #else diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/PacketMath.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/PacketMath.h index 35a329e3f..923a124b2 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/PacketMath.h @@ -18,11 +18,11 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 #endif -#if !defined(EIGEN_VECTORIZE_AVX512) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS) -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif -#ifdef EIGEN_VECTORIZE_FMA +#ifdef __FMA__ #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif @@ -31,12 +31,10 @@ namespace internal { typedef __m256 Packet8f; typedef __m256i Packet8i; typedef __m256d Packet4d; -typedef eigen_packet_wrapper<__m128i, 2> Packet8h; template<> struct is_arithmetic<__m256> { enum { value = true }; }; template<> struct is_arithmetic<__m256i> { enum { value = true }; }; template<> struct is_arithmetic<__m256d> { enum { value = true }; }; -template<> struct is_arithmetic { enum { value = true }; }; #define _EIGEN_DECLARE_CONST_Packet8f(NAME,X) \ const Packet8f p8f_##NAME = pset1(X) @@ -60,28 +58,21 @@ template<> struct packet_traits : default_packet_traits enum { Vectorizable = 1, AlignedOnScalar = 1, - size = 8, + size=8, HasHalfPacket = 1, - HasInsert = 1, - HasDiv = 1, - HasSin = EIGEN_FAST_MATH, - HasCos = EIGEN_FAST_MATH, - HasLog = 1, - HasLog1p = 1, - HasExpm1 = 1, - HasExp = 1, - HasNdtri = 1, - HasBessel = 1, + HasDiv = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = 0, + HasLog = 1, + HasExp = 1, HasSqrt = 1, HasRsqrt = 1, - HasTanh = EIGEN_FAST_MATH, - HasErf = EIGEN_FAST_MATH, + HasTanh = EIGEN_FAST_MATH, HasBlend = 1, HasRound = 1, HasFloor = 1, - HasCeil = 1, - HasRint = 1 + HasCeil = 1 }; }; template<> struct packet_traits : default_packet_traits @@ -93,7 +84,6 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size=4, HasHalfPacket = 1, - HasInsert = 1, HasDiv = 1, HasExp = 1, @@ -105,36 +95,6 @@ template<> struct packet_traits : default_packet_traits HasCeil = 1 }; }; - -template <> -struct packet_traits : default_packet_traits { - typedef Packet8h type; - // There is no half-size packet for Packet8h. - typedef Packet8h half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 8, - HasHalfPacket = 0, - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasNegate = 1, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasConj = 0, - HasSetLinear = 0, - HasSqrt = 0, - HasRsqrt = 0, - HasExp = 0, - HasLog = 0, - HasBlend = 0, - HasInsert = 1 - }; -}; #endif template<> struct scalar_div_cost { enum { value = 14 }; }; @@ -153,30 +113,14 @@ template<> struct packet_traits : default_packet_traits }; */ -template<> struct unpacket_traits { - typedef float type; - typedef Packet4f half; - typedef Packet8i integer_packet; - typedef uint8_t mask_t; - enum {size=8, alignment=Aligned32, vectorizable=true, masked_load_available=true, masked_store_available=true}; -}; -template<> struct unpacket_traits { - typedef double type; - typedef Packet2d half; - enum {size=4, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; -}; -template<> struct unpacket_traits { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32, vectorizable=false, masked_load_available=false, masked_store_available=false}; }; +template<> struct unpacket_traits { typedef float type; typedef Packet4f half; enum {size=8, alignment=Aligned32}; }; +template<> struct unpacket_traits { typedef double type; typedef Packet2d half; enum {size=4, alignment=Aligned32}; }; +template<> struct unpacket_traits { typedef int type; typedef Packet4i half; enum {size=8, alignment=Aligned32}; }; template<> EIGEN_STRONG_INLINE Packet8f pset1(const float& from) { return _mm256_set1_ps(from); } template<> EIGEN_STRONG_INLINE Packet4d pset1(const double& from) { return _mm256_set1_pd(from); } template<> EIGEN_STRONG_INLINE Packet8i pset1(const int& from) { return _mm256_set1_epi32(from); } -template<> EIGEN_STRONG_INLINE Packet8f pset1frombits(unsigned int from) { return _mm256_castsi256_ps(pset1(from)); } - -template<> EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f& /*a*/) { return _mm256_setzero_ps(); } -template<> EIGEN_STRONG_INLINE Packet4d pzero(const Packet4d& /*a*/) { return _mm256_setzero_pd(); } -template<> EIGEN_STRONG_INLINE Packet8i pzero(const Packet8i& /*a*/) { return _mm256_setzero_si256(); } - template<> EIGEN_STRONG_INLINE Packet8f pload1(const float* from) { return _mm256_broadcast_ss(from); } template<> EIGEN_STRONG_INLINE Packet4d pload1(const double* from) { return _mm256_broadcast_sd(from); } @@ -185,15 +129,6 @@ template<> EIGEN_STRONG_INLINE Packet4d plset(const double& a) { retur template<> EIGEN_STRONG_INLINE Packet8f padd(const Packet8f& a, const Packet8f& b) { return _mm256_add_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d padd(const Packet4d& a, const Packet4d& b) { return _mm256_add_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet8i padd(const Packet8i& a, const Packet8i& b) { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_add_epi32(a,b); -#else - __m128i lo = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0)); - __m128i hi = _mm_add_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); - return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); -#endif -} template<> EIGEN_STRONG_INLINE Packet8f psub(const Packet8f& a, const Packet8f& b) { return _mm256_sub_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d psub(const Packet4d& a, const Packet4d& b) { return _mm256_sub_pd(a,b); } @@ -222,7 +157,7 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv(const Packet8i& /*a*/, co return pset1(0); } -#ifdef EIGEN_VECTORIZE_FMA +#ifdef __FMA__ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { #if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) ) // Clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, @@ -249,77 +184,14 @@ template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& } #endif -template<> EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { -#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 - // There appears to be a bug in GCC, by which the optimizer may flip - // the argument order in calls to _mm_min_ps/_mm_max_ps, so we have to - // resort to inline ASM here. This is supposed to be fixed in gcc6.3, - // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 - Packet8f res; - asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); - return res; -#else - // Arguments are swapped to match NaN propagation behavior of std::min. - return _mm256_min_ps(b,a); -#endif -} -template<> EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const Packet4d& b) { -#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 - // See pmin above - Packet4d res; - asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); - return res; -#else - // Arguments are swapped to match NaN propagation behavior of std::min. - return _mm256_min_pd(b,a); -#endif -} -template<> EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { -#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 - // See pmin above - Packet8f res; - asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); - return res; -#else - // Arguments are swapped to match NaN propagation behavior of std::max. - return _mm256_max_ps(b,a); -#endif -} -template<> EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const Packet4d& b) { -#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 - // See pmin above - Packet4d res; - asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); - return res; -#else - // Arguments are swapped to match NaN propagation behavior of std::max. - return _mm256_max_pd(b,a); -#endif -} +template<> EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) { return _mm256_min_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pmin(const Packet4d& a, const Packet4d& b) { return _mm256_min_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LE_OQ); } -template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_LT_OQ); } -template<> EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a, b, _CMP_NGE_UQ); } -template<> EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { return _mm256_cmp_ps(a,b,_CMP_EQ_OQ); } +template<> EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) { return _mm256_max_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pmax(const Packet4d& a, const Packet4d& b) { return _mm256_max_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4d pcmp_le(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_LE_OQ); } -template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_LT_OQ); } -template<> EIGEN_STRONG_INLINE Packet4d pcmp_lt_or_nan(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a, b, _CMP_NGE_UQ); } -template<> EIGEN_STRONG_INLINE Packet4d pcmp_eq(const Packet4d& a, const Packet4d& b) { return _mm256_cmp_pd(a,b,_CMP_EQ_OQ); } - - -template<> EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_cmpeq_epi32(a,b); -#else - __m128i lo = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0)); - __m128i hi = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1)); - return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); -#endif -} - -template<> EIGEN_STRONG_INLINE Packet8f print(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); } -template<> EIGEN_STRONG_INLINE Packet4d print(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); } +template<> EIGEN_STRONG_INLINE Packet8f pround(const Packet8f& a) { return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION); } +template<> EIGEN_STRONG_INLINE Packet4d pround(const Packet4d& a) { return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION); } template<> EIGEN_STRONG_INLINE Packet8f pceil(const Packet8f& a) { return _mm256_ceil_ps(a); } template<> EIGEN_STRONG_INLINE Packet4d pceil(const Packet4d& a) { return _mm256_ceil_pd(a); } @@ -327,124 +199,17 @@ template<> EIGEN_STRONG_INLINE Packet4d pceil(const Packet4d& a) { ret template<> EIGEN_STRONG_INLINE Packet8f pfloor(const Packet8f& a) { return _mm256_floor_ps(a); } template<> EIGEN_STRONG_INLINE Packet4d pfloor(const Packet4d& a) { return _mm256_floor_pd(a); } - -template<> EIGEN_STRONG_INLINE Packet8i ptrue(const Packet8i& a) { -#ifdef EIGEN_VECTORIZE_AVX2 - // vpcmpeqd has lower latency than the more general vcmpps - return _mm256_cmpeq_epi32(a,a); -#else - const __m256 b = _mm256_castsi256_ps(a); - return _mm256_castps_si256(_mm256_cmp_ps(b,b,_CMP_TRUE_UQ)); -#endif -} - -template<> EIGEN_STRONG_INLINE Packet8f ptrue(const Packet8f& a) { -#ifdef EIGEN_VECTORIZE_AVX2 - // vpcmpeqd has lower latency than the more general vcmpps - const __m256i b = _mm256_castps_si256(a); - return _mm256_castsi256_ps(_mm256_cmpeq_epi32(b,b)); -#else - return _mm256_cmp_ps(a,a,_CMP_TRUE_UQ); -#endif -} - -template<> EIGEN_STRONG_INLINE Packet4d ptrue(const Packet4d& a) { -#ifdef EIGEN_VECTORIZE_AVX2 - // vpcmpeqq has lower latency than the more general vcmppd - const __m256i b = _mm256_castpd_si256(a); - return _mm256_castsi256_pd(_mm256_cmpeq_epi64(b,b)); -#else - return _mm256_cmp_pd(a,a,_CMP_TRUE_UQ); -#endif -} - template<> EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) { return _mm256_and_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pand(const Packet4d& a, const Packet4d& b) { return _mm256_and_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet8i pand(const Packet8i& a, const Packet8i& b) { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_and_si256(a,b); -#else - return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b))); -#endif -} template<> EIGEN_STRONG_INLINE Packet8f por(const Packet8f& a, const Packet8f& b) { return _mm256_or_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d por(const Packet4d& a, const Packet4d& b) { return _mm256_or_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet8i por(const Packet8i& a, const Packet8i& b) { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_or_si256(a,b); -#else - return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b))); -#endif -} template<> EIGEN_STRONG_INLINE Packet8f pxor(const Packet8f& a, const Packet8f& b) { return _mm256_xor_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet4d pxor(const Packet4d& a, const Packet4d& b) { return _mm256_xor_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet8i pxor(const Packet8i& a, const Packet8i& b) { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_xor_si256(a,b); -#else - return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a),_mm256_castsi256_ps(b))); -#endif -} -template<> EIGEN_STRONG_INLINE Packet8f pandnot(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(b,a); } -template<> EIGEN_STRONG_INLINE Packet4d pandnot(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(b,a); } -template<> EIGEN_STRONG_INLINE Packet8i pandnot(const Packet8i& a, const Packet8i& b) { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_andnot_si256(b,a); -#else - return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b),_mm256_castsi256_ps(a))); -#endif -} - -template<> EIGEN_STRONG_INLINE Packet8f pround(const Packet8f& a) -{ - const Packet8f mask = pset1frombits(0x80000000u); - const Packet8f prev0dot5 = pset1frombits(0x3EFFFFFFu); - return _mm256_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); -} -template<> EIGEN_STRONG_INLINE Packet4d pround(const Packet4d& a) -{ - const Packet4d mask = _mm256_castsi256_pd(_mm256_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull, 0x8000000000000000ull, 0x8000000000000000ull)); - const Packet4d prev0dot5 = _mm256_castsi256_pd(_mm256_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull)); - return _mm256_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); -} - -template<> EIGEN_STRONG_INLINE Packet8f pselect(const Packet8f& mask, const Packet8f& a, const Packet8f& b) -{ return _mm256_blendv_ps(b,a,mask); } -template<> EIGEN_STRONG_INLINE Packet4d pselect(const Packet4d& mask, const Packet4d& a, const Packet4d& b) -{ return _mm256_blendv_pd(b,a,mask); } - -template EIGEN_STRONG_INLINE Packet8i parithmetic_shift_right(Packet8i a) { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_srai_epi32(a, N); -#else - __m128i lo = _mm_srai_epi32(_mm256_extractf128_si256(a, 0), N); - __m128i hi = _mm_srai_epi32(_mm256_extractf128_si256(a, 1), N); - return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); -#endif -} - -template EIGEN_STRONG_INLINE Packet8i plogical_shift_right(Packet8i a) { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_srli_epi32(a, N); -#else - __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(a, 0), N); - __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(a, 1), N); - return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); -#endif -} - -template EIGEN_STRONG_INLINE Packet8i plogical_shift_left(Packet8i a) { -#ifdef EIGEN_VECTORIZE_AVX2 - return _mm256_slli_epi32(a, N); -#else - __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(a, 0), N); - __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(a, 1), N); - return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); -#endif -} +template<> EIGEN_STRONG_INLINE Packet8f pandnot(const Packet8f& a, const Packet8f& b) { return _mm256_andnot_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet4d pandnot(const Packet4d& a, const Packet4d& b) { return _mm256_andnot_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet8f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from); } template<> EIGEN_STRONG_INLINE Packet4d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from); } @@ -454,14 +219,6 @@ template<> EIGEN_STRONG_INLINE Packet8f ploadu(const float* from) { EI template<> EIGEN_STRONG_INLINE Packet4d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from); } template<> EIGEN_STRONG_INLINE Packet8i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast(from)); } -template<> EIGEN_STRONG_INLINE Packet8f ploadu(const float* from, uint8_t umask) { - Packet8i mask = _mm256_set1_epi8(static_cast(umask)); - const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe); - mask = por(mask, bit_mask); - mask = pcmp_eq(mask, _mm256_set1_epi32(0xffffffff)); - EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_maskload_ps(from, mask); -} - // Loads 4 floats from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, a3} template<> EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) { @@ -469,7 +226,7 @@ template<> EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) // Packet8f tmp = _mm256_castps128_ps256(_mm_loadu_ps(from)); // tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1); // return _mm256_unpacklo_ps(tmp,tmp); - + // _mm256_insertf128_ps is very slow on Haswell, thus: Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from); // mimic an "inplace" permutation of the lower 128bits using a blend @@ -499,14 +256,6 @@ template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet4d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& from, uint8_t umask) { - Packet8i mask = _mm256_set1_epi8(static_cast(umask)); - const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe); - mask = por(mask, bit_mask); - mask = pcmp_eq(mask, _mm256_set1_epi32(0xffffffff)); - EIGEN_DEBUG_UNALIGNED_STORE return _mm256_maskstore_ps(to, mask, from); -} - // NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available // NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride), 4); template<> EIGEN_DEVICE_FUNC inline Packet8f pgather(const float* from, Index stride) @@ -605,26 +354,47 @@ template<> EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) return _mm256_and_pd(a,mask); } -template<> EIGEN_STRONG_INLINE Packet8f pfrexp(const Packet8f& a, Packet8f& exponent) { - return pfrexp_float(a,exponent); -} +// preduxp should be ok +// FIXME: why is this ok? why isn't the simply implementation working as expected? +template<> EIGEN_STRONG_INLINE Packet8f preduxp(const Packet8f* vecs) +{ + __m256 hsum1 = _mm256_hadd_ps(vecs[0], vecs[1]); + __m256 hsum2 = _mm256_hadd_ps(vecs[2], vecs[3]); + __m256 hsum3 = _mm256_hadd_ps(vecs[4], vecs[5]); + __m256 hsum4 = _mm256_hadd_ps(vecs[6], vecs[7]); -template<> EIGEN_STRONG_INLINE Packet8f pldexp(const Packet8f& a, const Packet8f& exponent) { - return pldexp_float(a,exponent); -} + __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1); + __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2); + __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3); + __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4); -template<> EIGEN_STRONG_INLINE Packet4d pldexp(const Packet4d& a, const Packet4d& exponent) { - // Build e=2^n by constructing the exponents in a 128-bit vector and - // shifting them to where they belong in double-precision values. - Packet4i cst_1023 = pset1(1023); - __m128i emm0 = _mm256_cvtpd_epi32(exponent); - emm0 = _mm_add_epi32(emm0, cst_1023); - emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(3, 1, 2, 0)); - __m128i lo = _mm_slli_epi64(emm0, 52); - __m128i hi = _mm_slli_epi64(_mm_srli_epi64(emm0, 32), 52); - __m256i e = _mm256_insertf128_si256(_mm256_setzero_si256(), lo, 0); - e = _mm256_insertf128_si256(e, hi, 1); - return pmul(a,_mm256_castsi256_pd(e)); + __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); + __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); + __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); + __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); + + __m256 sum1 = _mm256_add_ps(perm1, hsum5); + __m256 sum2 = _mm256_add_ps(perm2, hsum6); + __m256 sum3 = _mm256_add_ps(perm3, hsum7); + __m256 sum4 = _mm256_add_ps(perm4, hsum8); + + __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); + __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); + + __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0); + return final; +} +template<> EIGEN_STRONG_INLINE Packet4d preduxp(const Packet4d* vecs) +{ + Packet4d tmp0, tmp1; + + tmp0 = _mm256_hadd_pd(vecs[0], vecs[1]); + tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); + + tmp1 = _mm256_hadd_pd(vecs[2], vecs[3]); + tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); + + return _mm256_blend_pd(tmp0, tmp1, 0xC); } template<> EIGEN_STRONG_INLINE float predux(const Packet8f& a) @@ -636,7 +406,7 @@ template<> EIGEN_STRONG_INLINE double predux(const Packet4d& a) return predux(Packet2d(_mm_add_pd(_mm256_castpd256_pd128(a),_mm256_extractf128_pd(a,1)))); } -template<> EIGEN_STRONG_INLINE Packet4f predux_half_dowto4(const Packet8f& a) +template<> EIGEN_STRONG_INLINE Packet4f predux_downto4(const Packet8f& a) { return _mm_add_ps(_mm256_castps256_ps128(a),_mm256_extractf128_ps(a,1)); } @@ -680,16 +450,93 @@ template<> EIGEN_STRONG_INLINE double predux_max(const Packet4d& a) return pfirst(_mm256_max_pd(tmp, _mm256_shuffle_pd(tmp, tmp, 1))); } -// not needed yet -// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet8f& x) -// { -// return _mm256_movemask_ps(x)==0xFF; -// } -template<> EIGEN_STRONG_INLINE bool predux_any(const Packet8f& x) +template +struct palign_impl { - return _mm256_movemask_ps(x)!=0; -} + static EIGEN_STRONG_INLINE void run(Packet8f& first, const Packet8f& second) + { + if (Offset==1) + { + first = _mm256_blend_ps(first, second, 1); + Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); + Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); + first = _mm256_blend_ps(tmp1, tmp2, 0x88); + } + else if (Offset==2) + { + first = _mm256_blend_ps(first, second, 3); + Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); + Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); + first = _mm256_blend_ps(tmp1, tmp2, 0xcc); + } + else if (Offset==3) + { + first = _mm256_blend_ps(first, second, 7); + Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); + Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); + first = _mm256_blend_ps(tmp1, tmp2, 0xee); + } + else if (Offset==4) + { + first = _mm256_blend_ps(first, second, 15); + Packet8f tmp1 = _mm256_permute_ps (first, _MM_SHUFFLE(3,2,1,0)); + Packet8f tmp2 = _mm256_permute2f128_ps (tmp1, tmp1, 1); + first = _mm256_permute_ps(tmp2, _MM_SHUFFLE(3,2,1,0)); + } + else if (Offset==5) + { + first = _mm256_blend_ps(first, second, 31); + first = _mm256_permute2f128_ps(first, first, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(0,3,2,1)); + first = _mm256_permute2f128_ps(tmp, tmp, 1); + first = _mm256_blend_ps(tmp, first, 0x88); + } + else if (Offset==6) + { + first = _mm256_blend_ps(first, second, 63); + first = _mm256_permute2f128_ps(first, first, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(1,0,3,2)); + first = _mm256_permute2f128_ps(tmp, tmp, 1); + first = _mm256_blend_ps(tmp, first, 0xcc); + } + else if (Offset==7) + { + first = _mm256_blend_ps(first, second, 127); + first = _mm256_permute2f128_ps(first, first, 1); + Packet8f tmp = _mm256_permute_ps (first, _MM_SHUFFLE(2,1,0,3)); + first = _mm256_permute2f128_ps(tmp, tmp, 1); + first = _mm256_blend_ps(tmp, first, 0xee); + } + } +}; + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4d& first, const Packet4d& second) + { + if (Offset==1) + { + first = _mm256_blend_pd(first, second, 1); + __m256d tmp = _mm256_permute_pd(first, 5); + first = _mm256_permute2f128_pd(tmp, tmp, 1); + first = _mm256_blend_pd(tmp, first, 0xA); + } + else if (Offset==2) + { + first = _mm256_blend_pd(first, second, 3); + first = _mm256_permute2f128_pd(first, first, 1); + } + else if (Offset==3) + { + first = _mm256_blend_pd(first, second, 7); + __m256d tmp = _mm256_permute_pd(first, 5); + first = _mm256_permute2f128_pd(tmp, tmp, 1); + first = _mm256_blend_pd(tmp, first, 5); + } + } +}; EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { @@ -763,274 +610,24 @@ template<> EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, cons return _mm256_blendv_pd(thenPacket, elsePacket, false_mask); } -// Packet math for Eigen::half -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet8h half; }; - -template<> EIGEN_STRONG_INLINE Packet8h pset1(const Eigen::half& from) { - return _mm_set1_epi16(from.x); -} - -template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet8h& from) { - return half_impl::raw_uint16_to_half(static_cast(_mm_extract_epi16(from, 0))); -} - -template<> EIGEN_STRONG_INLINE Packet8h pload(const Eigen::half* from) { - return _mm_load_si128(reinterpret_cast(from)); -} - -template<> EIGEN_STRONG_INLINE Packet8h ploadu(const Eigen::half* from) { - return _mm_loadu_si128(reinterpret_cast(from)); -} - -template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet8h& from) { - _mm_store_si128(reinterpret_cast<__m128i*>(to), from); -} - -template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet8h& from) { - _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); -} - -template<> EIGEN_STRONG_INLINE Packet8h -ploaddup(const Eigen::half* from) { - unsigned short a = from[0].x; - unsigned short b = from[1].x; - unsigned short c = from[2].x; - unsigned short d = from[3].x; - return _mm_set_epi16(d, d, c, c, b, b, a, a); -} - -template<> EIGEN_STRONG_INLINE Packet8h -ploadquad(const Eigen::half* from) { - unsigned short a = from[0].x; - unsigned short b = from[1].x; - return _mm_set_epi16(b, b, b, b, a, a, a, a); -} - -EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { -#ifdef EIGEN_HAS_FP16_C - return _mm256_cvtph_ps(a); -#else - EIGEN_ALIGN32 Eigen::half aux[8]; - pstore(aux, a); - float f0(aux[0]); - float f1(aux[1]); - float f2(aux[2]); - float f3(aux[3]); - float f4(aux[4]); - float f5(aux[5]); - float f6(aux[6]); - float f7(aux[7]); - - return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0); -#endif -} - -EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { -#ifdef EIGEN_HAS_FP16_C - return _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); -#else - EIGEN_ALIGN32 float aux[8]; - pstore(aux, a); - Eigen::half h0(aux[0]); - Eigen::half h1(aux[1]); - Eigen::half h2(aux[2]); - Eigen::half h3(aux[3]); - Eigen::half h4(aux[4]); - Eigen::half h5(aux[5]); - Eigen::half h6(aux[6]); - Eigen::half h7(aux[7]); - - return _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); -#endif -} - -template<> EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) { - return _mm_cmpeq_epi32(a, a); -} - -template<> EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a,const Packet8h& b) { - // in some cases Packet4i is a wrapper around __m128i, so we either need to - // cast to Packet4i to directly call the intrinsics as below: - return _mm_or_si128(a,b); -} -template<> EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a,const Packet8h& b) { - return _mm_xor_si128(a,b); -} -template<> EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a,const Packet8h& b) { - return _mm_and_si128(a,b); -} -template<> EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a,const Packet8h& b) { - return _mm_andnot_si128(b,a); -} - -template<> EIGEN_STRONG_INLINE Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) { - return _mm_blendv_epi8(b, a, mask); -} - -template<> EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a,const Packet8h& b) { - Packet8f af = half2float(a); - Packet8f bf = half2float(b); - Packet8f rf = pcmp_eq(af, bf); - // Pack the 32-bit flags into 16-bits flags. - return _mm_packs_epi32(_mm256_extractf128_si256(_mm256_castps_si256(rf), 0), - _mm256_extractf128_si256(_mm256_castps_si256(rf), 1)); -} - -template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } - -template<> EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) { - Packet8h sign_mask = _mm_set1_epi16(static_cast(0x8000)); - return _mm_xor_si128(a, sign_mask); -} - -template<> EIGEN_STRONG_INLINE Packet8h padd(const Packet8h& a, const Packet8h& b) { - Packet8f af = half2float(a); - Packet8f bf = half2float(b); - Packet8f rf = padd(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE Packet8h psub(const Packet8h& a, const Packet8h& b) { - Packet8f af = half2float(a); - Packet8f bf = half2float(b); - Packet8f rf = psub(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE Packet8h pmul(const Packet8h& a, const Packet8h& b) { - Packet8f af = half2float(a); - Packet8f bf = half2float(b); - Packet8f rf = pmul(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE Packet8h pdiv(const Packet8h& a, const Packet8h& b) { - Packet8f af = half2float(a); - Packet8f bf = half2float(b); - Packet8f rf = pdiv(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE Packet8h pgather(const Eigen::half* from, Index stride) +template<> EIGEN_STRONG_INLINE Packet8f pinsertfirst(const Packet8f& a, float b) { - return _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); + return _mm256_blend_ps(a,pset1(b),1); } -template<> EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet8h& from, Index stride) +template<> EIGEN_STRONG_INLINE Packet4d pinsertfirst(const Packet4d& a, double b) { - EIGEN_ALIGN32 Eigen::half aux[8]; - pstore(aux, from); - to[stride*0] = aux[0]; - to[stride*1] = aux[1]; - to[stride*2] = aux[2]; - to[stride*3] = aux[3]; - to[stride*4] = aux[4]; - to[stride*5] = aux[5]; - to[stride*6] = aux[6]; - to[stride*7] = aux[7]; + return _mm256_blend_pd(a,pset1(b),1); } -template<> EIGEN_STRONG_INLINE Eigen::half predux(const Packet8h& a) { - Packet8f af = half2float(a); - float reduced = predux(af); - return Eigen::half(reduced); -} - -template<> EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet8h& a) { - Packet8f af = half2float(a); - float reduced = predux_max(af); - return Eigen::half(reduced); -} - -template<> EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet8h& a) { - Packet8f af = half2float(a); - float reduced = predux_min(af); - return Eigen::half(reduced); -} - -template<> EIGEN_STRONG_INLINE Eigen::half predux_mul(const Packet8h& a) { - Packet8f af = half2float(a); - float reduced = predux_mul(af); - return Eigen::half(reduced); -} - -template<> EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a) +template<> EIGEN_STRONG_INLINE Packet8f pinsertlast(const Packet8f& a, float b) { - __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1); - return _mm_shuffle_epi8(a,m); + return _mm256_blend_ps(a,pset1(b),(1<<7)); } -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - __m128i a = kernel.packet[0]; - __m128i b = kernel.packet[1]; - __m128i c = kernel.packet[2]; - __m128i d = kernel.packet[3]; - __m128i e = kernel.packet[4]; - __m128i f = kernel.packet[5]; - __m128i g = kernel.packet[6]; - __m128i h = kernel.packet[7]; - - __m128i a03b03 = _mm_unpacklo_epi16(a, b); - __m128i c03d03 = _mm_unpacklo_epi16(c, d); - __m128i e03f03 = _mm_unpacklo_epi16(e, f); - __m128i g03h03 = _mm_unpacklo_epi16(g, h); - __m128i a47b47 = _mm_unpackhi_epi16(a, b); - __m128i c47d47 = _mm_unpackhi_epi16(c, d); - __m128i e47f47 = _mm_unpackhi_epi16(e, f); - __m128i g47h47 = _mm_unpackhi_epi16(g, h); - - __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03); - __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03); - __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03); - __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03); - __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47); - __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47); - __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47); - __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47); - - __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01); - __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01); - __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23); - __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23); - __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45); - __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45); - __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67); - __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67); - - kernel.packet[0] = a0b0c0d0e0f0g0h0; - kernel.packet[1] = a1b1c1d1e1f1g1h1; - kernel.packet[2] = a2b2c2d2e2f2g2h2; - kernel.packet[3] = a3b3c3d3e3f3g3h3; - kernel.packet[4] = a4b4c4d4e4f4g4h4; - kernel.packet[5] = a5b5c5d5e5f5g5h5; - kernel.packet[6] = a6b6c6d6e6f6g6h6; - kernel.packet[7] = a7b7c7d7e7f7g7h7; -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - EIGEN_ALIGN32 Eigen::half in[4][8]; - pstore(in[0], kernel.packet[0]); - pstore(in[1], kernel.packet[1]); - pstore(in[2], kernel.packet[2]); - pstore(in[3], kernel.packet[3]); - - EIGEN_ALIGN32 Eigen::half out[4][8]; - - for (int i = 0; i < 4; ++i) { - for (int j = 0; j < 4; ++j) { - out[i][j] = in[j][2*i]; - } - for (int j = 0; j < 4; ++j) { - out[i][j+4] = in[j][2*i+1]; - } - } - - kernel.packet[0] = pload(out[0]); - kernel.packet[1] = pload(out[1]); - kernel.packet[2] = pload(out[2]); - kernel.packet[3] = pload(out[3]); +template<> EIGEN_STRONG_INLINE Packet4d pinsertlast(const Packet4d& a, double b) +{ + return _mm256_blend_pd(a,pset1(b),(1<<3)); } } // end namespace internal diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/TypeCasting.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/TypeCasting.h index 181043588..83bfdc604 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/TypeCasting.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX/TypeCasting.h @@ -37,51 +37,13 @@ struct type_casting_traits { template<> EIGEN_STRONG_INLINE Packet8i pcast(const Packet8f& a) { - return _mm256_cvttps_epi32(a); + return _mm256_cvtps_epi32(a); } template<> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8i& a) { return _mm256_cvtepi32_ps(a); } -template<> EIGEN_STRONG_INLINE Packet8i preinterpret(const Packet8f& a) { - return _mm256_castps_si256(a); -} - -template<> EIGEN_STRONG_INLINE Packet8f preinterpret(const Packet8i& a) { - return _mm256_castsi256_ps(a); -} - -#ifndef EIGEN_VECTORIZE_AVX512 - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8h& a) { - return half2float(a); -} - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -#endif // EIGEN_VECTORIZE_AVX512 - -template<> EIGEN_STRONG_INLINE Packet8h pcast(const Packet8f& a) { - return float2half(a); -} - } // end namespace internal } // end namespace Eigen diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/Complex.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/Complex.h deleted file mode 100644 index dc2ae0a35..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/Complex.h +++ /dev/null @@ -1,447 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2018 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_COMPLEX_AVX512_H -#define EIGEN_COMPLEX_AVX512_H - -namespace Eigen { - -namespace internal { - -//---------- float ---------- -struct Packet8cf -{ - EIGEN_STRONG_INLINE Packet8cf() {} - EIGEN_STRONG_INLINE explicit Packet8cf(const __m512& a) : v(a) {} - __m512 v; -}; - -template<> struct packet_traits > : default_packet_traits -{ - typedef Packet8cf type; - typedef Packet4cf half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 8, - HasHalfPacket = 1, - - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasNegate = 1, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasSetLinear = 0, - HasInsert = 1 - }; -}; - -template<> struct unpacket_traits { - typedef std::complex type; - enum { - size = 8, - alignment=unpacket_traits::alignment, - vectorizable=true, - masked_load_available=false, - masked_store_available=false - }; - typedef Packet4cf half; -}; - -template<> EIGEN_STRONG_INLINE Packet8cf ptrue(const Packet8cf& a) { return Packet8cf(ptrue(Packet16f(a.v))); } -template<> EIGEN_STRONG_INLINE Packet8cf pnot(const Packet8cf& a) { return Packet8cf(pnot(Packet16f(a.v))); } -template<> EIGEN_STRONG_INLINE Packet8cf padd(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_add_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet8cf psub(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(_mm512_sub_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet8cf pnegate(const Packet8cf& a) -{ - return Packet8cf(pnegate(a.v)); -} -template<> EIGEN_STRONG_INLINE Packet8cf pconj(const Packet8cf& a) -{ - const __m512 mask = _mm512_castsi512_ps(_mm512_setr_epi32( - 0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000, - 0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000,0x00000000,0x80000000)); - return Packet8cf(pxor(a.v,mask)); -} - -template<> EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) -{ - __m512 tmp2 = _mm512_mul_ps(_mm512_movehdup_ps(a.v), _mm512_permute_ps(b.v, _MM_SHUFFLE(2,3,0,1))); - return Packet8cf(_mm512_fmaddsub_ps(_mm512_moveldup_ps(a.v), b.v, tmp2)); -} - -template<> EIGEN_STRONG_INLINE Packet8cf pand (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pand(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet8cf por (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(por(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet8cf pxor (const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pxor(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet8cf pandnot(const Packet8cf& a, const Packet8cf& b) { return Packet8cf(pandnot(a.v,b.v)); } - -template <> -EIGEN_STRONG_INLINE Packet8cf pcmp_eq(const Packet8cf& a, const Packet8cf& b) { - __m512 eq = pcmp_eq(a.v, b.v); - return Packet8cf(pand(eq, _mm512_permute_ps(eq, 0xB1))); -} - -template<> EIGEN_STRONG_INLINE Packet8cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload(&numext::real_ref(*from))); } -template<> EIGEN_STRONG_INLINE Packet8cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet8cf(ploadu(&numext::real_ref(*from))); } - - -template<> EIGEN_STRONG_INLINE Packet8cf pset1(const std::complex& from) -{ - return Packet8cf(_mm512_castpd_ps(pload1((const double*)(const void*)&from))); -} - -template<> EIGEN_STRONG_INLINE Packet8cf ploaddup(const std::complex* from) -{ - return Packet8cf( _mm512_castpd_ps( ploaddup((const double*)(const void*)from )) ); -} -template<> EIGEN_STRONG_INLINE Packet8cf ploadquad(const std::complex* from) -{ - return Packet8cf( _mm512_castpd_ps( ploadquad((const double*)(const void*)from )) ); -} - -template<> EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet8cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, const Packet8cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); } - -template<> EIGEN_DEVICE_FUNC inline Packet8cf pgather, Packet8cf>(const std::complex* from, Index stride) -{ - return Packet8cf(_mm512_castpd_ps(pgather((const double*)(const void*)from, stride))); -} - -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet8cf>(std::complex* to, const Packet8cf& from, Index stride) -{ - pscatter((double*)(void*)to, _mm512_castps_pd(from.v), stride); -} - -template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet8cf& a) -{ - return pfirst(Packet2cf(_mm512_castps512_ps128(a.v))); -} - -template<> EIGEN_STRONG_INLINE Packet8cf preverse(const Packet8cf& a) { - return Packet8cf(_mm512_castsi512_ps( - _mm512_permutexvar_epi64( _mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), - _mm512_castps_si512(a.v)))); -} - -template<> EIGEN_STRONG_INLINE std::complex predux(const Packet8cf& a) -{ - return predux(padd(Packet4cf(extract256<0>(a.v)), - Packet4cf(extract256<1>(a.v)))); -} - -template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet8cf& a) -{ - return predux_mul(pmul(Packet4cf(extract256<0>(a.v)), - Packet4cf(extract256<1>(a.v)))); -} - -template <> -EIGEN_STRONG_INLINE Packet4cf predux_half_dowto4(const Packet8cf& a) { - __m256 lane0 = extract256<0>(a.v); - __m256 lane1 = extract256<1>(a.v); - __m256 res = _mm256_add_ps(lane0, lane1); - return Packet4cf(res); -} - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - -EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf,Packet16f) - -template<> EIGEN_STRONG_INLINE Packet8cf pdiv(const Packet8cf& a, const Packet8cf& b) -{ - Packet8cf num = pmul(a, pconj(b)); - __m512 tmp = _mm512_mul_ps(b.v, b.v); - __m512 tmp2 = _mm512_shuffle_ps(tmp,tmp,0xB1); - __m512 denom = _mm512_add_ps(tmp, tmp2); - return Packet8cf(_mm512_div_ps(num.v, denom)); -} - -template<> EIGEN_STRONG_INLINE Packet8cf pcplxflip(const Packet8cf& x) -{ - return Packet8cf(_mm512_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0 ,1))); -} - -//---------- double ---------- -struct Packet4cd -{ - EIGEN_STRONG_INLINE Packet4cd() {} - EIGEN_STRONG_INLINE explicit Packet4cd(const __m512d& a) : v(a) {} - __m512d v; -}; - -template<> struct packet_traits > : default_packet_traits -{ - typedef Packet4cd type; - typedef Packet2cd half; - enum { - Vectorizable = 1, - AlignedOnScalar = 0, - size = 4, - HasHalfPacket = 1, - - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasNegate = 1, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasSetLinear = 0 - }; -}; - -template<> struct unpacket_traits { - typedef std::complex type; - enum { - size = 4, - alignment = unpacket_traits::alignment, - vectorizable=true, - masked_load_available=false, - masked_store_available=false - }; - typedef Packet2cd half; -}; - -template<> EIGEN_STRONG_INLINE Packet4cd padd(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_add_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet4cd psub(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(_mm512_sub_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet4cd pnegate(const Packet4cd& a) { return Packet4cd(pnegate(a.v)); } -template<> EIGEN_STRONG_INLINE Packet4cd pconj(const Packet4cd& a) -{ - const __m512d mask = _mm512_castsi512_pd( - _mm512_set_epi32(0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0, - 0x80000000,0x0,0x0,0x0,0x80000000,0x0,0x0,0x0)); - return Packet4cd(pxor(a.v,mask)); -} - -template<> EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) -{ - __m512d tmp1 = _mm512_shuffle_pd(a.v,a.v,0x0); - __m512d tmp2 = _mm512_shuffle_pd(a.v,a.v,0xFF); - __m512d tmp3 = _mm512_shuffle_pd(b.v,b.v,0x55); - __m512d odd = _mm512_mul_pd(tmp2, tmp3); - return Packet4cd(_mm512_fmaddsub_pd(tmp1, b.v, odd)); -} - -template<> EIGEN_STRONG_INLINE Packet4cd ptrue(const Packet4cd& a) { return Packet4cd(ptrue(Packet8d(a.v))); } -template<> EIGEN_STRONG_INLINE Packet4cd pnot(const Packet4cd& a) { return Packet4cd(pnot(Packet8d(a.v))); } -template<> EIGEN_STRONG_INLINE Packet4cd pand (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pand(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet4cd por (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(por(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet4cd pxor (const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pxor(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet4cd pandnot(const Packet4cd& a, const Packet4cd& b) { return Packet4cd(pandnot(a.v,b.v)); } - -template <> -EIGEN_STRONG_INLINE Packet4cd pcmp_eq(const Packet4cd& a, const Packet4cd& b) { - __m512d eq = pcmp_eq(a.v, b.v); - return Packet4cd(pand(eq, _mm512_permute_pd(eq, 0x55))); -} - -template<> EIGEN_STRONG_INLINE Packet4cd pload (const std::complex* from) -{ EIGEN_DEBUG_ALIGNED_LOAD return Packet4cd(pload((const double*)from)); } -template<> EIGEN_STRONG_INLINE Packet4cd ploadu(const std::complex* from) -{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cd(ploadu((const double*)from)); } - -template<> EIGEN_STRONG_INLINE Packet4cd pset1(const std::complex& from) -{ - #ifdef EIGEN_VECTORIZE_AVX512DQ - return Packet4cd(_mm512_broadcast_f64x2(pset1(from).v)); - #else - return Packet4cd(_mm512_castps_pd(_mm512_broadcast_f32x4( _mm_castpd_ps(pset1(from).v)))); - #endif -} - -template<> EIGEN_STRONG_INLINE Packet4cd ploaddup(const std::complex* from) { - return Packet4cd(_mm512_insertf64x4( - _mm512_castpd256_pd512(ploaddup(from).v), ploaddup(from+1).v, 1)); -} - -template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet4cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet4cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } - -template<> EIGEN_DEVICE_FUNC inline Packet4cd pgather, Packet4cd>(const std::complex* from, Index stride) -{ - return Packet4cd(_mm512_insertf64x4(_mm512_castpd256_pd512( - _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu(from+0*stride).v), ploadu(from+1*stride).v,1)), - _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu(from+2*stride).v), ploadu(from+3*stride).v,1), 1)); -} - -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet4cd>(std::complex* to, const Packet4cd& from, Index stride) -{ - __m512i fromi = _mm512_castpd_si512(from.v); - double* tod = (double*)(void*)to; - _mm_storeu_pd(tod+0*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,0)) ); - _mm_storeu_pd(tod+2*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,1)) ); - _mm_storeu_pd(tod+4*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,2)) ); - _mm_storeu_pd(tod+6*stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi,3)) ); -} - -template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet4cd& a) -{ - __m128d low = extract128<0>(a.v); - EIGEN_ALIGN16 double res[2]; - _mm_store_pd(res, low); - return std::complex(res[0],res[1]); -} - -template<> EIGEN_STRONG_INLINE Packet4cd preverse(const Packet4cd& a) { - return Packet4cd(_mm512_shuffle_f64x2(a.v, a.v, EIGEN_SSE_SHUFFLE_MASK(3,2,1,0))); -} - -template<> EIGEN_STRONG_INLINE std::complex predux(const Packet4cd& a) -{ - return predux(padd(Packet2cd(_mm512_extractf64x4_pd(a.v,0)), - Packet2cd(_mm512_extractf64x4_pd(a.v,1)))); -} - -template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet4cd& a) -{ - return predux_mul(pmul(Packet2cd(_mm512_extractf64x4_pd(a.v,0)), - Packet2cd(_mm512_extractf64x4_pd(a.v,1)))); -} - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cd pmadd(const Packet4cd& x, const Packet4cd& y, const Packet4cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet4cd pmul(const Packet4cd& a, const Packet4cd& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - -EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cd,Packet8d) - -template<> EIGEN_STRONG_INLINE Packet4cd pdiv(const Packet4cd& a, const Packet4cd& b) -{ - Packet4cd num = pmul(a, pconj(b)); - __m512d tmp = _mm512_mul_pd(b.v, b.v); - __m512d denom = padd(_mm512_permute_pd(tmp,0x55), tmp); - return Packet4cd(_mm512_div_pd(num.v, denom)); -} - -template<> EIGEN_STRONG_INLINE Packet4cd pcplxflip(const Packet4cd& x) -{ - return Packet4cd(_mm512_permute_pd(x.v,0x55)); -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - PacketBlock pb; - - pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v); - pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v); - pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v); - pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v); - ptranspose(pb); - kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]); - kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]); - kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]); - kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]); -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - PacketBlock pb; - - pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v); - pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v); - pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v); - pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v); - pb.packet[4] = _mm512_castps_pd(kernel.packet[4].v); - pb.packet[5] = _mm512_castps_pd(kernel.packet[5].v); - pb.packet[6] = _mm512_castps_pd(kernel.packet[6].v); - pb.packet[7] = _mm512_castps_pd(kernel.packet[7].v); - ptranspose(pb); - kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]); - kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]); - kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]); - kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]); - kernel.packet[4].v = _mm512_castpd_ps(pb.packet[4]); - kernel.packet[5].v = _mm512_castpd_ps(pb.packet[5]); - kernel.packet[6].v = _mm512_castpd_ps(pb.packet[6]); - kernel.packet[7].v = _mm512_castpd_ps(pb.packet[7]); -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - __m512d T0 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, EIGEN_SSE_SHUFFLE_MASK(0,1,0,1)); // [a0 a1 b0 b1] - __m512d T1 = _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, EIGEN_SSE_SHUFFLE_MASK(2,3,2,3)); // [a2 a3 b2 b3] - __m512d T2 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, EIGEN_SSE_SHUFFLE_MASK(0,1,0,1)); // [c0 c1 d0 d1] - __m512d T3 = _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, EIGEN_SSE_SHUFFLE_MASK(2,3,2,3)); // [c2 c3 d2 d3] - - kernel.packet[3] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, EIGEN_SSE_SHUFFLE_MASK(1,3,1,3))); // [a3 b3 c3 d3] - kernel.packet[2] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, EIGEN_SSE_SHUFFLE_MASK(0,2,0,2))); // [a2 b2 c2 d2] - kernel.packet[1] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, EIGEN_SSE_SHUFFLE_MASK(1,3,1,3))); // [a1 b1 c1 d1] - kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, EIGEN_SSE_SHUFFLE_MASK(0,2,0,2))); // [a0 b0 c0 d0] -} - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_COMPLEX_AVX512_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h index 67043d01b..b259c1e1f 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -15,13 +15,13 @@ namespace Eigen { namespace internal { // Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics. -#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 +#if EIGEN_GNUC_AT_LEAST(5, 3) #define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \ const Packet16f p16f_##NAME = pset1(X) #define _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(NAME, X) \ - const Packet16f p16f_##NAME = preinterpret(pset1(X)) + const Packet16f p16f_##NAME = (__m512)pset1(X) #define _EIGEN_DECLARE_CONST_Packet8d(NAME, X) \ const Packet8d p8d_##NAME = pset1(X) @@ -29,6 +29,7 @@ namespace internal { #define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \ const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X)) + // Natural logarithm // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) // and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can @@ -72,7 +73,7 @@ plog(const Packet16f& _x) { x = pmax(x, p16f_min_norm_pos); // Extract the shifted exponents. - Packet16f emm0 = _mm512_cvtepi32_ps(_mm512_srli_epi32((preinterpret(x)), 23)); + Packet16f emm0 = _mm512_cvtepi32_ps(_mm512_srli_epi32((__m512i)x, 23)); Packet16f e = _mm512_sub_ps(emm0, p16f_126f); // Set the exponents to -1, i.e. x are in the range [0.5,1). @@ -128,6 +129,7 @@ plog(const Packet16f& _x) { p16f_nan), p16f_minus_inf); } + #endif // Exponential function. Works by writing "x = m*log(2) + r" where @@ -253,7 +255,6 @@ pexp(const Packet8d& _x) { return pmax(pmul(x, e), _x); }*/ - // Functions for sqrt. // The EIGEN_FAST_MATH version uses the _mm_rsqrt_ps approximation and one step // of Newton's method, at a cost of 1-2 bits of precision as opposed to the @@ -309,136 +310,78 @@ EIGEN_STRONG_INLINE Packet8d psqrt(const Packet8d& x) { } #endif -// prsqrt for float. -#if defined(EIGEN_VECTORIZE_AVX512ER) - -template <> -EIGEN_STRONG_INLINE Packet16f prsqrt(const Packet16f& x) { - return _mm512_rsqrt28_ps(x); -} - -#elif EIGEN_FAST_MATH - +// Functions for rsqrt. +// Almost identical to the sqrt routine, just leave out the last multiplication +// and fill in NaN/Inf where needed. Note that this function only exists as an +// iterative version for doubles since there is no instruction for diretly +// computing the reciprocal square root in AVX-512. +#ifdef EIGEN_FAST_MATH template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f prsqrt(const Packet16f& _x) { _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(inf, 0x7f800000); + _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000); _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f); _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f); + _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000); Packet16f neg_half = pmul(_x, p16f_minus_half); - // Identity infinite, negative and denormal arguments. - __mmask16 inf_mask = _mm512_cmp_ps_mask(_x, p16f_inf, _CMP_EQ_OQ); - __mmask16 not_pos_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LE_OQ); - __mmask16 not_finite_pos_mask = not_pos_mask | inf_mask; - - // Compute an approximate result using the rsqrt intrinsic, forcing +inf - // for denormals for consistency with AVX and SSE implementations. - Packet16f y_approx = _mm512_rsqrt14_ps(_x); + // select only the inverse sqrt of positive normal inputs (denormals are + // flushed to zero and cause infs as well). + __mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ); + Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps()); - // Do a single step of Newton-Raphson iteration to improve the approximation. - // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n). - // It is essential to evaluate the inner term like this because forming - // y_n^2 may over- or underflow. - Packet16f y_newton = pmul(y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p16f_one_point_five)); + // Fill in NaNs and Infs for the negative/zero entries. + __mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ); + Packet16f infs_and_nans = _mm512_mask_blend_ps( + neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan); - // Select the result of the Newton-Raphson step for positive finite arguments. - // For other arguments, choose the output of the intrinsic. This will - // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf. - return _mm512_mask_blend_ps(not_finite_pos_mask, y_newton, y_approx); - } + // Do a single step of Newton's iteration. + x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)); -#else - -template <> -EIGEN_STRONG_INLINE Packet16f prsqrt(const Packet16f& x) { - _EIGEN_DECLARE_CONST_Packet16f(one, 1.0f); - return _mm512_div_ps(p16f_one, _mm512_sqrt_ps(x)); + // Insert NaNs and Infs in all the right places. + return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans); } -#endif - -// prsqrt for double. -#if EIGEN_FAST_MATH template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d prsqrt(const Packet8d& _x) { + _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL); + _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(nan, 0x7ff1000000000000LL); _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5); _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5); - _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(inf, 0x7ff0000000000000LL); + _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL); Packet8d neg_half = pmul(_x, p8d_minus_half); - // Identity infinite, negative and denormal arguments. - __mmask8 inf_mask = _mm512_cmp_pd_mask(_x, p8d_inf, _CMP_EQ_OQ); - __mmask8 not_pos_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LE_OQ); - __mmask8 not_finite_pos_mask = not_pos_mask | inf_mask; + // select only the inverse sqrt of positive normal inputs (denormals are + // flushed to zero and cause infs as well). + __mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ); + Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd()); - // Compute an approximate result using the rsqrt intrinsic, forcing +inf - // for denormals for consistency with AVX and SSE implementations. -#if defined(EIGEN_VECTORIZE_AVX512ER) - Packet8d y_approx = _mm512_rsqrt28_pd(_x); -#else - Packet8d y_approx = _mm512_rsqrt14_pd(_x); -#endif - // Do one or two steps of Newton-Raphson's to improve the approximation, depending on the - // starting accuracy (either 2^-14 or 2^-28, depending on whether AVX512ER is available). - // The Newton-Raphson algorithm has quadratic convergence and roughly doubles the number - // of correct digits for each step. - // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n). - // It is essential to evaluate the inner term like this because forming - // y_n^2 may over- or underflow. - Packet8d y_newton = pmul(y_approx, pmadd(neg_half, pmul(y_approx, y_approx), p8d_one_point_five)); -#if !defined(EIGEN_VECTORIZE_AVX512ER) - y_newton = pmul(y_newton, pmadd(y_newton, pmul(neg_half, y_newton), p8d_one_point_five)); -#endif - // Select the result of the Newton-Raphson step for positive finite arguments. - // For other arguments, choose the output of the intrinsic. This will - // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(0) = +inf. - return _mm512_mask_blend_pd(not_finite_pos_mask, y_newton, y_approx); + // Fill in NaNs and Infs for the negative/zero entries. + __mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ); + Packet8d infs_and_nans = _mm512_mask_blend_pd( + neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan); + + // Do a first step of Newton's iteration. + x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); + + // Do a second step of Newton's iteration. + x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); + + // Insert NaNs and Infs in all the right places. + return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans); } -#else +#elif defined(EIGEN_VECTORIZE_AVX512ER) template <> -EIGEN_STRONG_INLINE Packet8d prsqrt(const Packet8d& x) { - _EIGEN_DECLARE_CONST_Packet8d(one, 1.0f); - return _mm512_div_pd(p8d_one, _mm512_sqrt_pd(x)); +EIGEN_STRONG_INLINE Packet16f prsqrt(const Packet16f& x) { + return _mm512_rsqrt28_ps(x); } #endif - -#if defined(EIGEN_VECTORIZE_AVX512DQ) -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet16f plog1p(const Packet16f& _x) { - return generic_plog1p(_x); -} - -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet16f pexpm1(const Packet16f& _x) { - return generic_expm1(_x); -} #endif -#endif - - -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f -psin(const Packet16f& _x) { - return psin_float(_x); -} - -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f -pcos(const Packet16f& _x) { - return pcos_float(_x); -} - -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f -ptanh(const Packet16f& _x) { - return internal::generic_fast_tanh_float(_x); -} - } // end namespace internal } // end namespace Eigen diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/PacketMath.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/PacketMath.h index 10a1d4adb..000b7762f 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -31,7 +31,6 @@ namespace internal { typedef __m512 Packet16f; typedef __m512i Packet16i; typedef __m512d Packet8d; -typedef eigen_packet_wrapper<__m256i, 1> Packet16h; template <> struct is_arithmetic<__m512> { @@ -46,38 +45,6 @@ struct is_arithmetic<__m512d> { enum { value = true }; }; -template<> struct is_arithmetic { enum { value = true }; }; - -template <> -struct packet_traits : default_packet_traits { - typedef Packet16h type; - // There is no half-size packet for Packet16h. - typedef Packet16h half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 16, - HasHalfPacket = 0, - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasNegate = 1, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasConj = 0, - HasSetLinear = 0, - HasSqrt = 0, - HasRsqrt = 0, - HasExp = 0, - HasLog = 0, - HasBlend = 0, - HasInsert = 1 - }; -}; - template<> struct packet_traits : default_packet_traits { typedef Packet16f type; @@ -88,22 +55,13 @@ template<> struct packet_traits : default_packet_traits size = 16, HasHalfPacket = 1, HasBlend = 0, - HasInsert = 1, - HasSin = EIGEN_FAST_MATH, - HasCos = EIGEN_FAST_MATH, #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) #ifdef EIGEN_VECTORIZE_AVX512DQ HasLog = 1, - HasLog1p = 1, - HasExpm1 = 1, - HasNdtri = 1, - HasBessel = 1, #endif HasExp = 1, HasSqrt = EIGEN_FAST_MATH, HasRsqrt = EIGEN_FAST_MATH, - HasTanh = EIGEN_FAST_MATH, - HasErf = EIGEN_FAST_MATH, #endif HasDiv = 1 }; @@ -117,7 +75,6 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size = 8, HasHalfPacket = 1, - HasInsert = 1, #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) HasSqrt = EIGEN_FAST_MATH, HasRsqrt = EIGEN_FAST_MATH, @@ -143,27 +100,19 @@ struct unpacket_traits { typedef float type; typedef Packet8f half; typedef Packet16i integer_packet; - typedef uint16_t mask_t; - enum { size = 16, alignment=Aligned64, vectorizable=true, masked_load_available=true, masked_store_available=true }; + enum { size = 16, alignment=Aligned64 }; }; template <> struct unpacket_traits { typedef double type; typedef Packet4d half; - enum { size = 8, alignment=Aligned64, vectorizable=true, masked_load_available=false, masked_store_available=false }; + enum { size = 8, alignment=Aligned64 }; }; template <> struct unpacket_traits { typedef int type; typedef Packet8i half; - enum { size = 16, alignment=Aligned64, vectorizable=false, masked_load_available=false, masked_store_available=false }; -}; - -template<> -struct unpacket_traits { - typedef Eigen::half type; - typedef Packet16h half; - enum {size=16, alignment=Aligned32, vectorizable=true, masked_load_available=false, masked_store_available=false}; + enum { size = 16, alignment=Aligned64 }; }; template <> @@ -179,11 +128,6 @@ EIGEN_STRONG_INLINE Packet16i pset1(const int& from) { return _mm512_set1_epi32(from); } -template <> -EIGEN_STRONG_INLINE Packet16f pset1frombits(unsigned int from) { - return _mm512_castsi512_ps(_mm512_set1_epi32(from)); -} - template <> EIGEN_STRONG_INLINE Packet16f pload1(const float* from) { return _mm512_broadcastss_ps(_mm_load_ps1(from)); @@ -300,24 +244,6 @@ EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b, } #endif -template <> -EIGEN_DEVICE_FUNC inline Packet16f pselect(const Packet16f& mask, - const Packet16f& a, - const Packet16f& b) { - __mmask16 mask16 = _mm512_cmp_epi32_mask( - _mm512_castps_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ); - return _mm512_mask_blend_ps(mask16, a, b); -} - -template <> -EIGEN_DEVICE_FUNC inline Packet8d pselect(const Packet8d& mask, - const Packet8d& a, - const Packet8d& b) { - __mmask8 mask8 = _mm512_cmp_epi64_mask(_mm512_castpd_si512(mask), - _mm512_setzero_epi32(), _MM_CMPINT_EQ); - return _mm512_mask_blend_pd(mask8, a, b); -} - template <> EIGEN_STRONG_INLINE Packet16f pmin(const Packet16f& a, const Packet16f& b) { @@ -365,74 +291,23 @@ EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { } #endif -template <> -EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) { - __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ); - return _mm512_castsi512_ps( - _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); -} -template<> EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) { - __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ); - return _mm512_castsi512_ps( - _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); -} - -template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) { - __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ); - return _mm512_castsi512_ps( - _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); -} - -template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) { - __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGT_UQ); - return _mm512_castsi512_ps( - _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); -} - -template<> EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) { - __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _CMP_EQ_OQ); - return _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu); -} - - -template <> -EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) { - __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ); - return _mm512_castsi512_pd( - _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu)); -} -template <> -EIGEN_STRONG_INLINE Packet8d pcmp_le(const Packet8d& a, const Packet8d& b) { - __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LE_OQ); - return _mm512_castsi512_pd( - _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu)); -} -template <> -EIGEN_STRONG_INLINE Packet8d pcmp_lt(const Packet8d& a, const Packet8d& b) { - __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ); - return _mm512_castsi512_pd( - _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu)); -} -template <> -EIGEN_STRONG_INLINE Packet8d pcmp_lt_or_nan(const Packet8d& a, const Packet8d& b) { - __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_NGT_UQ); - return _mm512_castsi512_pd( - _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu)); -} - -template <> -EIGEN_STRONG_INLINE Packet16i ptrue(const Packet16i& /*a*/) { - return _mm512_set1_epi32(0xffffffffu); -} - -template <> -EIGEN_STRONG_INLINE Packet16f ptrue(const Packet16f& a) { - return _mm512_castsi512_ps(ptrue(_mm512_castps_si512(a))); -} - -template <> -EIGEN_STRONG_INLINE Packet8d ptrue(const Packet8d& a) { - return _mm512_castsi512_pd(ptrue(_mm512_castpd_si512(a))); +// Helper function for bit packing snippet of low precision comparison. +// It packs the flags from 32x16 to 16x16. +EIGEN_STRONG_INLINE __m256i Pack32To16(Packet16f rf) { + // Split data into small pieces and handle with AVX instructions + // to guarantee internal order of vector. + // Operation: + // dst[15:0] := Saturate16(rf[31:0]) + // dst[31:16] := Saturate16(rf[63:32]) + // ... + // dst[255:240] := Saturate16(rf[255:224]) + __m256i lo = _mm256_castps_si256(extract256<0>(rf)); + __m256i hi = _mm256_castps_si256(extract256<1>(rf)); + __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0), + _mm256_extractf128_si256(lo, 1)); + __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0), + _mm256_extractf128_si256(hi, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1); } template <> @@ -576,12 +451,6 @@ EIGEN_STRONG_INLINE Packet16i ploadu(const int* from) { reinterpret_cast(from)); } -template <> -EIGEN_STRONG_INLINE Packet16f ploadu(const float* from, uint16_t umask) { - __mmask16 mask = static_cast<__mmask16>(umask); - EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_maskz_loadu_ps(mask, from); -} - // Loads 8 floats from memory a returns the packet // {a0, a0 a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7} template <> @@ -666,11 +535,6 @@ EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet16i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_si512( reinterpret_cast<__m512i*>(to), from); } -template <> -EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet16f& from, uint16_t umask) { - __mmask16 mask = static_cast<__mmask16>(umask); - EIGEN_DEBUG_UNALIGNED_STORE return _mm512_mask_storeu_ps(to, mask, from); -} template <> EIGEN_DEVICE_FUNC inline Packet16f pgather(const float* from, @@ -822,26 +686,27 @@ EIGEN_STRONG_INLINE double predux(const Packet8d& a) { } template <> -EIGEN_STRONG_INLINE Packet8f predux_half_dowto4(const Packet16f& a) { +EIGEN_STRONG_INLINE Packet8f predux_downto4(const Packet16f& a) { #ifdef EIGEN_VECTORIZE_AVX512DQ - __m256 lane0 = _mm512_extractf32x8_ps(a, 0); - __m256 lane1 = _mm512_extractf32x8_ps(a, 1); - return _mm256_add_ps(lane0, lane1); + Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); + Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); + return padd(lane0, lane1); #else - __m128 lane0 = _mm512_extractf32x4_ps(a, 0); - __m128 lane1 = _mm512_extractf32x4_ps(a, 1); - __m128 lane2 = _mm512_extractf32x4_ps(a, 2); - __m128 lane3 = _mm512_extractf32x4_ps(a, 3); - __m128 sum0 = _mm_add_ps(lane0, lane2); - __m128 sum1 = _mm_add_ps(lane1, lane3); + Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); + Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); + Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); + Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); + Packet4f sum0 = padd(lane0, lane2); + Packet4f sum1 = padd(lane1, lane3); return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1); #endif } template <> -EIGEN_STRONG_INLINE Packet4d predux_half_dowto4(const Packet8d& a) { - __m256d lane0 = _mm512_extractf64x4_pd(a, 0); - __m256d lane1 = _mm512_extractf64x4_pd(a, 1); - return _mm256_add_pd(lane0, lane1); +EIGEN_STRONG_INLINE Packet4d predux_downto4(const Packet8d& a) { + Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); + Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); + Packet4d res = padd(lane0, lane1); + return res; } template <> @@ -912,13 +777,196 @@ EIGEN_STRONG_INLINE double predux_max(const Packet8d& a) { return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1))); } -template<> EIGEN_STRONG_INLINE bool predux_any(const Packet16f& x) +template<> EIGEN_STRONG_INLINE Packet16f preduxp(const Packet16f* vecs) { - Packet16i xi = _mm512_castps_si512(x); - __mmask16 tmp = _mm512_test_epi32_mask(xi,xi); - return !_mm512_kortestz(tmp,tmp); + EIGEN_EXTRACT_8f_FROM_16f(vecs[0], vecs0); + EIGEN_EXTRACT_8f_FROM_16f(vecs[1], vecs1); + EIGEN_EXTRACT_8f_FROM_16f(vecs[2], vecs2); + EIGEN_EXTRACT_8f_FROM_16f(vecs[3], vecs3); + EIGEN_EXTRACT_8f_FROM_16f(vecs[4], vecs4); + EIGEN_EXTRACT_8f_FROM_16f(vecs[5], vecs5); + EIGEN_EXTRACT_8f_FROM_16f(vecs[6], vecs6); + EIGEN_EXTRACT_8f_FROM_16f(vecs[7], vecs7); + EIGEN_EXTRACT_8f_FROM_16f(vecs[8], vecs8); + EIGEN_EXTRACT_8f_FROM_16f(vecs[9], vecs9); + EIGEN_EXTRACT_8f_FROM_16f(vecs[10], vecs10); + EIGEN_EXTRACT_8f_FROM_16f(vecs[11], vecs11); + EIGEN_EXTRACT_8f_FROM_16f(vecs[12], vecs12); + EIGEN_EXTRACT_8f_FROM_16f(vecs[13], vecs13); + EIGEN_EXTRACT_8f_FROM_16f(vecs[14], vecs14); + EIGEN_EXTRACT_8f_FROM_16f(vecs[15], vecs15); + + __m256 hsum1 = _mm256_hadd_ps(vecs0_0, vecs1_0); + __m256 hsum2 = _mm256_hadd_ps(vecs2_0, vecs3_0); + __m256 hsum3 = _mm256_hadd_ps(vecs4_0, vecs5_0); + __m256 hsum4 = _mm256_hadd_ps(vecs6_0, vecs7_0); + + __m256 hsum5 = _mm256_hadd_ps(hsum1, hsum1); + __m256 hsum6 = _mm256_hadd_ps(hsum2, hsum2); + __m256 hsum7 = _mm256_hadd_ps(hsum3, hsum3); + __m256 hsum8 = _mm256_hadd_ps(hsum4, hsum4); + + __m256 perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); + __m256 perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); + __m256 perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); + __m256 perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); + + __m256 sum1 = _mm256_add_ps(perm1, hsum5); + __m256 sum2 = _mm256_add_ps(perm2, hsum6); + __m256 sum3 = _mm256_add_ps(perm3, hsum7); + __m256 sum4 = _mm256_add_ps(perm4, hsum8); + + __m256 blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); + __m256 blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); + + __m256 final = _mm256_blend_ps(blend1, blend2, 0xf0); + + hsum1 = _mm256_hadd_ps(vecs0_1, vecs1_1); + hsum2 = _mm256_hadd_ps(vecs2_1, vecs3_1); + hsum3 = _mm256_hadd_ps(vecs4_1, vecs5_1); + hsum4 = _mm256_hadd_ps(vecs6_1, vecs7_1); + + hsum5 = _mm256_hadd_ps(hsum1, hsum1); + hsum6 = _mm256_hadd_ps(hsum2, hsum2); + hsum7 = _mm256_hadd_ps(hsum3, hsum3); + hsum8 = _mm256_hadd_ps(hsum4, hsum4); + + perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); + perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); + perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); + perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); + + sum1 = _mm256_add_ps(perm1, hsum5); + sum2 = _mm256_add_ps(perm2, hsum6); + sum3 = _mm256_add_ps(perm3, hsum7); + sum4 = _mm256_add_ps(perm4, hsum8); + + blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); + blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); + + final = padd(final, _mm256_blend_ps(blend1, blend2, 0xf0)); + + hsum1 = _mm256_hadd_ps(vecs8_0, vecs9_0); + hsum2 = _mm256_hadd_ps(vecs10_0, vecs11_0); + hsum3 = _mm256_hadd_ps(vecs12_0, vecs13_0); + hsum4 = _mm256_hadd_ps(vecs14_0, vecs15_0); + + hsum5 = _mm256_hadd_ps(hsum1, hsum1); + hsum6 = _mm256_hadd_ps(hsum2, hsum2); + hsum7 = _mm256_hadd_ps(hsum3, hsum3); + hsum8 = _mm256_hadd_ps(hsum4, hsum4); + + perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); + perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); + perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); + perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); + + sum1 = _mm256_add_ps(perm1, hsum5); + sum2 = _mm256_add_ps(perm2, hsum6); + sum3 = _mm256_add_ps(perm3, hsum7); + sum4 = _mm256_add_ps(perm4, hsum8); + + blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); + blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); + + __m256 final_1 = _mm256_blend_ps(blend1, blend2, 0xf0); + + hsum1 = _mm256_hadd_ps(vecs8_1, vecs9_1); + hsum2 = _mm256_hadd_ps(vecs10_1, vecs11_1); + hsum3 = _mm256_hadd_ps(vecs12_1, vecs13_1); + hsum4 = _mm256_hadd_ps(vecs14_1, vecs15_1); + + hsum5 = _mm256_hadd_ps(hsum1, hsum1); + hsum6 = _mm256_hadd_ps(hsum2, hsum2); + hsum7 = _mm256_hadd_ps(hsum3, hsum3); + hsum8 = _mm256_hadd_ps(hsum4, hsum4); + + perm1 = _mm256_permute2f128_ps(hsum5, hsum5, 0x23); + perm2 = _mm256_permute2f128_ps(hsum6, hsum6, 0x23); + perm3 = _mm256_permute2f128_ps(hsum7, hsum7, 0x23); + perm4 = _mm256_permute2f128_ps(hsum8, hsum8, 0x23); + + sum1 = _mm256_add_ps(perm1, hsum5); + sum2 = _mm256_add_ps(perm2, hsum6); + sum3 = _mm256_add_ps(perm3, hsum7); + sum4 = _mm256_add_ps(perm4, hsum8); + + blend1 = _mm256_blend_ps(sum1, sum2, 0xcc); + blend2 = _mm256_blend_ps(sum3, sum4, 0xcc); + + final_1 = padd(final_1, _mm256_blend_ps(blend1, blend2, 0xf0)); + + __m512 final_output; + + EIGEN_INSERT_8f_INTO_16f(final_output, final, final_1); + return final_output; } +template<> EIGEN_STRONG_INLINE Packet8d preduxp(const Packet8d* vecs) +{ + Packet4d vecs0_0 = _mm512_extractf64x4_pd(vecs[0], 0); + Packet4d vecs0_1 = _mm512_extractf64x4_pd(vecs[0], 1); + + Packet4d vecs1_0 = _mm512_extractf64x4_pd(vecs[1], 0); + Packet4d vecs1_1 = _mm512_extractf64x4_pd(vecs[1], 1); + + Packet4d vecs2_0 = _mm512_extractf64x4_pd(vecs[2], 0); + Packet4d vecs2_1 = _mm512_extractf64x4_pd(vecs[2], 1); + + Packet4d vecs3_0 = _mm512_extractf64x4_pd(vecs[3], 0); + Packet4d vecs3_1 = _mm512_extractf64x4_pd(vecs[3], 1); + + Packet4d vecs4_0 = _mm512_extractf64x4_pd(vecs[4], 0); + Packet4d vecs4_1 = _mm512_extractf64x4_pd(vecs[4], 1); + + Packet4d vecs5_0 = _mm512_extractf64x4_pd(vecs[5], 0); + Packet4d vecs5_1 = _mm512_extractf64x4_pd(vecs[5], 1); + + Packet4d vecs6_0 = _mm512_extractf64x4_pd(vecs[6], 0); + Packet4d vecs6_1 = _mm512_extractf64x4_pd(vecs[6], 1); + + Packet4d vecs7_0 = _mm512_extractf64x4_pd(vecs[7], 0); + Packet4d vecs7_1 = _mm512_extractf64x4_pd(vecs[7], 1); + + Packet4d tmp0, tmp1; + + tmp0 = _mm256_hadd_pd(vecs0_0, vecs1_0); + tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); + + tmp1 = _mm256_hadd_pd(vecs2_0, vecs3_0); + tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); + + __m256d final_0 = _mm256_blend_pd(tmp0, tmp1, 0xC); + + tmp0 = _mm256_hadd_pd(vecs0_1, vecs1_1); + tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); + + tmp1 = _mm256_hadd_pd(vecs2_1, vecs3_1); + tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); + + final_0 = padd(final_0, _mm256_blend_pd(tmp0, tmp1, 0xC)); + + tmp0 = _mm256_hadd_pd(vecs4_0, vecs5_0); + tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); + + tmp1 = _mm256_hadd_pd(vecs6_0, vecs7_0); + tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); + + __m256d final_1 = _mm256_blend_pd(tmp0, tmp1, 0xC); + + tmp0 = _mm256_hadd_pd(vecs4_1, vecs5_1); + tmp0 = _mm256_add_pd(tmp0, _mm256_permute2f128_pd(tmp0, tmp0, 1)); + + tmp1 = _mm256_hadd_pd(vecs6_1, vecs7_1); + tmp1 = _mm256_add_pd(tmp1, _mm256_permute2f128_pd(tmp1, tmp1, 1)); + + final_1 = padd(final_1, _mm256_blend_pd(tmp0, tmp1, 0xC)); + + __m512d final_output = _mm512_insertf64x4(final_output, final_0, 0); + + return _mm512_insertf64x4(final_output, final_1, 1); +} + #define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \ @@ -1202,418 +1250,52 @@ template<> EIGEN_STRONG_INLINE Packet16f pcast(const Packe return _mm512_cvtepi32_ps(a); } -template<> EIGEN_STRONG_INLINE Packet16i preinterpret(const Packet16f& a) { - return _mm512_castps_si512(a); -} +template +struct palign_impl { + static EIGEN_STRONG_INLINE void run(Packet16f& first, + const Packet16f& second) { + if (Offset != 0) { + __m512i first_idx = _mm512_set_epi32( + Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11, + Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6, + Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset); -template<> EIGEN_STRONG_INLINE Packet16f preinterpret(const Packet16i& a) { - return _mm512_castsi512_ps(a); -} + __m512i second_idx = + _mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4, + Offset - 5, Offset - 6, Offset - 7, Offset - 8, + Offset - 9, Offset - 10, Offset - 11, Offset - 12, + Offset - 13, Offset - 14, Offset - 15, Offset - 16); + unsigned short mask = 0xFFFF; + mask <<= (16 - Offset); -// Packet math for Eigen::half -template<> EIGEN_STRONG_INLINE Packet16h pset1(const Eigen::half& from) { - return _mm256_set1_epi16(from.x); -} - -template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet16h& from) { - return half_impl::raw_uint16_to_half(static_cast(_mm256_extract_epi16(from, 0))); -} - -template<> EIGEN_STRONG_INLINE Packet16h pload(const Eigen::half* from) { - return _mm256_load_si256(reinterpret_cast(from)); -} - -template<> EIGEN_STRONG_INLINE Packet16h ploadu(const Eigen::half* from) { - return _mm256_loadu_si256(reinterpret_cast(from)); -} - -template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet16h& from) { - // (void*) -> workaround clang warning: - // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32 - _mm256_store_si256((__m256i*)(void*)to, from); -} - -template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet16h& from) { - // (void*) -> workaround clang warning: - // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32 - _mm256_storeu_si256((__m256i*)(void*)to, from); -} - -template<> EIGEN_STRONG_INLINE Packet16h -ploaddup(const Eigen::half* from) { - unsigned short a = from[0].x; - unsigned short b = from[1].x; - unsigned short c = from[2].x; - unsigned short d = from[3].x; - unsigned short e = from[4].x; - unsigned short f = from[5].x; - unsigned short g = from[6].x; - unsigned short h = from[7].x; - return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a); -} - -template<> EIGEN_STRONG_INLINE Packet16h -ploadquad(const Eigen::half* from) { - unsigned short a = from[0].x; - unsigned short b = from[1].x; - unsigned short c = from[2].x; - unsigned short d = from[3].x; - return _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a); -} - -EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { -#ifdef EIGEN_HAS_FP16_C - return _mm512_cvtph_ps(a); -#else - EIGEN_ALIGN64 half aux[16]; - pstore(aux, a); - float f0(aux[0]); - float f1(aux[1]); - float f2(aux[2]); - float f3(aux[3]); - float f4(aux[4]); - float f5(aux[5]); - float f6(aux[6]); - float f7(aux[7]); - float f8(aux[8]); - float f9(aux[9]); - float fa(aux[10]); - float fb(aux[11]); - float fc(aux[12]); - float fd(aux[13]); - float fe(aux[14]); - float ff(aux[15]); - - return _mm512_set_ps( - ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0); -#endif -} - -EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { -#ifdef EIGEN_HAS_FP16_C - return _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); -#else - EIGEN_ALIGN64 float aux[16]; - pstore(aux, a); - half h0(aux[0]); - half h1(aux[1]); - half h2(aux[2]); - half h3(aux[3]); - half h4(aux[4]); - half h5(aux[5]); - half h6(aux[6]); - half h7(aux[7]); - half h8(aux[8]); - half h9(aux[9]); - half ha(aux[10]); - half hb(aux[11]); - half hc(aux[12]); - half hd(aux[13]); - half he(aux[14]); - half hf(aux[15]); - - return _mm256_set_epi16( - hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x, - h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); -#endif -} - -template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) { - return ptrue(Packet8i(a)); -} - -template<> EIGEN_STRONG_INLINE Packet16h pnot(const Packet16h& a) { - return _mm256_xor_si256(a, ptrue(a)); -} - - -template<> EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a,const Packet16h& b) { - // in some cases Packet8i is a wrapper around __m256i, so we need to - // cast to Packet8i to call the correct overload. - return por(Packet8i(a),Packet8i(b)); -} -template<> EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a,const Packet16h& b) { - return pxor(Packet8i(a),Packet8i(b)); -} -template<> EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a,const Packet16h& b) { - return pand(Packet8i(a),Packet8i(b)); -} -template<> EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a,const Packet16h& b) { - return pandnot(Packet8i(a),Packet8i(b)); -} - -template<> EIGEN_STRONG_INLINE Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) { - return _mm256_blendv_epi8(b, a, mask); -} - -template<> EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a,const Packet16h& b) { - Packet16f af = half2float(a); - Packet16f bf = half2float(b); - Packet16f rf = pcmp_eq(af, bf); - // Pack the 32-bit flags into 16-bits flags. - __m256i lo = _mm256_castps_si256(extract256<0>(rf)); - __m256i hi = _mm256_castps_si256(extract256<1>(rf)); - __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0), - _mm256_extractf128_si256(lo, 1)); - __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0), - _mm256_extractf128_si256(hi, 1)); - return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1); -} - -template<> EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) { - Packet16h sign_mask = _mm256_set1_epi16(static_cast(0x8000)); - return _mm256_xor_si256(a, sign_mask); -} - -template<> EIGEN_STRONG_INLINE Packet16h padd(const Packet16h& a, const Packet16h& b) { - Packet16f af = half2float(a); - Packet16f bf = half2float(b); - Packet16f rf = padd(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE Packet16h psub(const Packet16h& a, const Packet16h& b) { - Packet16f af = half2float(a); - Packet16f bf = half2float(b); - Packet16f rf = psub(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE Packet16h pmul(const Packet16h& a, const Packet16h& b) { - Packet16f af = half2float(a); - Packet16f bf = half2float(b); - Packet16f rf = pmul(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE Packet16h pdiv(const Packet16h& a, const Packet16h& b) { - Packet16f af = half2float(a); - Packet16f bf = half2float(b); - Packet16f rf = pdiv(af, bf); - return float2half(rf); -} - -template<> EIGEN_STRONG_INLINE half predux(const Packet16h& from) { - Packet16f from_float = half2float(from); - return half(predux(from_float)); -} - -template<> EIGEN_STRONG_INLINE half predux_mul(const Packet16h& from) { - Packet16f from_float = half2float(from); - return half(predux_mul(from_float)); -} - -template<> EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a) -{ - __m128i m = _mm_setr_epi8(14,15,12,13,10,11,8,9,6,7,4,5,2,3,0,1); - return _mm256_insertf128_si256( - _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a,1),m)), - _mm_shuffle_epi8(_mm256_extractf128_si256(a,0),m), 1); -} - -template<> EIGEN_STRONG_INLINE Packet16h pgather(const Eigen::half* from, Index stride) -{ - return _mm256_set_epi16( - from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x, - from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x, - from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, - from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); -} - -template<> EIGEN_STRONG_INLINE void pscatter(half* to, const Packet16h& from, Index stride) -{ - EIGEN_ALIGN64 half aux[16]; - pstore(aux, from); - to[stride*0].x = aux[0].x; - to[stride*1].x = aux[1].x; - to[stride*2].x = aux[2].x; - to[stride*3].x = aux[3].x; - to[stride*4].x = aux[4].x; - to[stride*5].x = aux[5].x; - to[stride*6].x = aux[6].x; - to[stride*7].x = aux[7].x; - to[stride*8].x = aux[8].x; - to[stride*9].x = aux[9].x; - to[stride*10].x = aux[10].x; - to[stride*11].x = aux[11].x; - to[stride*12].x = aux[12].x; - to[stride*13].x = aux[13].x; - to[stride*14].x = aux[14].x; - to[stride*15].x = aux[15].x; -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - __m256i a = kernel.packet[0]; - __m256i b = kernel.packet[1]; - __m256i c = kernel.packet[2]; - __m256i d = kernel.packet[3]; - __m256i e = kernel.packet[4]; - __m256i f = kernel.packet[5]; - __m256i g = kernel.packet[6]; - __m256i h = kernel.packet[7]; - __m256i i = kernel.packet[8]; - __m256i j = kernel.packet[9]; - __m256i k = kernel.packet[10]; - __m256i l = kernel.packet[11]; - __m256i m = kernel.packet[12]; - __m256i n = kernel.packet[13]; - __m256i o = kernel.packet[14]; - __m256i p = kernel.packet[15]; - - __m256i ab_07 = _mm256_unpacklo_epi16(a, b); - __m256i cd_07 = _mm256_unpacklo_epi16(c, d); - __m256i ef_07 = _mm256_unpacklo_epi16(e, f); - __m256i gh_07 = _mm256_unpacklo_epi16(g, h); - __m256i ij_07 = _mm256_unpacklo_epi16(i, j); - __m256i kl_07 = _mm256_unpacklo_epi16(k, l); - __m256i mn_07 = _mm256_unpacklo_epi16(m, n); - __m256i op_07 = _mm256_unpacklo_epi16(o, p); - - __m256i ab_8f = _mm256_unpackhi_epi16(a, b); - __m256i cd_8f = _mm256_unpackhi_epi16(c, d); - __m256i ef_8f = _mm256_unpackhi_epi16(e, f); - __m256i gh_8f = _mm256_unpackhi_epi16(g, h); - __m256i ij_8f = _mm256_unpackhi_epi16(i, j); - __m256i kl_8f = _mm256_unpackhi_epi16(k, l); - __m256i mn_8f = _mm256_unpackhi_epi16(m, n); - __m256i op_8f = _mm256_unpackhi_epi16(o, p); - - __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07); - __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07); - __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07); - __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07); - __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07); - __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07); - __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07); - __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07); - - __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f); - __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f); - __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f); - __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f); - __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f); - __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f); - __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f); - __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f); - - __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03); - __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03); - __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03); - __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03); - __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47); - __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47); - __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47); - __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47); - __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b); - __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b); - __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b); - __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b); - __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf); - __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf); - __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf); - __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf); - - // NOTE: no unpacklo/hi instr in this case, so using permute instr. - __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20); - __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20); - __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20); - __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20); - __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20); - __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20); - __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20); - __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20); - __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31); - __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31); - __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31); - __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31); - __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31); - __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31); - __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31); - __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31); - - kernel.packet[0] = a_p_0; - kernel.packet[1] = a_p_1; - kernel.packet[2] = a_p_2; - kernel.packet[3] = a_p_3; - kernel.packet[4] = a_p_4; - kernel.packet[5] = a_p_5; - kernel.packet[6] = a_p_6; - kernel.packet[7] = a_p_7; - kernel.packet[8] = a_p_8; - kernel.packet[9] = a_p_9; - kernel.packet[10] = a_p_a; - kernel.packet[11] = a_p_b; - kernel.packet[12] = a_p_c; - kernel.packet[13] = a_p_d; - kernel.packet[14] = a_p_e; - kernel.packet[15] = a_p_f; -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - EIGEN_ALIGN64 half in[8][16]; - pstore(in[0], kernel.packet[0]); - pstore(in[1], kernel.packet[1]); - pstore(in[2], kernel.packet[2]); - pstore(in[3], kernel.packet[3]); - pstore(in[4], kernel.packet[4]); - pstore(in[5], kernel.packet[5]); - pstore(in[6], kernel.packet[6]); - pstore(in[7], kernel.packet[7]); - - EIGEN_ALIGN64 half out[8][16]; - - for (int i = 0; i < 8; ++i) { - for (int j = 0; j < 8; ++j) { - out[i][j] = in[j][2*i]; - } - for (int j = 0; j < 8; ++j) { - out[i][j+8] = in[j][2*i+1]; + first = _mm512_permutexvar_ps(first_idx, first); + Packet16f tmp = _mm512_permutexvar_ps(second_idx, second); + first = _mm512_mask_blend_ps(mask, first, tmp); } } +}; +template +struct palign_impl { + static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) { + if (Offset != 0) { + __m512i first_idx = _mm512_set_epi32( + 0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0, + Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset); - kernel.packet[0] = pload(out[0]); - kernel.packet[1] = pload(out[1]); - kernel.packet[2] = pload(out[2]); - kernel.packet[3] = pload(out[3]); - kernel.packet[4] = pload(out[4]); - kernel.packet[5] = pload(out[5]); - kernel.packet[6] = pload(out[6]); - kernel.packet[7] = pload(out[7]); -} + __m512i second_idx = _mm512_set_epi32( + 0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0, + Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8); -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - EIGEN_ALIGN64 half in[4][16]; - pstore(in[0], kernel.packet[0]); - pstore(in[1], kernel.packet[1]); - pstore(in[2], kernel.packet[2]); - pstore(in[3], kernel.packet[3]); + unsigned char mask = 0xFF; + mask <<= (8 - Offset); - EIGEN_ALIGN64 half out[4][16]; - - for (int i = 0; i < 4; ++i) { - for (int j = 0; j < 4; ++j) { - out[i][j] = in[j][4*i]; - } - for (int j = 0; j < 4; ++j) { - out[i][j+4] = in[j][4*i+1]; - } - for (int j = 0; j < 4; ++j) { - out[i][j+8] = in[j][4*i+2]; - } - for (int j = 0; j < 4; ++j) { - out[i][j+12] = in[j][4*i+3]; + first = _mm512_permutexvar_pd(first_idx, first); + Packet8d tmp = _mm512_permutexvar_pd(second_idx, second); + first = _mm512_mask_blend_pd(mask, first, tmp); } } - - kernel.packet[0] = pload(out[0]); - kernel.packet[1] = pload(out[1]); - kernel.packet[2] = pload(out[2]); - kernel.packet[3] = pload(out[3]); -} +}; } // end namespace internal diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h deleted file mode 100644 index a82176941..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AVX512/TypeCasting.h +++ /dev/null @@ -1,47 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2019 Rasmus Munk Larsen -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_TYPE_CASTING_AVX512_H -#define EIGEN_TYPE_CASTING_AVX512_H - -namespace Eigen { - -namespace internal { - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet16f pcast(const Packet16h& a) { - return half2float(a); -} - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet16h pcast(const Packet16f& a) { - return float2half(a); -} - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_TYPE_CASTING_AVX512_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AltiVec/Complex.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AltiVec/Complex.h index 69d2ceca8..3e665730c 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AltiVec/Complex.h @@ -60,7 +60,7 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { @@ -82,14 +82,14 @@ template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) { - EIGEN_ALIGN16 std::complex af[2]; + std::complex EIGEN_ALIGN16 af[2]; af[0] = from[0*stride]; af[1] = from[1*stride]; return pload(af); } template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) { - EIGEN_ALIGN16 std::complex af[2]; + std::complex EIGEN_ALIGN16 af[2]; pstore >((std::complex *) af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -128,7 +128,7 @@ template<> EIGEN_STRONG_INLINE void prefetch >(const std::co template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { - EIGEN_ALIGN16 std::complex res[2]; + std::complex EIGEN_ALIGN16 res[2]; pstore((float *)&res, a.v); return res[0]; @@ -149,6 +149,22 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Packe return pfirst(Packet2cf(b)); } +template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) +{ + Packet4f b1, b2; +#ifdef _BIG_ENDIAN + b1 = vec_sld(vecs[0].v, vecs[1].v, 8); + b2 = vec_sld(vecs[1].v, vecs[0].v, 8); +#else + b1 = vec_sld(vecs[1].v, vecs[0].v, 8); + b2 = vec_sld(vecs[0].v, vecs[1].v, 8); +#endif + b2 = vec_sld(b2, b2, 8); + b2 = padd(b1, b2); + + return Packet2cf(b2); +} + template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) { Packet4f b; @@ -159,6 +175,22 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P return pfirst(prod); } +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) + { + if (Offset==1) + { +#ifdef _BIG_ENDIAN + first.v = vec_sld(first.v, second.v, 8); +#else + first.v = vec_sld(second.v, first.v, 8); +#endif + } + } +}; + template<> struct conj_helper { EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const @@ -214,11 +246,6 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) kernel.packet[0].v = tmp; } -template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) { - Packet4f eq = reinterpret_cast(vec_cmpeq(a.v,b.v)); - return Packet2cf(vec_and(eq, vec_perm(eq, eq, p16uc_COMPLEX32_REV))); -} - #ifdef __VSX__ template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { Packet2cf result; @@ -259,7 +286,7 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) { return Packet1cd(pload((const double*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { return Packet1cd(ploadu((const double*)from)); } @@ -271,14 +298,14 @@ template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride) { - EIGEN_ALIGN16 std::complex af[2]; + std::complex EIGEN_ALIGN16 af[2]; af[0] = from[0*stride]; af[1] = from[1*stride]; return pload(af); } template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index stride) { - EIGEN_ALIGN16 std::complex af[2]; + std::complex EIGEN_ALIGN16 af[2]; pstore >(af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -318,7 +345,7 @@ template<> EIGEN_STRONG_INLINE void prefetch >(const std::c template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { - EIGEN_ALIGN16 std::complex res[2]; + std::complex EIGEN_ALIGN16 res[2]; pstore >(res, a); return res[0]; @@ -327,9 +354,20 @@ template<> EIGEN_STRONG_INLINE std::complex pfirst(const Pac template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { return pfirst(a); } +template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) { return vecs[0]; } template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) + { + // FIXME is it sure we never have to align a Packet1cd? + // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... + } +}; + template<> struct conj_helper { EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const @@ -384,18 +422,6 @@ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); kernel.packet[0].v = tmp; } - -template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) { - // Compare real and imaginary parts of a and b to get the mask vector: - // [re(a)==re(b), im(a)==im(b)] - Packet2d eq = reinterpret_cast(vec_cmpeq(a.v,b.v)); - // Swap real/imag elements in the mask in to get: - // [im(a)==im(b), re(a)==re(b)] - Packet2d eq_swapped = reinterpret_cast(vec_sld(reinterpret_cast(eq), reinterpret_cast(eq), 8)); - // Return re(a)==re(b) & im(a)==im(b) by computing bitwise AND of eq and eq_swapped - return Packet1cd(vec_and(eq, eq_swapped)); -} - #endif // __VSX__ } // end namespace internal diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h index 3a7a32936..c5e4bede7 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AltiVec/MathFunctions.h @@ -9,6 +9,10 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +/* The sin, cos, exp, and log functions of this file come from + * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ + */ + #ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H #define EIGEN_MATH_FUNCTIONS_ALTIVEC_H @@ -16,28 +20,180 @@ namespace Eigen { namespace internal { +static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); +static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); +static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); +static _EIGEN_DECLARE_CONST_Packet4i(23, 23); + +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); + +/* the smallest non denormalized float number */ +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000); // -1.f/0.f +static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan, 0xffffffff); + +/* natural logarithm computed for 4 simultaneous float + return NaN for x <= 0 +*/ +static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); + +static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); +static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); + +static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); + +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); +static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); + +#ifdef __VSX__ +static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); +static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); +static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); + +static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); +static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); + +static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); + +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); + +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); + +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); +static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); + +#ifdef __POWER8_VECTOR__ +static Packet2l p2l_1023 = { 1023, 1023 }; +static Packet2ul p2ul_52 = { 52, 52 }; +#endif + +#endif + template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f plog(const Packet4f& _x) { - return plog_float(_x); + Packet4f x = _x; + + Packet4i emm0; + + /* isvalid_mask is 0 if x < 0 or x is NaN. */ + Packet4ui isvalid_mask = reinterpret_cast(vec_cmpge(x, p4f_ZERO)); + Packet4ui iszero_mask = reinterpret_cast(vec_cmpeq(x, p4f_ZERO)); + + x = pmax(x, p4f_min_norm_pos); /* cut off denormalized stuff */ + emm0 = vec_sr(reinterpret_cast(x), + reinterpret_cast(p4i_23)); + + /* keep only the fractional part */ + x = pand(x, p4f_inv_mant_mask); + x = por(x, p4f_half); + + emm0 = psub(emm0, p4i_0x7f); + Packet4f e = padd(vec_ctf(emm0, 0), p4f_1); + + /* part2: + if( x < SQRTHF ) { + e -= 1; + x = x + x - 1.0; + } else { x = x - 1.0; } + */ + Packet4f mask = reinterpret_cast(vec_cmplt(x, p4f_cephes_SQRTHF)); + Packet4f tmp = pand(x, mask); + x = psub(x, p4f_1); + e = psub(e, pand(p4f_1, mask)); + x = padd(x, tmp); + + Packet4f x2 = pmul(x,x); + Packet4f x3 = pmul(x2,x); + + Packet4f y, y1, y2; + y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1); + y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4); + y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7); + y = pmadd(y , x, p4f_cephes_log_p2); + y1 = pmadd(y1, x, p4f_cephes_log_p5); + y2 = pmadd(y2, x, p4f_cephes_log_p8); + y = pmadd(y, x3, y1); + y = pmadd(y, x3, y2); + y = pmul(y, x3); + + y1 = pmul(e, p4f_cephes_log_q1); + tmp = pmul(x2, p4f_half); + y = padd(y, y1); + x = psub(x, tmp); + y2 = pmul(e, p4f_cephes_log_q2); + x = padd(x, y); + x = padd(x, y2); + // negative arg will be NAN, 0 will be -INF + x = vec_sel(x, p4f_minus_inf, iszero_mask); + x = vec_sel(p4f_minus_nan, x, isvalid_mask); + return x; } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pexp(const Packet4f& _x) { - return pexp_float(_x); -} + Packet4f x = _x; -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f psin(const Packet4f& _x) -{ - return psin_float(_x); -} + Packet4f tmp, fx; + Packet4i emm0; -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f pcos(const Packet4f& _x) -{ - return pcos_float(_x); + // clamp x + x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo); + + // express exp(x) as exp(g + n*log(2)) + fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half); + + fx = pfloor(fx); + + tmp = pmul(fx, p4f_cephes_exp_C1); + Packet4f z = pmul(fx, p4f_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + + z = pmul(x,x); + + Packet4f y = p4f_cephes_exp_p0; + y = pmadd(y, x, p4f_cephes_exp_p1); + y = pmadd(y, x, p4f_cephes_exp_p2); + y = pmadd(y, x, p4f_cephes_exp_p3); + y = pmadd(y, x, p4f_cephes_exp_p4); + y = pmadd(y, x, p4f_cephes_exp_p5); + y = pmadd(y, z, x); + y = padd(y, p4f_1); + + // build 2^n + emm0 = vec_cts(fx, 0); + emm0 = vec_add(emm0, p4i_0x7f); + emm0 = vec_sl(emm0, reinterpret_cast(p4i_23)); + + // Altivec's max & min operators just drop silent NaNs. Check NaNs in + // inputs and return them unmodified. + Packet4ui isnumber_mask = reinterpret_cast(vec_cmpeq(_x, _x)); + return vec_sel(_x, pmax(pmul(y, reinterpret_cast(emm0)), _x), + isnumber_mask); } #ifndef EIGEN_COMP_CLANG @@ -69,19 +225,95 @@ Packet2d psqrt(const Packet2d& x) return vec_sqrt(x); } +// VSX support varies between different compilers and even different +// versions of the same compiler. For gcc version >= 4.9.3, we can use +// vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use +// a slow version that works with older compilers. +// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles +// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963 +static inline Packet2l ConvertToPacket2l(const Packet2d& x) { +#if EIGEN_GNUC_AT_LEAST(5, 4) || \ + (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1) + return vec_cts(x, 0); // TODO: check clang version. +#else + double tmp[2]; + memcpy(tmp, &x, sizeof(tmp)); + Packet2l l = { static_cast(tmp[0]), + static_cast(tmp[1]) }; + return l; +#endif +} + template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d pexp(const Packet2d& _x) { - return pexp_double(_x); -} + Packet2d x = _x; + + Packet2d tmp, fx; + Packet2l emm0; + + // clamp x + x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = pmadd(x, p2d_cephes_LOG2EF, p2d_half); + + fx = pfloor(fx); + + tmp = pmul(fx, p2d_cephes_exp_C1); + Packet2d z = pmul(fx, p2d_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + + Packet2d x2 = pmul(x,x); + + Packet2d px = p2d_cephes_exp_p0; + px = pmadd(px, x2, p2d_cephes_exp_p1); + px = pmadd(px, x2, p2d_cephes_exp_p2); + px = pmul (px, x); + + Packet2d qx = p2d_cephes_exp_q0; + qx = pmadd(qx, x2, p2d_cephes_exp_q1); + qx = pmadd(qx, x2, p2d_cephes_exp_q2); + qx = pmadd(qx, x2, p2d_cephes_exp_q3); + + x = pdiv(px,psub(qx,px)); + x = pmadd(p2d_2,x,p2d_1); + + // build 2^n + emm0 = ConvertToPacket2l(fx); + +#ifdef __POWER8_VECTOR__ + emm0 = vec_add(emm0, p2l_1023); + emm0 = vec_sl(emm0, p2ul_52); +#else + // Code is a bit complex for POWER7. There is actually a + // vec_xxsldi intrinsic but it is not supported by some gcc versions. + // So we shift (52-32) bits and do a word swap with zeros. + _EIGEN_DECLARE_CONST_Packet4i(1023, 1023); + _EIGEN_DECLARE_CONST_Packet4i(20, 20); // 52 - 32 + + Packet4i emm04i = reinterpret_cast(emm0); + emm04i = vec_add(emm04i, p4i_1023); + emm04i = vec_sl(emm04i, reinterpret_cast(p4i_20)); + static const Packet16uc perm = { + 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, + 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b }; +#ifdef _BIG_ENDIAN + emm0 = reinterpret_cast(vec_perm(p4i_ZERO, emm04i, perm)); +#else + emm0 = reinterpret_cast(vec_perm(emm04i, p4i_ZERO, perm)); #endif -// Hyperbolic Tangent function. -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f -ptanh(const Packet4f& x) { - return internal::generic_fast_tanh_float(x); +#endif + + // Altivec's max & min operators just drop silent NaNs. Check NaNs in + // inputs and return them unmodified. + Packet2ul isnumber_mask = reinterpret_cast(vec_cmpeq(_x, _x)); + return vec_sel(_x, pmax(pmul(x, reinterpret_cast(emm0)), _x), + isnumber_mask); } +#endif } // end namespace internal diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h index 83b75b974..08a27d153 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -31,33 +31,22 @@ namespace internal { #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #endif -typedef __vector float Packet4f; -typedef __vector int Packet4i; -typedef __vector unsigned int Packet4ui; -typedef __vector __bool int Packet4bi; -typedef __vector short int Packet8s; -typedef __vector unsigned short int Packet8us; -typedef __vector int8_t Packet16c; -typedef __vector uint8_t Packet16uc; +typedef __vector float Packet4f; +typedef __vector int Packet4i; +typedef __vector unsigned int Packet4ui; +typedef __vector __bool int Packet4bi; +typedef __vector short int Packet8i; +typedef __vector unsigned char Packet16uc; // We don't want to write the same code all the time, but we need to reuse the constants // and it doesn't really work to declare them global, so we define macros instead #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ - Packet4f p4f_##NAME = {X, X, X, X} + Packet4f p4f_##NAME = reinterpret_cast(vec_splat_s32(X)) #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ Packet4i p4i_##NAME = vec_splat_s32(X) -#define _EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME,X) \ - Packet4ui p4ui_##NAME = {X, X, X, X} - -#define _EIGEN_DECLARE_CONST_FAST_Packet8us(NAME,X) \ - Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X} - -#define _EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME,X) \ - Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X} - #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ Packet4f p4f_##NAME = pset1(X) @@ -76,39 +65,32 @@ typedef __vector uint8_t Packet16uc; #define DST_CHAN 1 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride)) + // These constants are endian-agnostic static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1} static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16} static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} -static _EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u); -static _EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu); -static _EIGEN_DECLARE_CONST_FAST_Packet8us(ONE,1); //{ 1, 1, 1, 1, 1, 1, 1, 1} -static _EIGEN_DECLARE_CONST_FAST_Packet16uc(ONE,1); static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000} #ifndef __VSX__ static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0} #endif -static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; -static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; -static Packet8s p8s_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 }; -static Packet8us p8us_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7 }; -static Packet16c p16c_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15}; -static Packet16uc p16uc_COUNTDOWN = { 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15}; +static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; +static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 }; -static Packet16uc p16uc_REVERSE16 = { 14,15, 12,13, 10,11, 8,9, 6,7, 4,5, 2,3, 0,1 }; -static Packet16uc p16uc_REVERSE8 = { 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 }; - static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; -static Packet16uc p16uc_DUPLICATE16_HI = { 0,1,0,1, 2,3,2,3, 4,5,4,5, 6,7,6,7 }; -static Packet16uc p16uc_DUPLICATE8_HI = { 0,0, 1,1, 2,2, 3,3, 4,4, 5,5, 6,6, 7,7 }; -static Packet16uc p16uc_QUADRUPLICATE16_HI = { 0,1,0,1,0,1,0,1, 2,3,2,3,2,3,2,3 }; +// Mask alignment +#ifdef __PPC64__ +#define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0 +#else +#define _EIGEN_MASK_ALIGNMENT 0xfffffff0 +#endif + +#define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT) // Handle endianness properly while loading constants // Define global static constants: @@ -147,27 +129,27 @@ static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_L #define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); #endif -template <> -struct packet_traits : default_packet_traits { +template<> struct packet_traits : default_packet_traits +{ typedef Packet4f type; typedef Packet4f half; enum { Vectorizable = 1, AlignedOnScalar = 1, - size = 4, + size=4, HasHalfPacket = 1, - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasMin = 1, - HasMax = 1, - HasAbs = 1, - HasSin = EIGEN_FAST_MATH, - HasCos = EIGEN_FAST_MATH, - HasLog = 1, - HasExp = 1, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasMin = 1, + HasMax = 1, + HasAbs = 1, + HasSin = 0, + HasCos = 0, + HasLog = 0, + HasExp = 1, #ifdef __VSX__ HasSqrt = 1, #if !EIGEN_COMP_CLANG @@ -178,8 +160,6 @@ struct packet_traits : default_packet_traits { #else HasSqrt = 0, HasRsqrt = 0, - HasTanh = EIGEN_FAST_MATH, - HasErf = EIGEN_FAST_MATH, #endif HasRound = 1, HasFloor = 1, @@ -188,8 +168,8 @@ struct packet_traits : default_packet_traits { HasBlend = 1 }; }; -template <> -struct packet_traits : default_packet_traits { +template<> struct packet_traits : default_packet_traits +{ typedef Packet4i type; typedef Packet4i half; enum { @@ -198,25 +178,6 @@ struct packet_traits : default_packet_traits { size = 4, HasHalfPacket = 0, - HasAdd = 1, - HasSub = 1, - HasShift = 1, - HasMul = 1, - HasDiv = 0, - HasBlend = 1 - }; -}; - -template <> -struct packet_traits : default_packet_traits { - typedef Packet8s type; - typedef Packet8s half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 8, - HasHalfPacket = 0, - HasAdd = 1, HasSub = 1, HasMul = 1, @@ -225,120 +186,19 @@ struct packet_traits : default_packet_traits { }; }; -template <> -struct packet_traits : default_packet_traits { - typedef Packet8us type; - typedef Packet8us half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 8, - HasHalfPacket = 0, - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 0, - HasBlend = 1 - }; -}; - -template <> -struct packet_traits : default_packet_traits { - typedef Packet16c type; - typedef Packet16c half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 16, - HasHalfPacket = 0, - - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 0, - HasBlend = 1 - }; -}; - -template <> -struct packet_traits : default_packet_traits { - typedef Packet16uc type; - typedef Packet16uc half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 16, - HasHalfPacket = 0, - - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 0, - HasBlend = 1 - }; -}; - -template<> struct unpacket_traits -{ - typedef float type; - typedef Packet4f half; - typedef Packet4i integer_packet; - enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; -}; -template<> struct unpacket_traits -{ - typedef int type; - typedef Packet4i half; - enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; -}; -template<> struct unpacket_traits -{ - typedef short int type; - typedef Packet8s half; - enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; -}; -template<> struct unpacket_traits -{ - typedef unsigned short int type; - typedef Packet8us half; - enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; -}; - -template<> struct unpacket_traits -{ - typedef int8_t type; - typedef Packet16c half; - enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; -}; -template<> struct unpacket_traits -{ - typedef uint8_t type; - typedef Packet16uc half; - enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; -}; - -inline std::ostream & operator <<(std::ostream & s, const Packet16c & v) -{ - union { - Packet16c v; - int8_t n[16]; - } vt; - vt.v = v; - for (int i=0; i< 16; i++) - s << vt.n[i] << ", "; - return s; -} +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; +template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v) { union { Packet16uc v; - uint8_t n[16]; + unsigned char n[16]; } vt; vt.v = v; for (int i=0; i< 16; i++) - s << vt.n[i] << ", "; + s << (int)vt.n[i] << ", "; return s; } @@ -378,12 +238,9 @@ inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) // Need to define them first or we get specialization after instantiation errors template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { - // some versions of GCC throw "unused-but-set-parameter". - // ignoring these warnings for now. - EIGEN_UNUSED_VARIABLE(from); EIGEN_DEBUG_ALIGNED_LOAD #ifdef __VSX__ - return vec_xl(0, from); + return vec_vsx_ld(0, from); #else return vec_ld(0, from); #endif @@ -391,61 +248,19 @@ template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { - // some versions of GCC throw "unused-but-set-parameter". - // ignoring these warnings for now. - EIGEN_UNUSED_VARIABLE(from); EIGEN_DEBUG_ALIGNED_LOAD #ifdef __VSX__ - return vec_xl(0, from); + return vec_vsx_ld(0, from); #else return vec_ld(0, from); #endif } -template<> EIGEN_STRONG_INLINE Packet8s pload(const short int* from) -{ - // some versions of GCC throw "unused-but-set-parameter". - // ignoring these warnings for now. - EIGEN_UNUSED_VARIABLE(from); - EIGEN_DEBUG_ALIGNED_LOAD - return vec_ld(0, from); -} - -template<> EIGEN_STRONG_INLINE Packet8us pload(const unsigned short int* from) -{ - // some versions of GCC throw "unused-but-set-parameter". - // ignoring these warnings for now. - EIGEN_UNUSED_VARIABLE(from); - EIGEN_DEBUG_ALIGNED_LOAD - return vec_ld(0, from); -} - -template<> EIGEN_STRONG_INLINE Packet16c pload(const int8_t* from) -{ - // some versions of GCC throw "unused-but-set-parameter". - // ignoring these warnings for now. - EIGEN_UNUSED_VARIABLE(from); - EIGEN_DEBUG_ALIGNED_LOAD - return vec_ld(0, from); -} - -template<> EIGEN_STRONG_INLINE Packet16uc pload(const uint8_t* from) -{ - // some versions of GCC throw "unused-but-set-parameter". - // ignoring these warnings for now. - EIGEN_UNUSED_VARIABLE(from); - EIGEN_DEBUG_ALIGNED_LOAD - return vec_ld(0, from); -} - template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { - // some versions of GCC throw "unused-but-set-parameter" (float *to). - // ignoring these warnings for now. - EIGEN_UNUSED_VARIABLE(to); EIGEN_DEBUG_ALIGNED_STORE #ifdef __VSX__ - vec_xst(from, 0, to); + vec_vsx_st(from, 0, to); #else vec_st(from, 0, to); #endif @@ -453,52 +268,14 @@ template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& f template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) { - // some versions of GCC throw "unused-but-set-parameter" (float *to). - // ignoring these warnings for now. - EIGEN_UNUSED_VARIABLE(to); EIGEN_DEBUG_ALIGNED_STORE #ifdef __VSX__ - vec_xst(from, 0, to); + vec_vsx_st(from, 0, to); #else vec_st(from, 0, to); #endif } -template<> EIGEN_STRONG_INLINE void pstore(short int* to, const Packet8s& from) -{ - // some versions of GCC throw "unused-but-set-parameter" (float *to). - // ignoring these warnings for now. - EIGEN_UNUSED_VARIABLE(to); - EIGEN_DEBUG_ALIGNED_STORE - vec_st(from, 0, to); -} - -template<> EIGEN_STRONG_INLINE void pstore(unsigned short int* to, const Packet8us& from) -{ - // some versions of GCC throw "unused-but-set-parameter" (float *to). - // ignoring these warnings for now. - EIGEN_UNUSED_VARIABLE(to); - EIGEN_DEBUG_ALIGNED_STORE - vec_st(from, 0, to); -} -template<> EIGEN_STRONG_INLINE void pstore(int8_t* to, const Packet16c& from) -{ - // some versions of GCC throw "unused-but-set-parameter" (float *to). - // ignoring these warnings for now. - EIGEN_UNUSED_VARIABLE(to); - EIGEN_DEBUG_ALIGNED_STORE - vec_st(from, 0, to); -} - -template<> EIGEN_STRONG_INLINE void pstore(uint8_t* to, const Packet16uc& from) -{ - // some versions of GCC throw "unused-but-set-parameter" (float *to). - // ignoring these warnings for now. - EIGEN_UNUSED_VARIABLE(to); - EIGEN_DEBUG_ALIGNED_STORE - vec_st(from, 0, to); -} - template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { Packet4f v = {from, from, from, from}; return v; @@ -508,31 +285,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { Packet4i v = {from, from, from, from}; return v; } - -template<> EIGEN_STRONG_INLINE Packet8s pset1(const short int& from) { - Packet8s v = {from, from, from, from, from, from, from, from}; - return v; -} - -template<> EIGEN_STRONG_INLINE Packet8us pset1(const unsigned short int& from) { - Packet8us v = {from, from, from, from, from, from, from, from}; - return v; -} - -template<> EIGEN_STRONG_INLINE Packet16c pset1(const int8_t& from) { - Packet16c v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from}; - return v; -} - -template<> EIGEN_STRONG_INLINE Packet16uc pset1(const uint8_t& from) { - Packet16uc v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from}; - return v; -} - -template<> EIGEN_STRONG_INLINE Packet4f pset1frombits(unsigned int from) { - return reinterpret_cast(pset1(from)); -} - template<> EIGEN_STRONG_INLINE void pbroadcast4(const float *a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) @@ -556,7 +308,7 @@ pbroadcast4(const int *a, template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { - EIGEN_ALIGN16 float af[4]; + float EIGEN_ALIGN16 af[4]; af[0] = from[0*stride]; af[1] = from[1*stride]; af[2] = from[2*stride]; @@ -565,88 +317,16 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const floa } template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) { - EIGEN_ALIGN16 int ai[4]; + int EIGEN_ALIGN16 ai[4]; ai[0] = from[0*stride]; ai[1] = from[1*stride]; ai[2] = from[2*stride]; ai[3] = from[3*stride]; return pload(ai); } -template<> EIGEN_DEVICE_FUNC inline Packet8s pgather(const short int* from, Index stride) -{ - EIGEN_ALIGN16 short int ai[8]; - ai[0] = from[0*stride]; - ai[1] = from[1*stride]; - ai[2] = from[2*stride]; - ai[3] = from[3*stride]; - ai[4] = from[4*stride]; - ai[5] = from[5*stride]; - ai[6] = from[6*stride]; - ai[7] = from[7*stride]; - return pload(ai); -} - -template<> EIGEN_DEVICE_FUNC inline Packet8us pgather(const unsigned short int* from, Index stride) -{ - EIGEN_ALIGN16 unsigned short int ai[8]; - ai[0] = from[0*stride]; - ai[1] = from[1*stride]; - ai[2] = from[2*stride]; - ai[3] = from[3*stride]; - ai[4] = from[4*stride]; - ai[5] = from[5*stride]; - ai[6] = from[6*stride]; - ai[7] = from[7*stride]; - return pload(ai); -} - -template<> EIGEN_DEVICE_FUNC inline Packet16c pgather(const int8_t* from, Index stride) -{ - EIGEN_ALIGN16 int8_t ai[16]; - ai[0] = from[0*stride]; - ai[1] = from[1*stride]; - ai[2] = from[2*stride]; - ai[3] = from[3*stride]; - ai[4] = from[4*stride]; - ai[5] = from[5*stride]; - ai[6] = from[6*stride]; - ai[7] = from[7*stride]; - ai[8] = from[8*stride]; - ai[9] = from[9*stride]; - ai[10] = from[10*stride]; - ai[11] = from[11*stride]; - ai[12] = from[12*stride]; - ai[13] = from[13*stride]; - ai[14] = from[14*stride]; - ai[15] = from[15*stride]; - return pload(ai); -} - -template<> EIGEN_DEVICE_FUNC inline Packet16uc pgather(const uint8_t* from, Index stride) -{ - EIGEN_ALIGN16 uint8_t ai[16]; - ai[0] = from[0*stride]; - ai[1] = from[1*stride]; - ai[2] = from[2*stride]; - ai[3] = from[3*stride]; - ai[4] = from[4*stride]; - ai[5] = from[5*stride]; - ai[6] = from[6*stride]; - ai[7] = from[7*stride]; - ai[8] = from[8*stride]; - ai[9] = from[9*stride]; - ai[10] = from[10*stride]; - ai[11] = from[11*stride]; - ai[12] = from[12*stride]; - ai[13] = from[13*stride]; - ai[14] = from[14*stride]; - ai[15] = from[15*stride]; - return pload(ai); -} - template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) { - EIGEN_ALIGN16 float af[4]; + float EIGEN_ALIGN16 af[4]; pstore(af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -655,7 +335,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, co } template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) { - EIGEN_ALIGN16 int ai[4]; + int EIGEN_ALIGN16 ai[4]; pstore((int *)ai, from); to[0*stride] = ai[0]; to[1*stride] = ai[1]; @@ -663,52 +343,14 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const to[3*stride] = ai[3]; } -template<> EIGEN_DEVICE_FUNC inline void pscatter(short int* to, const Packet8s& from, Index stride) -{ - EIGEN_ALIGN16 short int ai[8]; - pstore((short int *)ai, from); - to[0*stride] = ai[0]; - to[1*stride] = ai[1]; - to[2*stride] = ai[2]; - to[3*stride] = ai[3]; - to[4*stride] = ai[4]; - to[5*stride] = ai[5]; - to[6*stride] = ai[6]; - to[7*stride] = ai[7]; -} +template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return pset1(a) + p4f_COUNTDOWN; } +template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return pset1(a) + p4i_COUNTDOWN; } -template<> EIGEN_DEVICE_FUNC inline void pscatter(unsigned short int* to, const Packet8us& from, Index stride) -{ - EIGEN_ALIGN16 unsigned short int ai[8]; - pstore((unsigned short int *)ai, from); - to[0*stride] = ai[0]; - to[1*stride] = ai[1]; - to[2*stride] = ai[2]; - to[3*stride] = ai[3]; - to[4*stride] = ai[4]; - to[5*stride] = ai[5]; - to[6*stride] = ai[6]; - to[7*stride] = ai[7]; -} +template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return a + b; } +template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return a + b; } -template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return pset1(a) + p4f_COUNTDOWN; } -template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return pset1(a) + p4i_COUNTDOWN; } -template<> EIGEN_STRONG_INLINE Packet8s plset(const short int& a) { return pset1(a) + p8s_COUNTDOWN; } -template<> EIGEN_STRONG_INLINE Packet8us plset(const unsigned short int& a) { return pset1(a) + p8us_COUNTDOWN; } -template<> EIGEN_STRONG_INLINE Packet16c plset(const int8_t& a) { return pset1(a) + p16c_COUNTDOWN; } -template<> EIGEN_STRONG_INLINE Packet16uc plset(const uint8_t& a) { return pset1(a) + p16uc_COUNTDOWN; } - -template<> EIGEN_STRONG_INLINE Packet4f padd (const Packet4f& a, const Packet4f& b) { return a + b; } -template<> EIGEN_STRONG_INLINE Packet4i padd (const Packet4i& a, const Packet4i& b) { return a + b; } -template<> EIGEN_STRONG_INLINE Packet8s padd (const Packet8s& a, const Packet8s& b) { return a + b; } -template<> EIGEN_STRONG_INLINE Packet8us padd (const Packet8us& a, const Packet8us& b) { return a + b; } -template<> EIGEN_STRONG_INLINE Packet16c padd (const Packet16c& a, const Packet16c& b) { return a + b; } -template<> EIGEN_STRONG_INLINE Packet16uc padd(const Packet16uc& a, const Packet16uc& b) { return a + b; } - -template<> EIGEN_STRONG_INLINE Packet4f psub (const Packet4f& a, const Packet4f& b) { return a - b; } -template<> EIGEN_STRONG_INLINE Packet4i psub (const Packet4i& a, const Packet4i& b) { return a - b; } -template<> EIGEN_STRONG_INLINE Packet16c psub (const Packet16c& a, const Packet16c& b) { return a - b; } -template<> EIGEN_STRONG_INLINE Packet16uc psub(const Packet16uc& a, const Packet16uc& b) { return a - b; } +template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return a - b; } +template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return a - b; } template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; } template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; } @@ -716,10 +358,8 @@ template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4f pmul (const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); } -template<> EIGEN_STRONG_INLINE Packet4i pmul (const Packet4i& a, const Packet4i& b) { return a * b; } -template<> EIGEN_STRONG_INLINE Packet16c pmul (const Packet16c& a, const Packet16c& b) { return vec_mul(a,b); } -template<> EIGEN_STRONG_INLINE Packet16uc pmul(const Packet16uc& a, const Packet16uc& b) { return vec_mul(a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); } +template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { return a * b; } template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { @@ -751,7 +391,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { #ifdef __VSX__ - // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN Packet4f ret; __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); return ret; @@ -760,15 +399,10 @@ template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const #endif } template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } -template<> EIGEN_STRONG_INLINE Packet8s pmin(const Packet8s& a, const Packet8s& b) { return vec_min(a, b); } -template<> EIGEN_STRONG_INLINE Packet8us pmin(const Packet8us& a, const Packet8us& b) { return vec_min(a, b); } -template<> EIGEN_STRONG_INLINE Packet16c pmin(const Packet16c& a, const Packet16c& b) { return vec_min(a, b); } -template<> EIGEN_STRONG_INLINE Packet16uc pmin(const Packet16uc& a, const Packet16uc& b) { return vec_min(a, b); } template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { #ifdef __VSX__ - // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN Packet4f ret; __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); return ret; @@ -777,19 +411,6 @@ template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const #endif } template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } -template<> EIGEN_STRONG_INLINE Packet8s pmax(const Packet8s& a, const Packet8s& b) { return vec_max(a, b); } -template<> EIGEN_STRONG_INLINE Packet8us pmax(const Packet8us& a, const Packet8us& b) { return vec_max(a, b); } -template<> EIGEN_STRONG_INLINE Packet16c pmax(const Packet16c& a, const Packet16c& b) { return vec_max(a, b); } -template<> EIGEN_STRONG_INLINE Packet16uc pmax(const Packet16uc& a, const Packet16uc& b) { return vec_max(a, b); } - -template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast(vec_cmple(a,b)); } -template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast(vec_cmplt(a,b)); } -template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast(vec_cmpeq(a,b)); } -template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { - Packet4f c = reinterpret_cast(vec_cmpge(a,b)); - return vec_nor(c,c); -} -template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast(vec_cmpeq(a,b)); } template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } @@ -803,19 +424,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); } -template<> EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) { - return vec_sel(b, a, reinterpret_cast(mask)); -} -template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { - Packet4f t = vec_add(reinterpret_cast(vec_or(vec_and(reinterpret_cast(a), p4ui_SIGN), p4ui_PREV0DOT5)), a); - Packet4f res; - - __asm__("vrfiz %0, %1\n\t" - : "=v" (res) - : "v" (t)); - - return res; -} +template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { return vec_round(a); } template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { return vec_ceil(a); } template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { return vec_floor(a); } @@ -842,82 +451,17 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) mask = vec_lvsl(0, from); // create the permute mask return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data } -template<> EIGEN_STRONG_INLINE Packet8s ploadu(const short int* from) -{ - EIGEN_DEBUG_ALIGNED_LOAD - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - Packet16uc MSQ, LSQ; - Packet16uc mask; - MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword - LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword - mask = vec_lvsl(0, from); // create the permute mask - return static_cast(vec_perm(MSQ, LSQ, mask)); // align the data -} -template<> EIGEN_STRONG_INLINE Packet8us ploadu(const unsigned short int* from) -{ - EIGEN_DEBUG_ALIGNED_LOAD - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - Packet16uc MSQ, LSQ; - Packet16uc mask; - MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword - LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword - mask = vec_lvsl(0, from); // create the permute mask - return static_cast(vec_perm(MSQ, LSQ, mask)); // align the data -} -template<> EIGEN_STRONG_INLINE Packet16c ploadu(const char* from) -{ - EIGEN_DEBUG_ALIGNED_LOAD - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - Packet16uc MSQ, LSQ; - Packet16uc mask; - MSQ = vec_ld(0, from); // most significant quadword - LSQ = vec_ld(15, from); // least significant quadword - mask = vec_lvsl(0, from); // create the permute mask - return static_cast(vec_perm(MSQ, LSQ, mask)); // align the data -} - -template<> EIGEN_STRONG_INLINE Packet16uc ploadu(const unsigned char* from) -{ - EIGEN_DEBUG_ALIGNED_LOAD - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - Packet16uc MSQ, LSQ; - Packet16uc mask; - MSQ = vec_ld(0, from); // most significant quadword - LSQ = vec_ld(15, from); // least significant quadword - mask = vec_lvsl(0, from); // create the permute mask - return static_cast(vec_perm(MSQ, LSQ, mask)); // align the data -} #else -// We also need to redefine little endian loading of Packet4i/Packet4f using VSX +// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD - return vec_xl(0, from); + return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from)); } template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD - return vec_xl(0, from); -} -template<> EIGEN_STRONG_INLINE Packet8s ploadu(const short int* from) -{ - EIGEN_DEBUG_UNALIGNED_LOAD - return vec_vsx_ld(0, from); -} -template<> EIGEN_STRONG_INLINE Packet8us ploadu(const unsigned short int* from) -{ - EIGEN_DEBUG_UNALIGNED_LOAD - return vec_vsx_ld(0, from); -} -template<> EIGEN_STRONG_INLINE Packet16c ploadu(const int8_t* from) -{ - EIGEN_DEBUG_UNALIGNED_LOAD - return vec_vsx_ld(0, from); -} -template<> EIGEN_STRONG_INLINE Packet16uc ploadu(const uint8_t* from) -{ - EIGEN_DEBUG_UNALIGNED_LOAD - return vec_vsx_ld(0, from); + return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from)); } #endif @@ -928,7 +472,6 @@ template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) else p = ploadu(from); return vec_perm(p, p, p16uc_DUPLICATE32_HI); } - template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) { Packet4i p; @@ -937,54 +480,6 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) return vec_perm(p, p, p16uc_DUPLICATE32_HI); } -template<> EIGEN_STRONG_INLINE Packet8s ploaddup(const short int* from) -{ - Packet8s p; - if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); - else p = ploadu(from); - return vec_perm(p, p, p16uc_DUPLICATE16_HI); -} - -template<> EIGEN_STRONG_INLINE Packet8us ploaddup(const unsigned short int* from) -{ - Packet8us p; - if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); - else p = ploadu(from); - return vec_perm(p, p, p16uc_DUPLICATE16_HI); -} - -template<> EIGEN_STRONG_INLINE Packet8s ploadquad(const short int* from) -{ - Packet8s p; - if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); - else p = ploadu(from); - return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI); -} - -template<> EIGEN_STRONG_INLINE Packet8us ploadquad(const unsigned short int* from) -{ - Packet8us p; - if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); - else p = ploadu(from); - return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI); -} - -template<> EIGEN_STRONG_INLINE Packet16c ploaddup(const int8_t* from) -{ - Packet16c p; - if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); - else p = ploadu(from); - return vec_perm(p, p, p16uc_DUPLICATE8_HI); -} - -template<> EIGEN_STRONG_INLINE Packet16uc ploaddup(const uint8_t* from) -{ - Packet16uc p; - if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); - else p = ploadu(from); - return vec_perm(p, p, p16uc_DUPLICATE8_HI); -} - #ifdef _BIG_ENDIAN template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { @@ -1022,151 +517,25 @@ template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& f vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part } -template<> EIGEN_STRONG_INLINE void pstoreu(short int* to, const Packet8s& from) -{ - EIGEN_DEBUG_UNALIGNED_STORE - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - // Warning: not thread safe! - Packet16uc MSQ, LSQ, edges; - Packet16uc edgeAlign, align; - - MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword - LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword - edgeAlign = vec_lvsl(0, to); // permute map to extract edges - edges = vec_perm(LSQ, MSQ, edgeAlign); // extract the edges - align = vec_lvsr( 0, to ); // permute map to misalign data - MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ) - LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ) - vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first - vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part -} -template<> EIGEN_STRONG_INLINE void pstoreu(unsigned short int* to, const Packet8us& from) -{ - EIGEN_DEBUG_UNALIGNED_STORE - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - // Warning: not thread safe! - Packet16uc MSQ, LSQ, edges; - Packet16uc edgeAlign, align; - - MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword - LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword - edgeAlign = vec_lvsl(0, to); // permute map to extract edges - edges = vec_perm(LSQ, MSQ, edgeAlign); // extract the edges - align = vec_lvsr( 0, to ); // permute map to misalign data - MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ) - LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ) - vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first - vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part -} - -template<> EIGEN_STRONG_INLINE void pstoreu(char* to, const Packet16c& from) -{ - EIGEN_DEBUG_UNALIGNED_STORE - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - // Warning: not thread safe! - Packet16uc MSQ, LSQ, edges; - Packet16uc edgeAlign, align; - - MSQ = vec_ld(0, to); // most significant quadword - LSQ = vec_ld(15,to); // least significant quadword - edgeAlign = vec_lvsl(0, to); // permute map to extract edges - edges=vec_perm(LSQ, MSQ, edgeAlign); // extract the edges - align = vec_lvsr( 0, to ); // permute map to misalign data - MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ) - LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ) - vec_st( LSQ, 15, to ); // Store the LSQ part first - vec_st( MSQ, 0, to ); // Store the MSQ part -} -template<> EIGEN_STRONG_INLINE void pstoreu(unsigned char* to, const Packet16uc& from) -{ - EIGEN_DEBUG_UNALIGNED_STORE - // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html - // Warning: not thread safe! - Packet16uc MSQ, LSQ, edges; - Packet16uc edgeAlign, align; - - MSQ = vec_ld(0, to); // most significant quadword - LSQ = vec_ld(15,to); // least significant quadword - edgeAlign = vec_lvsl(0, to); // permute map to extract edges - edges=vec_perm(LSQ, MSQ, edgeAlign); // extract the edges - align = vec_lvsr( 0, to ); // permute map to misalign data - MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ) - LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ) - vec_st( LSQ, 15, to ); // Store the LSQ part first - vec_st( MSQ, 0, to ); // Store the MSQ part -} #else -// We also need to redefine little endian loading of Packet4i/Packet4f using VSX +// We also need ot redefine little endian loading of Packet4i/Packet4f using VSX template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { - EIGEN_DEBUG_UNALIGNED_STORE - vec_xst(from, 0, to); + EIGEN_DEBUG_ALIGNED_STORE + vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to)); } template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { - EIGEN_DEBUG_UNALIGNED_STORE - vec_xst(from, 0, to); -} -template<> EIGEN_STRONG_INLINE void pstoreu(short int* to, const Packet8s& from) -{ - EIGEN_DEBUG_UNALIGNED_STORE - /*GCC provides a commonly used synonym for vec_xst called vec_vsx_st. - * Although these have the same behavior, - * only vec_xst is guaranteed to be portable across compliant compilers - * vec_xst should be preferred. */ - vec_xst(from, 0, to); -} -template<> EIGEN_STRONG_INLINE void pstoreu(unsigned short int* to, const Packet8us& from) -{ - EIGEN_DEBUG_UNALIGNED_STORE - /*GCC provides a commonly used synonym for vec_xst called vec_vsx_st. - * Although these have the same behavior, - * only vec_xst is guaranteed to be portable across compliant compilers - * vec_xst should be preferred. */ - vec_xst(from, 0, to); -} -template<> EIGEN_STRONG_INLINE void pstoreu(int8_t* to, const Packet16c& from) -{ - EIGEN_DEBUG_UNALIGNED_STORE - vec_vsx_st(from, 0, to); -} -template<> EIGEN_STRONG_INLINE void pstoreu(uint8_t* to, const Packet16uc& from) -{ - EIGEN_DEBUG_UNALIGNED_STORE - vec_vsx_st(from, 0, to); + EIGEN_DEBUG_ALIGNED_STORE + vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); } #endif template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_PPC_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_PPC_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { EIGEN_ALIGN16 float x; vec_ste(a, 0, &x); return x; } -template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { EIGEN_ALIGN16 int x; vec_ste(a, 0, &x); return x; } - -template<> EIGEN_STRONG_INLINE short int pfirst(const Packet8s& a) { - EIGEN_ALIGN16 short int x; - vec_ste(a, 0, &x); - return x; -} - -template<> EIGEN_STRONG_INLINE unsigned short int pfirst(const Packet8us& a) { - EIGEN_ALIGN16 unsigned short int x; - vec_ste(a, 0, &x); - return x; -} - -template<> EIGEN_STRONG_INLINE int8_t pfirst(const Packet16c& a) -{ - EIGEN_ALIGN16 int8_t x; - vec_ste(a, 0, &x); - return x; -} -template<> EIGEN_STRONG_INLINE uint8_t pfirst(const Packet16uc& a) -{ - EIGEN_ALIGN16 uint8_t x; - vec_ste(a, 0, &x); - return x; -} +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } +template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { @@ -1174,46 +543,10 @@ template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) } template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { - return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); -} -template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) -{ - return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE16)); -} -template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) -{ - return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE16)); -} -template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) -{ - return vec_perm(a, a, p16uc_REVERSE8); -} -template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) -{ - return vec_perm(a, a, p16uc_REVERSE8); -} + return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); } template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); } template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } -template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vec_abs(a); } -template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vec_abs(a); } -template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; } - -template EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) -{ return vec_sra(a,reinterpret_cast(pset1(N))); } -template EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a) -{ return vec_sr(a,reinterpret_cast(pset1(N))); } -template EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) -{ return vec_sl(a,reinterpret_cast(pset1(N))); } - -template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Packet4f& exponent) { - return pfrexp_float(a,exponent); -} - -template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) { - return pldexp_float(a,exponent); -} template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) { @@ -1225,6 +558,34 @@ template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) return pfirst(sum); } +template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) +{ + Packet4f v[4], sum[4]; + + // It's easier and faster to transpose then add as columns + // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation + // Do the transpose, first set of moves + v[0] = vec_mergeh(vecs[0], vecs[2]); + v[1] = vec_mergel(vecs[0], vecs[2]); + v[2] = vec_mergeh(vecs[1], vecs[3]); + v[3] = vec_mergel(vecs[1], vecs[3]); + // Get the resulting vectors + sum[0] = vec_mergeh(v[0], v[2]); + sum[1] = vec_mergel(v[0], v[2]); + sum[2] = vec_mergeh(v[1], v[3]); + sum[3] = vec_mergel(v[1], v[3]); + + // Now do the summation: + // Lines 0+1 + sum[0] = sum[0] + sum[1]; + // Lines 2+3 + sum[1] = sum[2] + sum[3]; + // Add the results + sum[0] = sum[0] + sum[1]; + + return sum[0]; +} + template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) { Packet4i sum; @@ -1237,85 +598,34 @@ template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) return pfirst(sum); } -template<> EIGEN_STRONG_INLINE short int predux(const Packet8s& a) +template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) { - union{ - Packet8s v; - short int n[8]; - } vt; - vt.v = a; + Packet4i v[4], sum[4]; - EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] }; - EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] }; - Packet4i first_half = pload(first_loader); - Packet4i second_half = pload(second_loader); + // It's easier and faster to transpose then add as columns + // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation + // Do the transpose, first set of moves + v[0] = vec_mergeh(vecs[0], vecs[2]); + v[1] = vec_mergel(vecs[0], vecs[2]); + v[2] = vec_mergeh(vecs[1], vecs[3]); + v[3] = vec_mergel(vecs[1], vecs[3]); + // Get the resulting vectors + sum[0] = vec_mergeh(v[0], v[2]); + sum[1] = vec_mergel(v[0], v[2]); + sum[2] = vec_mergeh(v[1], v[3]); + sum[3] = vec_mergel(v[1], v[3]); - return static_cast(predux(first_half) + predux(second_half)); + // Now do the summation: + // Lines 0+1 + sum[0] = sum[0] + sum[1]; + // Lines 2+3 + sum[1] = sum[2] + sum[3]; + // Add the results + sum[0] = sum[0] + sum[1]; + + return sum[0]; } -template<> EIGEN_STRONG_INLINE unsigned short int predux(const Packet8us& a) -{ - union{ - Packet8us v; - unsigned short int n[8]; - } vt; - vt.v = a; - - //There is no predux for Packet4ui. So we are intentionally using int - EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] }; - EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] }; - Packet4i first_half = pload(first_loader); - Packet4i second_half = pload(second_loader); - - return static_cast(predux(first_half) + predux(second_half)); -} - -template<> EIGEN_STRONG_INLINE int8_t predux(const Packet16c& a) -{ - union{ - Packet16c v; - int8_t n[16]; - } vt; - vt.v = a; - - EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] }; - EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] }; - EIGEN_ALIGN16 int third_loader[4] = { vt.n[8], vt.n[9], vt.n[10], vt.n[11] }; - EIGEN_ALIGN16 int fourth_loader[4] = { vt.n[12], vt.n[13], vt.n[14], vt.n[15] }; - - Packet4i first_quarter = pload(first_loader); - Packet4i second_quarter = pload(second_loader); - Packet4i third_quarter = pload(third_loader); - Packet4i fourth_quarter = pload(fourth_loader); - - return static_cast(predux(first_quarter) + predux(second_quarter) - + predux(third_quarter) + predux(fourth_quarter)); -} - -template<> EIGEN_STRONG_INLINE uint8_t predux(const Packet16uc& a) -{ - union{ - Packet16uc v; - uint8_t n[16]; - } vt; - vt.v = a; - - EIGEN_ALIGN16 int first_loader[4] = { vt.n[0], vt.n[1], vt.n[2], vt.n[3] }; - EIGEN_ALIGN16 int second_loader[4] = { vt.n[4], vt.n[5], vt.n[6], vt.n[7] }; - EIGEN_ALIGN16 int third_loader[4] = { vt.n[8], vt.n[9], vt.n[10], vt.n[11] }; - EIGEN_ALIGN16 int fourth_loader[4] = { vt.n[12], vt.n[13], vt.n[14], vt.n[15] }; - - Packet4i first_quarter = pload(first_loader); - Packet4i second_quarter = pload(second_loader); - Packet4i third_quarter = pload(third_loader); - Packet4i fourth_quarter = pload(fourth_loader); - - - return static_cast(predux(first_quarter) + predux(second_quarter) - + predux(third_quarter) + predux(fourth_quarter)); -} - - // Other reduction functions: // mul template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) @@ -1332,52 +642,6 @@ template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) return aux[0] * aux[1] * aux[2] * aux[3]; } -template<> EIGEN_STRONG_INLINE short int predux_mul(const Packet8s& a) -{ - Packet8s pair, quad, octo; - - pair = vec_mul(a, vec_sld(a, a, 8)); - quad = vec_mul(pair, vec_sld(pair, pair, 4)); - octo = vec_mul(quad, vec_sld(quad, quad, 2)); - - return pfirst(octo); -} - -template<> EIGEN_STRONG_INLINE unsigned short int predux_mul(const Packet8us& a) -{ - Packet8us pair, quad, octo; - - pair = vec_mul(a, vec_sld(a, a, 8)); - quad = vec_mul(pair, vec_sld(pair, pair, 4)); - octo = vec_mul(quad, vec_sld(quad, quad, 2)); - - return pfirst(octo); -} - -template<> EIGEN_STRONG_INLINE int8_t predux_mul(const Packet16c& a) -{ - Packet16c pair, quad, octo, result; - - pair = vec_mul(a, vec_sld(a, a, 8)); - quad = vec_mul(pair, vec_sld(pair, pair, 4)); - octo = vec_mul(quad, vec_sld(quad, quad, 2)); - result = vec_mul(octo, vec_sld(octo, octo, 1)); - - return pfirst(result); -} - -template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet16uc& a) -{ - Packet16uc pair, quad, octo, result; - - pair = vec_mul(a, vec_sld(a, a, 8)); - quad = vec_mul(pair, vec_sld(pair, pair, 4)); - octo = vec_mul(quad, vec_sld(quad, quad, 2)); - result = vec_mul(octo, vec_sld(octo, octo, 1)); - - return pfirst(result); -} - // min template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) { @@ -1395,59 +659,6 @@ template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) return pfirst(res); } -template<> EIGEN_STRONG_INLINE short int predux_min(const Packet8s& a) -{ - Packet8s pair, quad, octo; - - //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) } - pair = vec_min(a, vec_sld(a, a, 8)); - - //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) } - quad = vec_min(pair, vec_sld(pair, pair, 4)); - - //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) } - octo = vec_min(quad, vec_sld(quad, quad, 2)); - return pfirst(octo); -} - -template<> EIGEN_STRONG_INLINE unsigned short int predux_min(const Packet8us& a) -{ - Packet8us pair, quad, octo; - - //pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) } - pair = vec_min(a, vec_sld(a, a, 8)); - - //quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) } - quad = vec_min(pair, vec_sld(pair, pair, 4)); - - //octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) } - octo = vec_min(quad, vec_sld(quad, quad, 2)); - return pfirst(octo); -} - -template<> EIGEN_STRONG_INLINE int8_t predux_min(const Packet16c& a) -{ - Packet16c pair, quad, octo, result; - - pair = vec_min(a, vec_sld(a, a, 8)); - quad = vec_min(pair, vec_sld(pair, pair, 4)); - octo = vec_min(quad, vec_sld(quad, quad, 2)); - result = vec_min(octo, vec_sld(octo, octo, 1)); - - return pfirst(result); -} - -template<> EIGEN_STRONG_INLINE uint8_t predux_min(const Packet16uc& a) -{ - Packet16uc pair, quad, octo, result; - - pair = vec_min(a, vec_sld(a, a, 8)); - quad = vec_min(pair, vec_sld(pair, pair, 4)); - octo = vec_min(quad, vec_sld(quad, quad, 2)); - result = vec_min(octo, vec_sld(octo, octo, 1)); - - return pfirst(result); -} // max template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) { @@ -1465,64 +676,59 @@ template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) return pfirst(res); } -template<> EIGEN_STRONG_INLINE short int predux_max(const Packet8s& a) +template +struct palign_impl { - Packet8s pair, quad, octo; - - //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) } - pair = vec_max(a, vec_sld(a, a, 8)); + static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) + { +#ifdef _BIG_ENDIAN + switch (Offset % 4) { + case 1: + first = vec_sld(first, second, 4); break; + case 2: + first = vec_sld(first, second, 8); break; + case 3: + first = vec_sld(first, second, 12); break; + } +#else + switch (Offset % 4) { + case 1: + first = vec_sld(second, first, 12); break; + case 2: + first = vec_sld(second, first, 8); break; + case 3: + first = vec_sld(second, first, 4); break; + } +#endif + } +}; - //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) } - quad = vec_max(pair, vec_sld(pair, pair, 4)); - - //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) } - octo = vec_max(quad, vec_sld(quad, quad, 2)); - return pfirst(octo); -} - -template<> EIGEN_STRONG_INLINE unsigned short int predux_max(const Packet8us& a) +template +struct palign_impl { - Packet8us pair, quad, octo; - - //pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) } - pair = vec_max(a, vec_sld(a, a, 8)); - - //quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) } - quad = vec_max(pair, vec_sld(pair, pair, 4)); - - //octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) } - octo = vec_max(quad, vec_sld(quad, quad, 2)); - return pfirst(octo); -} - -template<> EIGEN_STRONG_INLINE int8_t predux_max(const Packet16c& a) -{ - Packet16c pair, quad, octo, result; - - pair = vec_max(a, vec_sld(a, a, 8)); - quad = vec_max(pair, vec_sld(pair, pair, 4)); - octo = vec_max(quad, vec_sld(quad, quad, 2)); - result = vec_max(octo, vec_sld(octo, octo, 1)); - - return pfirst(result); -} - -template<> EIGEN_STRONG_INLINE uint8_t predux_max(const Packet16uc& a) -{ - Packet16uc pair, quad, octo, result; - - pair = vec_max(a, vec_sld(a, a, 8)); - quad = vec_max(pair, vec_sld(pair, pair, 4)); - octo = vec_max(quad, vec_sld(quad, quad, 2)); - result = vec_max(octo, vec_sld(octo, octo, 1)); - - return pfirst(result); -} - -template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) -{ - return vec_any_ne(x, pzero(x)); -} + static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) + { +#ifdef _BIG_ENDIAN + switch (Offset % 4) { + case 1: + first = vec_sld(first, second, 4); break; + case 2: + first = vec_sld(first, second, 8); break; + case 3: + first = vec_sld(first, second, 12); break; + } +#else + switch (Offset % 4) { + case 1: + first = vec_sld(second, first, 12); break; + case 2: + first = vec_sld(second, first, 8); break; + case 3: + first = vec_sld(second, first, 4); break; + } +#endif + } +}; EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { @@ -1550,267 +756,6 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3] = vec_mergel(t1, t3); } -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - Packet8s t0, t1, t2, t3; - t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); - t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); - t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); - t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); - kernel.packet[0] = vec_mergeh(t0, t2); - kernel.packet[1] = vec_mergel(t0, t2); - kernel.packet[2] = vec_mergeh(t1, t3); - kernel.packet[3] = vec_mergel(t1, t3); -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - Packet8us t0, t1, t2, t3; - t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); - t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); - t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); - t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); - kernel.packet[0] = vec_mergeh(t0, t2); - kernel.packet[1] = vec_mergel(t0, t2); - kernel.packet[2] = vec_mergeh(t1, t3); - kernel.packet[3] = vec_mergel(t1, t3); -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - Packet16c t0, t1, t2, t3; - t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); - t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); - t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); - t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); - kernel.packet[0] = vec_mergeh(t0, t2); - kernel.packet[1] = vec_mergel(t0, t2); - kernel.packet[2] = vec_mergeh(t1, t3); - kernel.packet[3] = vec_mergel(t1, t3); -} - - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - Packet16uc t0, t1, t2, t3; - t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); - t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); - t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); - t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); - kernel.packet[0] = vec_mergeh(t0, t2); - kernel.packet[1] = vec_mergel(t0, t2); - kernel.packet[2] = vec_mergeh(t1, t3); - kernel.packet[3] = vec_mergel(t1, t3); -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - Packet8s v[8], sum[8]; - - v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]); - v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]); - v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]); - v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]); - v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]); - v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]); - v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]); - v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]); - sum[0] = vec_mergeh(v[0], v[4]); - sum[1] = vec_mergel(v[0], v[4]); - sum[2] = vec_mergeh(v[1], v[5]); - sum[3] = vec_mergel(v[1], v[5]); - sum[4] = vec_mergeh(v[2], v[6]); - sum[5] = vec_mergel(v[2], v[6]); - sum[6] = vec_mergeh(v[3], v[7]); - sum[7] = vec_mergel(v[3], v[7]); - - kernel.packet[0] = vec_mergeh(sum[0], sum[4]); - kernel.packet[1] = vec_mergel(sum[0], sum[4]); - kernel.packet[2] = vec_mergeh(sum[1], sum[5]); - kernel.packet[3] = vec_mergel(sum[1], sum[5]); - kernel.packet[4] = vec_mergeh(sum[2], sum[6]); - kernel.packet[5] = vec_mergel(sum[2], sum[6]); - kernel.packet[6] = vec_mergeh(sum[3], sum[7]); - kernel.packet[7] = vec_mergel(sum[3], sum[7]); -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - Packet8us v[8], sum[8]; - - v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]); - v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]); - v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]); - v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]); - v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]); - v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]); - v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]); - v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]); - sum[0] = vec_mergeh(v[0], v[4]); - sum[1] = vec_mergel(v[0], v[4]); - sum[2] = vec_mergeh(v[1], v[5]); - sum[3] = vec_mergel(v[1], v[5]); - sum[4] = vec_mergeh(v[2], v[6]); - sum[5] = vec_mergel(v[2], v[6]); - sum[6] = vec_mergeh(v[3], v[7]); - sum[7] = vec_mergel(v[3], v[7]); - - kernel.packet[0] = vec_mergeh(sum[0], sum[4]); - kernel.packet[1] = vec_mergel(sum[0], sum[4]); - kernel.packet[2] = vec_mergeh(sum[1], sum[5]); - kernel.packet[3] = vec_mergel(sum[1], sum[5]); - kernel.packet[4] = vec_mergeh(sum[2], sum[6]); - kernel.packet[5] = vec_mergel(sum[2], sum[6]); - kernel.packet[6] = vec_mergeh(sum[3], sum[7]); - kernel.packet[7] = vec_mergel(sum[3], sum[7]); -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - Packet16c step1[16], step2[16], step3[16]; - - step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]); - step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]); - step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]); - step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]); - step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]); - step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]); - step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]); - step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]); - step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]); - step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]); - step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]); - step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]); - step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]); - step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]); - step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]); - step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]); - - step2[0] = vec_mergeh(step1[0], step1[8]); - step2[1] = vec_mergel(step1[0], step1[8]); - step2[2] = vec_mergeh(step1[1], step1[9]); - step2[3] = vec_mergel(step1[1], step1[9]); - step2[4] = vec_mergeh(step1[2], step1[10]); - step2[5] = vec_mergel(step1[2], step1[10]); - step2[6] = vec_mergeh(step1[3], step1[11]); - step2[7] = vec_mergel(step1[3], step1[11]); - step2[8] = vec_mergeh(step1[4], step1[12]); - step2[9] = vec_mergel(step1[4], step1[12]); - step2[10] = vec_mergeh(step1[5], step1[13]); - step2[11] = vec_mergel(step1[5], step1[13]); - step2[12] = vec_mergeh(step1[6], step1[14]); - step2[13] = vec_mergel(step1[6], step1[14]); - step2[14] = vec_mergeh(step1[7], step1[15]); - step2[15] = vec_mergel(step1[7], step1[15]); - - step3[0] = vec_mergeh(step2[0], step2[8]); - step3[1] = vec_mergel(step2[0], step2[8]); - step3[2] = vec_mergeh(step2[1], step2[9]); - step3[3] = vec_mergel(step2[1], step2[9]); - step3[4] = vec_mergeh(step2[2], step2[10]); - step3[5] = vec_mergel(step2[2], step2[10]); - step3[6] = vec_mergeh(step2[3], step2[11]); - step3[7] = vec_mergel(step2[3], step2[11]); - step3[8] = vec_mergeh(step2[4], step2[12]); - step3[9] = vec_mergel(step2[4], step2[12]); - step3[10] = vec_mergeh(step2[5], step2[13]); - step3[11] = vec_mergel(step2[5], step2[13]); - step3[12] = vec_mergeh(step2[6], step2[14]); - step3[13] = vec_mergel(step2[6], step2[14]); - step3[14] = vec_mergeh(step2[7], step2[15]); - step3[15] = vec_mergel(step2[7], step2[15]); - - kernel.packet[0] = vec_mergeh(step3[0], step3[8]); - kernel.packet[1] = vec_mergel(step3[0], step3[8]); - kernel.packet[2] = vec_mergeh(step3[1], step3[9]); - kernel.packet[3] = vec_mergel(step3[1], step3[9]); - kernel.packet[4] = vec_mergeh(step3[2], step3[10]); - kernel.packet[5] = vec_mergel(step3[2], step3[10]); - kernel.packet[6] = vec_mergeh(step3[3], step3[11]); - kernel.packet[7] = vec_mergel(step3[3], step3[11]); - kernel.packet[8] = vec_mergeh(step3[4], step3[12]); - kernel.packet[9] = vec_mergel(step3[4], step3[12]); - kernel.packet[10] = vec_mergeh(step3[5], step3[13]); - kernel.packet[11] = vec_mergel(step3[5], step3[13]); - kernel.packet[12] = vec_mergeh(step3[6], step3[14]); - kernel.packet[13] = vec_mergel(step3[6], step3[14]); - kernel.packet[14] = vec_mergeh(step3[7], step3[15]); - kernel.packet[15] = vec_mergel(step3[7], step3[15]); -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - Packet16uc step1[16], step2[16], step3[16]; - - step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]); - step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]); - step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]); - step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]); - step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]); - step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]); - step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]); - step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]); - step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]); - step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]); - step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]); - step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]); - step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]); - step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]); - step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]); - step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]); - - step2[0] = vec_mergeh(step1[0], step1[8]); - step2[1] = vec_mergel(step1[0], step1[8]); - step2[2] = vec_mergeh(step1[1], step1[9]); - step2[3] = vec_mergel(step1[1], step1[9]); - step2[4] = vec_mergeh(step1[2], step1[10]); - step2[5] = vec_mergel(step1[2], step1[10]); - step2[6] = vec_mergeh(step1[3], step1[11]); - step2[7] = vec_mergel(step1[3], step1[11]); - step2[8] = vec_mergeh(step1[4], step1[12]); - step2[9] = vec_mergel(step1[4], step1[12]); - step2[10] = vec_mergeh(step1[5], step1[13]); - step2[11] = vec_mergel(step1[5], step1[13]); - step2[12] = vec_mergeh(step1[6], step1[14]); - step2[13] = vec_mergel(step1[6], step1[14]); - step2[14] = vec_mergeh(step1[7], step1[15]); - step2[15] = vec_mergel(step1[7], step1[15]); - - step3[0] = vec_mergeh(step2[0], step2[8]); - step3[1] = vec_mergel(step2[0], step2[8]); - step3[2] = vec_mergeh(step2[1], step2[9]); - step3[3] = vec_mergel(step2[1], step2[9]); - step3[4] = vec_mergeh(step2[2], step2[10]); - step3[5] = vec_mergel(step2[2], step2[10]); - step3[6] = vec_mergeh(step2[3], step2[11]); - step3[7] = vec_mergel(step2[3], step2[11]); - step3[8] = vec_mergeh(step2[4], step2[12]); - step3[9] = vec_mergel(step2[4], step2[12]); - step3[10] = vec_mergeh(step2[5], step2[13]); - step3[11] = vec_mergel(step2[5], step2[13]); - step3[12] = vec_mergeh(step2[6], step2[14]); - step3[13] = vec_mergel(step2[6], step2[14]); - step3[14] = vec_mergeh(step2[7], step2[15]); - step3[15] = vec_mergel(step2[7], step2[15]); - - kernel.packet[0] = vec_mergeh(step3[0], step3[8]); - kernel.packet[1] = vec_mergel(step3[0], step3[8]); - kernel.packet[2] = vec_mergeh(step3[1], step3[9]); - kernel.packet[3] = vec_mergel(step3[1], step3[9]); - kernel.packet[4] = vec_mergeh(step3[2], step3[10]); - kernel.packet[5] = vec_mergel(step3[2], step3[10]); - kernel.packet[6] = vec_mergeh(step3[3], step3[11]); - kernel.packet[7] = vec_mergel(step3[3], step3[11]); - kernel.packet[8] = vec_mergeh(step3[4], step3[12]); - kernel.packet[9] = vec_mergel(step3[4], step3[12]); - kernel.packet[10] = vec_mergeh(step3[5], step3[13]); - kernel.packet[11] = vec_mergel(step3[5], step3[13]); - kernel.packet[12] = vec_mergeh(step3[6], step3[14]); - kernel.packet[13] = vec_mergel(step3[6], step3[14]); - kernel.packet[14] = vec_mergeh(step3[7], step3[15]); - kernel.packet[15] = vec_mergel(step3[7], step3[15]); -} - template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; Packet4ui mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p4i_ONE))); @@ -1823,77 +768,6 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons return vec_sel(elsePacket, thenPacket, mask); } -template<> EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket, const Packet8s& elsePacket) { - Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3], - ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] }; - Packet8us mask = reinterpret_cast(vec_cmpeq(select, p8us_ONE)); - Packet8s result = vec_sel(elsePacket, thenPacket, mask); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket, const Packet8us& elsePacket) { - Packet8us select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3], - ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7] }; - Packet8us mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), p8us_ONE)); - return vec_sel(elsePacket, thenPacket, mask); -} - -template<> EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, const Packet16c& thenPacket, const Packet16c& elsePacket) { - Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3], - ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7], - ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11], - ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] }; - - Packet16uc mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), p16uc_ONE)); - return vec_sel(elsePacket, thenPacket, mask); -} - -template<> EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket, const Packet16uc& elsePacket) { - Packet16uc select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3], - ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7], - ifPacket.select[8], ifPacket.select[9], ifPacket.select[10], ifPacket.select[11], - ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15] }; - - Packet16uc mask = reinterpret_cast(vec_cmpeq(reinterpret_cast(select), p16uc_ONE)); - return vec_sel(elsePacket, thenPacket, mask); -} - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - - -template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { - return vec_cts(a,0); -} - -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { - return vec_ctf(a,0); -} - -template<> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { - return reinterpret_cast(a); -} - -template<> EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) { - return reinterpret_cast(a); -} - - //---------- double ---------- #ifdef __VSX__ @@ -1961,7 +835,7 @@ template<> struct packet_traits : default_packet_traits }; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; inline std::ostream & operator <<(std::ostream & s, const Packet2l & v) { @@ -1989,13 +863,21 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD - return vec_xl(0, const_cast(from)); // cast needed by Clang +#ifdef __VSX__ + return vec_vsx_ld(0, from); +#else + return vec_ld(0, from); +#endif } template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE - vec_xst(from, 0, to); +#ifdef __VSX__ + vec_vsx_st(from, 0, to); +#else + vec_st(from, 0, to); +#endif } template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { @@ -2017,14 +899,14 @@ pbroadcast4(const double *a, template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) { - EIGEN_ALIGN16 double af[2]; + double EIGEN_ALIGN16 af[2]; af[0] = from[0*stride]; af[1] = from[1*stride]; return pload(af); } template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) { - EIGEN_ALIGN16 double af[2]; + double EIGEN_ALIGN16 af[2]; pstore(af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -2048,7 +930,6 @@ template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { - // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN Packet2d ret; __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); return ret; @@ -2056,20 +937,11 @@ template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { - // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN Packet2d ret; __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); return ret; } -template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return reinterpret_cast(vec_cmple(a,b)); } -template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return reinterpret_cast(vec_cmplt(a,b)); } -template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return reinterpret_cast(vec_cmpeq(a,b)); } -template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) { - Packet2d c = reinterpret_cast(vec_cmpge(a,b)); - return vec_nor(c,c); -} - template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } @@ -2084,8 +956,8 @@ template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { re template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { - EIGEN_DEBUG_UNALIGNED_LOAD - return vec_xl(0, from); + EIGEN_DEBUG_ALIGNED_LOAD + return (Packet2d) vec_vsx_ld((long)from & 15, (const double*) _EIGEN_ALIGNED_PTR(from)); } template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) @@ -2098,13 +970,13 @@ template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { - EIGEN_DEBUG_UNALIGNED_STORE - vec_xst(from, 0, to); + EIGEN_DEBUG_ALIGNED_STORE + vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); } template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_PPC_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { @@ -2112,59 +984,6 @@ template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) } template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } -// VSX support varies between different compilers and even different -// versions of the same compiler. For gcc version >= 4.9.3, we can use -// vec_cts to efficiently convert Packet2d to Packet2l. Otherwise, use -// a slow version that works with older compilers. -// Update: apparently vec_cts/vec_ctf intrinsics for 64-bit doubles -// are buggy, https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70963 -static inline Packet2l ConvertToPacket2l(const Packet2d& x) { -#if EIGEN_GNUC_AT_LEAST(5, 4) || \ - (EIGEN_GNUC_AT(6, 1) && __GNUC_PATCHLEVEL__ >= 1) - return vec_cts(x, 0); // TODO: check clang version. -#else - double tmp[2]; - memcpy(tmp, &x, sizeof(tmp)); - Packet2l l = { static_cast(tmp[0]), - static_cast(tmp[1]) }; - return l; -#endif -} - -template<> EIGEN_STRONG_INLINE Packet2d pldexp(const Packet2d& a, const Packet2d& exponent) { - - // build 2^n - Packet2l emm0 = ConvertToPacket2l(exponent); - -#ifdef __POWER8_VECTOR__ - const Packet2l p2l_1023 = { 1023, 1023 }; - const Packet2ul p2ul_52 = { 52, 52 }; - emm0 = vec_add(emm0, p2l_1023); - emm0 = vec_sl(emm0, p2ul_52); -#else - // Code is a bit complex for POWER7. There is actually a - // vec_xxsldi intrinsic but it is not supported by some gcc versions. - // So we shift (52-32) bits and do a word swap with zeros. - const Packet4i p4i_1023 = pset1(1023); - const Packet4i p4i_20 = pset1(20); // 52 - 32 - - Packet4i emm04i = reinterpret_cast(emm0); - emm04i = vec_add(emm04i, p4i_1023); - emm04i = vec_sl(emm04i, reinterpret_cast(p4i_20)); - static const Packet16uc perm = { - 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, - 0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b }; -#ifdef _BIG_ENDIAN - emm0 = reinterpret_cast(vec_perm(p4i_ZERO, emm04i, perm)); -#else - emm0 = reinterpret_cast(vec_perm(emm04i, p4i_ZERO, perm)); -#endif - -#endif - - return pmul(a, reinterpret_cast(emm0)); -} - template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { Packet2d b, sum; @@ -2173,6 +992,20 @@ template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) return pfirst(sum); } +template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) +{ + Packet2d v[2], sum; + v[0] = vecs[0] + reinterpret_cast(vec_sld(reinterpret_cast(vecs[0]), reinterpret_cast(vecs[0]), 8)); + v[1] = vecs[1] + reinterpret_cast(vec_sld(reinterpret_cast(vecs[1]), reinterpret_cast(vecs[1]), 8)); + +#ifdef _BIG_ENDIAN + sum = reinterpret_cast(vec_sld(reinterpret_cast(v[0]), reinterpret_cast(v[1]), 8)); +#else + sum = reinterpret_cast(vec_sld(reinterpret_cast(v[1]), reinterpret_cast(v[0]), 8)); +#endif + + return sum; +} // Other reduction functions: // mul template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) @@ -2192,6 +1025,20 @@ template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) return pfirst(pmax(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); } +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) + { + if (Offset == 1) +#ifdef _BIG_ENDIAN + first = reinterpret_cast(vec_sld(reinterpret_cast(first), reinterpret_cast(second), 8)); +#else + first = reinterpret_cast(vec_sld(reinterpret_cast(second), reinterpret_cast(first), 8)); +#endif + } +}; + EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { Packet2d t0, t1; diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/Complex.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/Complex.h index 57d1201f4..9c2536509 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/Complex.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/Complex.h @@ -16,7 +16,7 @@ namespace Eigen { namespace internal { -#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU) +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) // Many std::complex methods such as operator+, operator-, operator* and // operator/ are not constexpr. Due to this, clang does not treat them as device @@ -55,7 +55,7 @@ template struct scalar_difference_op, std::complex struct scalar_product_op, const std::complex > : binary_op_base, const std::complex > { enum { - Vectorizable = packet_traits >::HasMul + Vectorizable = packet_traits>::HasMul }; typedef typename std::complex result_type; @@ -76,7 +76,7 @@ template struct scalar_product_op, std::complex > // Quotient template struct scalar_quotient_op, const std::complex > : binary_op_base, const std::complex > { enum { - Vectorizable = packet_traits >::HasDiv + Vectorizable = packet_traits>::HasDiv }; typedef typename std::complex result_type; diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/Half.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/Half.h similarity index 79% rename from uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/Half.h rename to uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/Half.h index cfd0bdc06..59717b4fe 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/Half.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/Half.h @@ -26,15 +26,15 @@ // Standard 16-bit float type, mostly useful for GPUs. Defines a new -// type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with +// type Eigen::half (inheriting from CUDA's __half struct) with // operator overloads such that it behaves basically as an arithmetic // type. It will be quite slow on CPUs (so it is recommended to stay -// in fp32 for CPUs, except for simple parameter conversions, I/O +// in float32_bits for CPUs, except for simple parameter conversions, I/O // to disk and the likes), but fast on GPUs. -#ifndef EIGEN_HALF_H -#define EIGEN_HALF_H +#ifndef EIGEN_HALF_CUDA_H +#define EIGEN_HALF_CUDA_H #if __cplusplus > 199711L #define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type() @@ -50,25 +50,16 @@ struct half; namespace half_impl { -#if !defined(EIGEN_HAS_GPU_FP16) +#if !defined(EIGEN_HAS_CUDA_FP16) // Make our own __half_raw definition that is similar to CUDA's. struct __half_raw { EIGEN_DEVICE_FUNC __half_raw() : x(0) {} explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} unsigned short x; }; -#elif defined(EIGEN_HAS_HIP_FP16) - // Nothing to do here - // HIP fp16 header file has a definition for __half_raw -#elif defined(EIGEN_HAS_CUDA_FP16) - #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 +#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 // In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw - typedef __half __half_raw; - #endif // defined(EIGEN_HAS_CUDA_FP16) - -#elif defined(SYCL_DEVICE_ONLY) -typedef cl::sycl::half __half_raw; - +typedef __half __half_raw; #endif EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x); @@ -77,16 +68,10 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h); struct half_base : public __half_raw { EIGEN_DEVICE_FUNC half_base() {} + EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {} EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {} - -#if defined(EIGEN_HAS_GPU_FP16) - #if defined(EIGEN_HAS_HIP_FP16) - EIGEN_DEVICE_FUNC half_base(const __half& h) { x = __half_as_ushort(h); } - #elif defined(EIGEN_HAS_CUDA_FP16) - #if (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000) +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} - #endif - #endif #endif }; @@ -94,38 +79,18 @@ struct half_base : public __half_raw { // Class definition. struct half : public half_impl::half_base { - - // Writing this out as separate #if-else blocks to make the code easier to follow - // The same applies to most #if-else blocks in this file -#if !defined(EIGEN_HAS_GPU_FP16) - typedef half_impl::__half_raw __half_raw; -#elif defined(EIGEN_HAS_HIP_FP16) - // Nothing to do here - // HIP fp16 header file has a definition for __half_raw -#elif defined(EIGEN_HAS_CUDA_FP16) - // Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so - // (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP! So keeping this within - // #if defined(EIGEN_HAS_CUDA_FP16) is needed - #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000 + #if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000) typedef half_impl::__half_raw __half_raw; #endif -#endif EIGEN_DEVICE_FUNC half() {} EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {} - -#if defined(EIGEN_HAS_GPU_FP16) - #if defined(EIGEN_HAS_HIP_FP16) + EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {} +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} - #elif defined(EIGEN_HAS_CUDA_FP16) - #if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000 - EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} - #endif - #endif #endif - explicit EIGEN_DEVICE_FUNC half(bool b) : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} template @@ -174,6 +139,11 @@ struct half : public half_impl::half_base { EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const { return static_cast(half_impl::half_to_float(*this)); } + + EIGEN_DEVICE_FUNC half& operator=(const half& other) { + x = other.x; + return *this; + } }; } // end namespace Eigen @@ -232,24 +202,15 @@ namespace Eigen { namespace half_impl { -#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \ - EIGEN_CUDA_ARCH >= 530) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE)) -#define EIGEN_HAS_NATIVE_FP16 -#endif +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 // Intrinsics for native fp16 support. Note that on current hardware, -// these are no faster than fp32 arithmetic (you need to use the half2 +// these are no faster than float32_bits arithmetic (you need to use the half2 // versions to get the ALU speed increased), but you do save the // conversion steps back and forth. -#if defined(EIGEN_HAS_NATIVE_FP16) EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) { -#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000 - return __hadd(::__half(a), ::__half(b)); -#else return __hadd(a, b); -#endif } EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) { return __hmul(a, b); @@ -258,13 +219,9 @@ EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) { return __hsub(a, b); } EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) { -#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000 - return __hdiv(a, b); -#else float num = __half2float(a); float denom = __half2float(b); return __float2half(num / denom); -#endif } EIGEN_STRONG_INLINE __device__ half operator - (const half& a) { return __hneg(a); @@ -304,26 +261,10 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { return __hge(a, b); } -#endif +#else // Emulate support for half floats -// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler, -// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation -// of the functions, while the latter can only deal with one of them. -#if !defined(EIGEN_HAS_NATIVE_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) // Emulate support for half floats - -#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC) -// We need to provide emulated *host-side* FP16 operators for clang. -#pragma push_macro("EIGEN_DEVICE_FUNC") -#undef EIGEN_DEVICE_FUNC -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_FP16) -#define EIGEN_DEVICE_FUNC __host__ -#else // both host and device need emulated ops. -#define EIGEN_DEVICE_FUNC __host__ __device__ -#endif -#endif - -// Definitions for CPUs and older HIP+CUDA, mostly working through conversion -// to/from fp32. +// Definitions for CPUs and older CUDA, mostly working through conversion +// to/from float32_bits. EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { return half(float(a) + float(b)); @@ -377,9 +318,6 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator >= (const half& a, const hal return float(a) >= float(b); } -#if defined(__clang__) && defined(__CUDA__) -#pragma pop_macro("EIGEN_DEVICE_FUNC") -#endif #endif // Emulate support for half floats // Division by an index. Do it in full float precision to avoid accuracy @@ -405,8 +343,7 @@ union float32_bits { }; EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { -#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 __half tmp_ff = __float2half(ff); return *(__half_raw*)&tmp_ff; @@ -462,8 +399,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { -#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 return __half2float(h); #elif defined(EIGEN_HAS_FP16_C) @@ -497,8 +433,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) { return (a.x & 0x7fff) == 0x7c00; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) { -#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return __hisnan(a); #else return (a.x & 0x7fff) > 0x7c00; @@ -514,19 +449,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { return result; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { -#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ - defined(EIGEN_HIP_DEVICE_COMPILE) +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 return half(hexp(a)); #else return half(::expf(float(a))); #endif } -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { - return half(numext::expm1(float(a))); -} EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { -#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) +#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return half(::hlog(a)); #else return half(::logf(float(a))); @@ -539,8 +469,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { return half(::log10f(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { -#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ - defined(EIGEN_HIP_DEVICE_COMPILE) +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 return half(hsqrt(a)); #else return half(::sqrtf(float(a))); @@ -562,16 +491,14 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { return half(::tanhf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { -#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \ - defined(EIGEN_HIP_DEVICE_COMPILE) +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 return half(hfloor(a)); #else return half(::floorf(float(a))); #endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { -#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \ - defined(EIGEN_HIP_DEVICE_COMPILE) +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 return half(hceil(a)); #else return half(::ceilf(float(a))); @@ -579,8 +506,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { -#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return __hlt(b, a) ? b : a; #else const float f1 = static_cast(a); @@ -589,8 +515,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { #endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) { -#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return __hlt(a, b) ? b : a; #else const float f1 = static_cast(a); @@ -599,12 +524,10 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) { #endif } -#ifndef EIGEN_NO_IO EIGEN_ALWAYS_INLINE std::ostream& operator << (std::ostream& os, const half& v) { os << static_cast(v); return os; } -#endif } // end namespace half_impl @@ -670,8 +593,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) { return Eigen::half(::expf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { -#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \ - defined(EIGEN_HIP_DEVICE_COMPILE) +#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return Eigen::half(::hlog(a)); #else return Eigen::half(::logf(float(a))); @@ -705,12 +627,9 @@ struct hash { // Add the missing shfl_xor intrinsic -#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ - defined(EIGEN_HIPCC) - +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { - #if (EIGEN_CUDA_SDK_VER < 90000) || \ - defined(EIGEN_HAS_HIP_FP16) + #if EIGEN_CUDACC_VER < 90000 return static_cast(__shfl_xor(static_cast(var), laneMask, width)); #else return static_cast(__shfl_xor_sync(0xFFFFFFFF, static_cast(var), laneMask, width)); @@ -719,8 +638,7 @@ __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneM #endif // ldg() has an overload for __half_raw, but we also need one for Eigen::half. -#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350) || \ - defined(EIGEN_HIPCC) +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { return Eigen::half_impl::raw_uint16_to_half( __ldg(reinterpret_cast(ptr))); @@ -728,7 +646,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) #endif -#if defined(EIGEN_GPU_COMPILE_PHASE) +#if defined(EIGEN_CUDA_ARCH) namespace Eigen { namespace numext { @@ -754,4 +672,4 @@ bool (isfinite)(const Eigen::half& h) { } // namespace numext #endif -#endif // EIGEN_HALF_H +#endif // EIGEN_HALF_CUDA_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/GPU/MathFunctions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h similarity index 82% rename from uppsrc/plugin/Eigen/Eigen/src/Core/arch/GPU/MathFunctions.h rename to uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h index d2b3a2568..0348b41db 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/GPU/MathFunctions.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/MathFunctions.h @@ -7,8 +7,8 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#ifndef EIGEN_MATH_FUNCTIONS_GPU_H -#define EIGEN_MATH_FUNCTIONS_GPU_H +#ifndef EIGEN_MATH_FUNCTIONS_CUDA_H +#define EIGEN_MATH_FUNCTIONS_CUDA_H namespace Eigen { @@ -17,7 +17,7 @@ namespace internal { // Make sure this is only available when targeting a GPU: we don't want to // introduce conflicts between these packet_traits definitions and the ones // we'll use on the host side (SSE, AVX, ...) -#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plog(const float4& a) { @@ -56,18 +56,6 @@ double2 pexp(const double2& a) return make_double2(exp(a.x), exp(a.y)); } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -float4 pexpm1(const float4& a) -{ - return make_float4(expm1f(a.x), expm1f(a.y), expm1f(a.z), expm1f(a.w)); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -double2 pexpm1(const double2& a) -{ - return make_double2(expm1(a.x), expm1(a.y)); -} - template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psqrt(const float4& a) { @@ -100,4 +88,4 @@ double2 prsqrt(const double2& a) } // end namespace Eigen -#endif // EIGEN_MATH_FUNCTIONS_GPU_H +#endif // EIGEN_MATH_FUNCTIONS_CUDA_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/PacketMath.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/PacketMath.h new file mode 100644 index 000000000..4dda63188 --- /dev/null +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -0,0 +1,333 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_CUDA_H +#define EIGEN_PACKET_MATH_CUDA_H + +namespace Eigen { + +namespace internal { + +// Make sure this is only available when targeting a GPU: we don't want to +// introduce conflicts between these packet_traits definitions and the ones +// we'll use on the host side (SSE, AVX, ...) +#if defined(__CUDACC__) && defined(EIGEN_USE_GPU) +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; + +template<> struct packet_traits : default_packet_traits +{ + typedef float4 type; + typedef float4 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=4, + HasHalfPacket = 0, + + HasDiv = 1, + HasSin = 0, + HasCos = 0, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasLGamma = 1, + HasDiGamma = 1, + HasZeta = 1, + HasPolygamma = 1, + HasErf = 1, + HasErfc = 1, + HasIGamma = 1, + HasIGammac = 1, + HasBetaInc = 1, + + HasBlend = 0, + }; +}; + +template<> struct packet_traits : default_packet_traits +{ + typedef double2 type; + typedef double2 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 0, + + HasDiv = 1, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasLGamma = 1, + HasDiGamma = 1, + HasZeta = 1, + HasPolygamma = 1, + HasErf = 1, + HasErfc = 1, + HasIGamma = 1, + HasIGammac = 1, + HasBetaInc = 1, + + HasBlend = 0, + }; +}; + + +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef float4 half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef double2 half; }; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1(const float& from) { + return make_float4(from, from, from, from); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1(const double& from) { + return make_double2(from, from); +} + + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset(const float& a) { + return make_float4(a, a+1, a+2, a+3); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset(const double& a) { + return make_double2(a, a+1); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd(const float4& a, const float4& b) { + return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd(const double2& a, const double2& b) { + return make_double2(a.x+b.x, a.y+b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub(const float4& a, const float4& b) { + return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub(const double2& a, const double2& b) { + return make_double2(a.x-b.x, a.y-b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) { + return make_float4(-a.x, -a.y, -a.z, -a.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) { + return make_double2(-a.x, -a.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; } +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; } + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul(const float4& a, const float4& b) { + return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul(const double2& a, const double2& b) { + return make_double2(a.x*b.x, a.y*b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv(const float4& a, const float4& b) { + return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv(const double2& a, const double2& b) { + return make_double2(a.x/b.x, a.y/b.y); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin(const float4& a, const float4& b) { + return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin(const double2& a, const double2& b) { + return make_double2(fmin(a.x, b.x), fmin(a.y, b.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax(const float4& a, const float4& b) { + return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w)); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax(const double2& a, const double2& b) { + return make_double2(fmax(a.x, b.x), fmax(a.y, b.y)); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload(const float* from) { + return *reinterpret_cast(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload(const double* from) { + return *reinterpret_cast(from); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu(const float* from) { + return make_float4(from[0], from[1], from[2], from[3]); +} +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu(const double* from) { + return make_double2(from[0], from[1]); +} + +template<> EIGEN_STRONG_INLINE float4 ploaddup(const float* from) { + return make_float4(from[0], from[0], from[1], from[1]); +} +template<> EIGEN_STRONG_INLINE double2 ploaddup(const double* from) { + return make_double2(from[0], from[0]); +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(float* to, const float4& from) { + *reinterpret_cast(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(double* to, const double2& from) { + *reinterpret_cast(to) = from; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(float* to, const float4& from) { + to[0] = from.x; + to[1] = from.y; + to[2] = from.z; + to[3] = from.w; +} + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(double* to, const double2& from) { + to[0] = from.x; + to[1] = from.y; +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 + return __ldg((const float4*)from); +#else + return make_float4(from[0], from[1], from[2], from[3]); +#endif +} +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 + return __ldg((const double2*)from); +#else + return make_double2(from[0], from[1]); +#endif +} + +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 + return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3)); +#else + return make_float4(from[0], from[1], from[2], from[3]); +#endif +} +template<> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 + return make_double2(__ldg(from+0), __ldg(from+1)); +#else + return make_double2(from[0], from[1]); +#endif +} + +template<> EIGEN_DEVICE_FUNC inline float4 pgather(const float* from, Index stride) { + return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline double2 pgather(const double* from, Index stride) { + return make_double2(from[0*stride], from[1*stride]); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const float4& from, Index stride) { + to[stride*0] = from.x; + to[stride*1] = from.y; + to[stride*2] = from.z; + to[stride*3] = from.w; +} +template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const double2& from, Index stride) { + to[stride*0] = from.x; + to[stride*1] = from.y; +} + +template<> EIGEN_DEVICE_FUNC inline float pfirst(const float4& a) { + return a.x; +} +template<> EIGEN_DEVICE_FUNC inline double pfirst(const double2& a) { + return a.x; +} + +template<> EIGEN_DEVICE_FUNC inline float predux(const float4& a) { + return a.x + a.y + a.z + a.w; +} +template<> EIGEN_DEVICE_FUNC inline double predux(const double2& a) { + return a.x + a.y; +} + +template<> EIGEN_DEVICE_FUNC inline float predux_max(const float4& a) { + return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double predux_max(const double2& a) { + return fmax(a.x, a.y); +} + +template<> EIGEN_DEVICE_FUNC inline float predux_min(const float4& a) { + return fminf(fminf(a.x, a.y), fminf(a.z, a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double predux_min(const double2& a) { + return fmin(a.x, a.y); +} + +template<> EIGEN_DEVICE_FUNC inline float predux_mul(const float4& a) { + return a.x * a.y * a.z * a.w; +} +template<> EIGEN_DEVICE_FUNC inline double predux_mul(const double2& a) { + return a.x * a.y; +} + +template<> EIGEN_DEVICE_FUNC inline float4 pabs(const float4& a) { + return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w)); +} +template<> EIGEN_DEVICE_FUNC inline double2 pabs(const double2& a) { + return make_double2(fabs(a.x), fabs(a.y)); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + float tmp = kernel.packet[0].y; + kernel.packet[0].y = kernel.packet[1].x; + kernel.packet[1].x = tmp; + + tmp = kernel.packet[0].z; + kernel.packet[0].z = kernel.packet[2].x; + kernel.packet[2].x = tmp; + + tmp = kernel.packet[0].w; + kernel.packet[0].w = kernel.packet[3].x; + kernel.packet[3].x = tmp; + + tmp = kernel.packet[1].z; + kernel.packet[1].z = kernel.packet[2].y; + kernel.packet[2].y = tmp; + + tmp = kernel.packet[1].w; + kernel.packet[1].w = kernel.packet[3].y; + kernel.packet[3].y = tmp; + + tmp = kernel.packet[2].w; + kernel.packet[2].w = kernel.packet[3].z; + kernel.packet[3].z = tmp; +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + double tmp = kernel.packet[0].y; + kernel.packet[0].y = kernel.packet[1].x; + kernel.packet[1].x = tmp; +} + +#endif + +} // end namespace internal + +} // end namespace Eigen + + +#endif // EIGEN_PACKET_MATH_CUDA_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h new file mode 100644 index 000000000..f749c573f --- /dev/null +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -0,0 +1,1124 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H +#define EIGEN_PACKET_MATH_HALF_CUDA_H + + +namespace Eigen { +namespace internal { + +// Most of the following operations require arch >= 3.0 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + +template<> struct is_arithmetic { enum { value = true }; }; + +template<> struct packet_traits : default_packet_traits +{ + typedef half2 type; + typedef half2 half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size=2, + HasHalfPacket = 0, + HasAdd = 1, + HasMul = 1, + HasDiv = 1, + HasSqrt = 1, + HasRsqrt = 1, + HasExp = 1, + HasLog = 1, + HasLog1p = 1 + }; +}; + +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=2, alignment=Aligned16}; typedef half2 half; }; + +template<> __device__ EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { + return __half2half2(from); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { + return *reinterpret_cast(from); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { + return __halves2half2(from[0], from[1]); +} + +template<> EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { + return __halves2half2(from[0], from[0]); +} + +template<> __device__ EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const half2& from) { + *reinterpret_cast(to) = from; +} + +template<> __device__ EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) { + to[0] = __low2half(from); + to[1] = __high2half(from); +} + +template<> + __device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { +#if __CUDA_ARCH__ >= 350 + return __ldg((const half2*)from); +#else + return __halves2half2(*(from+0), *(from+1)); +#endif +} + +template<> +__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro(const Eigen::half* from) { +#if __CUDA_ARCH__ >= 350 + return __halves2half2(__ldg(from+0), __ldg(from+1)); +#else + return __halves2half2(*(from+0), *(from+1)); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) { + return __halves2half2(from[0*stride], from[1*stride]); +} + +template<> __device__ EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const half2& from, Index stride) { + to[stride*0] = __low2half(from); + to[stride*1] = __high2half(from); +} + +template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { + return __low2half(a); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pabs(const half2& a) { + half2 result; + unsigned temp = *(reinterpret_cast(&(a))); + *(reinterpret_cast(&(result))) = temp & 0x7FFF7FFF; + return result; +} + + +__device__ EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __half a1 = __low2half(kernel.packet[0]); + __half a2 = __high2half(kernel.packet[0]); + __half b1 = __low2half(kernel.packet[1]); + __half b2 = __high2half(kernel.packet[1]); + kernel.packet[0] = __halves2half2(a1, b1); + kernel.packet[1] = __halves2half2(a2, b2); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { +#if __CUDA_ARCH__ >= 530 + return __halves2half2(a, __hadd(a, __float2half(1.0f))); +#else + float f = __half2float(a) + 1.0f; + return __halves2half2(a, __float2half(f)); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) { +#if __CUDA_ARCH__ >= 530 + return __hadd2(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 + b1; + float r2 = a2 + b2; + return __floats2half2_rn(r1, r2); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) { +#if __CUDA_ARCH__ >= 530 + return __hsub2(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 - b1; + float r2 = a2 - b2; + return __floats2half2_rn(r1, r2); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { +#if __CUDA_ARCH__ >= 530 + return __hneg2(a); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return __floats2half2_rn(-a1, -a2); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } + +template<> __device__ EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) { +#if __CUDA_ARCH__ >= 530 + return __hmul2(a, b); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 * b1; + float r2 = a2 * b2; + return __floats2half2_rn(r1, r2); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) { +#if __CUDA_ARCH__ >= 530 + return __hfma2(a, b, c); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float c1 = __low2float(c); + float c2 = __high2float(c); + float r1 = a1 * b1 + c1; + float r2 = a2 * b2 + c2; + return __floats2half2_rn(r1, r2); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + float r1 = a1 / b1; + float r2 = a2 / b2; + return __floats2half2_rn(r1, r2); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float b1 = __low2float(b); + float b2 = __high2float(b); + __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); +} + +template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { +#if __CUDA_ARCH__ >= 530 + return __hadd(__low2half(a), __high2half(a)); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return Eigen::half(__float2half_rn(a1 + a2)); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { +#if __CUDA_ARCH__ >= 530 + __half first = __low2half(a); + __half second = __high2half(a); + return __hgt(first, second) ? first : second; +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return a1 > a2 ? __low2half(a) : __high2half(a); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { +#if __CUDA_ARCH__ >= 530 + __half first = __low2half(a); + __half second = __high2half(a); + return __hlt(first, second) ? first : second; +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return a1 < a2 ? __low2half(a) : __high2half(a); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) { +#if __CUDA_ARCH__ >= 530 + return __hmul(__low2half(a), __high2half(a)); +#else + float a1 = __low2float(a); + float a2 = __high2float(a); + return Eigen::half(__float2half_rn(a1 * a2)); +#endif +} + +template<> __device__ EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = log1pf(a1); + float r2 = log1pf(a2); + return __floats2half2_rn(r1, r2); +} + +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 + +template<> __device__ EIGEN_STRONG_INLINE +half2 plog(const half2& a) { + return h2log(a); +} + +template<> __device__ EIGEN_STRONG_INLINE +half2 pexp(const half2& a) { + return h2exp(a); +} + +template<> __device__ EIGEN_STRONG_INLINE +half2 psqrt(const half2& a) { + return h2sqrt(a); +} + +template<> __device__ EIGEN_STRONG_INLINE +half2 prsqrt(const half2& a) { + return h2rsqrt(a); +} + +#else + +template<> __device__ EIGEN_STRONG_INLINE half2 plog(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = logf(a1); + float r2 = logf(a2); + return __floats2half2_rn(r1, r2); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 pexp(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = expf(a1); + float r2 = expf(a2); + return __floats2half2_rn(r1, r2); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = sqrtf(a1); + float r2 = sqrtf(a2); + return __floats2half2_rn(r1, r2); +} + +template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { + float a1 = __low2float(a); + float a2 = __high2float(a); + float r1 = rsqrtf(a1); + float r2 = rsqrtf(a2); + return __floats2half2_rn(r1, r2); +} + +#endif + +#elif defined EIGEN_VECTORIZE_AVX512 + +typedef struct { + __m256i x; +} Packet16h; + + +template<> struct is_arithmetic { enum { value = true }; }; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet16h type; + // There is no half-size packet for Packet16h. + typedef Packet16h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 16, + HasHalfPacket = 0, + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasDiv = 0, + HasSqrt = 0, + HasRsqrt = 0, + HasExp = 0, + HasLog = 0, + HasBlend = 0 + }; +}; + + +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=16, alignment=Aligned32}; typedef Packet16h half; }; + +template<> EIGEN_STRONG_INLINE Packet16h pset1(const Eigen::half& from) { + Packet16h result; + result.x = _mm256_set1_epi16(from.x); + return result; +} + +template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet16h& from) { + return half_impl::raw_uint16_to_half(static_cast(_mm256_extract_epi16(from.x, 0))); +} + +template<> EIGEN_STRONG_INLINE Packet16h pload(const Eigen::half* from) { + Packet16h result; + result.x = _mm256_load_si256(reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet16h ploadu(const Eigen::half* from) { + Packet16h result; + result.x = _mm256_loadu_si256(reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet16h& from) { + _mm256_store_si256((__m256i*)to, from.x); +} + +template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet16h& from) { + _mm256_storeu_si256((__m256i*)to, from.x); +} + +template<> EIGEN_STRONG_INLINE Packet16h +ploadquad(const Eigen::half* from) { + Packet16h result; + unsigned short a = from[0].x; + unsigned short b = from[1].x; + unsigned short c = from[2].x; + unsigned short d = from[3].x; + result.x = _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a); + return result; +} + +EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { +#ifdef EIGEN_HAS_FP16_C + return _mm512_cvtph_ps(a.x); +#else + EIGEN_ALIGN64 half aux[16]; + pstore(aux, a); + float f0(aux[0]); + float f1(aux[1]); + float f2(aux[2]); + float f3(aux[3]); + float f4(aux[4]); + float f5(aux[5]); + float f6(aux[6]); + float f7(aux[7]); + float f8(aux[8]); + float f9(aux[9]); + float fa(aux[10]); + float fb(aux[11]); + float fc(aux[12]); + float fd(aux[13]); + float fe(aux[14]); + float ff(aux[15]); + + return _mm512_set_ps( + ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0); +#endif +} + +EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { +#ifdef EIGEN_HAS_FP16_C + Packet16h result; + result.x = _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); + return result; +#else + EIGEN_ALIGN64 float aux[16]; + pstore(aux, a); + half h0(aux[0]); + half h1(aux[1]); + half h2(aux[2]); + half h3(aux[3]); + half h4(aux[4]); + half h5(aux[5]); + half h6(aux[6]); + half h7(aux[7]); + half h8(aux[8]); + half h9(aux[9]); + half ha(aux[10]); + half hb(aux[11]); + half hc(aux[12]); + half hd(aux[13]); + half he(aux[14]); + half hf(aux[15]); + + Packet16h result; + result.x = _mm256_set_epi16( + hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x, + h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); + return result; +#endif +} + +template<> EIGEN_STRONG_INLINE Packet16h padd(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = padd(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet16h pmul(const Packet16h& a, const Packet16h& b) { + Packet16f af = half2float(a); + Packet16f bf = half2float(b); + Packet16f rf = pmul(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE half predux(const Packet16h& from) { + Packet16f from_float = half2float(from); + return half(predux(from_float)); +} + +template<> EIGEN_STRONG_INLINE Packet16h pgather(const Eigen::half* from, Index stride) +{ + Packet16h result; + result.x = _mm256_set_epi16( + from[15*stride].x, from[14*stride].x, from[13*stride].x, from[12*stride].x, + from[11*stride].x, from[10*stride].x, from[9*stride].x, from[8*stride].x, + from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, + from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); + return result; +} + +template<> EIGEN_STRONG_INLINE void pscatter(half* to, const Packet16h& from, Index stride) +{ + EIGEN_ALIGN64 half aux[16]; + pstore(aux, from); + to[stride*0].x = aux[0].x; + to[stride*1].x = aux[1].x; + to[stride*2].x = aux[2].x; + to[stride*3].x = aux[3].x; + to[stride*4].x = aux[4].x; + to[stride*5].x = aux[5].x; + to[stride*6].x = aux[6].x; + to[stride*7].x = aux[7].x; + to[stride*8].x = aux[8].x; + to[stride*9].x = aux[9].x; + to[stride*10].x = aux[10].x; + to[stride*11].x = aux[11].x; + to[stride*12].x = aux[12].x; + to[stride*13].x = aux[13].x; + to[stride*14].x = aux[14].x; + to[stride*15].x = aux[15].x; +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m256i a = kernel.packet[0].x; + __m256i b = kernel.packet[1].x; + __m256i c = kernel.packet[2].x; + __m256i d = kernel.packet[3].x; + __m256i e = kernel.packet[4].x; + __m256i f = kernel.packet[5].x; + __m256i g = kernel.packet[6].x; + __m256i h = kernel.packet[7].x; + __m256i i = kernel.packet[8].x; + __m256i j = kernel.packet[9].x; + __m256i k = kernel.packet[10].x; + __m256i l = kernel.packet[11].x; + __m256i m = kernel.packet[12].x; + __m256i n = kernel.packet[13].x; + __m256i o = kernel.packet[14].x; + __m256i p = kernel.packet[15].x; + + __m256i ab_07 = _mm256_unpacklo_epi16(a, b); + __m256i cd_07 = _mm256_unpacklo_epi16(c, d); + __m256i ef_07 = _mm256_unpacklo_epi16(e, f); + __m256i gh_07 = _mm256_unpacklo_epi16(g, h); + __m256i ij_07 = _mm256_unpacklo_epi16(i, j); + __m256i kl_07 = _mm256_unpacklo_epi16(k, l); + __m256i mn_07 = _mm256_unpacklo_epi16(m, n); + __m256i op_07 = _mm256_unpacklo_epi16(o, p); + + __m256i ab_8f = _mm256_unpackhi_epi16(a, b); + __m256i cd_8f = _mm256_unpackhi_epi16(c, d); + __m256i ef_8f = _mm256_unpackhi_epi16(e, f); + __m256i gh_8f = _mm256_unpackhi_epi16(g, h); + __m256i ij_8f = _mm256_unpackhi_epi16(i, j); + __m256i kl_8f = _mm256_unpackhi_epi16(k, l); + __m256i mn_8f = _mm256_unpackhi_epi16(m, n); + __m256i op_8f = _mm256_unpackhi_epi16(o, p); + + __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07); + __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07); + __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07); + __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07); + __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07); + __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07); + __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07); + __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07); + + __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f); + __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f); + __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f); + __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f); + __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f); + __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f); + __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f); + __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f); + + __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03); + __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03); + __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03); + __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03); + __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47); + __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47); + __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47); + __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47); + __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b); + __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b); + __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b); + __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b); + __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf); + __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf); + __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf); + __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf); + + // NOTE: no unpacklo/hi instr in this case, so using permute instr. + __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20); + __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31); + __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20); + __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31); + __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20); + __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31); + __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20); + __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31); + __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20); + __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31); + __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20); + __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31); + __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20); + __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31); + __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20); + __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31); + + kernel.packet[0].x = a_p_0; + kernel.packet[1].x = a_p_1; + kernel.packet[2].x = a_p_2; + kernel.packet[3].x = a_p_3; + kernel.packet[4].x = a_p_4; + kernel.packet[5].x = a_p_5; + kernel.packet[6].x = a_p_6; + kernel.packet[7].x = a_p_7; + kernel.packet[8].x = a_p_8; + kernel.packet[9].x = a_p_9; + kernel.packet[10].x = a_p_a; + kernel.packet[11].x = a_p_b; + kernel.packet[12].x = a_p_c; + kernel.packet[13].x = a_p_d; + kernel.packet[14].x = a_p_e; + kernel.packet[15].x = a_p_f; +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + EIGEN_ALIGN64 half in[8][16]; + pstore(in[0], kernel.packet[0]); + pstore(in[1], kernel.packet[1]); + pstore(in[2], kernel.packet[2]); + pstore(in[3], kernel.packet[3]); + pstore(in[4], kernel.packet[4]); + pstore(in[5], kernel.packet[5]); + pstore(in[6], kernel.packet[6]); + pstore(in[7], kernel.packet[7]); + + EIGEN_ALIGN64 half out[8][16]; + + for (int i = 0; i < 8; ++i) { + for (int j = 0; j < 8; ++j) { + out[i][j] = in[j][2*i]; + } + for (int j = 0; j < 8; ++j) { + out[i][j+8] = in[j][2*i+1]; + } + } + + kernel.packet[0] = pload(out[0]); + kernel.packet[1] = pload(out[1]); + kernel.packet[2] = pload(out[2]); + kernel.packet[3] = pload(out[3]); + kernel.packet[4] = pload(out[4]); + kernel.packet[5] = pload(out[5]); + kernel.packet[6] = pload(out[6]); + kernel.packet[7] = pload(out[7]); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + EIGEN_ALIGN64 half in[4][16]; + pstore(in[0], kernel.packet[0]); + pstore(in[1], kernel.packet[1]); + pstore(in[2], kernel.packet[2]); + pstore(in[3], kernel.packet[3]); + + EIGEN_ALIGN64 half out[4][16]; + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + out[i][j] = in[j][4*i]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+4] = in[j][4*i+1]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+8] = in[j][4*i+2]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+12] = in[j][4*i+3]; + } + } + + kernel.packet[0] = pload(out[0]); + kernel.packet[1] = pload(out[1]); + kernel.packet[2] = pload(out[2]); + kernel.packet[3] = pload(out[3]); +} + + +#elif defined EIGEN_VECTORIZE_AVX + +typedef struct { + __m128i x; +} Packet8h; + + +template<> struct is_arithmetic { enum { value = true }; }; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet8h type; + // There is no half-size packet for Packet8h. + typedef Packet8h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 8, + HasHalfPacket = 0, + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasDiv = 0, + HasSqrt = 0, + HasRsqrt = 0, + HasExp = 0, + HasLog = 0, + HasBlend = 0 + }; +}; + + +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16}; typedef Packet8h half; }; + +template<> EIGEN_STRONG_INLINE Packet8h pset1(const Eigen::half& from) { + Packet8h result; + result.x = _mm_set1_epi16(from.x); + return result; +} + +template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet8h& from) { + return half_impl::raw_uint16_to_half(static_cast(_mm_extract_epi16(from.x, 0))); +} + +template<> EIGEN_STRONG_INLINE Packet8h pload(const Eigen::half* from) { + Packet8h result; + result.x = _mm_load_si128(reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet8h ploadu(const Eigen::half* from) { + Packet8h result; + result.x = _mm_loadu_si128(reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet8h& from) { + _mm_store_si128(reinterpret_cast<__m128i*>(to), from.x); +} + +template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet8h& from) { + _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x); +} + +template<> EIGEN_STRONG_INLINE Packet8h +ploadquad(const Eigen::half* from) { + Packet8h result; + unsigned short a = from[0].x; + unsigned short b = from[1].x; + result.x = _mm_set_epi16(b, b, b, b, a, a, a, a); + return result; +} + +EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { +#ifdef EIGEN_HAS_FP16_C + return _mm256_cvtph_ps(a.x); +#else + EIGEN_ALIGN32 Eigen::half aux[8]; + pstore(aux, a); + float f0(aux[0]); + float f1(aux[1]); + float f2(aux[2]); + float f3(aux[3]); + float f4(aux[4]); + float f5(aux[5]); + float f6(aux[6]); + float f7(aux[7]); + + return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0); +#endif +} + +EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { +#ifdef EIGEN_HAS_FP16_C + Packet8h result; + result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); + return result; +#else + EIGEN_ALIGN32 float aux[8]; + pstore(aux, a); + Eigen::half h0(aux[0]); + Eigen::half h1(aux[1]); + Eigen::half h2(aux[2]); + Eigen::half h3(aux[3]); + Eigen::half h4(aux[4]); + Eigen::half h5(aux[5]); + Eigen::half h6(aux[6]); + Eigen::half h7(aux[7]); + + Packet8h result; + result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); + return result; +#endif +} + +template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet8h padd(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = padd(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h pmul(const Packet8h& a, const Packet8h& b) { + Packet8f af = half2float(a); + Packet8f bf = half2float(b); + Packet8f rf = pmul(af, bf); + return float2half(rf); +} + +template<> EIGEN_STRONG_INLINE Packet8h pgather(const Eigen::half* from, Index stride) +{ + Packet8h result; + result.x = _mm_set_epi16(from[7*stride].x, from[6*stride].x, from[5*stride].x, from[4*stride].x, from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); + return result; +} + +template<> EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet8h& from, Index stride) +{ + EIGEN_ALIGN32 Eigen::half aux[8]; + pstore(aux, from); + to[stride*0].x = aux[0].x; + to[stride*1].x = aux[1].x; + to[stride*2].x = aux[2].x; + to[stride*3].x = aux[3].x; + to[stride*4].x = aux[4].x; + to[stride*5].x = aux[5].x; + to[stride*6].x = aux[6].x; + to[stride*7].x = aux[7].x; +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_max(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_min(af); + return Eigen::half(reduced); +} + +template<> EIGEN_STRONG_INLINE Eigen::half predux_mul(const Packet8h& a) { + Packet8f af = half2float(a); + float reduced = predux_mul(af); + return Eigen::half(reduced); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m128i a = kernel.packet[0].x; + __m128i b = kernel.packet[1].x; + __m128i c = kernel.packet[2].x; + __m128i d = kernel.packet[3].x; + __m128i e = kernel.packet[4].x; + __m128i f = kernel.packet[5].x; + __m128i g = kernel.packet[6].x; + __m128i h = kernel.packet[7].x; + + __m128i a03b03 = _mm_unpacklo_epi16(a, b); + __m128i c03d03 = _mm_unpacklo_epi16(c, d); + __m128i e03f03 = _mm_unpacklo_epi16(e, f); + __m128i g03h03 = _mm_unpacklo_epi16(g, h); + __m128i a47b47 = _mm_unpackhi_epi16(a, b); + __m128i c47d47 = _mm_unpackhi_epi16(c, d); + __m128i e47f47 = _mm_unpackhi_epi16(e, f); + __m128i g47h47 = _mm_unpackhi_epi16(g, h); + + __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03); + __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03); + __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03); + __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03); + __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47); + __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47); + __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47); + __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47); + + __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01); + __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01); + __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23); + __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23); + __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45); + __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45); + __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67); + __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67); + + kernel.packet[0].x = a0b0c0d0e0f0g0h0; + kernel.packet[1].x = a1b1c1d1e1f1g1h1; + kernel.packet[2].x = a2b2c2d2e2f2g2h2; + kernel.packet[3].x = a3b3c3d3e3f3g3h3; + kernel.packet[4].x = a4b4c4d4e4f4g4h4; + kernel.packet[5].x = a5b5c5d5e5f5g5h5; + kernel.packet[6].x = a6b6c6d6e6f6g6h6; + kernel.packet[7].x = a7b7c7d7e7f7g7h7; +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + EIGEN_ALIGN32 Eigen::half in[4][8]; + pstore(in[0], kernel.packet[0]); + pstore(in[1], kernel.packet[1]); + pstore(in[2], kernel.packet[2]); + pstore(in[3], kernel.packet[3]); + + EIGEN_ALIGN32 Eigen::half out[4][8]; + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + out[i][j] = in[j][2*i]; + } + for (int j = 0; j < 4; ++j) { + out[i][j+4] = in[j][2*i+1]; + } + } + + kernel.packet[0] = pload(out[0]); + kernel.packet[1] = pload(out[1]); + kernel.packet[2] = pload(out[2]); + kernel.packet[3] = pload(out[3]); +} + + +// Disable the following code since it's broken on too many platforms / compilers. +//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) +#elif 0 + +typedef struct { + __m64 x; +} Packet4h; + + +template<> struct is_arithmetic { enum { value = true }; }; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet4h type; + // There is no half-size packet for Packet4h. + typedef Packet4h half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = 4, + HasHalfPacket = 0, + HasAdd = 0, + HasSub = 0, + HasMul = 0, + HasNegate = 0, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, + HasConj = 0, + HasSetLinear = 0, + HasDiv = 0, + HasSqrt = 0, + HasRsqrt = 0, + HasExp = 0, + HasLog = 0, + HasBlend = 0 + }; +}; + + +template<> struct unpacket_traits { typedef Eigen::half type; enum {size=4, alignment=Aligned16}; typedef Packet4h half; }; + +template<> EIGEN_STRONG_INLINE Packet4h pset1(const Eigen::half& from) { + Packet4h result; + result.x = _mm_set1_pi16(from.x); + return result; +} + +template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet4h& from) { + return half_impl::raw_uint16_to_half(static_cast(_mm_cvtsi64_si32(from.x))); +} + +template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet4h padd(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha + hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha + hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h pmul(const Packet4h& a, const Packet4h& b) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + __int64_t b64 = _mm_cvtm64_si64(b.x); + + Eigen::half h[4]; + + Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); + Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); + h[0] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); + h[1] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); + h[2] = ha * hb; + ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); + hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); + h[3] = ha * hb; + Packet4h result; + result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h pload(const Eigen::half* from) { + Packet4h result; + result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE Packet4h ploadu(const Eigen::half* from) { + Packet4h result; + result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); + return result; +} + +template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet4h& from) { + __int64_t r = _mm_cvtm64_si64(from.x); + *(reinterpret_cast<__int64_t*>(to)) = r; +} + +template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet4h& from) { + __int64_t r = _mm_cvtm64_si64(from.x); + *(reinterpret_cast<__int64_t*>(to)) = r; +} + +template<> EIGEN_STRONG_INLINE Packet4h +ploadquad(const Eigen::half* from) { + return pset1(*from); +} + +template<> EIGEN_STRONG_INLINE Packet4h pgather(const Eigen::half* from, Index stride) +{ + Packet4h result; + result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); + return result; +} + +template<> EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet4h& from, Index stride) +{ + __int64_t a = _mm_cvtm64_si64(from.x); + to[stride*0].x = static_cast(a); + to[stride*1].x = static_cast(a >> 16); + to[stride*2].x = static_cast(a >> 32); + to[stride*3].x = static_cast(a >> 48); +} + +EIGEN_STRONG_INLINE void +ptranspose(PacketBlock& kernel) { + __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x); + __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x); + __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x); + __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x); + + kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1); + kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1); + kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3); + kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3); +} + +#endif + +} +} + +#endif // EIGEN_PACKET_MATH_HALF_CUDA_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h new file mode 100644 index 000000000..aa5fbce8e --- /dev/null +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/CUDA/TypeCasting.h @@ -0,0 +1,212 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_CUDA_H +#define EIGEN_TYPE_CASTING_CUDA_H + +namespace Eigen { + +namespace internal { + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef Eigen::half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const { + #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + return __float2half(a); + #else + return Eigen::half(a); + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef Eigen::half result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const { + #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + return __float2half(static_cast(a)); + #else + return Eigen::half(static_cast(a)); + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + +template<> +struct scalar_cast_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) + typedef float result_type; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const { + #if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + return __half2float(a); + #else + return static_cast(a); + #endif + } +}; + +template<> +struct functor_traits > +{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; + + + +#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 2, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast(const half2& a, const half2& b) { + float2 r1 = __half22float2(a); + float2 r2 = __half22float2(b); + return make_float4(r1.x, r1.y, r2.x, r2.y); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 2 + }; +}; + +template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast(const float4& a) { + // Simply discard the second half of the input + return __floats2half2_rn(a.x, a.y); +} + +#elif defined EIGEN_VECTORIZE_AVX512 +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet16f pcast(const Packet16h& a) { + return half2float(a); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet16h pcast(const Packet16f& a) { + return float2half(a); +} + +#elif defined EIGEN_VECTORIZE_AVX + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet8f pcast(const Packet8h& a) { + return half2float(a); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet8h pcast(const Packet8f& a) { + return float2half(a); +} + +// Disable the following code since it's broken on too many platforms / compilers. +//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) +#elif 0 + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4h& a) { + __int64_t a64 = _mm_cvtm64_si64(a.x); + Eigen::half h = raw_uint16_to_half(static_cast(a64)); + float f1 = static_cast(h); + h = raw_uint16_to_half(static_cast(a64 >> 16)); + float f2 = static_cast(h); + h = raw_uint16_to_half(static_cast(a64 >> 32)); + float f3 = static_cast(h); + h = raw_uint16_to_half(static_cast(a64 >> 48)); + float f4 = static_cast(h); + return _mm_set_ps(f4, f3, f2, f1); +} + +template <> +struct type_casting_traits { + enum { + VectorizedCast = 1, + SrcCoeffRatio = 1, + TgtCoeffRatio = 1 + }; +}; + +template<> EIGEN_STRONG_INLINE Packet4h pcast(const Packet4f& a) { + EIGEN_ALIGN16 float aux[4]; + pstore(aux, a); + Eigen::half h0(aux[0]); + Eigen::half h1(aux[1]); + Eigen::half h2(aux[2]); + Eigen::half h3(aux[3]); + + Packet4h result; + result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x); + return result; +} + +#endif + +} // end namespace internal + +} // end namespace Eigen + +#endif // EIGEN_TYPE_CASTING_CUDA_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h deleted file mode 100644 index 4d9b3b44c..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ /dev/null @@ -1,655 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2007 Julien Pommier -// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com) -// Copyright (C) 2009-2019 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/* The exp and log functions of this file initially come from - * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ - */ - -#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H -#define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H - -namespace Eigen { -namespace internal { - -template EIGEN_STRONG_INLINE Packet -pfrexp_float(const Packet& a, Packet& exponent) { - typedef typename unpacket_traits::integer_packet PacketI; - const Packet cst_126f = pset1(126.0f); - const Packet cst_half = pset1(0.5f); - const Packet cst_inv_mant_mask = pset1frombits(~0x7f800000u); - exponent = psub(pcast(plogical_shift_right<23>(preinterpret(a))), cst_126f); - return por(pand(a, cst_inv_mant_mask), cst_half); -} - -template EIGEN_STRONG_INLINE Packet -pldexp_float(Packet a, Packet exponent) -{ - typedef typename unpacket_traits::integer_packet PacketI; - const Packet cst_127 = pset1(127.f); - // return a * 2^exponent - PacketI ei = pcast(padd(exponent, cst_127)); - return pmul(a, preinterpret(plogical_shift_left<23>(ei))); -} - -// Natural logarithm -// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) -// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can -// be easily approximated by a polynomial centered on m=1 for stability. -// TODO(gonnet): Further reduce the interval allowing for lower-degree -// polynomial interpolants -> ... -> profit! -template -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED -Packet plog_float(const Packet _x) -{ - Packet x = _x; - - const Packet cst_1 = pset1(1.0f); - const Packet cst_half = pset1(0.5f); - // The smallest non denormalized float number. - const Packet cst_min_norm_pos = pset1frombits( 0x00800000u); - const Packet cst_minus_inf = pset1frombits( 0xff800000u); - const Packet cst_pos_inf = pset1frombits( 0x7f800000u); - - // Polynomial coefficients. - const Packet cst_cephes_SQRTHF = pset1(0.707106781186547524f); - const Packet cst_cephes_log_p0 = pset1(7.0376836292E-2f); - const Packet cst_cephes_log_p1 = pset1(-1.1514610310E-1f); - const Packet cst_cephes_log_p2 = pset1(1.1676998740E-1f); - const Packet cst_cephes_log_p3 = pset1(-1.2420140846E-1f); - const Packet cst_cephes_log_p4 = pset1(+1.4249322787E-1f); - const Packet cst_cephes_log_p5 = pset1(-1.6668057665E-1f); - const Packet cst_cephes_log_p6 = pset1(+2.0000714765E-1f); - const Packet cst_cephes_log_p7 = pset1(-2.4999993993E-1f); - const Packet cst_cephes_log_p8 = pset1(+3.3333331174E-1f); - const Packet cst_cephes_log_q1 = pset1(-2.12194440e-4f); - const Packet cst_cephes_log_q2 = pset1(0.693359375f); - - // Truncate input values to the minimum positive normal. - x = pmax(x, cst_min_norm_pos); - - Packet e; - // extract significant in the range [0.5,1) and exponent - x = pfrexp(x,e); - - // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2)) - // and shift by -1. The values are then centered around 0, which improves - // the stability of the polynomial evaluation. - // if( x < SQRTHF ) { - // e -= 1; - // x = x + x - 1.0; - // } else { x = x - 1.0; } - Packet mask = pcmp_lt(x, cst_cephes_SQRTHF); - Packet tmp = pand(x, mask); - x = psub(x, cst_1); - e = psub(e, pand(cst_1, mask)); - x = padd(x, tmp); - - Packet x2 = pmul(x, x); - Packet x3 = pmul(x2, x); - - // Evaluate the polynomial approximant of degree 8 in three parts, probably - // to improve instruction-level parallelism. - Packet y, y1, y2; - y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1); - y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4); - y2 = pmadd(cst_cephes_log_p6, x, cst_cephes_log_p7); - y = pmadd(y, x, cst_cephes_log_p2); - y1 = pmadd(y1, x, cst_cephes_log_p5); - y2 = pmadd(y2, x, cst_cephes_log_p8); - y = pmadd(y, x3, y1); - y = pmadd(y, x3, y2); - y = pmul(y, x3); - - // Add the logarithm of the exponent back to the result of the interpolation. - y1 = pmul(e, cst_cephes_log_q1); - tmp = pmul(x2, cst_half); - y = padd(y, y1); - x = psub(x, tmp); - y2 = pmul(e, cst_cephes_log_q2); - x = padd(x, y); - x = padd(x, y2); - - Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x)); - Packet iszero_mask = pcmp_eq(_x,pzero(_x)); - Packet pos_inf_mask = pcmp_eq(_x,cst_pos_inf); - // Filter out invalid inputs, i.e.: - // - negative arg will be NAN - // - 0 will be -INF - // - +INF will be +INF - return pselect(iszero_mask, cst_minus_inf, - por(pselect(pos_inf_mask,cst_pos_inf,x), invalid_mask)); -} - -/** \internal \returns log(1 + x) computed using W. Kahan's formula. - See: http://www.plunk.org/~hatch/rightway.php - */ -template -Packet generic_plog1p(const Packet& x) -{ - typedef typename unpacket_traits::type ScalarType; - const Packet one = pset1(ScalarType(1)); - Packet xp1 = padd(x, one); - Packet small_mask = pcmp_eq(xp1, one); - Packet log1 = plog(xp1); - Packet inf_mask = pcmp_eq(xp1, log1); - Packet log_large = pmul(x, pdiv(log1, psub(xp1, one))); - return pselect(por(small_mask, inf_mask), x, log_large); -} - -/** \internal \returns exp(x)-1 computed using W. Kahan's formula. - See: http://www.plunk.org/~hatch/rightway.php - */ -template -Packet generic_expm1(const Packet& x) -{ - typedef typename unpacket_traits::type ScalarType; - const Packet one = pset1(ScalarType(1)); - const Packet neg_one = pset1(ScalarType(-1)); - Packet u = pexp(x); - Packet one_mask = pcmp_eq(u, one); - Packet u_minus_one = psub(u, one); - Packet neg_one_mask = pcmp_eq(u_minus_one, neg_one); - Packet logu = plog(u); - // The following comparison is to catch the case where - // exp(x) = +inf. It is written in this way to avoid having - // to form the constant +inf, which depends on the packet - // type. - Packet pos_inf_mask = pcmp_eq(logu, u); - Packet expm1 = pmul(u_minus_one, pdiv(x, logu)); - expm1 = pselect(pos_inf_mask, u, expm1); - return pselect(one_mask, - x, - pselect(neg_one_mask, - neg_one, - expm1)); -} - - -// Exponential function. Works by writing "x = m*log(2) + r" where -// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then -// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1). -template -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED -Packet pexp_float(const Packet _x) -{ - const Packet cst_1 = pset1(1.0f); - const Packet cst_half = pset1(0.5f); - const Packet cst_exp_hi = pset1( 88.3762626647950f); - const Packet cst_exp_lo = pset1(-88.3762626647949f); - - const Packet cst_cephes_LOG2EF = pset1(1.44269504088896341f); - const Packet cst_cephes_exp_p0 = pset1(1.9875691500E-4f); - const Packet cst_cephes_exp_p1 = pset1(1.3981999507E-3f); - const Packet cst_cephes_exp_p2 = pset1(8.3334519073E-3f); - const Packet cst_cephes_exp_p3 = pset1(4.1665795894E-2f); - const Packet cst_cephes_exp_p4 = pset1(1.6666665459E-1f); - const Packet cst_cephes_exp_p5 = pset1(5.0000001201E-1f); - - // Clamp x. - Packet x = pmax(pmin(_x, cst_exp_hi), cst_exp_lo); - - // Express exp(x) as exp(m*ln(2) + r), start by extracting - // m = floor(x/ln(2) + 0.5). - Packet m = pfloor(pmadd(x, cst_cephes_LOG2EF, cst_half)); - - // Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is - // subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating - // truncation errors. - Packet r; -#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD - const Packet cst_nln2 = pset1(-0.6931471805599453f); - r = pmadd(m, cst_nln2, x); -#else - const Packet cst_cephes_exp_C1 = pset1(0.693359375f); - const Packet cst_cephes_exp_C2 = pset1(-2.12194440e-4f); - r = psub(x, pmul(m, cst_cephes_exp_C1)); - r = psub(r, pmul(m, cst_cephes_exp_C2)); -#endif - - Packet r2 = pmul(r, r); - - // TODO(gonnet): Split into odd/even polynomials and try to exploit - // instruction-level parallelism. - Packet y = cst_cephes_exp_p0; - y = pmadd(y, r, cst_cephes_exp_p1); - y = pmadd(y, r, cst_cephes_exp_p2); - y = pmadd(y, r, cst_cephes_exp_p3); - y = pmadd(y, r, cst_cephes_exp_p4); - y = pmadd(y, r, cst_cephes_exp_p5); - y = pmadd(y, r2, r); - y = padd(y, cst_1); - - // Return 2^m * exp(r). - return pmax(pldexp(y,m), _x); -} - -// make it the default path for scalar float -template<> -EIGEN_DEVICE_FUNC inline float pexp(const float& a) { return pexp_float(a); } - -template -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED -Packet pexp_double(const Packet _x) -{ - Packet x = _x; - - const Packet cst_1 = pset1(1.0); - const Packet cst_2 = pset1(2.0); - const Packet cst_half = pset1(0.5); - - const Packet cst_exp_hi = pset1(709.437); - const Packet cst_exp_lo = pset1(-709.436139303); - - const Packet cst_cephes_LOG2EF = pset1(1.4426950408889634073599); - const Packet cst_cephes_exp_p0 = pset1(1.26177193074810590878e-4); - const Packet cst_cephes_exp_p1 = pset1(3.02994407707441961300e-2); - const Packet cst_cephes_exp_p2 = pset1(9.99999999999999999910e-1); - const Packet cst_cephes_exp_q0 = pset1(3.00198505138664455042e-6); - const Packet cst_cephes_exp_q1 = pset1(2.52448340349684104192e-3); - const Packet cst_cephes_exp_q2 = pset1(2.27265548208155028766e-1); - const Packet cst_cephes_exp_q3 = pset1(2.00000000000000000009e0); - const Packet cst_cephes_exp_C1 = pset1(0.693145751953125); - const Packet cst_cephes_exp_C2 = pset1(1.42860682030941723212e-6); - - Packet tmp, fx; - - // clamp x - x = pmax(pmin(x, cst_exp_hi), cst_exp_lo); - // Express exp(x) as exp(g + n*log(2)). - fx = pmadd(cst_cephes_LOG2EF, x, cst_half); - - // Get the integer modulus of log(2), i.e. the "n" described above. - fx = pfloor(fx); - - // Get the remainder modulo log(2), i.e. the "g" described above. Subtract - // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last - // digits right. - tmp = pmul(fx, cst_cephes_exp_C1); - Packet z = pmul(fx, cst_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - Packet x2 = pmul(x, x); - - // Evaluate the numerator polynomial of the rational interpolant. - Packet px = cst_cephes_exp_p0; - px = pmadd(px, x2, cst_cephes_exp_p1); - px = pmadd(px, x2, cst_cephes_exp_p2); - px = pmul(px, x); - - // Evaluate the denominator polynomial of the rational interpolant. - Packet qx = cst_cephes_exp_q0; - qx = pmadd(qx, x2, cst_cephes_exp_q1); - qx = pmadd(qx, x2, cst_cephes_exp_q2); - qx = pmadd(qx, x2, cst_cephes_exp_q3); - - // I don't really get this bit, copied from the SSE2 routines, so... - // TODO(gonnet): Figure out what is going on here, perhaps find a better - // rational interpolant? - x = pdiv(px, psub(qx, px)); - x = pmadd(cst_2, x, cst_1); - - // Construct the result 2^n * exp(g) = e * x. The max is used to catch - // non-finite values in the input. - return pmax(pldexp(x,fx), _x); -} - -// make it the default path for scalar double -template<> -EIGEN_DEVICE_FUNC inline double pexp(const double& a) { return pexp_double(a); } - -// The following code is inspired by the following stack-overflow answer: -// https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751 -// It has been largely optimized: -// - By-pass calls to frexp. -// - Aligned loads of required 96 bits of 2/pi. This is accomplished by -// (1) balancing the mantissa and exponent to the required bits of 2/pi are -// aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi. -// - Avoid a branch in rounding and extraction of the remaining fractional part. -// Overall, I measured a speed up higher than x2 on x86-64. -inline float trig_reduce_huge (float xf, int *quadrant) -{ - using Eigen::numext::int32_t; - using Eigen::numext::uint32_t; - using Eigen::numext::int64_t; - using Eigen::numext::uint64_t; - - const double pio2_62 = 3.4061215800865545e-19; // pi/2 * 2^-62 - const uint64_t zero_dot_five = uint64_t(1) << 61; // 0.5 in 2.62-bit fixed-point foramt - - // 192 bits of 2/pi for Payne-Hanek reduction - // Bits are introduced by packet of 8 to enable aligned reads. - static const uint32_t two_over_pi [] = - { - 0x00000028, 0x000028be, 0x0028be60, 0x28be60db, - 0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a, - 0x91054a7f, 0x054a7f09, 0x4a7f09d5, 0x7f09d5f4, - 0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770, - 0x4d377036, 0x377036d8, 0x7036d8a5, 0x36d8a566, - 0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410, - 0x10e41000, 0xe4100000 - }; - - uint32_t xi = numext::as_uint(xf); - // Below, -118 = -126 + 8. - // -126 is to get the exponent, - // +8 is to enable alignment of 2/pi's bits on 8 bits. - // This is possible because the fractional part of x as only 24 meaningful bits. - uint32_t e = (xi >> 23) - 118; - // Extract the mantissa and shift it to align it wrt the exponent - xi = ((xi & 0x007fffffu)| 0x00800000u) << (e & 0x7); - - uint32_t i = e >> 3; - uint32_t twoopi_1 = two_over_pi[i-1]; - uint32_t twoopi_2 = two_over_pi[i+3]; - uint32_t twoopi_3 = two_over_pi[i+7]; - - // Compute x * 2/pi in 2.62-bit fixed-point format. - uint64_t p; - p = uint64_t(xi) * twoopi_3; - p = uint64_t(xi) * twoopi_2 + (p >> 32); - p = (uint64_t(xi * twoopi_1) << 32) + p; - - // Round to nearest: add 0.5 and extract integral part. - uint64_t q = (p + zero_dot_five) >> 62; - *quadrant = int(q); - // Now it remains to compute "r = x - q*pi/2" with high accuracy, - // since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as: - // r = (p-q)*pi/2, - // where the product can be be carried out with sufficient accuracy using double precision. - p -= q<<62; - return float(double(int64_t(p)) * pio2_62); -} - -template -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED -#if EIGEN_GNUC_AT_LEAST(4,4) && EIGEN_COMP_GNUC_STRICT -__attribute__((optimize("-fno-unsafe-math-optimizations"))) -#endif -Packet psincos_float(const Packet& _x) -{ -// Workaround -ffast-math aggressive optimizations -// See bug 1674 -#if EIGEN_COMP_CLANG && defined(EIGEN_VECTORIZE_SSE) -#define EIGEN_SINCOS_DONT_OPT(X) __asm__ ("" : "+x" (X)); -#else -#define EIGEN_SINCOS_DONT_OPT(X) -#endif - - typedef typename unpacket_traits::integer_packet PacketI; - - const Packet cst_2oPI = pset1(0.636619746685028076171875f); // 2/PI - const Packet cst_rounding_magic = pset1(12582912); // 2^23 for rounding - const PacketI csti_1 = pset1(1); - const Packet cst_sign_mask = pset1frombits(0x80000000u); - - Packet x = pabs(_x); - - // Scale x by 2/Pi to find x's octant. - Packet y = pmul(x, cst_2oPI); - - // Rounding trick: - Packet y_round = padd(y, cst_rounding_magic); - EIGEN_SINCOS_DONT_OPT(y_round) - PacketI y_int = preinterpret(y_round); // last 23 digits represent integer (if abs(x)<2^24) - y = psub(y_round, cst_rounding_magic); // nearest integer to x*4/pi - - // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4 - // using "Extended precision modular arithmetic" - #if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) - // This version requires true FMA for high accuracy - // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08): - const float huge_th = ComputeSine ? 117435.992f : 71476.0625f; - x = pmadd(y, pset1(-1.57079601287841796875f), x); - x = pmadd(y, pset1(-3.1391647326017846353352069854736328125e-07f), x); - x = pmadd(y, pset1(-5.390302529957764765544681040410068817436695098876953125e-15f), x); - #else - // Without true FMA, the previous set of coefficients maintain 1ULP accuracy - // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7. - // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs. - - // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively. - // and 2 ULP up to: - const float huge_th = ComputeSine ? 25966.f : 18838.f; - x = pmadd(y, pset1(-1.5703125), x); // = 0xbfc90000 - EIGEN_SINCOS_DONT_OPT(x) - x = pmadd(y, pset1(-0.000483989715576171875), x); // = 0xb9fdc000 - EIGEN_SINCOS_DONT_OPT(x) - x = pmadd(y, pset1(1.62865035235881805419921875e-07), x); // = 0x342ee000 - x = pmadd(y, pset1(5.5644315544167710640977020375430583953857421875e-11), x); // = 0x2e74b9ee - - // For the record, the following set of coefficients maintain 2ULP up - // to a slightly larger range: - // const float huge_th = ComputeSine ? 51981.f : 39086.125f; - // but it slightly fails to maintain 1ULP for two values of sin below pi. - // x = pmadd(y, pset1(-3.140625/2.), x); - // x = pmadd(y, pset1(-0.00048351287841796875), x); - // x = pmadd(y, pset1(-3.13855707645416259765625e-07), x); - // x = pmadd(y, pset1(-6.0771006282767103812147979624569416046142578125e-11), x); - - // For the record, with only 3 iterations it is possible to maintain - // 1 ULP up to 3PI (maybe more) and 2ULP up to 255. - // The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee - #endif - - if(predux_any(pcmp_le(pset1(huge_th),pabs(_x)))) - { - const int PacketSize = unpacket_traits::size; - EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize]; - EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize]; - EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) int y_int2[PacketSize]; - pstoreu(vals, pabs(_x)); - pstoreu(x_cpy, x); - pstoreu(y_int2, y_int); - for(int k=0; k=huge_th && (numext::isfinite)(val)) - x_cpy[k] = trig_reduce_huge(val,&y_int2[k]); - } - x = ploadu(x_cpy); - y_int = ploadu(y_int2); - } - - // Compute the sign to apply to the polynomial. - // sin: sign = second_bit(y_int) xor signbit(_x) - // cos: sign = second_bit(y_int+1) - Packet sign_bit = ComputeSine ? pxor(_x, preinterpret(plogical_shift_left<30>(y_int))) - : preinterpret(plogical_shift_left<30>(padd(y_int,csti_1))); - sign_bit = pand(sign_bit, cst_sign_mask); // clear all but left most bit - - // Get the polynomial selection mask from the second bit of y_int - // We'll calculate both (sin and cos) polynomials and then select from the two. - Packet poly_mask = preinterpret(pcmp_eq(pand(y_int, csti_1), pzero(y_int))); - - Packet x2 = pmul(x,x); - - // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4) - Packet y1 = pset1(2.4372266125283204019069671630859375e-05f); - y1 = pmadd(y1, x2, pset1(-0.00138865201734006404876708984375f )); - y1 = pmadd(y1, x2, pset1(0.041666619479656219482421875f )); - y1 = pmadd(y1, x2, pset1(-0.5f)); - y1 = pmadd(y1, x2, pset1(1.f)); - - // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4) - // octave/matlab code to compute those coefficients: - // x = (0:0.0001:pi/4)'; - // A = [x.^3 x.^5 x.^7]; - // w = ((1.-(x/(pi/4)).^2).^5)*2000+1; # weights trading relative accuracy - // c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1 - // printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1)) - // - Packet y2 = pset1(-0.0001959234114083702898469196984621021329076029360294342041015625f); - y2 = pmadd(y2, x2, pset1( 0.0083326873655616851693794799871284340042620897293090820312500000f)); - y2 = pmadd(y2, x2, pset1(-0.1666666203982298255503735617821803316473960876464843750000000000f)); - y2 = pmul(y2, x2); - y2 = pmadd(y2, x, x); - - // Select the correct result from the two polynomials. - y = ComputeSine ? pselect(poly_mask,y2,y1) - : pselect(poly_mask,y1,y2); - - // Update the sign and filter huge inputs - return pxor(y, sign_bit); - -#undef EIGEN_SINCOS_DONT_OPT -} - -template -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED -Packet psin_float(const Packet& x) -{ - return psincos_float(x); -} - -template -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED -Packet pcos_float(const Packet& x) -{ - return psincos_float(x); -} - -/* polevl (modified for Eigen) - * - * Evaluate polynomial - * - * - * - * SYNOPSIS: - * - * int N; - * Scalar x, y, coef[N+1]; - * - * y = polevl( x, coef); - * - * - * - * DESCRIPTION: - * - * Evaluates polynomial of degree N: - * - * 2 N - * y = C + C x + C x +...+ C x - * 0 1 2 N - * - * Coefficients are stored in reverse order: - * - * coef[0] = C , ..., coef[N] = C . - * N 0 - * - * The function p1evl() assumes that coef[N] = 1.0 and is - * omitted from the array. Its calling arguments are - * otherwise the same as polevl(). - * - * - * The Eigen implementation is templatized. For best speed, store - * coef as a const array (constexpr), e.g. - * - * const double coef[] = {1.0, 2.0, 3.0, ...}; - * - */ -template -struct ppolevl { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits::type coeff[]) { - EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); - return pmadd(ppolevl::run(x, coeff), x, pset1(coeff[N])); - } -}; - -template -struct ppolevl { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const typename unpacket_traits::type coeff[]) { - EIGEN_UNUSED_VARIABLE(x); - return pset1(coeff[0]); - } -}; - -/* chbevl (modified for Eigen) - * - * Evaluate Chebyshev series - * - * - * - * SYNOPSIS: - * - * int N; - * Scalar x, y, coef[N], chebevl(); - * - * y = chbevl( x, coef, N ); - * - * - * - * DESCRIPTION: - * - * Evaluates the series - * - * N-1 - * - ' - * y = > coef[i] T (x/2) - * - i - * i=0 - * - * of Chebyshev polynomials Ti at argument x/2. - * - * Coefficients are stored in reverse order, i.e. the zero - * order term is last in the array. Note N is the number of - * coefficients, not the order. - * - * If coefficients are for the interval a to b, x must - * have been transformed to x -> 2(2x - b - a)/(b-a) before - * entering the routine. This maps x from (a, b) to (-1, 1), - * over which the Chebyshev polynomials are defined. - * - * If the coefficients are for the inverted interval, in - * which (a, b) is mapped to (1/b, 1/a), the transformation - * required is x -> 2(2ab/x - b - a)/(b-a). If b is infinity, - * this becomes x -> 4a/x - 1. - * - * - * - * SPEED: - * - * Taking advantage of the recurrence properties of the - * Chebyshev polynomials, the routine requires one more - * addition per loop than evaluating a nested polynomial of - * the same degree. - * - */ - -template -struct pchebevl { - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Packet run(Packet x, const typename unpacket_traits::type coef[]) { - typedef typename unpacket_traits::type Scalar; - Packet b0 = pset1(coef[0]); - Packet b1 = pset1(static_cast(0.f)); - Packet b2; - - for (int i = 1; i < N; i++) { - b2 = b1; - b1 = b0; - b0 = psub(pmadd(x, b1, pset1(coef[i])), b2); - } - - return pmul(pset1(static_cast(0.5f)), psub(b0, b2)); - } -}; - -} // end namespace internal -} // end namespace Eigen - -#endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h deleted file mode 100644 index 68153cae3..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +++ /dev/null @@ -1,69 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2019 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H -#define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H - -namespace Eigen { -namespace internal { - -// Forward declarations of the generic math functions -// implemented in GenericPacketMathFunctions.h -// This is needed to workaround a circular dependency. - -template EIGEN_STRONG_INLINE Packet -pfrexp_float(const Packet& a, Packet& exponent); - -template EIGEN_STRONG_INLINE Packet -pldexp_float(Packet a, Packet exponent); - -/** \internal \returns log(x) for single precision float */ -template -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED -Packet plog_float(const Packet _x); - -/** \internal \returns log(1 + x) */ -template -Packet generic_plog1p(const Packet& x); - -/** \internal \returns exp(x)-1 */ -template -Packet generic_expm1(const Packet& x); - -/** \internal \returns exp(x) for single precision float */ -template -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED -Packet pexp_float(const Packet _x); - -/** \internal \returns exp(x) for double precision real numbers */ -template -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED -Packet pexp_double(const Packet _x); - -/** \internal \returns sin(x) for single precision float */ -template -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED -Packet psin_float(const Packet& x); - -/** \internal \returns cos(x) for single precision float */ -template -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED -Packet pcos_float(const Packet& x); - -template struct ppolevl; - -} // end namespace internal -} // end namespace Eigen - -#endif // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/Settings.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/Settings.h index a5c3ada4c..097373c84 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/Settings.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/Settings.h @@ -21,7 +21,7 @@ * it does not correspond to the number of iterations or the number of instructions */ #ifndef EIGEN_UNROLLING_LIMIT -#define EIGEN_UNROLLING_LIMIT 110 +#define EIGEN_UNROLLING_LIMIT 100 #endif /** Defines the threshold between a "small" and a "large" matrix. diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/TypeCasting.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/TypeCasting.h deleted file mode 100644 index b6df98468..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/Default/TypeCasting.h +++ /dev/null @@ -1,77 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Benoit Steiner -// Copyright (C) 2019 Rasmus Munk Larsen -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_GENERIC_TYPE_CASTING_H -#define EIGEN_GENERIC_TYPE_CASTING_H - -namespace Eigen { - -namespace internal { - -template<> -struct scalar_cast_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) - typedef Eigen::half result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const float& a) const { - #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) - return __float2half(a); - #else - return Eigen::half(a); - #endif - } -}; - -template<> -struct functor_traits > -{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; - - -template<> -struct scalar_cast_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) - typedef Eigen::half result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator() (const int& a) const { - #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) - return __float2half(static_cast(a)); - #else - return Eigen::half(static_cast(a)); - #endif - } -}; - -template<> -struct functor_traits > -{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; - - -template<> -struct scalar_cast_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op) - typedef float result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator() (const Eigen::half& a) const { - #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) - return __half2float(a); - #else - return static_cast(a); - #endif - } -}; - -template<> -struct functor_traits > -{ enum { Cost = NumTraits::AddCost, PacketAccess = false }; }; - -} -} - -#endif // EIGEN_GENERIC_TYPE_CASTING_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/GPU/PacketMath.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/GPU/PacketMath.h deleted file mode 100644 index dd4e77d3a..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/GPU/PacketMath.h +++ /dev/null @@ -1,1786 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_PACKET_MATH_GPU_H -#define EIGEN_PACKET_MATH_GPU_H - -namespace Eigen { - -namespace internal { - -// Make sure this is only available when targeting a GPU: we don't want to -// introduce conflicts between these packet_traits definitions and the ones -// we'll use on the host side (SSE, AVX, ...) -#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) -template<> struct is_arithmetic { enum { value = true }; }; -template<> struct is_arithmetic { enum { value = true }; }; - -template<> struct packet_traits : default_packet_traits -{ - typedef float4 type; - typedef float4 half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=4, - HasHalfPacket = 0, - - HasDiv = 1, - HasSin = 0, - HasCos = 0, - HasLog = 1, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasLGamma = 1, - HasDiGamma = 1, - HasZeta = 1, - HasPolygamma = 1, - HasErf = 1, - HasErfc = 1, - HasNdtri = 1, - HasBessel = 1, - HasIGamma = 1, - HasIGammaDerA = 1, - HasGammaSampleDerAlpha = 1, - HasIGammac = 1, - HasBetaInc = 1, - - HasBlend = 0, - HasFloor = 1, - }; -}; - -template<> struct packet_traits : default_packet_traits -{ - typedef double2 type; - typedef double2 half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=2, - HasHalfPacket = 0, - - HasDiv = 1, - HasLog = 1, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasLGamma = 1, - HasDiGamma = 1, - HasZeta = 1, - HasPolygamma = 1, - HasErf = 1, - HasErfc = 1, - HasNdtri = 1, - HasBessel = 1, - HasIGamma = 1, - HasIGammaDerA = 1, - HasGammaSampleDerAlpha = 1, - HasIGammac = 1, - HasBetaInc = 1, - - HasBlend = 0, - HasFloor = 1, - }; -}; - - -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef float4 half; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef double2 half; }; - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1(const float& from) { - return make_float4(from, from, from, from); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1(const double& from) { - return make_double2(from, from); -} - -// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler, -// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation -// of the functions, while the latter can only deal with one of them. -#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) -namespace { - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a, - const float& b) { - return __int_as_float(__float_as_int(a) & __float_as_int(b)); -} -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a, - const double& b) { - return __longlong_as_double(__double_as_longlong(a) & - __double_as_longlong(b)); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a, - const float& b) { - return __int_as_float(__float_as_int(a) | __float_as_int(b)); -} -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a, - const double& b) { - return __longlong_as_double(__double_as_longlong(a) | - __double_as_longlong(b)); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a, - const float& b) { - return __int_as_float(__float_as_int(a) ^ __float_as_int(b)); -} -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a, - const double& b) { - return __longlong_as_double(__double_as_longlong(a) ^ - __double_as_longlong(b)); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a, - const float& b) { - return __int_as_float(__float_as_int(a) & ~__float_as_int(b)); -} -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a, - const double& b) { - return __longlong_as_double(__double_as_longlong(a) & - ~__double_as_longlong(b)); -} -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a, - const float& b) { - return __int_as_float(a == b ? 0xffffffffu : 0u); -} -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a, - const double& b) { - return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float lt_mask(const float& a, - const float& b) { - return __int_as_float(a < b ? 0xffffffffu : 0u); -} -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a, - const double& b) { - return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull); -} - -} // namespace - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand(const float4& a, - const float4& b) { - return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y), - bitwise_and(a.z, b.z), bitwise_and(a.w, b.w)); -} -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand(const double2& a, - const double2& b) { - return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y)); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por(const float4& a, - const float4& b) { - return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y), - bitwise_or(a.z, b.z), bitwise_or(a.w, b.w)); -} -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por(const double2& a, - const double2& b) { - return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y)); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor(const float4& a, - const float4& b) { - return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y), - bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w)); -} -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor(const double2& a, - const double2& b) { - return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y)); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot(const float4& a, - const float4& b) { - return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y), - bitwise_andnot(a.z, b.z), bitwise_andnot(a.w, b.w)); -} -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 -pandnot(const double2& a, const double2& b) { - return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y)); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq(const float4& a, - const float4& b) { - return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z), - eq_mask(a.w, b.w)); -} -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_lt(const float4& a, - const float4& b) { - return make_float4(lt_mask(a.x, b.x), lt_mask(a.y, b.y), lt_mask(a.z, b.z), - lt_mask(a.w, b.w)); -} -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 -pcmp_eq(const double2& a, const double2& b) { - return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y)); -} -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 -pcmp_lt(const double2& a, const double2& b) { - return make_double2(lt_mask(a.x, b.x), lt_mask(a.y, b.y)); -} -#endif // EIGEN_CUDA_ARCH || defined(EIGEN_HIP_DEVICE_COMPILE) - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset(const float& a) { - return make_float4(a, a+1, a+2, a+3); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset(const double& a) { - return make_double2(a, a+1); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd(const float4& a, const float4& b) { - return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd(const double2& a, const double2& b) { - return make_double2(a.x+b.x, a.y+b.y); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub(const float4& a, const float4& b) { - return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub(const double2& a, const double2& b) { - return make_double2(a.x-b.x, a.y-b.y); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) { - return make_float4(-a.x, -a.y, -a.z, -a.w); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) { - return make_double2(-a.x, -a.y); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; } - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul(const float4& a, const float4& b) { - return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul(const double2& a, const double2& b) { - return make_double2(a.x*b.x, a.y*b.y); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv(const float4& a, const float4& b) { - return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv(const double2& a, const double2& b) { - return make_double2(a.x/b.x, a.y/b.y); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin(const float4& a, const float4& b) { - return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w)); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin(const double2& a, const double2& b) { - return make_double2(fmin(a.x, b.x), fmin(a.y, b.y)); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax(const float4& a, const float4& b) { - return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w)); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax(const double2& a, const double2& b) { - return make_double2(fmax(a.x, b.x), fmax(a.y, b.y)); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload(const float* from) { - return *reinterpret_cast(from); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload(const double* from) { - return *reinterpret_cast(from); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu(const float* from) { - return make_float4(from[0], from[1], from[2], from[3]); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu(const double* from) { - return make_double2(from[0], from[1]); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup(const float* from) { - return make_float4(from[0], from[0], from[1], from[1]); -} -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup(const double* from) { - return make_double2(from[0], from[0]); -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(float* to, const float4& from) { - *reinterpret_cast(to) = from; -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(double* to, const double2& from) { - *reinterpret_cast(to) = from; -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(float* to, const float4& from) { - to[0] = from.x; - to[1] = from.y; - to[2] = from.z; - to[3] = from.w; -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(double* to, const double2& from) { - to[0] = from.x; - to[1] = from.y; -} - -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { -#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 - return __ldg((const float4*)from); -#else - return make_float4(from[0], from[1], from[2], from[3]); -#endif -} -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { -#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 - return __ldg((const double2*)from); -#else - return make_double2(from[0], from[1]); -#endif -} - -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro(const float* from) { -#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 - return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3)); -#else - return make_float4(from[0], from[1], from[2], from[3]); -#endif -} -template<> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro(const double* from) { -#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 - return make_double2(__ldg(from+0), __ldg(from+1)); -#else - return make_double2(from[0], from[1]); -#endif -} - -template<> EIGEN_DEVICE_FUNC inline float4 pgather(const float* from, Index stride) { - return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]); -} - -template<> EIGEN_DEVICE_FUNC inline double2 pgather(const double* from, Index stride) { - return make_double2(from[0*stride], from[1*stride]); -} - -template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const float4& from, Index stride) { - to[stride*0] = from.x; - to[stride*1] = from.y; - to[stride*2] = from.z; - to[stride*3] = from.w; -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const double2& from, Index stride) { - to[stride*0] = from.x; - to[stride*1] = from.y; -} - -template<> EIGEN_DEVICE_FUNC inline float pfirst(const float4& a) { - return a.x; -} -template<> EIGEN_DEVICE_FUNC inline double pfirst(const double2& a) { - return a.x; -} - -template<> EIGEN_DEVICE_FUNC inline float predux(const float4& a) { - return a.x + a.y + a.z + a.w; -} -template<> EIGEN_DEVICE_FUNC inline double predux(const double2& a) { - return a.x + a.y; -} - -template<> EIGEN_DEVICE_FUNC inline float predux_max(const float4& a) { - return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w)); -} -template<> EIGEN_DEVICE_FUNC inline double predux_max(const double2& a) { - return fmax(a.x, a.y); -} - -template<> EIGEN_DEVICE_FUNC inline float predux_min(const float4& a) { - return fminf(fminf(a.x, a.y), fminf(a.z, a.w)); -} -template<> EIGEN_DEVICE_FUNC inline double predux_min(const double2& a) { - return fmin(a.x, a.y); -} - -template<> EIGEN_DEVICE_FUNC inline float predux_mul(const float4& a) { - return a.x * a.y * a.z * a.w; -} -template<> EIGEN_DEVICE_FUNC inline double predux_mul(const double2& a) { - return a.x * a.y; -} - -template<> EIGEN_DEVICE_FUNC inline float4 pabs(const float4& a) { - return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w)); -} -template<> EIGEN_DEVICE_FUNC inline double2 pabs(const double2& a) { - return make_double2(fabs(a.x), fabs(a.y)); -} - -template<> EIGEN_DEVICE_FUNC inline float4 pfloor(const float4& a) { - return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w)); -} -template<> EIGEN_DEVICE_FUNC inline double2 pfloor(const double2& a) { - return make_double2(floor(a.x), floor(a.y)); -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - float tmp = kernel.packet[0].y; - kernel.packet[0].y = kernel.packet[1].x; - kernel.packet[1].x = tmp; - - tmp = kernel.packet[0].z; - kernel.packet[0].z = kernel.packet[2].x; - kernel.packet[2].x = tmp; - - tmp = kernel.packet[0].w; - kernel.packet[0].w = kernel.packet[3].x; - kernel.packet[3].x = tmp; - - tmp = kernel.packet[1].z; - kernel.packet[1].z = kernel.packet[2].y; - kernel.packet[2].y = tmp; - - tmp = kernel.packet[1].w; - kernel.packet[1].w = kernel.packet[3].y; - kernel.packet[3].y = tmp; - - tmp = kernel.packet[2].w; - kernel.packet[2].w = kernel.packet[3].z; - kernel.packet[3].z = tmp; -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - double tmp = kernel.packet[0].y; - kernel.packet[0].y = kernel.packet[1].x; - kernel.packet[1].x = tmp; -} - -#endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) - -// Packet4h2 must be defined in the macro without EIGEN_CUDA_ARCH, meaning -// its corresponding packet_traits must be visible on host. -#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC)) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIPCC)) || \ - (defined(EIGEN_HAS_CUDA_FP16) && defined(__clang__) && defined(__CUDA__)) - -typedef ulonglong2 Packet4h2; -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h2 half; }; -template<> struct is_arithmetic { enum { value = true }; }; - -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef half2 half; }; -template<> struct is_arithmetic { enum { value = true }; }; - -template<> struct packet_traits : default_packet_traits -{ - typedef Packet4h2 type; - typedef Packet4h2 half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size=8, - HasHalfPacket = 0, - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasExp = 1, - HasExpm1 = 1, - HasLog = 1, - HasLog1p = 1 - }; -}; - -template<> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { -#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIPCC) - half2 r; - r.x = from; - r.y = from; - return r; -#elif defined(EIGEN_HIPCC) - return __half2{from,from}; -#else - return __half2half2(from); -#endif -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 -pset1(const Eigen::half& from) { - Packet4h2 r; - half2* p_alias = reinterpret_cast(&r); - p_alias[0] = pset1(from); - p_alias[1] = pset1(from); - p_alias[2] = pset1(from); - p_alias[3] = pset1(from); - return r; -} - -#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) -namespace { - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { - return *reinterpret_cast(from); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { - return __halves2half2(from[0], from[1]); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { - return __halves2half2(from[0], from[0]); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, - const half2& from) { - *reinterpret_cast(to) = from; -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, - const half2& from) { -#if !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_HIPCC) - to[0] = from.x; - to[1] = from.y; -#else - to[0] = __low2half(from); - to[1] = __high2half(from); -#endif -} - - -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned( - const Eigen::half* from) { - -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - return __ldg((const half2*)from); - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 350 - return __ldg((const half2*)from); -#else - return __halves2half2(*(from+0), *(from+1)); -#endif - -#endif -} - -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned( - const Eigen::half* from) { - -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - return __halves2half2(__ldg(from+0), __ldg(from+1)); - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 350 - return __halves2half2(__ldg(from+0), __ldg(from+1)); -#else - return __halves2half2(*(from+0), *(from+1)); -#endif - -#endif -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, - Index stride) { - return __halves2half2(from[0*stride], from[1*stride]); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter( - Eigen::half* to, const half2& from, Index stride) { - to[stride*0] = __low2half(from); - to[stride*1] = __high2half(from); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { - return __low2half(a); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) { - half a1 = __low2half(a); - half a2 = __high2half(a); - half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF); - half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF); - return __halves2half2(result1, result2); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& a) { - half true_half = half_impl::raw_uint16_to_half(0xffffu); - return pset1(true_half); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& a) { - half false_half = half_impl::raw_uint16_to_half(0x0000u); - return pset1(false_half); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - __half a1 = __low2half(kernel.packet[0]); - __half a2 = __high2half(kernel.packet[0]); - __half b1 = __low2half(kernel.packet[1]); - __half b2 = __high2half(kernel.packet[1]); - kernel.packet[0] = __halves2half2(a1, b1); - kernel.packet[1] = __halves2half2(a2, b2); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - return __halves2half2(a, __hadd(a, __float2half(1.0f))); - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 530 - return __halves2half2(a, __hadd(a, __float2half(1.0f))); -#else - float f = __half2float(a) + 1.0f; - return __halves2half2(a, __float2half(f)); -#endif - -#endif -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, - const half2& a, - const half2& b) { - half mask_low = __low2half(mask); - half mask_high = __high2half(mask); - half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a); - half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a); - return __halves2half2(result_low, result_high); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a, - const half2& b) { - half true_half = half_impl::raw_uint16_to_half(0xffffu); - half false_half = half_impl::raw_uint16_to_half(0x0000u); - half a1 = __low2half(a); - half a2 = __high2half(a); - half b1 = __low2half(b); - half b2 = __high2half(b); - half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half; - half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half; - return __halves2half2(eq1, eq2); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a, - const half2& b) { - half true_half = half_impl::raw_uint16_to_half(0xffffu); - half false_half = half_impl::raw_uint16_to_half(0x0000u); - half a1 = __low2half(a); - half a2 = __high2half(a); - half b1 = __low2half(b); - half b2 = __high2half(b); - half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half; - half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half; - return __halves2half2(eq1, eq2); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a, - const half2& b) { - half a1 = __low2half(a); - half a2 = __high2half(a); - half b1 = __low2half(b); - half b2 = __high2half(b); - half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x); - half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x); - return __halves2half2(result1, result2); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a, - const half2& b) { - half a1 = __low2half(a); - half a2 = __high2half(a); - half b1 = __low2half(b); - half b2 = __high2half(b); - half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x); - half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x); - return __halves2half2(result1, result2); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a, - const half2& b) { - half a1 = __low2half(a); - half a2 = __high2half(a); - half b1 = __low2half(b); - half b2 = __high2half(b); - half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x); - half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x); - return __halves2half2(result1, result2); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, - const half2& b) { - half a1 = __low2half(a); - half a2 = __high2half(a); - half b1 = __low2half(b); - half b2 = __high2half(b); - half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x); - half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x); - return __halves2half2(result1, result2); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, - const half2& b) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - return __hadd2(a, b); - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 530 - return __hadd2(a, b); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 + b1; - float r2 = a2 + b2; - return __floats2half2_rn(r1, r2); -#endif - -#endif -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, - const half2& b) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - return __hsub2(a, b); - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 530 - return __hsub2(a, b); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 - b1; - float r2 = a2 - b2; - return __floats2half2_rn(r1, r2); -#endif - -#endif -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - return __hneg2(a); - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 530 - return __hneg2(a); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return __floats2half2_rn(-a1, -a2); -#endif - -#endif -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; } - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, - const half2& b) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - return __hmul2(a, b); - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 530 - return __hmul2(a, b); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 * b1; - float r2 = a2 * b2; - return __floats2half2_rn(r1, r2); -#endif - -#endif -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, - const half2& b, - const half2& c) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - return __hfma2(a, b, c); - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 530 - return __hfma2(a, b, c); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float c1 = __low2float(c); - float c2 = __high2float(c); - float r1 = a1 * b1 + c1; - float r2 = a2 * b2 + c2; - return __floats2half2_rn(r1, r2); -#endif - -#endif -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, - const half2& b) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - return __h2div(a, b); - -#else // EIGEN_CUDA_ARCH - - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 / b1; - float r2 = a2 / b2; - return __floats2half2_rn(r1, r2); - -#endif -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, - const half2& b) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); - __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); - return __halves2half2(r1, r2); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, - const half2& b) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); - __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); - return __halves2half2(r1, r2); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - return __hadd(__low2half(a), __high2half(a)); - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 530 - return __hadd(__low2half(a), __high2half(a)); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return Eigen::half(__float2half(a1 + a2)); -#endif - -#endif -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - __half first = __low2half(a); - __half second = __high2half(a); - return __hgt(first, second) ? first : second; - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 530 - __half first = __low2half(a); - __half second = __high2half(a); - return __hgt(first, second) ? first : second; -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return a1 > a2 ? __low2half(a) : __high2half(a); -#endif - -#endif -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - __half first = __low2half(a); - __half second = __high2half(a); - return __hlt(first, second) ? first : second; - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 530 - __half first = __low2half(a); - __half second = __high2half(a); - return __hlt(first, second) ? first : second; -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return a1 < a2 ? __low2half(a) : __high2half(a); -#endif - -#endif -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - return __hmul(__low2half(a), __high2half(a)); - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 530 - return __hmul(__low2half(a), __high2half(a)); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - return Eigen::half(__float2half(a1 * a2)); -#endif - -#endif -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = log1pf(a1); - float r2 = log1pf(a2); - return __floats2half2_rn(r1, r2); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = expm1f(a1); - float r2 = expm1f(a2); - return __floats2half2_rn(r1, r2); -} - -#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \ - defined(EIGEN_HIP_DEVICE_COMPILE) - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -half2 plog(const half2& a) { - return h2log(a); -} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -half2 pexp(const half2& a) { - return h2exp(a); -} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -half2 psqrt(const half2& a) { - return h2sqrt(a); -} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -half2 prsqrt(const half2& a) { - return h2rsqrt(a); -} - -#else - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = logf(a1); - float r2 = logf(a2); - return __floats2half2_rn(r1, r2); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = expf(a1); - float r2 = expf(a2); - return __floats2half2_rn(r1, r2); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = sqrtf(a1); - float r2 = sqrtf(a2); - return __floats2half2_rn(r1, r2); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float r1 = rsqrtf(a1); - float r2 = rsqrtf(a2); - return __floats2half2_rn(r1, r2); -} -#endif -} // namespace - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 -pload(const Eigen::half* from) { - return *reinterpret_cast(from); -} - -// unaligned load; -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 -ploadu(const Eigen::half* from) { - Packet4h2 r; - half2* p_alias = reinterpret_cast(&r); - p_alias[0] = ploadu(from + 0); - p_alias[1] = ploadu(from + 2); - p_alias[2] = ploadu(from + 4); - p_alias[3] = ploadu(from + 6); - return r; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 -ploaddup(const Eigen::half* from) { - Packet4h2 r; - half2* p_alias = reinterpret_cast(&r); - p_alias[0] = ploaddup(from + 0); - p_alias[1] = ploaddup(from + 1); - p_alias[2] = ploaddup(from + 2); - p_alias[3] = ploaddup(from + 3); - return r; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore( - Eigen::half* to, const Packet4h2& from) { - *reinterpret_cast(to) = from; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu( - Eigen::half* to, const Packet4h2& from) { - const half2* from_alias = reinterpret_cast(&from); - pstoreu(to + 0,from_alias[0]); - pstoreu(to + 2,from_alias[1]); - pstoreu(to + 4,from_alias[2]); - pstoreu(to + 6,from_alias[3]); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 -ploadt_ro(const Eigen::half* from) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - Packet4h2 r; - r = __ldg((const Packet4h2*)from); - return r; -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 350 - Packet4h2 r; - r = __ldg((const Packet4h2*)from); - return r; -#else - Packet4h2 r; - half2* r_alias = reinterpret_cast(&r); - r_alias[0] = ploadt_ro_aligned(from + 0); - r_alias[1] = ploadt_ro_aligned(from + 2); - r_alias[2] = ploadt_ro_aligned(from + 4); - r_alias[3] = ploadt_ro_aligned(from + 6); - return r; -#endif - -#endif -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 -ploadt_ro(const Eigen::half* from) { - Packet4h2 r; - half2* r_alias = reinterpret_cast(&r); - r_alias[0] = ploadt_ro_unaligned(from + 0); - r_alias[1] = ploadt_ro_unaligned(from + 2); - r_alias[2] = ploadt_ro_unaligned(from + 4); - r_alias[3] = ploadt_ro_unaligned(from + 6); - return r; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 -pgather(const Eigen::half* from, Index stride) { - Packet4h2 r; - half2* p_alias = reinterpret_cast(&r); - p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]); - p_alias[1] = __halves2half2(from[2 * stride], from[3 * stride]); - p_alias[2] = __halves2half2(from[4 * stride], from[5 * stride]); - p_alias[3] = __halves2half2(from[6 * stride], from[7 * stride]); - return r; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter( - Eigen::half* to, const Packet4h2& from, Index stride) { - const half2* from_alias = reinterpret_cast(&from); - pscatter(to + stride * 0, from_alias[0], stride); - pscatter(to + stride * 2, from_alias[1], stride); - pscatter(to + stride * 4, from_alias[2], stride); - pscatter(to + stride * 6, from_alias[3], stride); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst( - const Packet4h2& a) { - return pfirst(*(reinterpret_cast(&a))); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs( - const Packet4h2& a) { - Packet4h2 r; - half2* p_alias = reinterpret_cast(&r); - const half2* a_alias = reinterpret_cast(&a); - p_alias[0] = pabs(a_alias[0]); - p_alias[1] = pabs(a_alias[1]); - p_alias[2] = pabs(a_alias[2]); - p_alias[3] = pabs(a_alias[3]); - return r; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue( - const Packet4h2& a) { - half true_half = half_impl::raw_uint16_to_half(0xffffu); - return pset1(true_half); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pzero(const Packet4h2& a) { - half false_half = half_impl::raw_uint16_to_half(0x0000u); - return pset1(false_half); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_double( - double* d_row0, double* d_row1, double* d_row2, double* d_row3, - double* d_row4, double* d_row5, double* d_row6, double* d_row7) { - double d_tmp; - d_tmp = d_row0[1]; - d_row0[1] = d_row4[0]; - d_row4[0] = d_tmp; - - d_tmp = d_row1[1]; - d_row1[1] = d_row5[0]; - d_row5[0] = d_tmp; - - d_tmp = d_row2[1]; - d_row2[1] = d_row6[0]; - d_row6[0] = d_tmp; - - d_tmp = d_row3[1]; - d_row3[1] = d_row7[0]; - d_row7[0] = d_tmp; -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2( - half2* f_row0, half2* f_row1, half2* f_row2, half2* f_row3) { - half2 f_tmp; - f_tmp = f_row0[1]; - f_row0[1] = f_row2[0]; - f_row2[0] = f_tmp; - - f_tmp = f_row1[1]; - f_row1[1] = f_row3[0]; - f_row3[0] = f_tmp; -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void -ptranspose_half(half2& f0, half2& f1) { - __half a1 = __low2half(f0); - __half a2 = __high2half(f0); - __half b1 = __low2half(f1); - __half b2 = __high2half(f1); - f0 = __halves2half2(a1, b1); - f1 = __halves2half2(a2, b2); -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - double* d_row0 = reinterpret_cast(&kernel.packet[0]); - double* d_row1 = reinterpret_cast(&kernel.packet[1]); - double* d_row2 = reinterpret_cast(&kernel.packet[2]); - double* d_row3 = reinterpret_cast(&kernel.packet[3]); - double* d_row4 = reinterpret_cast(&kernel.packet[4]); - double* d_row5 = reinterpret_cast(&kernel.packet[5]); - double* d_row6 = reinterpret_cast(&kernel.packet[6]); - double* d_row7 = reinterpret_cast(&kernel.packet[7]); - ptranspose_double(d_row0, d_row1, d_row2, d_row3, - d_row4, d_row5, d_row6, d_row7); - - - half2* f_row0 = reinterpret_cast(d_row0); - half2* f_row1 = reinterpret_cast(d_row1); - half2* f_row2 = reinterpret_cast(d_row2); - half2* f_row3 = reinterpret_cast(d_row3); - ptranspose_half2(f_row0, f_row1, f_row2, f_row3); - ptranspose_half(f_row0[0], f_row1[0]); - ptranspose_half(f_row0[1], f_row1[1]); - ptranspose_half(f_row2[0], f_row3[0]); - ptranspose_half(f_row2[1], f_row3[1]); - - f_row0 = reinterpret_cast(d_row0 + 1); - f_row1 = reinterpret_cast(d_row1 + 1); - f_row2 = reinterpret_cast(d_row2 + 1); - f_row3 = reinterpret_cast(d_row3 + 1); - ptranspose_half2(f_row0, f_row1, f_row2, f_row3); - ptranspose_half(f_row0[0], f_row1[0]); - ptranspose_half(f_row0[1], f_row1[1]); - ptranspose_half(f_row2[0], f_row3[0]); - ptranspose_half(f_row2[1], f_row3[1]); - - f_row0 = reinterpret_cast(d_row4); - f_row1 = reinterpret_cast(d_row5); - f_row2 = reinterpret_cast(d_row6); - f_row3 = reinterpret_cast(d_row7); - ptranspose_half2(f_row0, f_row1, f_row2, f_row3); - ptranspose_half(f_row0[0], f_row1[0]); - ptranspose_half(f_row0[1], f_row1[1]); - ptranspose_half(f_row2[0], f_row3[0]); - ptranspose_half(f_row2[1], f_row3[1]); - - f_row0 = reinterpret_cast(d_row4 + 1); - f_row1 = reinterpret_cast(d_row5 + 1); - f_row2 = reinterpret_cast(d_row6 + 1); - f_row3 = reinterpret_cast(d_row7 + 1); - ptranspose_half2(f_row0, f_row1, f_row2, f_row3); - ptranspose_half(f_row0[0], f_row1[0]); - ptranspose_half(f_row0[1], f_row1[1]); - ptranspose_half(f_row2[0], f_row3[0]); - ptranspose_half(f_row2[1], f_row3[1]); - -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 -plset(const Eigen::half& a) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - Packet4h2 r; - half2* p_alias = reinterpret_cast(&r); - p_alias[0] = __halves2half2(a, __hadd(a, __float2half(1.0f))); - p_alias[1] = __halves2half2(__hadd(a, __float2half(2.0f)), - __hadd(a, __float2half(3.0f))); - p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)), - __hadd(a, __float2half(5.0f))); - p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)), - __hadd(a, __float2half(7.0f))); - return r; -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 530 - Packet4h2 r; - half2* r_alias = reinterpret_cast(&r); - - half2 b = pset1(a); - half2 c; - half2 half_offset0 = __halves2half2(__float2half(0.0f),__float2half(2.0f)); - half2 half_offset1 = __halves2half2(__float2half(4.0f),__float2half(6.0f)); - - c = __hadd2(b, half_offset0); - r_alias[0] = plset(__low2half(c)); - r_alias[1] = plset(__high2half(c)); - - c = __hadd2(b, half_offset1); - r_alias[2] = plset(__low2half(c)); - r_alias[3] = plset(__high2half(c)); - - return r; - -#else - float f = __half2float(a); - Packet4h2 r; - half2* p_alias = reinterpret_cast(&r); - p_alias[0] = __halves2half2(a, __float2half(f + 1.0f)); - p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f)); - p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f)); - p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f)); - return r; -#endif - -#endif -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 -pselect(const Packet4h2& mask, const Packet4h2& a, - const Packet4h2& b) { - Packet4h2 r; - half2* r_alias = reinterpret_cast(&r); - const half2* mask_alias = reinterpret_cast(&mask); - const half2* a_alias = reinterpret_cast(&a); - const half2* b_alias = reinterpret_cast(&b); - r_alias[0] = pselect(mask_alias[0], a_alias[0], b_alias[0]); - r_alias[1] = pselect(mask_alias[1], a_alias[1], b_alias[1]); - r_alias[2] = pselect(mask_alias[2], a_alias[2], b_alias[2]); - r_alias[3] = pselect(mask_alias[3], a_alias[3], b_alias[3]); - return r; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 -pcmp_eq(const Packet4h2& a, const Packet4h2& b) { - Packet4h2 r; - half2* r_alias = reinterpret_cast(&r); - const half2* a_alias = reinterpret_cast(&a); - const half2* b_alias = reinterpret_cast(&b); - r_alias[0] = pcmp_eq(a_alias[0], b_alias[0]); - r_alias[1] = pcmp_eq(a_alias[1], b_alias[1]); - r_alias[2] = pcmp_eq(a_alias[2], b_alias[2]); - r_alias[3] = pcmp_eq(a_alias[3], b_alias[3]); - return r; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand( - const Packet4h2& a, const Packet4h2& b) { - Packet4h2 r; - half2* r_alias = reinterpret_cast(&r); - const half2* a_alias = reinterpret_cast(&a); - const half2* b_alias = reinterpret_cast(&b); - r_alias[0] = pand(a_alias[0], b_alias[0]); - r_alias[1] = pand(a_alias[1], b_alias[1]); - r_alias[2] = pand(a_alias[2], b_alias[2]); - r_alias[3] = pand(a_alias[3], b_alias[3]); - return r; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por( - const Packet4h2& a, const Packet4h2& b) { - Packet4h2 r; - half2* r_alias = reinterpret_cast(&r); - const half2* a_alias = reinterpret_cast(&a); - const half2* b_alias = reinterpret_cast(&b); - r_alias[0] = por(a_alias[0], b_alias[0]); - r_alias[1] = por(a_alias[1], b_alias[1]); - r_alias[2] = por(a_alias[2], b_alias[2]); - r_alias[3] = por(a_alias[3], b_alias[3]); - return r; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor( - const Packet4h2& a, const Packet4h2& b) { - Packet4h2 r; - half2* r_alias = reinterpret_cast(&r); - const half2* a_alias = reinterpret_cast(&a); - const half2* b_alias = reinterpret_cast(&b); - r_alias[0] = pxor(a_alias[0], b_alias[0]); - r_alias[1] = pxor(a_alias[1], b_alias[1]); - r_alias[2] = pxor(a_alias[2], b_alias[2]); - r_alias[3] = pxor(a_alias[3], b_alias[3]); - return r; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 -pandnot(const Packet4h2& a, const Packet4h2& b) { - Packet4h2 r; - half2* r_alias = reinterpret_cast(&r); - const half2* a_alias = reinterpret_cast(&a); - const half2* b_alias = reinterpret_cast(&b); - r_alias[0] = pandnot(a_alias[0], b_alias[0]); - r_alias[1] = pandnot(a_alias[1], b_alias[1]); - r_alias[2] = pandnot(a_alias[2], b_alias[2]); - r_alias[3] = pandnot(a_alias[3], b_alias[3]); - return r; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd( - const Packet4h2& a, const Packet4h2& b) { - Packet4h2 r; - half2* r_alias = reinterpret_cast(&r); - const half2* a_alias = reinterpret_cast(&a); - const half2* b_alias = reinterpret_cast(&b); - r_alias[0] = padd(a_alias[0], b_alias[0]); - r_alias[1] = padd(a_alias[1], b_alias[1]); - r_alias[2] = padd(a_alias[2], b_alias[2]); - r_alias[3] = padd(a_alias[3], b_alias[3]); - return r; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psub( - const Packet4h2& a, const Packet4h2& b) { - Packet4h2 r; - half2* r_alias = reinterpret_cast(&r); - const half2* a_alias = reinterpret_cast(&a); - const half2* b_alias = reinterpret_cast(&b); - r_alias[0] = psub(a_alias[0], b_alias[0]); - r_alias[1] = psub(a_alias[1], b_alias[1]); - r_alias[2] = psub(a_alias[2], b_alias[2]); - r_alias[3] = psub(a_alias[3], b_alias[3]); - return r; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pnegate(const Packet4h2& a) { - Packet4h2 r; - half2* r_alias = reinterpret_cast(&r); - const half2* a_alias = reinterpret_cast(&a); - r_alias[0] = pnegate(a_alias[0]); - r_alias[1] = pnegate(a_alias[1]); - r_alias[2] = pnegate(a_alias[2]); - r_alias[3] = pnegate(a_alias[3]); - return r; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pconj(const Packet4h2& a) { - return a; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul( - const Packet4h2& a, const Packet4h2& b) { - Packet4h2 r; - half2* r_alias = reinterpret_cast(&r); - const half2* a_alias = reinterpret_cast(&a); - const half2* b_alias = reinterpret_cast(&b); - r_alias[0] = pmul(a_alias[0], b_alias[0]); - r_alias[1] = pmul(a_alias[1], b_alias[1]); - r_alias[2] = pmul(a_alias[2], b_alias[2]); - r_alias[3] = pmul(a_alias[3], b_alias[3]); - return r; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd( - const Packet4h2& a, const Packet4h2& b, const Packet4h2& c) { - Packet4h2 r; - half2* r_alias = reinterpret_cast(&r); - const half2* a_alias = reinterpret_cast(&a); - const half2* b_alias = reinterpret_cast(&b); - const half2* c_alias = reinterpret_cast(&c); - r_alias[0] = pmadd(a_alias[0], b_alias[0], c_alias[0]); - r_alias[1] = pmadd(a_alias[1], b_alias[1], c_alias[1]); - r_alias[2] = pmadd(a_alias[2], b_alias[2], c_alias[2]); - r_alias[3] = pmadd(a_alias[3], b_alias[3], c_alias[3]); - return r; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv( - const Packet4h2& a, const Packet4h2& b) { - Packet4h2 r; - half2* r_alias = reinterpret_cast(&r); - const half2* a_alias = reinterpret_cast(&a); - const half2* b_alias = reinterpret_cast(&b); - r_alias[0] = pdiv(a_alias[0], b_alias[0]); - r_alias[1] = pdiv(a_alias[1], b_alias[1]); - r_alias[2] = pdiv(a_alias[2], b_alias[2]); - r_alias[3] = pdiv(a_alias[3], b_alias[3]); - return r; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin( - const Packet4h2& a, const Packet4h2& b) { - Packet4h2 r; - half2* r_alias = reinterpret_cast(&r); - const half2* a_alias = reinterpret_cast(&a); - const half2* b_alias = reinterpret_cast(&b); - r_alias[0] = pmin(a_alias[0], b_alias[0]); - r_alias[1] = pmin(a_alias[1], b_alias[1]); - r_alias[2] = pmin(a_alias[2], b_alias[2]); - r_alias[3] = pmin(a_alias[3], b_alias[3]); - return r; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax( - const Packet4h2& a, const Packet4h2& b) { - Packet4h2 r; - half2* r_alias = reinterpret_cast(&r); - const half2* a_alias = reinterpret_cast(&a); - const half2* b_alias = reinterpret_cast(&b); - r_alias[0] = pmax(a_alias[0], b_alias[0]); - r_alias[1] = pmax(a_alias[1], b_alias[1]); - r_alias[2] = pmax(a_alias[2], b_alias[2]); - r_alias[3] = pmax(a_alias[3], b_alias[3]); - return r; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux( - const Packet4h2& a) { - const half2* a_alias = reinterpret_cast(&a); - - return predux(a_alias[0]) + predux(a_alias[1]) + - predux(a_alias[2]) + predux(a_alias[3]); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max( - const Packet4h2& a) { - const half2* a_alias = reinterpret_cast(&a); - half2 m0 = __halves2half2(predux_max(a_alias[0]), - predux_max(a_alias[1])); - half2 m1 = __halves2half2(predux_max(a_alias[2]), - predux_max(a_alias[3])); - __half first = predux_max(m0); - __half second = predux_max(m1); -#if EIGEN_CUDA_ARCH >= 530 - return (__hgt(first, second) ? first : second); -#else - float ffirst = __half2float(first); - float fsecond = __half2float(second); - return (ffirst > fsecond)? first: second; -#endif -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min( - const Packet4h2& a) { - const half2* a_alias = reinterpret_cast(&a); - half2 m0 = __halves2half2(predux_min(a_alias[0]), - predux_min(a_alias[1])); - half2 m1 = __halves2half2(predux_min(a_alias[2]), - predux_min(a_alias[3])); - __half first = predux_min(m0); - __half second = predux_min(m1); -#if EIGEN_CUDA_ARCH >= 530 - return (__hlt(first, second) ? first : second); -#else - float ffirst = __half2float(first); - float fsecond = __half2float(second); - return (ffirst < fsecond)? first: second; -#endif -} - -// likely overflow/underflow -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul( - const Packet4h2& a) { - const half2* a_alias = reinterpret_cast(&a); - return predux_mul(pmul(pmul(a_alias[0], a_alias[1]), - pmul(a_alias[2], a_alias[3]))); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 -plog1p(const Packet4h2& a) { - Packet4h2 r; - half2* r_alias = reinterpret_cast(&r); - const half2* a_alias = reinterpret_cast(&a); - r_alias[0] = plog1p(a_alias[0]); - r_alias[1] = plog1p(a_alias[1]); - r_alias[2] = plog1p(a_alias[2]); - r_alias[3] = plog1p(a_alias[3]); - return r; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 -pexpm1(const Packet4h2& a) { - Packet4h2 r; - half2* r_alias = reinterpret_cast(&r); - const half2* a_alias = reinterpret_cast(&a); - r_alias[0] = pexpm1(a_alias[0]); - r_alias[1] = pexpm1(a_alias[1]); - r_alias[2] = pexpm1(a_alias[2]); - r_alias[3] = pexpm1(a_alias[3]); - return r; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog(const Packet4h2& a) { - Packet4h2 r; - half2* r_alias = reinterpret_cast(&r); - const half2* a_alias = reinterpret_cast(&a); - r_alias[0] = plog(a_alias[0]); - r_alias[1] = plog(a_alias[1]); - r_alias[2] = plog(a_alias[2]); - r_alias[3] = plog(a_alias[3]); - return r; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexp(const Packet4h2& a) { - Packet4h2 r; - half2* r_alias = reinterpret_cast(&r); - const half2* a_alias = reinterpret_cast(&a); - r_alias[0] = pexp(a_alias[0]); - r_alias[1] = pexp(a_alias[1]); - r_alias[2] = pexp(a_alias[2]); - r_alias[3] = pexp(a_alias[3]); - return r; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psqrt(const Packet4h2& a) { - Packet4h2 r; - half2* r_alias = reinterpret_cast(&r); - const half2* a_alias = reinterpret_cast(&a); - r_alias[0] = psqrt(a_alias[0]); - r_alias[1] = psqrt(a_alias[1]); - r_alias[2] = psqrt(a_alias[2]); - r_alias[3] = psqrt(a_alias[3]); - return r; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 -prsqrt(const Packet4h2& a) { - Packet4h2 r; - half2* r_alias = reinterpret_cast(&r); - const half2* a_alias = reinterpret_cast(&a); - r_alias[0] = prsqrt(a_alias[0]); - r_alias[1] = prsqrt(a_alias[1]); - r_alias[2] = prsqrt(a_alias[2]); - r_alias[3] = prsqrt(a_alias[3]); - return r; -} - -// The following specialized padd, pmul, pdiv, pmin, pmax, pset1 are needed for -// the implementation of GPU half reduction. -template<> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, - const half2& b) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - return __hadd2(a, b); - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 530 - return __hadd2(a, b); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 + b1; - float r2 = a2 + b2; - return __floats2half2_rn(r1, r2); -#endif - -#endif -} - -template<> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, - const half2& b) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - return __hmul2(a, b); - -#else // EIGEN_CUDA_ARCH - -#if EIGEN_CUDA_ARCH >= 530 - return __hmul2(a, b); -#else - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 * b1; - float r2 = a2 * b2; - return __floats2half2_rn(r1, r2); -#endif - -#endif -} - -template<> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, - const half2& b) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - - return __h2div(a, b); - -#else // EIGEN_CUDA_ARCH - - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - float r1 = a1 / b1; - float r2 = a2 / b2; - return __floats2half2_rn(r1, r2); - -#endif -} - -template<> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, - const half2& b) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); - __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); - return __halves2half2(r1, r2); -} - -template<> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, - const half2& b) { - float a1 = __low2float(a); - float a2 = __high2float(a); - float b1 = __low2float(b); - float b2 = __high2float(b); - __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); - __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); - return __halves2half2(r1, r2); -} - -#endif // defined(EIGEN_CUDA_ARCH) - -#endif // defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC) - -} // end namespace internal - -} // end namespace Eigen - - -#endif // EIGEN_PACKET_MATH_GPU_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/GPU/TypeCasting.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/GPU/TypeCasting.h deleted file mode 100644 index 754546225..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/GPU/TypeCasting.h +++ /dev/null @@ -1,80 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2016 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_TYPE_CASTING_GPU_H -#define EIGEN_TYPE_CASTING_GPU_H - -namespace Eigen { - -namespace internal { - -#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) - - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 2 - }; -}; - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast(const half2& a, const half2& b) { - float2 r1 = __half22float2(a); - float2 r2 = __half22float2(b); - return make_float4(r1.x, r1.y, r2.x, r2.y); -} - - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcast(const float4& a, const float4& b) { - Packet4h2 r; - half2* r_alias=reinterpret_cast(&r); - r_alias[0]=__floats2half2_rn(a.x,a.y); - r_alias[1]=__floats2half2_rn(a.z,a.w); - r_alias[2]=__floats2half2_rn(b.x,b.y); - r_alias[3]=__floats2half2_rn(b.z,b.w); - return r; -} - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 2, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast(const Packet4h2& a) { - // Simply discard the second half of the input - float4 r; - const half2* a_alias=reinterpret_cast(&a); - float2 r1 = __half22float2(a_alias[0]); - float2 r2 = __half22float2(a_alias[1]); - r.x=static_cast(r1.x); - r.y=static_cast(r1.y); - r.z=static_cast(r2.x); - r.w=static_cast(r2.y); - return r; -} - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast(const float4& a) { - // Simply discard the second half of the input - return __floats2half2_rn(a.x, a.y); -} - -#endif - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_TYPE_CASTING_GPU_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h deleted file mode 100644 index 25375a0a4..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/HIP/hcc/math_constants.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * math_constants.h - - * HIP equivalent of the CUDA header of the same name - */ - -#ifndef __MATH_CONSTANTS_H__ -#define __MATH_CONSTANTS_H__ - -/* single precision constants */ - -#define HIPRT_INF_F __int_as_float(0x7f800000) -#define HIPRT_NAN_F __int_as_float(0x7fffffff) -#define HIPRT_MIN_DENORM_F __int_as_float(0x00000001) -#define HIPRT_MAX_NORMAL_F __int_as_float(0x7f7fffff) -#define HIPRT_NEG_ZERO_F __int_as_float(0x80000000) -#define HIPRT_ZERO_F 0.0f -#define HIPRT_ONE_F 1.0f - -/* double precision constants */ -#define HIPRT_INF __hiloint2double(0x7ff00000, 0x00000000) -#define HIPRT_NAN __hiloint2double(0xfff80000, 0x00000000) - -#endif diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/MSA/Complex.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/MSA/Complex.h deleted file mode 100644 index 4877a95a8..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/MSA/Complex.h +++ /dev/null @@ -1,720 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2018 Wave Computing, Inc. -// Written by: -// Chris Larsen -// Alexey Frunze (afrunze@wavecomp.com) -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_COMPLEX_MSA_H -#define EIGEN_COMPLEX_MSA_H - -#include - -namespace Eigen { - -namespace internal { - -//---------- float ---------- -struct Packet2cf { - EIGEN_STRONG_INLINE Packet2cf() { - } - EIGEN_STRONG_INLINE explicit Packet2cf(const std::complex& a, - const std::complex& b) { - Packet4f t = { std::real(a), std::imag(a), std::real(b), std::imag(b) }; - v = t; - } - EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) { - } - EIGEN_STRONG_INLINE Packet2cf(const Packet2cf& a) : v(a.v) { - } - EIGEN_STRONG_INLINE Packet2cf& operator=(const Packet2cf& b) { - v = b.v; - return *this; - } - EIGEN_STRONG_INLINE Packet2cf conjugate(void) const { - return Packet2cf((Packet4f)__builtin_msa_bnegi_d((v2u64)v, 63)); - } - EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) { - Packet4f v1, v2; - - // Get the real values of a | a1_re | a1_re | a2_re | a2_re | - v1 = (Packet4f)__builtin_msa_ilvev_w((v4i32)v, (v4i32)v); - // Get the imag values of a | a1_im | a1_im | a2_im | a2_im | - v2 = (Packet4f)__builtin_msa_ilvod_w((v4i32)v, (v4i32)v); - // Multiply the real a with b - v1 = pmul(v1, b.v); - // Multiply the imag a with b - v2 = pmul(v2, b.v); - // Conjugate v2 - v2 = Packet2cf(v2).conjugate().v; - // Swap real/imag elements in v2. - v2 = (Packet4f)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(1, 0, 3, 2)); - // Add and return the result - v = padd(v1, v2); - return *this; - } - EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const { - return Packet2cf(*this) *= b; - } - EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) { - v = padd(v, b.v); - return *this; - } - EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const { - return Packet2cf(*this) += b; - } - EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) { - v = psub(v, b.v); - return *this; - } - EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const { - return Packet2cf(*this) -= b; - } - EIGEN_STRONG_INLINE Packet2cf& operator/=(const Packet2cf& b) { - *this *= b.conjugate(); - Packet4f s = pmul(b.v, b.v); - s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); - v = pdiv(v, s); - return *this; - } - EIGEN_STRONG_INLINE Packet2cf operator/(const Packet2cf& b) const { - return Packet2cf(*this) /= b; - } - EIGEN_STRONG_INLINE Packet2cf operator-(void) const { - return Packet2cf(pnegate(v)); - } - - Packet4f v; -}; - -inline std::ostream& operator<<(std::ostream& os, const Packet2cf& value) { - os << "[ (" << value.v[0] << ", " << value.v[1] - << "i)," - " (" - << value.v[2] << ", " << value.v[3] << "i) ]"; - return os; -} - -template <> -struct packet_traits > : default_packet_traits { - typedef Packet2cf type; - typedef Packet2cf half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 2, - HasHalfPacket = 0, - - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasNegate = 1, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasSetLinear = 0, - HasBlend = 1 - }; -}; - -template <> -struct unpacket_traits { - typedef std::complex type; - enum { size = 2, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false }; - typedef Packet2cf half; -}; - -template <> -EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { - EIGEN_MSA_DEBUG; - - float f0 = from.real(), f1 = from.imag(); - Packet4f v0 = { f0, f0, f0, f0 }; - Packet4f v1 = { f1, f1, f1, f1 }; - return Packet2cf((Packet4f)__builtin_msa_ilvr_w((Packet4i)v1, (Packet4i)v0)); -} - -template <> -EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { - EIGEN_MSA_DEBUG; - - return a + b; -} - -template <> -EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { - EIGEN_MSA_DEBUG; - - return a - b; -} - -template <> -EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { - EIGEN_MSA_DEBUG; - - return -a; -} - -template <> -EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { - EIGEN_MSA_DEBUG; - - return a.conjugate(); -} - -template <> -EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) { - EIGEN_MSA_DEBUG; - - return a * b; -} - -template <> -EIGEN_STRONG_INLINE Packet2cf pand(const Packet2cf& a, const Packet2cf& b) { - EIGEN_MSA_DEBUG; - - return Packet2cf(pand(a.v, b.v)); -} - -template <> -EIGEN_STRONG_INLINE Packet2cf por(const Packet2cf& a, const Packet2cf& b) { - EIGEN_MSA_DEBUG; - - return Packet2cf(por(a.v, b.v)); -} - -template <> -EIGEN_STRONG_INLINE Packet2cf pxor(const Packet2cf& a, const Packet2cf& b) { - EIGEN_MSA_DEBUG; - - return Packet2cf(pxor(a.v, b.v)); -} - -template <> -EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { - EIGEN_MSA_DEBUG; - - return Packet2cf(pandnot(a.v, b.v)); -} - -template <> -EIGEN_STRONG_INLINE Packet2cf pload(const std::complex* from) { - EIGEN_MSA_DEBUG; - - EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); -} - -template <> -EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { - EIGEN_MSA_DEBUG; - - EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); -} - -template <> -EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { - EIGEN_MSA_DEBUG; - - return pset1(*from); -} - -template <> -EIGEN_STRONG_INLINE void pstore >(std::complex* to, - const Packet2cf& from) { - EIGEN_MSA_DEBUG; - - EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); -} - -template <> -EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, - const Packet2cf& from) { - EIGEN_MSA_DEBUG; - - EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); -} - -template <> -EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>( - const std::complex* from, Index stride) { - EIGEN_MSA_DEBUG; - - return Packet2cf(from[0 * stride], from[1 * stride]); -} - -template <> -EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, - const Packet2cf& from, - Index stride) { - EIGEN_MSA_DEBUG; - - *to = std::complex(from.v[0], from.v[1]); - to += stride; - *to = std::complex(from.v[2], from.v[3]); -} - -template <> -EIGEN_STRONG_INLINE void prefetch >(const std::complex* addr) { - EIGEN_MSA_DEBUG; - - prefetch(reinterpret_cast(addr)); -} - -template <> -EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { - EIGEN_MSA_DEBUG; - - return std::complex(a.v[0], a.v[1]); -} - -template <> -EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { - EIGEN_MSA_DEBUG; - - return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); -} - -template <> -EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& a) { - EIGEN_MSA_DEBUG; - - return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); -} - -template <> -EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) { - EIGEN_MSA_DEBUG; - - Packet4f value = (Packet4f)preverse((Packet2d)a.v); - value += a.v; - return std::complex(value[0], value[1]); -} - -template <> -EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) { - EIGEN_MSA_DEBUG; - - return std::complex((a.v[0] * a.v[2]) - (a.v[1] * a.v[3]), - (a.v[0] * a.v[3]) + (a.v[1] * a.v[2])); -} - -template <> -struct conj_helper { - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, - const Packet2cf& c) const { - return padd(pmul(x, y), c); - } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const { - return internal::pmul(a, pconj(b)); - } -}; - -template <> -struct conj_helper { - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, - const Packet2cf& c) const { - return padd(pmul(x, y), c); - } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const { - return internal::pmul(pconj(a), b); - } -}; - -template <> -struct conj_helper { - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, - const Packet2cf& c) const { - return padd(pmul(x, y), c); - } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const { - return pconj(internal::pmul(a, b)); - } -}; - -EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f) - -template <> -EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { - EIGEN_MSA_DEBUG; - - return a / b; -} - -inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { - os << "[ " << value.packet[0] << ", " << std::endl << " " << value.packet[1] << " ]"; - return os; -} - -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - EIGEN_MSA_DEBUG; - - Packet4f tmp = - (Packet4f)__builtin_msa_ilvl_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v); - kernel.packet[0].v = - (Packet4f)__builtin_msa_ilvr_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v); - kernel.packet[1].v = tmp; -} - -template <> -EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, - const Packet2cf& elsePacket) { - return (Packet2cf)(Packet4f)pblend(ifPacket, (Packet2d)thenPacket.v, - (Packet2d)elsePacket.v); -} - -//---------- double ---------- - -struct Packet1cd { - EIGEN_STRONG_INLINE Packet1cd() { - } - EIGEN_STRONG_INLINE explicit Packet1cd(const std::complex& a) { - v[0] = std::real(a); - v[1] = std::imag(a); - } - EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) { - } - EIGEN_STRONG_INLINE Packet1cd(const Packet1cd& a) : v(a.v) { - } - EIGEN_STRONG_INLINE Packet1cd& operator=(const Packet1cd& b) { - v = b.v; - return *this; - } - EIGEN_STRONG_INLINE Packet1cd conjugate(void) const { - static const v2u64 p2ul_CONJ_XOR = { 0x0, 0x8000000000000000 }; - return (Packet1cd)pxor(v, (Packet2d)p2ul_CONJ_XOR); - } - EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) { - Packet2d v1, v2; - - // Get the real values of a | a1_re | a1_re - v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)v, (v2i64)v); - // Get the imag values of a | a1_im | a1_im - v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)v, (v2i64)v); - // Multiply the real a with b - v1 = pmul(v1, b.v); - // Multiply the imag a with b - v2 = pmul(v2, b.v); - // Conjugate v2 - v2 = Packet1cd(v2).conjugate().v; - // Swap real/imag elements in v2. - v2 = (Packet2d)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); - // Add and return the result - v = padd(v1, v2); - return *this; - } - EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const { - return Packet1cd(*this) *= b; - } - EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) { - v = padd(v, b.v); - return *this; - } - EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const { - return Packet1cd(*this) += b; - } - EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) { - v = psub(v, b.v); - return *this; - } - EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const { - return Packet1cd(*this) -= b; - } - EIGEN_STRONG_INLINE Packet1cd& operator/=(const Packet1cd& b) { - *this *= b.conjugate(); - Packet2d s = pmul(b.v, b.v); - s = padd(s, preverse(s)); - v = pdiv(v, s); - return *this; - } - EIGEN_STRONG_INLINE Packet1cd operator/(const Packet1cd& b) const { - return Packet1cd(*this) /= b; - } - EIGEN_STRONG_INLINE Packet1cd operator-(void) const { - return Packet1cd(pnegate(v)); - } - - Packet2d v; -}; - -inline std::ostream& operator<<(std::ostream& os, const Packet1cd& value) { - os << "[ (" << value.v[0] << ", " << value.v[1] << "i) ]"; - return os; -} - -template <> -struct packet_traits > : default_packet_traits { - typedef Packet1cd type; - typedef Packet1cd half; - enum { - Vectorizable = 1, - AlignedOnScalar = 0, - size = 1, - HasHalfPacket = 0, - - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasNegate = 1, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasSetLinear = 0 - }; -}; - -template <> -struct unpacket_traits { - typedef std::complex type; - enum { size = 1, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false }; - typedef Packet1cd half; -}; - -template <> -EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) { - EIGEN_MSA_DEBUG; - - EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); -} - -template <> -EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { - EIGEN_MSA_DEBUG; - - EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); -} - -template <> -EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) { - EIGEN_MSA_DEBUG; - - return Packet1cd(from); -} - -template <> -EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { - EIGEN_MSA_DEBUG; - - return a + b; -} - -template <> -EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { - EIGEN_MSA_DEBUG; - - return a - b; -} - -template <> -EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { - EIGEN_MSA_DEBUG; - - return -a; -} - -template <> -EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { - EIGEN_MSA_DEBUG; - - return a.conjugate(); -} - -template <> -EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) { - EIGEN_MSA_DEBUG; - - return a * b; -} - -template <> -EIGEN_STRONG_INLINE Packet1cd pand(const Packet1cd& a, const Packet1cd& b) { - EIGEN_MSA_DEBUG; - - return Packet1cd(pand(a.v, b.v)); -} - -template <> -EIGEN_STRONG_INLINE Packet1cd por(const Packet1cd& a, const Packet1cd& b) { - EIGEN_MSA_DEBUG; - - return Packet1cd(por(a.v, b.v)); -} - -template <> -EIGEN_STRONG_INLINE Packet1cd pxor(const Packet1cd& a, const Packet1cd& b) { - EIGEN_MSA_DEBUG; - - return Packet1cd(pxor(a.v, b.v)); -} - -template <> -EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { - EIGEN_MSA_DEBUG; - - return Packet1cd(pandnot(a.v, b.v)); -} - -template <> -EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { - EIGEN_MSA_DEBUG; - - return pset1(*from); -} - -template <> -EIGEN_STRONG_INLINE void pstore >(std::complex* to, - const Packet1cd& from) { - EIGEN_MSA_DEBUG; - - EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); -} - -template <> -EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, - const Packet1cd& from) { - EIGEN_MSA_DEBUG; - - EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); -} - -template <> -EIGEN_STRONG_INLINE void prefetch >(const std::complex* addr) { - EIGEN_MSA_DEBUG; - - prefetch(reinterpret_cast(addr)); -} - -template <> -EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>( - const std::complex* from, Index stride __attribute__((unused))) { - EIGEN_MSA_DEBUG; - - Packet1cd res; - res.v[0] = std::real(from[0]); - res.v[1] = std::imag(from[0]); - return res; -} - -template <> -EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, - const Packet1cd& from, - Index stride - __attribute__((unused))) { - EIGEN_MSA_DEBUG; - - pstore(to, from); -} - -template <> -EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { - EIGEN_MSA_DEBUG; - - return std::complex(a.v[0], a.v[1]); -} - -template <> -EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { - EIGEN_MSA_DEBUG; - - return a; -} - -template <> -EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { - EIGEN_MSA_DEBUG; - - return pfirst(a); -} - -template <> -EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { - EIGEN_MSA_DEBUG; - - return pfirst(a); -} - -template <> -struct conj_helper { - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, - const Packet1cd& c) const { - return padd(pmul(x, y), c); - } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const { - return internal::pmul(a, pconj(b)); - } -}; - -template <> -struct conj_helper { - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, - const Packet1cd& c) const { - return padd(pmul(x, y), c); - } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const { - return internal::pmul(pconj(a), b); - } -}; - -template <> -struct conj_helper { - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, - const Packet1cd& c) const { - return padd(pmul(x, y), c); - } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const { - return pconj(internal::pmul(a, b)); - } -}; - -EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d) - -template <> -EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { - EIGEN_MSA_DEBUG; - - return a / b; -} - -EIGEN_STRONG_INLINE Packet1cd pcplxflip /**/ (const Packet1cd& x) { - EIGEN_MSA_DEBUG; - - return Packet1cd(preverse(Packet2d(x.v))); -} - -inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { - os << "[ " << value.packet[0] << ", " << std::endl << " " << value.packet[1] << " ]"; - return os; -} - -EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { - EIGEN_MSA_DEBUG; - - Packet2d v1, v2; - - v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v); - // Get the imag values of a - v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v); - - kernel.packet[0].v = v1; - kernel.packet[1].v = v2; -} - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_COMPLEX_MSA_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/MSA/MathFunctions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/MSA/MathFunctions.h deleted file mode 100644 index f5181b90e..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/MSA/MathFunctions.h +++ /dev/null @@ -1,387 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2007 Julien Pommier -// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com) -// Copyright (C) 2016 Gael Guennebaud -// -// Copyright (C) 2018 Wave Computing, Inc. -// Written by: -// Chris Larsen -// Alexey Frunze (afrunze@wavecomp.com) -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/* The sin, cos, exp, and log functions of this file come from - * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ - */ - -/* The tanh function of this file is an adaptation of - * template T generic_fast_tanh_float(const T&) - * from MathFunctionsImpl.h. - */ - -#ifndef EIGEN_MATH_FUNCTIONS_MSA_H -#define EIGEN_MATH_FUNCTIONS_MSA_H - -namespace Eigen { - -namespace internal { - -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f -plog(const Packet4f& _x) { - static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292e-2f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310e-1f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740e-1f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, -1.2420140846e-1f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, +1.4249322787e-1f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, -1.6668057665e-1f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, +2.0000714765e-1f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, -2.4999993993e-1f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, +3.3333331174e-1f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); - static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f); - - // Convert negative argument into NAN (quiet negative, to be specific). - Packet4f zero = (Packet4f)__builtin_msa_ldi_w(0); - Packet4i neg_mask = __builtin_msa_fclt_w(_x, zero); - Packet4i zero_mask = __builtin_msa_fceq_w(_x, zero); - Packet4f non_neg_x_or_nan = padd(_x, (Packet4f)neg_mask); // Add 0.0 or NAN. - Packet4f x = non_neg_x_or_nan; - - // Extract exponent from x = mantissa * 2**exponent, where 1.0 <= mantissa < 2.0. - // N.B. the exponent is one less of what frexpf() would return. - Packet4i e_int = __builtin_msa_ftint_s_w(__builtin_msa_flog2_w(x)); - // Multiply x by 2**(-exponent-1) to get 0.5 <= x < 1.0 as from frexpf(). - x = __builtin_msa_fexp2_w(x, (Packet4i)__builtin_msa_nori_b((v16u8)e_int, 0)); - - /* - if (x < SQRTHF) { - x = x + x - 1.0; - } else { - e += 1; - x = x - 1.0; - } - */ - Packet4f xx = padd(x, x); - Packet4i ge_mask = __builtin_msa_fcle_w(p4f_cephes_SQRTHF, x); - e_int = psub(e_int, ge_mask); - x = (Packet4f)__builtin_msa_bsel_v((v16u8)ge_mask, (v16u8)xx, (v16u8)x); - x = psub(x, p4f_1); - Packet4f e = __builtin_msa_ffint_s_w(e_int); - - Packet4f x2 = pmul(x, x); - Packet4f x3 = pmul(x2, x); - - Packet4f y, y1, y2; - y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1); - y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4); - y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7); - y = pmadd(y, x, p4f_cephes_log_p2); - y1 = pmadd(y1, x, p4f_cephes_log_p5); - y2 = pmadd(y2, x, p4f_cephes_log_p8); - y = pmadd(y, x3, y1); - y = pmadd(y, x3, y2); - y = pmul(y, x3); - - y = pmadd(e, p4f_cephes_log_q1, y); - x = __builtin_msa_fmsub_w(x, x2, p4f_half); - x = padd(x, y); - x = pmadd(e, p4f_cephes_log_q2, x); - - // x is now the logarithm result candidate. We still need to handle the - // extreme arguments of zero and positive infinity, though. - // N.B. if the argument is +INFINITY, x is NAN because the polynomial terms - // contain infinities of both signs (see the coefficients and code above). - // INFINITY - INFINITY is NAN. - - // If the argument is +INFINITY, make it the new result candidate. - // To achieve that we choose the smaller of the result candidate and the - // argument. - // This is correct for all finite pairs of values (the logarithm is smaller - // than the argument). - // This is also correct in the special case when the argument is +INFINITY - // and the result candidate is NAN. This is because the fmin.df instruction - // prefers non-NANs to NANs. - x = __builtin_msa_fmin_w(x, non_neg_x_or_nan); - - // If the argument is zero (including -0.0), the result becomes -INFINITY. - Packet4i neg_infs = __builtin_msa_slli_w(zero_mask, 23); - x = (Packet4f)__builtin_msa_bsel_v((v16u8)zero_mask, (v16u8)x, (v16u8)neg_infs); - - return x; -} - -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f -pexp(const Packet4f& _x) { - // Limiting single-precision pexp's argument to [-128, +128] lets pexp - // reach 0 and INFINITY naturally. - static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -128.0f); - static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, +128.0f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500e-4f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507e-3f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073e-3f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894e-2f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459e-1f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201e-1f); - static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f); - - Packet4f x = _x; - - // Clamp x. - x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(x, p4f_exp_lo), (v16u8)x, - (v16u8)p4f_exp_lo); - x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_exp_hi, x), (v16u8)x, - (v16u8)p4f_exp_hi); - - // Round to nearest integer by adding 0.5 (with x's sign) and truncating. - Packet4f x2_add = (Packet4f)__builtin_msa_binsli_w((v4u32)p4f_half, (v4u32)x, 0); - Packet4f x2 = pmadd(x, p4f_cephes_LOG2EF, x2_add); - Packet4i x2_int = __builtin_msa_ftrunc_s_w(x2); - Packet4f x2_int_f = __builtin_msa_ffint_s_w(x2_int); - - x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C1); - x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C2); - - Packet4f z = pmul(x, x); - - Packet4f y = p4f_cephes_exp_p0; - y = pmadd(y, x, p4f_cephes_exp_p1); - y = pmadd(y, x, p4f_cephes_exp_p2); - y = pmadd(y, x, p4f_cephes_exp_p3); - y = pmadd(y, x, p4f_cephes_exp_p4); - y = pmadd(y, x, p4f_cephes_exp_p5); - y = pmadd(y, z, x); - y = padd(y, p4f_1); - - // y *= 2**exponent. - y = __builtin_msa_fexp2_w(y, x2_int); - - return y; -} - -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f -ptanh(const Packet4f& _x) { - static _EIGEN_DECLARE_CONST_Packet4f(tanh_tiny, 1e-4f); - static _EIGEN_DECLARE_CONST_Packet4f(tanh_hi, 9.0f); - // The monomial coefficients of the numerator polynomial (odd). - static _EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-3f); - static _EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-4f); - static _EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-5f); - static _EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-8f); - static _EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f); - static _EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f); - static _EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f); - // The monomial coefficients of the denominator polynomial (even). - static _EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-3f); - static _EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-3f); - static _EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-4f); - static _EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-6f); - - Packet4f x = pabs(_x); - Packet4i tiny_mask = __builtin_msa_fclt_w(x, p4f_tanh_tiny); - - // Clamp the inputs to the range [-9, 9] since anything outside - // this range is -/+1.0f in single-precision. - x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_tanh_hi, x), (v16u8)x, - (v16u8)p4f_tanh_hi); - - // Since the polynomials are odd/even, we need x**2. - Packet4f x2 = pmul(x, x); - - // Evaluate the numerator polynomial p. - Packet4f p = pmadd(x2, p4f_alpha_13, p4f_alpha_11); - p = pmadd(x2, p, p4f_alpha_9); - p = pmadd(x2, p, p4f_alpha_7); - p = pmadd(x2, p, p4f_alpha_5); - p = pmadd(x2, p, p4f_alpha_3); - p = pmadd(x2, p, p4f_alpha_1); - p = pmul(x, p); - - // Evaluate the denominator polynomial q. - Packet4f q = pmadd(x2, p4f_beta_6, p4f_beta_4); - q = pmadd(x2, q, p4f_beta_2); - q = pmadd(x2, q, p4f_beta_0); - - // Divide the numerator by the denominator. - p = pdiv(p, q); - - // Reinstate the sign. - p = (Packet4f)__builtin_msa_binsli_w((v4u32)p, (v4u32)_x, 0); - - // When the argument is very small in magnitude it's more accurate to just return it. - p = (Packet4f)__builtin_msa_bsel_v((v16u8)tiny_mask, (v16u8)p, (v16u8)_x); - - return p; -} - -template -Packet4f psincos_inner_msa_float(const Packet4f& _x) { - static _EIGEN_DECLARE_CONST_Packet4f(sincos_max_arg, 13176795.0f); // Approx. (2**24) / (4/Pi). - static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1, -0.78515625f); - static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); - static _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); - static _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891e-4f); - static _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736e-3f); - static _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611e-1f); - static _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948e-5f); - static _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765e-3f); - static _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827e-2f); - static _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4/Pi. - static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); - static _EIGEN_DECLARE_CONST_Packet4f(1, 1.0f); - - Packet4f x = pabs(_x); - - // Translate infinite arguments into NANs. - Packet4f zero_or_nan_if_inf = psub(_x, _x); - x = padd(x, zero_or_nan_if_inf); - // Prevent sin/cos from generating values larger than 1.0 in magnitude - // for very large arguments by setting x to 0.0. - Packet4i small_or_nan_mask = __builtin_msa_fcult_w(x, p4f_sincos_max_arg); - x = pand(x, (Packet4f)small_or_nan_mask); - - // Scale x by 4/Pi to find x's octant. - Packet4f y = pmul(x, p4f_cephes_FOPI); - // Get the octant. We'll reduce x by this number of octants or by one more than it. - Packet4i y_int = __builtin_msa_ftrunc_s_w(y); - // x's from even-numbered octants will translate to octant 0: [0, +Pi/4]. - // x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0]. - // Adjustment for odd-numbered octants: octant = (octant + 1) & (~1). - Packet4i y_int1 = __builtin_msa_addvi_w(y_int, 1); - Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0); // bclri = bit-clear - y = __builtin_msa_ffint_s_w(y_int2); - - // Compute the sign to apply to the polynomial. - Packet4i sign_mask = sine ? pxor(__builtin_msa_slli_w(y_int1, 29), (Packet4i)_x) - : __builtin_msa_slli_w(__builtin_msa_addvi_w(y_int, 3), 29); - - // Get the polynomial selection mask. - // We'll calculate both (sin and cos) polynomials and then select from the two. - Packet4i poly_mask = __builtin_msa_ceqi_w(__builtin_msa_slli_w(y_int2, 30), 0); - - // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4. - // The magic pass: "Extended precision modular arithmetic" - // x = ((x - y * DP1) - y * DP2) - y * DP3 - Packet4f tmp1 = pmul(y, p4f_minus_cephes_DP1); - Packet4f tmp2 = pmul(y, p4f_minus_cephes_DP2); - Packet4f tmp3 = pmul(y, p4f_minus_cephes_DP3); - x = padd(x, tmp1); - x = padd(x, tmp2); - x = padd(x, tmp3); - - // Evaluate the cos(x) polynomial. - y = p4f_coscof_p0; - Packet4f z = pmul(x, x); - y = pmadd(y, z, p4f_coscof_p1); - y = pmadd(y, z, p4f_coscof_p2); - y = pmul(y, z); - y = pmul(y, z); - y = __builtin_msa_fmsub_w(y, z, p4f_half); - y = padd(y, p4f_1); - - // Evaluate the sin(x) polynomial. - Packet4f y2 = p4f_sincof_p0; - y2 = pmadd(y2, z, p4f_sincof_p1); - y2 = pmadd(y2, z, p4f_sincof_p2); - y2 = pmul(y2, z); - y2 = pmadd(y2, x, x); - - // Select the correct result from the two polynomials. - y = sine ? (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y, (v16u8)y2) - : (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y2, (v16u8)y); - - // Update the sign. - sign_mask = pxor(sign_mask, (Packet4i)y); - y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0); // binsli = bit-insert-left - return y; -} - -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f -psin(const Packet4f& x) { - return psincos_inner_msa_float(x); -} - -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f -pcos(const Packet4f& x) { - return psincos_inner_msa_float(x); -} - -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d -pexp(const Packet2d& _x) { - // Limiting double-precision pexp's argument to [-1024, +1024] lets pexp - // reach 0 and INFINITY naturally. - static _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -1024.0); - static _EIGEN_DECLARE_CONST_Packet2d(exp_hi, +1024.0); - static _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); - static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); - static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); - static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); - static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); - static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); - static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); - static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); - static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); - static _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); - static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); - static _EIGEN_DECLARE_CONST_Packet2d(1, 1.0); - static _EIGEN_DECLARE_CONST_Packet2d(2, 2.0); - - Packet2d x = _x; - - // Clamp x. - x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(x, p2d_exp_lo), (v16u8)x, - (v16u8)p2d_exp_lo); - x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(p2d_exp_hi, x), (v16u8)x, - (v16u8)p2d_exp_hi); - - // Round to nearest integer by adding 0.5 (with x's sign) and truncating. - Packet2d x2_add = (Packet2d)__builtin_msa_binsli_d((v2u64)p2d_half, (v2u64)x, 0); - Packet2d x2 = pmadd(x, p2d_cephes_LOG2EF, x2_add); - Packet2l x2_long = __builtin_msa_ftrunc_s_d(x2); - Packet2d x2_long_d = __builtin_msa_ffint_s_d(x2_long); - - x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C1); - x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C2); - - x2 = pmul(x, x); - - Packet2d px = p2d_cephes_exp_p0; - px = pmadd(px, x2, p2d_cephes_exp_p1); - px = pmadd(px, x2, p2d_cephes_exp_p2); - px = pmul(px, x); - - Packet2d qx = p2d_cephes_exp_q0; - qx = pmadd(qx, x2, p2d_cephes_exp_q1); - qx = pmadd(qx, x2, p2d_cephes_exp_q2); - qx = pmadd(qx, x2, p2d_cephes_exp_q3); - - x = pdiv(px, psub(qx, px)); - x = pmadd(p2d_2, x, p2d_1); - - // x *= 2**exponent. - x = __builtin_msa_fexp2_d(x, x2_long); - - return x; -} - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_MATH_FUNCTIONS_MSA_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/MSA/PacketMath.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/MSA/PacketMath.h deleted file mode 100644 index f03cf61ff..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/MSA/PacketMath.h +++ /dev/null @@ -1,1237 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2018 Wave Computing, Inc. -// Written by: -// Chris Larsen -// Alexey Frunze (afrunze@wavecomp.com) -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_PACKET_MATH_MSA_H -#define EIGEN_PACKET_MATH_MSA_H - -#include -#include - -namespace Eigen { - -namespace internal { - -#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD -#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 -#endif - -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD -#endif - -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#endif - -#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 -#endif - -#if 0 -#define EIGEN_MSA_DEBUG \ - static bool firstTime = true; \ - do { \ - if (firstTime) { \ - std::cout << __FILE__ << ':' << __LINE__ << ':' << __FUNCTION__ << std::endl; \ - firstTime = false; \ - } \ - } while (0) -#else -#define EIGEN_MSA_DEBUG -#endif - -#define EIGEN_MSA_SHF_I8(a, b, c, d) (((d) << 6) | ((c) << 4) | ((b) << 2) | (a)) - -typedef v4f32 Packet4f; -typedef v4i32 Packet4i; -typedef v4u32 Packet4ui; - -#define _EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = { X, X, X, X } -#define _EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = { X, X, X, X } -#define _EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = { X, X, X, X } - -inline std::ostream& operator<<(std::ostream& os, const Packet4f& value) { - os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]"; - return os; -} - -inline std::ostream& operator<<(std::ostream& os, const Packet4i& value) { - os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]"; - return os; -} - -inline std::ostream& operator<<(std::ostream& os, const Packet4ui& value) { - os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]"; - return os; -} - -template <> -struct packet_traits : default_packet_traits { - typedef Packet4f type; - typedef Packet4f half; // Packet2f intrinsics not implemented yet - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 4, - HasHalfPacket = 0, // Packet2f intrinsics not implemented yet - // FIXME check the Has* - HasDiv = 1, - HasSin = EIGEN_FAST_MATH, - HasCos = EIGEN_FAST_MATH, - HasTanh = EIGEN_FAST_MATH, - HasErf = EIGEN_FAST_MATH, - HasLog = 1, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasRound = 1, - HasFloor = 1, - HasCeil = 1, - HasBlend = 1 - }; -}; - -template <> -struct packet_traits : default_packet_traits { - typedef Packet4i type; - typedef Packet4i half; // Packet2i intrinsics not implemented yet - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 4, - HasHalfPacket = 0, // Packet2i intrinsics not implemented yet - // FIXME check the Has* - HasDiv = 1, - HasBlend = 1 - }; -}; - -template <> -struct unpacket_traits { - typedef float type; - enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false }; - typedef Packet4f half; -}; - -template <> -struct unpacket_traits { - typedef int32_t type; - enum { size = 4, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false }; - typedef Packet4i half; -}; - -template <> -EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { - EIGEN_MSA_DEBUG; - - Packet4f v = { from, from, from, from }; - return v; -} - -template <> -EIGEN_STRONG_INLINE Packet4i pset1(const int32_t& from) { - EIGEN_MSA_DEBUG; - - return __builtin_msa_fill_w(from); -} - -template <> -EIGEN_STRONG_INLINE Packet4f pload1(const float* from) { - EIGEN_MSA_DEBUG; - - float f = *from; - Packet4f v = { f, f, f, f }; - return v; -} - -template <> -EIGEN_STRONG_INLINE Packet4i pload1(const int32_t* from) { - EIGEN_MSA_DEBUG; - - return __builtin_msa_fill_w(*from); -} - -template <> -EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { - EIGEN_MSA_DEBUG; - - return __builtin_msa_fadd_w(a, b); -} - -template <> -EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { - EIGEN_MSA_DEBUG; - - return __builtin_msa_addv_w(a, b); -} - -template <> -EIGEN_STRONG_INLINE Packet4f plset(const float& a) { - EIGEN_MSA_DEBUG; - - static const Packet4f countdown = { 0.0f, 1.0f, 2.0f, 3.0f }; - return padd(pset1(a), countdown); -} - -template <> -EIGEN_STRONG_INLINE Packet4i plset(const int32_t& a) { - EIGEN_MSA_DEBUG; - - static const Packet4i countdown = { 0, 1, 2, 3 }; - return padd(pset1(a), countdown); -} - -template <> -EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { - EIGEN_MSA_DEBUG; - - return __builtin_msa_fsub_w(a, b); -} - -template <> -EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { - EIGEN_MSA_DEBUG; - - return __builtin_msa_subv_w(a, b); -} - -template <> -EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { - EIGEN_MSA_DEBUG; - - return (Packet4f)__builtin_msa_bnegi_w((v4u32)a, 31); -} - -template <> -EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { - EIGEN_MSA_DEBUG; - - return __builtin_msa_addvi_w((v4i32)__builtin_msa_nori_b((v16u8)a, 0), 1); -} - -template <> -EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { - EIGEN_MSA_DEBUG; - - return a; -} - -template <> -EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { - EIGEN_MSA_DEBUG; - - return a; -} - -template <> -EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { - EIGEN_MSA_DEBUG; - - return __builtin_msa_fmul_w(a, b); -} - -template <> -EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { - EIGEN_MSA_DEBUG; - - return __builtin_msa_mulv_w(a, b); -} - -template <> -EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { - EIGEN_MSA_DEBUG; - - return __builtin_msa_fdiv_w(a, b); -} - -template <> -EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& a, const Packet4i& b) { - EIGEN_MSA_DEBUG; - - return __builtin_msa_div_s_w(a, b); -} - -template <> -EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { - EIGEN_MSA_DEBUG; - - return __builtin_msa_fmadd_w(c, a, b); -} - -template <> -EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { - EIGEN_MSA_DEBUG; - - // Use "asm" construct to avoid __builtin_msa_maddv_w GNU C bug. - Packet4i value = c; - __asm__("maddv.w %w[value], %w[a], %w[b]\n" - // Outputs - : [value] "+f"(value) - // Inputs - : [a] "f"(a), [b] "f"(b)); - return value; -} - -template <> -EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { - EIGEN_MSA_DEBUG; - - return (Packet4f)__builtin_msa_and_v((v16u8)a, (v16u8)b); -} - -template <> -EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { - EIGEN_MSA_DEBUG; - - return (Packet4i)__builtin_msa_and_v((v16u8)a, (v16u8)b); -} - -template <> -EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) { - EIGEN_MSA_DEBUG; - - return (Packet4f)__builtin_msa_or_v((v16u8)a, (v16u8)b); -} - -template <> -EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { - EIGEN_MSA_DEBUG; - - return (Packet4i)__builtin_msa_or_v((v16u8)a, (v16u8)b); -} - -template <> -EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) { - EIGEN_MSA_DEBUG; - - return (Packet4f)__builtin_msa_xor_v((v16u8)a, (v16u8)b); -} - -template <> -EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { - EIGEN_MSA_DEBUG; - - return (Packet4i)__builtin_msa_xor_v((v16u8)a, (v16u8)b); -} - -template <> -EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { - EIGEN_MSA_DEBUG; - - return pand(a, (Packet4f)__builtin_msa_xori_b((v16u8)b, 255)); -} - -template <> -EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { - EIGEN_MSA_DEBUG; - - return pand(a, (Packet4i)__builtin_msa_xori_b((v16u8)b, 255)); -} - -template <> -EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { - EIGEN_MSA_DEBUG; - -#if EIGEN_FAST_MATH - // This prefers numbers to NaNs. - return __builtin_msa_fmin_w(a, b); -#else - // This prefers NaNs to numbers. - Packet4i aNaN = __builtin_msa_fcun_w(a, a); - Packet4i aMinOrNaN = por(__builtin_msa_fclt_w(a, b), aNaN); - return (Packet4f)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a); -#endif -} - -template <> -EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { - EIGEN_MSA_DEBUG; - - return __builtin_msa_min_s_w(a, b); -} - -template <> -EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { - EIGEN_MSA_DEBUG; - -#if EIGEN_FAST_MATH - // This prefers numbers to NaNs. - return __builtin_msa_fmax_w(a, b); -#else - // This prefers NaNs to numbers. - Packet4i aNaN = __builtin_msa_fcun_w(a, a); - Packet4i aMaxOrNaN = por(__builtin_msa_fclt_w(b, a), aNaN); - return (Packet4f)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a); -#endif -} - -template <> -EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { - EIGEN_MSA_DEBUG; - - return __builtin_msa_max_s_w(a, b); -} - -template <> -EIGEN_STRONG_INLINE Packet4f pload(const float* from) { - EIGEN_MSA_DEBUG; - - EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast(from), 0); -} - -template <> -EIGEN_STRONG_INLINE Packet4i pload(const int32_t* from) { - EIGEN_MSA_DEBUG; - - EIGEN_DEBUG_ALIGNED_LOAD return __builtin_msa_ld_w(const_cast(from), 0); -} - -template <> -EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { - EIGEN_MSA_DEBUG; - - EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast(from), 0); -} - -template <> -EIGEN_STRONG_INLINE Packet4i ploadu(const int32_t* from) { - EIGEN_MSA_DEBUG; - - EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4i)__builtin_msa_ld_w(const_cast(from), 0); -} - -template <> -EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) { - EIGEN_MSA_DEBUG; - - float f0 = from[0], f1 = from[1]; - Packet4f v0 = { f0, f0, f0, f0 }; - Packet4f v1 = { f1, f1, f1, f1 }; - return (Packet4f)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0); -} - -template <> -EIGEN_STRONG_INLINE Packet4i ploaddup(const int32_t* from) { - EIGEN_MSA_DEBUG; - - int32_t i0 = from[0], i1 = from[1]; - Packet4i v0 = { i0, i0, i0, i0 }; - Packet4i v1 = { i1, i1, i1, i1 }; - return (Packet4i)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0); -} - -template <> -EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { - EIGEN_MSA_DEBUG; - - EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0); -} - -template <> -EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet4i& from) { - EIGEN_MSA_DEBUG; - - EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w(from, to, 0); -} - -template <> -EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { - EIGEN_MSA_DEBUG; - - EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0); -} - -template <> -EIGEN_STRONG_INLINE void pstoreu(int32_t* to, const Packet4i& from) { - EIGEN_MSA_DEBUG; - - EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w(from, to, 0); -} - -template <> -EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { - EIGEN_MSA_DEBUG; - - float f = *from; - Packet4f v = { f, f, f, f }; - v[1] = from[stride]; - v[2] = from[2 * stride]; - v[3] = from[3 * stride]; - return v; -} - -template <> -EIGEN_DEVICE_FUNC inline Packet4i pgather(const int32_t* from, Index stride) { - EIGEN_MSA_DEBUG; - - int32_t i = *from; - Packet4i v = { i, i, i, i }; - v[1] = from[stride]; - v[2] = from[2 * stride]; - v[3] = from[3 * stride]; - return v; -} - -template <> -EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, - Index stride) { - EIGEN_MSA_DEBUG; - - *to = from[0]; - to += stride; - *to = from[1]; - to += stride; - *to = from[2]; - to += stride; - *to = from[3]; -} - -template <> -EIGEN_DEVICE_FUNC inline void pscatter(int32_t* to, const Packet4i& from, - Index stride) { - EIGEN_MSA_DEBUG; - - *to = from[0]; - to += stride; - *to = from[1]; - to += stride; - *to = from[2]; - to += stride; - *to = from[3]; -} - -template <> -EIGEN_STRONG_INLINE void prefetch(const float* addr) { - EIGEN_MSA_DEBUG; - - __builtin_prefetch(addr); -} - -template <> -EIGEN_STRONG_INLINE void prefetch(const int32_t* addr) { - EIGEN_MSA_DEBUG; - - __builtin_prefetch(addr); -} - -template <> -EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { - EIGEN_MSA_DEBUG; - - return a[0]; -} - -template <> -EIGEN_STRONG_INLINE int32_t pfirst(const Packet4i& a) { - EIGEN_MSA_DEBUG; - - return a[0]; -} - -template <> -EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { - EIGEN_MSA_DEBUG; - - return (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(3, 2, 1, 0)); -} - -template <> -EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { - EIGEN_MSA_DEBUG; - - return __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(3, 2, 1, 0)); -} - -template <> -EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { - EIGEN_MSA_DEBUG; - - return (Packet4f)__builtin_msa_bclri_w((v4u32)a, 31); -} - -template <> -EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { - EIGEN_MSA_DEBUG; - - Packet4i zero = __builtin_msa_ldi_w(0); - return __builtin_msa_add_a_w(zero, a); -} - -template <> -EIGEN_STRONG_INLINE float predux(const Packet4f& a) { - EIGEN_MSA_DEBUG; - - Packet4f s = padd(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); - s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); - return s[0]; -} - - -template <> -EIGEN_STRONG_INLINE int32_t predux(const Packet4i& a) { - EIGEN_MSA_DEBUG; - - Packet4i s = padd(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); - s = padd(s, __builtin_msa_shf_w(s, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); - return s[0]; -} - -// Other reduction functions: -// mul -template <> -EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) { - EIGEN_MSA_DEBUG; - - Packet4f p = pmul(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); - p = pmul(p, (Packet4f)__builtin_msa_shf_w((v4i32)p, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); - return p[0]; -} - -template <> -EIGEN_STRONG_INLINE int32_t predux_mul(const Packet4i& a) { - EIGEN_MSA_DEBUG; - - Packet4i p = pmul(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); - p = pmul(p, __builtin_msa_shf_w(p, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); - return p[0]; -} - -// min -template <> -EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) { - EIGEN_MSA_DEBUG; - - // Swap 64-bit halves of a. - Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); -#if !EIGEN_FAST_MATH - // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit - // masks of all zeroes/ones in low 64 bits. - v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped); - // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes. - unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0); -#endif - // Continue with min computation. - Packet4f v = __builtin_msa_fmin_w(a, swapped); - v = __builtin_msa_fmin_w( - v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); -#if !EIGEN_FAST_MATH - // Based on the mask select between v and 4 qNaNs. - v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000); - v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v); -#endif - return v[0]; -} - -template <> -EIGEN_STRONG_INLINE int32_t predux_min(const Packet4i& a) { - EIGEN_MSA_DEBUG; - - Packet4i m = pmin(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); - m = pmin(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); - return m[0]; -} - -// max -template <> -EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) { - EIGEN_MSA_DEBUG; - - // Swap 64-bit halves of a. - Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); -#if !EIGEN_FAST_MATH - // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit - // masks of all zeroes/ones in low 64 bits. - v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped); - // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes. - unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0); -#endif - // Continue with max computation. - Packet4f v = __builtin_msa_fmax_w(a, swapped); - v = __builtin_msa_fmax_w( - v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); -#if !EIGEN_FAST_MATH - // Based on the mask select between v and 4 qNaNs. - v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000); - v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v); -#endif - return v[0]; -} - -template <> -EIGEN_STRONG_INLINE int32_t predux_max(const Packet4i& a) { - EIGEN_MSA_DEBUG; - - Packet4i m = pmax(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1))); - m = pmax(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2))); - return m[0]; -} - -inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { - os << "[ " << value.packet[0] << "," << std::endl - << " " << value.packet[1] << "," << std::endl - << " " << value.packet[2] << "," << std::endl - << " " << value.packet[3] << " ]"; - return os; -} - -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - EIGEN_MSA_DEBUG; - - v4i32 tmp1, tmp2, tmp3, tmp4; - - tmp1 = __builtin_msa_ilvr_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]); - tmp2 = __builtin_msa_ilvr_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]); - tmp3 = __builtin_msa_ilvl_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]); - tmp4 = __builtin_msa_ilvl_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]); - - kernel.packet[0] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1); - kernel.packet[1] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1); - kernel.packet[2] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3); - kernel.packet[3] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3); -} - -inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { - os << "[ " << value.packet[0] << "," << std::endl - << " " << value.packet[1] << "," << std::endl - << " " << value.packet[2] << "," << std::endl - << " " << value.packet[3] << " ]"; - return os; -} - -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - EIGEN_MSA_DEBUG; - - v4i32 tmp1, tmp2, tmp3, tmp4; - - tmp1 = __builtin_msa_ilvr_w(kernel.packet[1], kernel.packet[0]); - tmp2 = __builtin_msa_ilvr_w(kernel.packet[3], kernel.packet[2]); - tmp3 = __builtin_msa_ilvl_w(kernel.packet[1], kernel.packet[0]); - tmp4 = __builtin_msa_ilvl_w(kernel.packet[3], kernel.packet[2]); - - kernel.packet[0] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1); - kernel.packet[1] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1); - kernel.packet[2] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3); - kernel.packet[3] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3); -} - -template <> -EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) { - EIGEN_MSA_DEBUG; - - return __builtin_msa_fsqrt_w(a); -} - -template <> -EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) { - EIGEN_MSA_DEBUG; - -#if EIGEN_FAST_MATH - return __builtin_msa_frsqrt_w(a); -#else - Packet4f ones = __builtin_msa_ffint_s_w(__builtin_msa_ldi_w(1)); - return pdiv(ones, psqrt(a)); -#endif -} - -template <> -EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { - Packet4f v = a; - int32_t old_mode, new_mode; - asm volatile( - "cfcmsa %[old_mode], $1\n" - "ori %[new_mode], %[old_mode], 3\n" // 3 = round towards -INFINITY. - "ctcmsa $1, %[new_mode]\n" - "frint.w %w[v], %w[v]\n" - "ctcmsa $1, %[old_mode]\n" - : // outputs - [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), - [v] "+f"(v) - : // inputs - : // clobbers - ); - return v; -} - -template <> -EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { - Packet4f v = a; - int32_t old_mode, new_mode; - asm volatile( - "cfcmsa %[old_mode], $1\n" - "ori %[new_mode], %[old_mode], 3\n" - "xori %[new_mode], %[new_mode], 1\n" // 2 = round towards +INFINITY. - "ctcmsa $1, %[new_mode]\n" - "frint.w %w[v], %w[v]\n" - "ctcmsa $1, %[old_mode]\n" - : // outputs - [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), - [v] "+f"(v) - : // inputs - : // clobbers - ); - return v; -} - -template <> -EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { - Packet4f v = a; - int32_t old_mode, new_mode; - asm volatile( - "cfcmsa %[old_mode], $1\n" - "ori %[new_mode], %[old_mode], 3\n" - "xori %[new_mode], %[new_mode], 3\n" // 0 = round to nearest, ties to even. - "ctcmsa $1, %[new_mode]\n" - "frint.w %w[v], %w[v]\n" - "ctcmsa $1, %[old_mode]\n" - : // outputs - [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), - [v] "+f"(v) - : // inputs - : // clobbers - ); - return v; -} - -template <> -EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, - const Packet4f& elsePacket) { - Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], - ifPacket.select[3] }; - Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0); - return (Packet4f)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket); -} - -template <> -EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, - const Packet4i& elsePacket) { - Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], - ifPacket.select[3] }; - Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0); - return (Packet4i)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket); -} - -//---------- double ---------- - -typedef v2f64 Packet2d; -typedef v2i64 Packet2l; -typedef v2u64 Packet2ul; - -#define _EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = { X, X } -#define _EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = { X, X } -#define _EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = { X, X } - -inline std::ostream& operator<<(std::ostream& os, const Packet2d& value) { - os << "[ " << value[0] << ", " << value[1] << " ]"; - return os; -} - -inline std::ostream& operator<<(std::ostream& os, const Packet2l& value) { - os << "[ " << value[0] << ", " << value[1] << " ]"; - return os; -} - -inline std::ostream& operator<<(std::ostream& os, const Packet2ul& value) { - os << "[ " << value[0] << ", " << value[1] << " ]"; - return os; -} - -template <> -struct packet_traits : default_packet_traits { - typedef Packet2d type; - typedef Packet2d half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 2, - HasHalfPacket = 0, - // FIXME check the Has* - HasDiv = 1, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasRound = 1, - HasFloor = 1, - HasCeil = 1, - HasBlend = 1 - }; -}; - -template <> -struct unpacket_traits { - typedef double type; - enum { size = 2, alignment = Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false }; - typedef Packet2d half; -}; - -template <> -EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { - EIGEN_MSA_DEBUG; - - Packet2d value = { from, from }; - return value; -} - -template <> -EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { - EIGEN_MSA_DEBUG; - - return __builtin_msa_fadd_d(a, b); -} - -template <> -EIGEN_STRONG_INLINE Packet2d plset(const double& a) { - EIGEN_MSA_DEBUG; - - static const Packet2d countdown = { 0.0, 1.0 }; - return padd(pset1(a), countdown); -} - -template <> -EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { - EIGEN_MSA_DEBUG; - - return __builtin_msa_fsub_d(a, b); -} - -template <> -EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { - EIGEN_MSA_DEBUG; - - return (Packet2d)__builtin_msa_bnegi_d((v2u64)a, 63); -} - -template <> -EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { - EIGEN_MSA_DEBUG; - - return a; -} - -template <> -EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { - EIGEN_MSA_DEBUG; - - return __builtin_msa_fmul_d(a, b); -} - -template <> -EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { - EIGEN_MSA_DEBUG; - - return __builtin_msa_fdiv_d(a, b); -} - -template <> -EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { - EIGEN_MSA_DEBUG; - - return __builtin_msa_fmadd_d(c, a, b); -} - -// Logical Operations are not supported for float, so we have to reinterpret casts using MSA -// intrinsics -template <> -EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { - EIGEN_MSA_DEBUG; - - return (Packet2d)__builtin_msa_and_v((v16u8)a, (v16u8)b); -} - -template <> -EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { - EIGEN_MSA_DEBUG; - - return (Packet2d)__builtin_msa_or_v((v16u8)a, (v16u8)b); -} - -template <> -EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { - EIGEN_MSA_DEBUG; - - return (Packet2d)__builtin_msa_xor_v((v16u8)a, (v16u8)b); -} - -template <> -EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { - EIGEN_MSA_DEBUG; - - return pand(a, (Packet2d)__builtin_msa_xori_b((v16u8)b, 255)); -} - -template <> -EIGEN_STRONG_INLINE Packet2d pload(const double* from) { - EIGEN_MSA_DEBUG; - - EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast(from), 0); -} - -template <> -EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { - EIGEN_MSA_DEBUG; - -#if EIGEN_FAST_MATH - // This prefers numbers to NaNs. - return __builtin_msa_fmin_d(a, b); -#else - // This prefers NaNs to numbers. - v2i64 aNaN = __builtin_msa_fcun_d(a, a); - v2i64 aMinOrNaN = por(__builtin_msa_fclt_d(a, b), aNaN); - return (Packet2d)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a); -#endif -} - -template <> -EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { - EIGEN_MSA_DEBUG; - -#if EIGEN_FAST_MATH - // This prefers numbers to NaNs. - return __builtin_msa_fmax_d(a, b); -#else - // This prefers NaNs to numbers. - v2i64 aNaN = __builtin_msa_fcun_d(a, a); - v2i64 aMaxOrNaN = por(__builtin_msa_fclt_d(b, a), aNaN); - return (Packet2d)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a); -#endif -} - -template <> -EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { - EIGEN_MSA_DEBUG; - - EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast(from), 0); -} - -template <> -EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { - EIGEN_MSA_DEBUG; - - Packet2d value = { *from, *from }; - return value; -} - -template <> -EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { - EIGEN_MSA_DEBUG; - - EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0); -} - -template <> -EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { - EIGEN_MSA_DEBUG; - - EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0); -} - -template <> -EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) { - EIGEN_MSA_DEBUG; - - Packet2d value; - value[0] = *from; - from += stride; - value[1] = *from; - return value; -} - -template <> -EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, - Index stride) { - EIGEN_MSA_DEBUG; - - *to = from[0]; - to += stride; - *to = from[1]; -} - -template <> -EIGEN_STRONG_INLINE void prefetch(const double* addr) { - EIGEN_MSA_DEBUG; - - __builtin_prefetch(addr); -} - -template <> -EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { - EIGEN_MSA_DEBUG; - - return a[0]; -} - -template <> -EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { - EIGEN_MSA_DEBUG; - - return (Packet2d)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); -} - -template <> -EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { - EIGEN_MSA_DEBUG; - - return (Packet2d)__builtin_msa_bclri_d((v2u64)a, 63); -} - -template <> -EIGEN_STRONG_INLINE double predux(const Packet2d& a) { - EIGEN_MSA_DEBUG; - - Packet2d s = padd(a, preverse(a)); - return s[0]; -} - -// Other reduction functions: -// mul -template <> -EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { - EIGEN_MSA_DEBUG; - - Packet2d p = pmul(a, preverse(a)); - return p[0]; -} - -// min -template <> -EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { - EIGEN_MSA_DEBUG; - -#if EIGEN_FAST_MATH - Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); - Packet2d v = __builtin_msa_fmin_d(a, swapped); - return v[0]; -#else - double a0 = a[0], a1 = a[1]; - return ((numext::isnan)(a0) || a0 < a1) ? a0 : a1; -#endif -} - -// max -template <> -EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) { - EIGEN_MSA_DEBUG; - -#if EIGEN_FAST_MATH - Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)); - Packet2d v = __builtin_msa_fmax_d(a, swapped); - return v[0]; -#else - double a0 = a[0], a1 = a[1]; - return ((numext::isnan)(a0) || a0 > a1) ? a0 : a1; -#endif -} - -template <> -EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& a) { - EIGEN_MSA_DEBUG; - - return __builtin_msa_fsqrt_d(a); -} - -template <> -EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) { - EIGEN_MSA_DEBUG; - -#if EIGEN_FAST_MATH - return __builtin_msa_frsqrt_d(a); -#else - Packet2d ones = __builtin_msa_ffint_s_d(__builtin_msa_ldi_d(1)); - return pdiv(ones, psqrt(a)); -#endif -} - -inline std::ostream& operator<<(std::ostream& os, const PacketBlock& value) { - os << "[ " << value.packet[0] << "," << std::endl << " " << value.packet[1] << " ]"; - return os; -} - -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - EIGEN_MSA_DEBUG; - - Packet2d trn1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]); - Packet2d trn2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]); - kernel.packet[0] = trn1; - kernel.packet[1] = trn2; -} - -template <> -EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { - Packet2d v = a; - int32_t old_mode, new_mode; - asm volatile( - "cfcmsa %[old_mode], $1\n" - "ori %[new_mode], %[old_mode], 3\n" // 3 = round towards -INFINITY. - "ctcmsa $1, %[new_mode]\n" - "frint.d %w[v], %w[v]\n" - "ctcmsa $1, %[old_mode]\n" - : // outputs - [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), - [v] "+f"(v) - : // inputs - : // clobbers - ); - return v; -} - -template <> -EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { - Packet2d v = a; - int32_t old_mode, new_mode; - asm volatile( - "cfcmsa %[old_mode], $1\n" - "ori %[new_mode], %[old_mode], 3\n" - "xori %[new_mode], %[new_mode], 1\n" // 2 = round towards +INFINITY. - "ctcmsa $1, %[new_mode]\n" - "frint.d %w[v], %w[v]\n" - "ctcmsa $1, %[old_mode]\n" - : // outputs - [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), - [v] "+f"(v) - : // inputs - : // clobbers - ); - return v; -} - -template <> -EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { - Packet2d v = a; - int32_t old_mode, new_mode; - asm volatile( - "cfcmsa %[old_mode], $1\n" - "ori %[new_mode], %[old_mode], 3\n" - "xori %[new_mode], %[new_mode], 3\n" // 0 = round to nearest, ties to even. - "ctcmsa $1, %[new_mode]\n" - "frint.d %w[v], %w[v]\n" - "ctcmsa $1, %[old_mode]\n" - : // outputs - [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode), - [v] "+f"(v) - : // inputs - : // clobbers - ); - return v; -} - -template <> -EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, - const Packet2d& elsePacket) { - Packet2ul select = { ifPacket.select[0], ifPacket.select[1] }; - Packet2l mask = __builtin_msa_ceqi_d((Packet2l)select, 0); - return (Packet2d)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket); -} - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_PACKET_MATH_MSA_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/Complex.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/Complex.h index 8cd2a5ebe..306a309be 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/Complex.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/Complex.h @@ -15,8 +15,7 @@ namespace Eigen { namespace internal { -inline uint32x4_t p4ui_CONJ_XOR() -{ +inline uint32x4_t p4ui_CONJ_XOR() { // See bug 1325, clang fails to call vld1q_u64. #if EIGEN_COMP_CLANG uint32x4_t ret = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; @@ -27,134 +26,61 @@ inline uint32x4_t p4ui_CONJ_XOR() #endif } -inline uint32x2_t p2ui_CONJ_XOR() -{ +inline uint32x2_t p2ui_CONJ_XOR() { static const uint32_t conj_XOR_DATA[] = { 0x00000000, 0x80000000 }; return vld1_u32( conj_XOR_DATA ); } //---------- float ---------- - -struct Packet1cf -{ - EIGEN_STRONG_INLINE Packet1cf() {} - EIGEN_STRONG_INLINE explicit Packet1cf(const Packet2f& a) : v(a) {} - Packet2f v; -}; struct Packet2cf { EIGEN_STRONG_INLINE Packet2cf() {} EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {} - Packet4f v; + Packet4f v; }; -template<> struct packet_traits > : default_packet_traits +template<> struct packet_traits > : default_packet_traits { typedef Packet2cf type; - typedef Packet1cf half; - enum - { + typedef Packet2cf half; + enum { Vectorizable = 1, AlignedOnScalar = 1, size = 2, - HasHalfPacket = 1, + HasHalfPacket = 0, - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasNegate = 1, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasNegate = 1, + HasAbs = 0, + HasAbs2 = 0, + HasMin = 0, + HasMax = 0, HasSetLinear = 0 }; }; -template<> struct unpacket_traits -{ - typedef std::complex type; - typedef Packet1cf half; - enum - { - size = 1, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template<> struct unpacket_traits -{ - typedef std::complex type; - typedef Packet1cf half; - enum - { - size = 2, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; -template<> EIGEN_STRONG_INLINE Packet1cf pcast(const float& a) -{ return Packet1cf(vset_lane_f32(a, vdup_n_f32(0.f), 0)); } -template<> EIGEN_STRONG_INLINE Packet2cf pcast(const Packet2f& a) -{ return Packet2cf(vreinterpretq_f32_u64(vmovl_u32(vreinterpret_u32_f32(a)))); } - -template<> EIGEN_STRONG_INLINE Packet1cf pset1(const std::complex& from) -{ return Packet1cf(vld1_f32(reinterpret_cast(&from))); } -template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) +template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { - const float32x2_t r64 = vld1_f32(reinterpret_cast(&from)); + float32x2_t r64; + r64 = vld1_f32((const float *)&from); + return Packet2cf(vcombine_f32(r64, r64)); } -template<> EIGEN_STRONG_INLINE Packet1cf padd(const Packet1cf& a, const Packet1cf& b) -{ return Packet1cf(padd(a.v, b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) -{ return Packet2cf(padd(a.v, b.v)); } - -template<> EIGEN_STRONG_INLINE Packet1cf psub(const Packet1cf& a, const Packet1cf& b) -{ return Packet1cf(psub(a.v, b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) -{ return Packet2cf(psub(a.v, b.v)); } - -template<> EIGEN_STRONG_INLINE Packet1cf pnegate(const Packet1cf& a) { return Packet1cf(pnegate(a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); } - -template<> EIGEN_STRONG_INLINE Packet1cf pconj(const Packet1cf& a) -{ - const Packet2ui b = vreinterpret_u32_f32(a.v); - return Packet1cf(vreinterpret_f32_u32(veor_u32(b, p2ui_CONJ_XOR()))); -} template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { - const Packet4ui b = vreinterpretq_u32_f32(a.v); + Packet4ui b = vreinterpretq_u32_f32(a.v); return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR()))); } -template<> EIGEN_STRONG_INLINE Packet1cf pmul(const Packet1cf& a, const Packet1cf& b) -{ - Packet2f v1, v2; - - // Get the real values of a | a1_re | a1_re | - v1 = vdup_lane_f32(a.v, 0); - // Get the imag values of a | a1_im | a1_im | - v2 = vdup_lane_f32(a.v, 1); - // Multiply the real a with b - v1 = vmul_f32(v1, b.v); - // Multiply the imag a with b - v2 = vmul_f32(v2, b.v); - // Conjugate v2 - v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR())); - // Swap real/imag elements in v2. - v2 = vrev64_f32(v2); - // Add and return the result - return Packet1cf(vadd_f32(v1, v2)); -} template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) { Packet4f v1, v2; @@ -167,7 +93,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con v1 = vmulq_f32(v1, b.v); // Multiply the imag a with b v2 = vmulq_f32(v2, b.v); - // Conjugate v2 + // Conjugate v2 v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), p4ui_CONJ_XOR())); // Swap real/imag elements in v2. v2 = vrev64q_f32(v2); @@ -175,144 +101,98 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con return Packet2cf(vaddq_f32(v1, v2)); } -template<> EIGEN_STRONG_INLINE Packet1cf pcmp_eq(const Packet1cf& a, const Packet1cf& b) +template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { - // Compare real and imaginary parts of a and b to get the mask vector: - // [re(a[0])==re(b[0]), im(a[0])==im(b[0])] - Packet2f eq = pcmp_eq(a.v, b.v); - // Swap real/imag elements in the mask in to get: - // [im(a[0])==im(b[0]), re(a[0])==re(b[0])] - Packet2f eq_swapped = vrev64_f32(eq); - // Return re(a)==re(b) && im(a)==im(b) by computing bitwise AND of eq and eq_swapped - return Packet1cf(pand(eq, eq_swapped)); + return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); } -template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) +template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { - // Compare real and imaginary parts of a and b to get the mask vector: - // [re(a[0])==re(b[0]), im(a[0])==im(b[0]), re(a[1])==re(b[1]), im(a[1])==im(b[1])] - Packet4f eq = pcmp_eq(a.v, b.v); - // Swap real/imag elements in the mask in to get: - // [im(a[0])==im(b[0]), re(a[0])==re(b[0]), im(a[1])==im(b[1]), re(a[1])==re(b[1])] - Packet4f eq_swapped = vrev64q_f32(eq); - // Return re(a)==re(b) && im(a)==im(b) by computing bitwise AND of eq and eq_swapped - return Packet2cf(pand(eq, eq_swapped)); + return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); +} +template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) +{ + return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); } - -template<> EIGEN_STRONG_INLINE Packet1cf pand(const Packet1cf& a, const Packet1cf& b) -{ return Packet1cf(vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); } -template<> EIGEN_STRONG_INLINE Packet2cf pand(const Packet2cf& a, const Packet2cf& b) -{ return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); } - -template<> EIGEN_STRONG_INLINE Packet1cf por(const Packet1cf& a, const Packet1cf& b) -{ return Packet1cf(vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); } -template<> EIGEN_STRONG_INLINE Packet2cf por(const Packet2cf& a, const Packet2cf& b) -{ return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); } - -template<> EIGEN_STRONG_INLINE Packet1cf pxor(const Packet1cf& a, const Packet1cf& b) -{ return Packet1cf(vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); } -template<> EIGEN_STRONG_INLINE Packet2cf pxor(const Packet2cf& a, const Packet2cf& b) -{ return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); } - -template<> EIGEN_STRONG_INLINE Packet1cf pandnot(const Packet1cf& a, const Packet1cf& b) -{ return Packet1cf(vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v)))); } template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) -{ return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v)))); } - -template<> EIGEN_STRONG_INLINE Packet1cf pload(const std::complex* from) -{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cf(pload((const float*)from)); } -template<> EIGEN_STRONG_INLINE Packet2cf pload(const std::complex* from) -{ EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload(reinterpret_cast(from))); } - -template<> EIGEN_STRONG_INLINE Packet1cf ploadu(const std::complex* from) -{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cf(ploadu((const float*)from)); } -template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) -{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu(reinterpret_cast(from))); } - -template<> EIGEN_STRONG_INLINE Packet1cf ploaddup(const std::complex* from) -{ return pset1(*from); } -template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) -{ return pset1(*from); } - -template<> EIGEN_STRONG_INLINE void pstore >(std::complex *to, const Packet1cf& from) -{ EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstore >(std::complex *to, const Packet2cf& from) -{ EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast(to), from.v); } - -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex *to, const Packet1cf& from) -{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex *to, const Packet2cf& from) -{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), from.v); } - -template<> EIGEN_DEVICE_FUNC inline Packet1cf pgather, Packet1cf>( - const std::complex* from, Index stride) { - const Packet2f tmp = vdup_n_f32(std::real(from[0*stride])); - return Packet1cf(vset_lane_f32(std::imag(from[0*stride]), tmp, 1)); + return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v)))); } -template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>( - const std::complex* from, Index stride) + +template<> EIGEN_STRONG_INLINE Packet2cf pload(const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); } + +template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } + +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } + +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) { - Packet4f res = vdupq_n_f32(std::real(from[0*stride])); + Packet4f res = pset1(0.f); + res = vsetq_lane_f32(std::real(from[0*stride]), res, 0); res = vsetq_lane_f32(std::imag(from[0*stride]), res, 1); res = vsetq_lane_f32(std::real(from[1*stride]), res, 2); res = vsetq_lane_f32(std::imag(from[1*stride]), res, 3); return Packet2cf(res); } -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cf>( - std::complex* to, const Packet1cf& from, Index stride) -{ to[stride*0] = std::complex(vget_lane_f32(from.v, 0), vget_lane_f32(from.v, 1)); } -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>( - std::complex* to, const Packet2cf& from, Index stride) +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) { to[stride*0] = std::complex(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1)); to[stride*1] = std::complex(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3)); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex *addr) -{ EIGEN_ARM_PREFETCH(reinterpret_cast(addr)); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((const float *)addr); } -template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cf& a) +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { - EIGEN_ALIGN16 std::complex x; - vst1_f32(reinterpret_cast(&x), a.v); - return x; -} -template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) -{ - EIGEN_ALIGN16 std::complex x[2]; - vst1q_f32(reinterpret_cast(x), a.v); + std::complex EIGEN_ALIGN16 x[2]; + vst1q_f32((float *)x, a.v); return x[0]; } -template<> EIGEN_STRONG_INLINE Packet1cf preverse(const Packet1cf& a) { return a; } template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) -{ return Packet2cf(vcombine_f32(vget_high_f32(a.v), vget_low_f32(a.v))); } - -template<> EIGEN_STRONG_INLINE Packet1cf pcplxflip(const Packet1cf& a) -{ return Packet1cf(vrev64_f32(a.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& a) -{ return Packet2cf(vrev64q_f32(a.v)); } - -template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cf& a) { - std::complex s; - vst1_f32((float *)&s, a.v); - return s; + float32x2_t a_lo, a_hi; + Packet4f a_r128; + + a_lo = vget_low_f32(a.v); + a_hi = vget_high_f32(a.v); + a_r128 = vcombine_f32(a_hi, a_lo); + + return Packet2cf(a_r128); } + +template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& a) +{ + return Packet2cf(vrev64q_f32(a.v)); +} + template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) { + float32x2_t a1, a2; std::complex s; - vst1_f32(reinterpret_cast(&s), vadd_f32(vget_low_f32(a.v), vget_high_f32(a.v))); + + a1 = vget_low_f32(a.v); + a2 = vget_high_f32(a.v); + a2 = vadd_f32(a1, a2); + vst1_f32((float *)&s, a2); + return s; } -template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cf& a) +template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) { - std::complex s; - vst1_f32((float *)&s, a.v); - return s; + Packet4f sum1, sum2, sum; + + // Add the first two 64-bit float32x2_t of vecs[0] + sum1 = vcombine_f32(vget_low_f32(vecs[0].v), vget_low_f32(vecs[1].v)); + sum2 = vcombine_f32(vget_high_f32(vecs[0].v), vget_high_f32(vecs[1].v)); + sum = vaddq_f32(sum1, sum2); + + return Packet2cf(sum); } + template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) { float32x2_t a1, a2, v1, v2, prod; @@ -328,103 +208,80 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P v1 = vmul_f32(v1, a2); // Multiply the imag a with b v2 = vmul_f32(v2, a2); - // Conjugate v2 + // Conjugate v2 v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR())); // Swap real/imag elements in v2. v2 = vrev64_f32(v2); // Add v1, v2 prod = vadd_f32(v1, v2); - vst1_f32(reinterpret_cast(&s), prod); + vst1_f32((float *)&s, prod); return s; } -template<> struct conj_helper +template +struct palign_impl { - EIGEN_STRONG_INLINE Packet1cf pmadd(const Packet1cf& x, const Packet1cf& y, const Packet1cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cf pmul(const Packet1cf& a, const Packet1cf& b) const - { return internal::pmul(a, pconj(b)); } + EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second) + { + if (Offset==1) + { + first.v = vextq_f32(first.v, second.v, 2); + } + } }; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cf pmadd(const Packet1cf& x, const Packet1cf& y, const Packet1cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cf pmul(const Packet1cf& a, const Packet1cf& b) const - { return internal::pmul(pconj(a), b); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cf pmadd(const Packet1cf& x, const Packet1cf& y, const Packet1cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cf pmul(const Packet1cf& a, const Packet1cf& b) const - { return pconj(internal::pmul(a,b)); } -}; - -template<> struct conj_helper +template<> struct conj_helper { EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const { return padd(pmul(x,y),c); } EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { return internal::pmul(a, pconj(b)); } + { + return internal::pmul(a, pconj(b)); + } }; -template<> struct conj_helper +template<> struct conj_helper { EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const { return padd(pmul(x,y),c); } EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { return internal::pmul(pconj(a), b); } + { + return internal::pmul(pconj(a), b); + } }; -template<> struct conj_helper +template<> struct conj_helper { EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const { return padd(pmul(x,y),c); } EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { return pconj(internal::pmul(a,b)); } + { + return pconj(internal::pmul(a, b)); + } }; -EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cf,Packet2f) EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) -template<> EIGEN_STRONG_INLINE Packet1cf pdiv(const Packet1cf& a, const Packet1cf& b) -{ - // TODO optimize it for NEON - Packet1cf res = conj_helper().pmul(a,b); - Packet2f s, rev_s; - - // this computes the norm - s = vmul_f32(b.v, b.v); - rev_s = vrev64_f32(s); - - return Packet1cf(pdiv(res.v, vadd_f32(s, rev_s))); -} template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for NEON - Packet2cf res = conj_helper().pmul(a,b); + Packet2cf res = conj_helper().pmul(a,b); Packet4f s, rev_s; // this computes the norm s = vmulq_f32(b.v, b.v); rev_s = vrev64q_f32(s); - return Packet2cf(pdiv(res.v, vaddq_f32(s, rev_s))); + return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s))); } -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& /*kernel*/) {} -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) -{ +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { Packet4f tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v)); kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v)); kernel.packet[1].v = tmp; @@ -452,8 +309,7 @@ template<> struct packet_traits > : default_packet_traits { typedef Packet1cd type; typedef Packet1cd half; - enum - { + enum { Vectorizable = 1, AlignedOnScalar = 0, size = 1, @@ -472,49 +328,24 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits -{ - typedef std::complex type; - enum - { - size=1, - alignment=Aligned16, - vectorizable=true, - masked_load_available=false, - masked_store_available=false - }; - typedef Packet1cd half; -}; +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; -template<> EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) -{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload(reinterpret_cast(from))); } +template<> EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } -template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) -{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu(reinterpret_cast(from))); } +template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) +{ /* here we really have to use unaligned loads :( */ return ploadu(&from); } -template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) -{ - /* here we really have to use unaligned loads :( */ - return ploadu(&from); -} - -template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) -{ return Packet1cd(padd(a.v, b.v)); } - -template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) -{ return Packet1cd(psub(a.v, b.v)); } - -template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) -{ return Packet1cd(pnegate(a.v)); } - -template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) -{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); } +template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(padd(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(psub(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR))); } template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) { Packet2d v1, v2; - // Get the real values of a + // Get the real values of a v1 = vdupq_lane_f64(vget_low_f64(a.v), 0); // Get the imag values of a v2 = vdupq_lane_f64(vget_high_f64(a.v), 0); @@ -522,7 +353,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con v1 = vmulq_f64(v1, b.v); // Multiply the imag a with b v2 = vmulq_f64(v2, b.v); - // Conjugate v2 + // Conjugate v2 v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR)); // Swap real/imag elements in v2. v2 = preverse(v2); @@ -530,44 +361,31 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con return Packet1cd(vaddq_f64(v1, v2)); } -template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) +template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { - // Compare real and imaginary parts of a and b to get the mask vector: - // [re(a)==re(b), im(a)==im(b)] - Packet2d eq = pcmp_eq(a.v, b.v); - // Swap real/imag elements in the mask in to get: - // [im(a)==im(b), re(a)==re(b)] - Packet2d eq_swapped = vreinterpretq_f64_u32(vrev64q_u32(vreinterpretq_u32_f64(eq))); - // Return re(a)==re(b) & im(a)==im(b) by computing bitwise AND of eq and eq_swapped - return Packet1cd(pand(eq, eq_swapped)); + return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); +} +template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) +{ + return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); +} +template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) +{ + return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); +} +template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) +{ + return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); } -template<> EIGEN_STRONG_INLINE Packet1cd pand(const Packet1cd& a, const Packet1cd& b) -{ return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); } +template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } -template<> EIGEN_STRONG_INLINE Packet1cd por(const Packet1cd& a, const Packet1cd& b) -{ return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } -template<> EIGEN_STRONG_INLINE Packet1cd pxor(const Packet1cd& a, const Packet1cd& b) -{ return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((const double *)addr); } -template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) -{ return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v),vreinterpretq_u64_f64(b.v)))); } - -template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) -{ return pset1(*from); } - -template<> EIGEN_STRONG_INLINE void pstore >(std::complex *to, const Packet1cd& from) -{ EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast(to), from.v); } - -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex *to, const Packet1cd& from) -{ EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), from.v); } - -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex *addr) -{ EIGEN_ARM_PREFETCH(reinterpret_cast(addr)); } - -template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>( - const std::complex* from, Index stride) +template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride) { Packet2d res = pset1(0.0); res = vsetq_lane_f64(std::real(from[0*stride]), res, 0); @@ -575,14 +393,17 @@ template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Pack return Packet1cd(res); } -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>( - std::complex* to, const Packet1cd& from, Index stride) -{ to[stride*0] = std::complex(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1)); } - -template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index stride) { - EIGEN_ALIGN16 std::complex res; + to[stride*0] = std::complex(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1)); +} + + +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) +{ + std::complex EIGEN_ALIGN16 res; pstore >(&res, a); + return res; } @@ -590,15 +411,29 @@ template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { return pfirst(a); } +template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) { return vecs[0]; } + template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) + { + // FIXME is it sure we never have to align a Packet1cd? + // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... + } +}; + template<> struct conj_helper { EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const { return padd(pmul(x,y),c); } EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { return internal::pmul(a, pconj(b)); } + { + return internal::pmul(a, pconj(b)); + } }; template<> struct conj_helper @@ -607,7 +442,9 @@ template<> struct conj_helper { return padd(pmul(x,y),c); } EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { return internal::pmul(pconj(a), b); } + { + return internal::pmul(pconj(a), b); + } }; template<> struct conj_helper @@ -616,7 +453,9 @@ template<> struct conj_helper { return padd(pmul(x,y),c); } EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { return pconj(internal::pmul(a,b)); } + { + return pconj(internal::pmul(a, b)); + } }; EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) @@ -632,7 +471,9 @@ template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, con } EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) -{ return Packet1cd(preverse(Packet2d(x.v))); } +{ + return Packet1cd(preverse(Packet2d(x.v))); +} EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/MathFunctions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/MathFunctions.h index 1c025618e..6bb05bb92 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/MathFunctions.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/MathFunctions.h @@ -5,6 +5,10 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +/* The sin, cos, exp, and log functions of this file come from + * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ + */ + #ifndef EIGEN_MATH_FUNCTIONS_NEON_H #define EIGEN_MATH_FUNCTIONS_NEON_H @@ -12,31 +16,73 @@ namespace Eigen { namespace internal { -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f pexp(const Packet2f& x) -{ return pexp_float(x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pexp(const Packet4f& x) -{ return pexp_float(x); } +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet4f pexp(const Packet4f& _x) +{ + Packet4f x = _x; + Packet4f tmp, fx; -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f plog(const Packet2f& x) -{ return plog_float(x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f plog(const Packet4f& x) -{ return plog_float(x); } + _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); + _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); + _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); + _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f psin(const Packet2f& x) -{ return psin_float(x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psin(const Packet4f& x) -{ return psin_float(x); } + x = vminq_f32(x, p4f_exp_hi); + x = vmaxq_f32(x, p4f_exp_lo); -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f pcos(const Packet2f& x) -{ return pcos_float(x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pcos(const Packet4f& x) -{ return pcos_float(x); } + /* express exp(x) as exp(g + n*log(2)) */ + fx = vmlaq_f32(p4f_half, x, p4f_cephes_LOG2EF); -// Hyperbolic Tangent function. -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2f ptanh(const Packet2f& x) -{ return internal::generic_fast_tanh_float(x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f ptanh(const Packet4f& x) -{ return internal::generic_fast_tanh_float(x); } + /* perform a floorf */ + tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx)); + + /* if greater, substract 1 */ + Packet4ui mask = vcgtq_f32(tmp, fx); + mask = vandq_u32(mask, vreinterpretq_u32_f32(p4f_1)); + + fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); + + tmp = vmulq_f32(fx, p4f_cephes_exp_C1); + Packet4f z = vmulq_f32(fx, p4f_cephes_exp_C2); + x = vsubq_f32(x, tmp); + x = vsubq_f32(x, z); + + Packet4f y = vmulq_f32(p4f_cephes_exp_p0, x); + z = vmulq_f32(x, x); + y = vaddq_f32(y, p4f_cephes_exp_p1); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p2); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p3); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p4); + y = vmulq_f32(y, x); + y = vaddq_f32(y, p4f_cephes_exp_p5); + + y = vmulq_f32(y, z); + y = vaddq_f32(y, x); + y = vaddq_f32(y, p4f_1); + + /* build 2^n */ + int32x4_t mm; + mm = vcvtq_s32_f32(fx); + mm = vaddq_s32(mm, p4i_0x7f); + mm = vshlq_n_s32(mm, 23); + Packet4f pow2n = vreinterpretq_f32_s32(mm); + + y = vmulq_f32(y, pow2n); + return y; +} } // end namespace internal diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/PacketMath.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/PacketMath.h index ee5a938b9..3d5ed0d24 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/PacketMath.h @@ -32,7 +32,7 @@ namespace internal { #if EIGEN_ARCH_ARM64 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #else -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 #endif #endif @@ -42,45 +42,34 @@ namespace internal { // are aliases to the same underlying type __n128. // We thus have to wrap them to make them different C++ types. // (See also bug 1428) -typedef eigen_packet_wrapper Packet2f; -typedef eigen_packet_wrapper Packet4f; -typedef eigen_packet_wrapper Packet4c; -typedef eigen_packet_wrapper Packet8c; -typedef eigen_packet_wrapper Packet16c; -typedef eigen_packet_wrapper Packet4uc; -typedef eigen_packet_wrapper Packet8uc; -typedef eigen_packet_wrapper Packet16uc; -typedef eigen_packet_wrapper Packet4s; -typedef eigen_packet_wrapper Packet8s; -typedef eigen_packet_wrapper Packet4us; -typedef eigen_packet_wrapper Packet8us; -typedef eigen_packet_wrapper Packet2i; -typedef eigen_packet_wrapper Packet4i; -typedef eigen_packet_wrapper Packet2ui; -typedef eigen_packet_wrapper Packet4ui; -typedef eigen_packet_wrapper Packet2l; -typedef eigen_packet_wrapper Packet2ul; + +template +struct eigen_packet_wrapper +{ + operator T&() { return m_val; } + operator const T&() const { return m_val; } + eigen_packet_wrapper() {} + eigen_packet_wrapper(const T &v) : m_val(v) {} + eigen_packet_wrapper& operator=(const T &v) { + m_val = v; + return *this; + } + + T m_val; +}; +typedef eigen_packet_wrapper Packet2f; +typedef eigen_packet_wrapper Packet4f; +typedef eigen_packet_wrapper Packet4i; +typedef eigen_packet_wrapper Packet2i; +typedef eigen_packet_wrapper Packet4ui; #else -typedef float32x2_t Packet2f; -typedef float32x4_t Packet4f; -typedef eigen_packet_wrapper Packet4c; -typedef int8x8_t Packet8c; -typedef int8x16_t Packet16c; -typedef eigen_packet_wrapper Packet4uc; -typedef uint8x8_t Packet8uc; -typedef uint8x16_t Packet16uc; -typedef int16x4_t Packet4s; -typedef int16x8_t Packet8s; -typedef uint16x4_t Packet4us; -typedef uint16x8_t Packet8us; -typedef int32x2_t Packet2i; -typedef int32x4_t Packet4i; -typedef uint32x2_t Packet2ui; -typedef uint32x4_t Packet4ui; -typedef int64x2_t Packet2l; -typedef uint64x2_t Packet2ul; +typedef float32x2_t Packet2f; +typedef float32x4_t Packet4f; +typedef int32x4_t Packet4i; +typedef int32x2_t Packet2i; +typedef uint32x4_t Packet4ui; #endif // EIGEN_COMP_MSVC @@ -109,809 +98,81 @@ typedef uint64x2_t Packet2ul; #define EIGEN_ARM_PREFETCH(ADDR) #endif -template <> -struct packet_traits : default_packet_traits +template<> struct packet_traits : default_packet_traits { typedef Packet4f type; - typedef Packet2f half; - enum - { + typedef Packet4f half; // Packet2f intrinsics not implemented yet + enum { Vectorizable = 1, AlignedOnScalar = 1, size = 4, - HasHalfPacket = 1, - - HasCast = 1, - HasAdd = 1, - HasSub = 1, - HasShift = 1, - HasMul = 1, - HasNegate = 1, - HasAbs = 1, - HasArg = 0, - HasAbs2 = 1, - HasAbsDiff = 1, - HasMin = 1, - HasMax = 1, - HasConj = 1, - HasSetLinear = 0, - HasBlend = 0, - HasInsert = 1, - - HasDiv = 1, - HasFloor = 1, - - HasSin = EIGEN_FAST_MATH, - HasCos = EIGEN_FAST_MATH, - HasLog = 1, + HasHalfPacket=0, // Packet2f intrinsics not implemented yet + + HasDiv = 1, + // FIXME check the Has* + HasSin = 0, + HasCos = 0, + HasLog = 0, HasExp = 1, - HasSqrt = 0, - HasTanh = EIGEN_FAST_MATH, - HasErf = EIGEN_FAST_MATH + HasSqrt = 0 }; }; - -template <> -struct packet_traits : default_packet_traits -{ - typedef Packet16c type; - typedef Packet8c half; - enum - { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 16, - HasHalfPacket = 1, - - HasCast = 1, - HasAdd = 1, - HasSub = 1, - HasShift = 1, - HasMul = 1, - HasNegate = 1, - HasAbs = 1, - HasAbsDiff = 1, - HasArg = 0, - HasAbs2 = 1, - HasMin = 1, - HasMax = 1, - HasConj = 1, - HasSetLinear = 0, - HasBlend = 0, - HasInsert = 1, - }; -}; - -template <> -struct packet_traits : default_packet_traits -{ - typedef Packet16uc type; - typedef Packet8uc half; - enum - { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 16, - HasHalfPacket = 1, - - HasCast = 1, - HasAdd = 1, - HasSub = 1, - HasShift = 1, - HasMul = 1, - HasNegate = 0, - HasAbs = 1, - HasAbsDiff = 1, - HasArg = 0, - HasAbs2 = 1, - HasMin = 1, - HasMax = 1, - HasConj = 1, - HasSetLinear = 0, - HasBlend = 0, - HasInsert = 1, - - HasSqrt = 1 - }; -}; - -template <> -struct packet_traits : default_packet_traits -{ - typedef Packet8s type; - typedef Packet4s half; - enum - { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 8, - HasHalfPacket = 1, - - HasCast = 1, - HasAdd = 1, - HasSub = 1, - HasShift = 1, - HasMul = 1, - HasNegate = 1, - HasAbs = 1, - HasAbsDiff = 1, - HasArg = 0, - HasAbs2 = 1, - HasMin = 1, - HasMax = 1, - HasConj = 1, - HasSetLinear = 0, - HasBlend = 0, - HasInsert = 1, - }; -}; - -template <> -struct packet_traits : default_packet_traits -{ - typedef Packet8us type; - typedef Packet4us half; - enum - { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 8, - HasHalfPacket = 1, - - HasCast = 1, - HasAdd = 1, - HasSub = 1, - HasShift = 1, - HasMul = 1, - HasNegate = 0, - HasAbs = 0, - HasAbsDiff = 1, - HasArg = 0, - HasAbs2 = 1, - HasMin = 1, - HasMax = 1, - HasConj = 1, - HasSetLinear = 0, - HasBlend = 0, - HasInsert = 1, - - HasSqrt = 1 - }; -}; - -template <> -struct packet_traits : default_packet_traits +template<> struct packet_traits : default_packet_traits { typedef Packet4i type; - typedef Packet2i half; - enum - { + typedef Packet4i half; // Packet2i intrinsics not implemented yet + enum { Vectorizable = 1, AlignedOnScalar = 1, - size = 4, - HasHalfPacket = 1, - - HasCast = 1, - HasAdd = 1, - HasSub = 1, - HasShift = 1, - HasMul = 1, - HasNegate = 1, - HasAbs = 1, - HasArg = 0, - HasAbs2 = 1, - HasAbsDiff = 1, - HasMin = 1, - HasMax = 1, - HasConj = 1, - HasSetLinear = 0, - HasBlend = 0, - HasInsert = 1, + size=4, + HasHalfPacket=0 // Packet2i intrinsics not implemented yet + // FIXME check the Has* }; }; -template <> -struct packet_traits : default_packet_traits -{ - typedef Packet4ui type; - typedef Packet2ui half; - enum - { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 4, - HasHalfPacket = 1, - - HasCast = 1, - HasAdd = 1, - HasSub = 1, - HasShift = 1, - HasMul = 1, - HasNegate = 0, - HasAbs = 0, - HasArg = 0, - HasAbs2 = 1, - HasAbsDiff = 1, - HasMin = 1, - HasMax = 1, - HasConj = 1, - HasSetLinear = 0, - HasBlend = 0, - HasInsert = 1, - - HasSqrt = 1 - }; -}; - -template <> -struct packet_traits : default_packet_traits -{ - typedef Packet2l type; - typedef Packet2l half; - enum - { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 2, - HasHalfPacket = 1, - - HasCast = 1, - HasCmp = 1, - HasAdd = 1, - HasSub = 1, - HasShift = 1, - HasMul = 1, - HasNegate = 1, - HasAbs = 1, - HasArg = 0, - HasAbs2 = 1, - HasAbsDiff = 1, - HasMin = 1, - HasMax = 1, - HasConj = 1, - HasSetLinear = 0, - HasBlend = 0, - HasInsert = 1, - }; -}; - -template <> -struct packet_traits : default_packet_traits -{ - typedef Packet2ul type; - typedef Packet2ul half; - enum - { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 2, - HasHalfPacket = 1, - - HasCast = 1, - HasCmp = 1, - HasAdd = 1, - HasSub = 1, - HasShift = 1, - HasMul = 1, - HasNegate = 0, - HasAbs = 0, - HasArg = 0, - HasAbs2 = 1, - HasAbsDiff = 1, - HasMin = 1, - HasMax = 1, - HasConj = 1, - HasSetLinear = 0, - HasBlend = 0, - HasInsert = 1, - }; -}; - -#if EIGEN_GNUC_AT_MOST(4, 4) && !EIGEN_COMP_LLVM +#if EIGEN_GNUC_AT_MOST(4,4) && !EIGEN_COMP_LLVM // workaround gcc 4.2, 4.3 and 4.4 compilatin issue EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); } -EIGEN_STRONG_INLINE float32x2_t vld1_f32(const float* x) { return ::vld1_f32 ((const float32_t*)x); } -EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32(const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); } -EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); } -EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } +EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); } +EIGEN_STRONG_INLINE float32x2_t vld1_dup_f32 (const float* x) { return ::vld1_dup_f32 ((const float32_t*)x); } +EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); } +EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } #endif -template<> struct unpacket_traits -{ - typedef float type; - typedef Packet2f half; - typedef Packet2i integer_packet; - enum - { - size = 2, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template<> struct unpacket_traits -{ - typedef float type; - typedef Packet2f half; - typedef Packet4i integer_packet; - enum - { - size = 4, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template<> struct unpacket_traits -{ - typedef int8_t type; - typedef Packet4c half; - enum - { - size = 4, - alignment = Unaligned, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template<> struct unpacket_traits -{ - typedef int8_t type; - typedef Packet4c half; - enum - { - size = 8, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template<> struct unpacket_traits -{ - typedef int8_t type; - typedef Packet8c half; - enum - { - size = 16, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template<> struct unpacket_traits -{ - typedef uint8_t type; - typedef Packet4uc half; - enum - { - size = 4, - alignment = Unaligned, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template<> struct unpacket_traits -{ - typedef uint8_t type; - typedef Packet4uc half; - enum - { - size = 8, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template<> struct unpacket_traits -{ - typedef uint8_t type; - typedef Packet8uc half; - enum - { - size = 16, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false}; -}; -template<> struct unpacket_traits -{ - typedef int16_t type; - typedef Packet4s half; - enum - { - size = 4, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template<> struct unpacket_traits -{ - typedef int16_t type; - typedef Packet4s half; - enum - { - size = 8, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template<> struct unpacket_traits -{ - typedef uint16_t type; - typedef Packet4us half; - enum - { - size = 4, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template<> struct unpacket_traits -{ - typedef uint16_t type; - typedef Packet4us half; - enum - { - size = 8, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template<> struct unpacket_traits -{ - typedef int32_t type; - typedef Packet2i half; - enum - { - size = 2, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template<> struct unpacket_traits -{ - typedef int32_t type; - typedef Packet2i half; - enum - { - size = 4, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template<> struct unpacket_traits -{ - typedef uint32_t type; - typedef Packet2ui half; - enum - { - size = 2, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template<> struct unpacket_traits -{ - typedef uint32_t type; - typedef Packet2ui half; - enum - { - size = 4, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template<> struct unpacket_traits -{ - typedef int64_t type; - typedef Packet2l half; - enum - { - size = 2, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; -template<> struct unpacket_traits -{ - typedef uint64_t type; - typedef Packet2ul half; - enum - { - size = 2, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; +template<> struct unpacket_traits { typedef int32_t type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; -template<> EIGEN_STRONG_INLINE Packet2f pset1(const float& from) { return vdup_n_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return vdupq_n_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4c pset1(const int8_t& from) -{ return vget_lane_s32(vreinterpret_s32_s8(vdup_n_s8(from)), 0); } -template<> EIGEN_STRONG_INLINE Packet8c pset1(const int8_t& from) { return vdup_n_s8(from); } -template<> EIGEN_STRONG_INLINE Packet16c pset1(const int8_t& from) { return vdupq_n_s8(from); } -template<> EIGEN_STRONG_INLINE Packet4uc pset1(const uint8_t& from) -{ return vget_lane_u32(vreinterpret_u32_u8(vdup_n_u8(from)), 0); } -template<> EIGEN_STRONG_INLINE Packet8uc pset1(const uint8_t& from) { return vdup_n_u8(from); } -template<> EIGEN_STRONG_INLINE Packet16uc pset1(const uint8_t& from) { return vdupq_n_u8(from); } -template<> EIGEN_STRONG_INLINE Packet4s pset1(const int16_t& from) { return vdup_n_s16(from); } -template<> EIGEN_STRONG_INLINE Packet8s pset1(const int16_t& from) { return vdupq_n_s16(from); } -template<> EIGEN_STRONG_INLINE Packet4us pset1(const uint16_t& from) { return vdup_n_u16(from); } -template<> EIGEN_STRONG_INLINE Packet8us pset1(const uint16_t& from) { return vdupq_n_u16(from); } -template<> EIGEN_STRONG_INLINE Packet2i pset1(const int32_t& from) { return vdup_n_s32(from); } -template<> EIGEN_STRONG_INLINE Packet4i pset1(const int32_t& from) { return vdupq_n_s32(from); } -template<> EIGEN_STRONG_INLINE Packet2ui pset1(const uint32_t& from) { return vdup_n_u32(from); } -template<> EIGEN_STRONG_INLINE Packet4ui pset1(const uint32_t& from) { return vdupq_n_u32(from); } -template<> EIGEN_STRONG_INLINE Packet2l pset1(const int64_t& from) { return vdupq_n_s64(from); } -template<> EIGEN_STRONG_INLINE Packet2ul pset1(const uint64_t& from) { return vdupq_n_u64(from); } +template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return vdupq_n_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4i pset1(const int32_t& from) { return vdupq_n_s32(from); } -template<> EIGEN_STRONG_INLINE Packet2f pset1frombits(unsigned int from) -{ return vreinterpret_f32_u32(vdup_n_u32(from)); } -template<> EIGEN_STRONG_INLINE Packet4f pset1frombits(unsigned int from) -{ return vreinterpretq_f32_u32(vdupq_n_u32(from)); } - -template<> EIGEN_STRONG_INLINE Packet2f plset(const float& a) -{ - const float c[] = {0.0f,1.0f}; - return vadd_f32(pset1(a), vld1_f32(c)); -} template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { - const float c[] = {0.0f,1.0f,2.0f,3.0f}; - return vaddq_f32(pset1(a), vld1q_f32(c)); -} -template<> EIGEN_STRONG_INLINE Packet4c plset(const int8_t& a) -{ return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_u32(vdup_n_u32(0x03020100)), vdup_n_s8(a))), 0); } -template<> EIGEN_STRONG_INLINE Packet8c plset(const int8_t& a) -{ - const int8_t c[] = {0,1,2,3,4,5,6,7}; - return vadd_s8(pset1(a), vld1_s8(c)); -} -template<> EIGEN_STRONG_INLINE Packet16c plset(const int8_t& a) -{ - const int8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; - return vaddq_s8(pset1(a), vld1q_s8(c)); -} -template<> EIGEN_STRONG_INLINE Packet4uc plset(const uint8_t& a) -{ return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(0x03020100)), vdup_n_u8(a))), 0); } -template<> EIGEN_STRONG_INLINE Packet8uc plset(const uint8_t& a) -{ - const uint8_t c[] = {0,1,2,3,4,5,6,7}; - return vadd_u8(pset1(a), vld1_u8(c)); -} -template<> EIGEN_STRONG_INLINE Packet16uc plset(const uint8_t& a) -{ - const uint8_t c[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; - return vaddq_u8(pset1(a), vld1q_u8(c)); -} -template<> EIGEN_STRONG_INLINE Packet4s plset(const int16_t& a) -{ - const int16_t c[] = {0,1,2,3}; - return vadd_s16(pset1(a), vld1_s16(c)); -} -template<> EIGEN_STRONG_INLINE Packet4us plset(const uint16_t& a) -{ - const uint16_t c[] = {0,1,2,3}; - return vadd_u16(pset1(a), vld1_u16(c)); -} -template<> EIGEN_STRONG_INLINE Packet8s plset(const int16_t& a) -{ - const int16_t c[] = {0,1,2,3,4,5,6,7}; - return vaddq_s16(pset1(a), vld1q_s16(c)); -} -template<> EIGEN_STRONG_INLINE Packet8us plset(const uint16_t& a) -{ - const uint16_t c[] = {0,1,2,3,4,5,6,7}; - return vaddq_u16(pset1(a), vld1q_u16(c)); -} -template<> EIGEN_STRONG_INLINE Packet2i plset(const int32_t& a) -{ - const int32_t c[] = {0,1}; - return vadd_s32(pset1(a), vld1_s32(c)); + const float f[] = {0, 1, 2, 3}; + Packet4f countdown = vld1q_f32(f); + return vaddq_f32(pset1(a), countdown); } template<> EIGEN_STRONG_INLINE Packet4i plset(const int32_t& a) { - const int32_t c[] = {0,1,2,3}; - return vaddq_s32(pset1(a), vld1q_s32(c)); -} -template<> EIGEN_STRONG_INLINE Packet2ui plset(const uint32_t& a) -{ - const uint32_t c[] = {0,1}; - return vadd_u32(pset1(a), vld1_u32(c)); -} -template<> EIGEN_STRONG_INLINE Packet4ui plset(const uint32_t& a) -{ - const uint32_t c[] = {0,1,2,3}; - return vaddq_u32(pset1(a), vld1q_u32(c)); -} -template<> EIGEN_STRONG_INLINE Packet2l plset(const int64_t& a) -{ - const int64_t c[] = {0,1}; - return vaddq_s64(pset1(a), vld1q_s64(c)); -} -template<> EIGEN_STRONG_INLINE Packet2ul plset(const uint64_t& a) -{ - const uint64_t c[] = {0,1}; - return vaddq_u64(pset1(a), vld1q_u64(c)); + const int32_t i[] = {0, 1, 2, 3}; + Packet4i countdown = vld1q_s32(i); + return vaddq_s32(pset1(a), countdown); } -template<> EIGEN_STRONG_INLINE Packet2f padd(const Packet2f& a, const Packet2f& b) { return vadd_f32(a,b); } template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4c padd(const Packet4c& a, const Packet4c& b) -{ - return vget_lane_s32(vreinterpret_s32_s8(vadd_s8( - vreinterpret_s8_s32(vdup_n_s32(a)), - vreinterpret_s8_s32(vdup_n_s32(b)))), 0); -} -template<> EIGEN_STRONG_INLINE Packet8c padd(const Packet8c& a, const Packet8c& b) { return vadd_s8(a,b); } -template<> EIGEN_STRONG_INLINE Packet16c padd(const Packet16c& a, const Packet16c& b) { return vaddq_s8(a,b); } -template<> EIGEN_STRONG_INLINE Packet4uc padd(const Packet4uc& a, const Packet4uc& b) -{ - return vget_lane_u32(vreinterpret_u32_u8(vadd_u8( - vreinterpret_u8_u32(vdup_n_u32(a)), - vreinterpret_u8_u32(vdup_n_u32(b)))), 0); -} -template<> EIGEN_STRONG_INLINE Packet8uc padd(const Packet8uc& a, const Packet8uc& b) { return vadd_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet16uc padd(const Packet16uc& a, const Packet16uc& b) { return vaddq_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet4s padd(const Packet4s& a, const Packet4s& b) { return vadd_s16(a,b); } -template<> EIGEN_STRONG_INLINE Packet8s padd(const Packet8s& a, const Packet8s& b) { return vaddq_s16(a,b); } -template<> EIGEN_STRONG_INLINE Packet4us padd(const Packet4us& a, const Packet4us& b) { return vadd_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet8us padd(const Packet8us& a, const Packet8us& b) { return vaddq_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet2i padd(const Packet2i& a, const Packet2i& b) { return vadd_s32(a,b); } template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet2ui padd(const Packet2ui& a, const Packet2ui& b) { return vadd_u32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4ui padd(const Packet4ui& a, const Packet4ui& b) { return vaddq_u32(a,b); } -template<> EIGEN_STRONG_INLINE Packet2l padd(const Packet2l& a, const Packet2l& b) { return vaddq_s64(a,b); } -template<> EIGEN_STRONG_INLINE Packet2ul padd(const Packet2ul& a, const Packet2ul& b) { return vaddq_u64(a,b); } -template<> EIGEN_STRONG_INLINE Packet2f psub(const Packet2f& a, const Packet2f& b) { return vsub_f32(a,b); } template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4c psub(const Packet4c& a, const Packet4c& b) -{ - return vget_lane_s32(vreinterpret_s32_s8(vsub_s8( - vreinterpret_s8_s32(vdup_n_s32(a)), - vreinterpret_s8_s32(vdup_n_s32(b)))), 0); -} -template<> EIGEN_STRONG_INLINE Packet8c psub(const Packet8c& a, const Packet8c& b) { return vsub_s8(a,b); } -template<> EIGEN_STRONG_INLINE Packet16c psub(const Packet16c& a, const Packet16c& b) { return vsubq_s8(a,b); } -template<> EIGEN_STRONG_INLINE Packet4uc psub(const Packet4uc& a, const Packet4uc& b) -{ - return vget_lane_u32(vreinterpret_u32_u8(vsub_u8( - vreinterpret_u8_u32(vdup_n_u32(a)), - vreinterpret_u8_u32(vdup_n_u32(b)))), 0); -} -template<> EIGEN_STRONG_INLINE Packet8uc psub(const Packet8uc& a, const Packet8uc& b) { return vsub_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet16uc psub(const Packet16uc& a, const Packet16uc& b) { return vsubq_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet4s psub(const Packet4s& a, const Packet4s& b) { return vsub_s16(a,b); } -template<> EIGEN_STRONG_INLINE Packet8s psub(const Packet8s& a, const Packet8s& b) { return vsubq_s16(a,b); } -template<> EIGEN_STRONG_INLINE Packet4us psub(const Packet4us& a, const Packet4us& b) { return vsub_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet8us psub(const Packet8us& a, const Packet8us& b) { return vsubq_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet2i psub(const Packet2i& a, const Packet2i& b) { return vsub_s32(a,b); } template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet2ui psub(const Packet2ui& a, const Packet2ui& b) { return vsub_u32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4ui psub(const Packet4ui& a, const Packet4ui& b) { return vsubq_u32(a,b); } -template<> EIGEN_STRONG_INLINE Packet2l psub(const Packet2l& a, const Packet2l& b) { return vsubq_s64(a,b); } -template<> EIGEN_STRONG_INLINE Packet2ul psub(const Packet2ul& a, const Packet2ul& b) { return vsubq_u64(a,b); } -template<> EIGEN_STRONG_INLINE Packet2f pnegate(const Packet2f& a) { return vneg_f32(a); } template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); } -template<> EIGEN_STRONG_INLINE Packet4c pnegate(const Packet4c& a) -{ return vget_lane_s32(vreinterpret_s32_s8(vneg_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); } -template<> EIGEN_STRONG_INLINE Packet8c pnegate(const Packet8c& a) { return vneg_s8(a); } -template<> EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) { return vnegq_s8(a); } -template<> EIGEN_STRONG_INLINE Packet4s pnegate(const Packet4s& a) { return vneg_s16(a); } -template<> EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) { return vnegq_s16(a); } -template<> EIGEN_STRONG_INLINE Packet2i pnegate(const Packet2i& a) { return vneg_s32(a); } template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); } -template<> EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) { -#if EIGEN_ARCH_ARM64 - return vnegq_s64(a); -#else - return vcombine_s64( - vdup_n_s64(-vgetq_lane_s64(a, 0)), - vdup_n_s64(-vgetq_lane_s64(a, 1))); -#endif -} -template<> EIGEN_STRONG_INLINE Packet2f pconj(const Packet2f& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4c pconj(const Packet4c& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet8c pconj(const Packet8c& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4uc pconj(const Packet4uc& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet8uc pconj(const Packet8uc& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4s pconj(const Packet4s& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4us pconj(const Packet4us& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet2i pconj(const Packet2i& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet2ui pconj(const Packet2ui& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet2f pmul(const Packet2f& a, const Packet2f& b) { return vmul_f32(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4c pmul(const Packet4c& a, const Packet4c& b) -{ - return vget_lane_s32(vreinterpret_s32_s8(vmul_s8( - vreinterpret_s8_s32(vdup_n_s32(a)), - vreinterpret_s8_s32(vdup_n_s32(b)))), 0); -} -template<> EIGEN_STRONG_INLINE Packet8c pmul(const Packet8c& a, const Packet8c& b) { return vmul_s8(a,b); } -template<> EIGEN_STRONG_INLINE Packet16c pmul(const Packet16c& a, const Packet16c& b) { return vmulq_s8(a,b); } -template<> EIGEN_STRONG_INLINE Packet4uc pmul(const Packet4uc& a, const Packet4uc& b) -{ - return vget_lane_u32(vreinterpret_u32_u8(vmul_u8( - vreinterpret_u8_u32(vdup_n_u32(a)), - vreinterpret_u8_u32(vdup_n_u32(b)))), 0); -} -template<> EIGEN_STRONG_INLINE Packet8uc pmul(const Packet8uc& a, const Packet8uc& b) { return vmul_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet16uc pmul(const Packet16uc& a, const Packet16uc& b) { return vmulq_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet4s pmul(const Packet4s& a, const Packet4s& b) { return vmul_s16(a,b); } -template<> EIGEN_STRONG_INLINE Packet8s pmul(const Packet8s& a, const Packet8s& b) { return vmulq_s16(a,b); } -template<> EIGEN_STRONG_INLINE Packet4us pmul(const Packet4us& a, const Packet4us& b) { return vmul_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet8us pmul(const Packet8us& a, const Packet8us& b) { return vmulq_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet2i pmul(const Packet2i& a, const Packet2i& b) { return vmul_s32(a,b); } template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet2ui pmul(const Packet2ui& a, const Packet2ui& b) { return vmul_u32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4ui pmul(const Packet4ui& a, const Packet4ui& b) { return vmulq_u32(a,b); } -template<> EIGEN_STRONG_INLINE Packet2f pdiv(const Packet2f& a, const Packet2f& b) -{ -#if EIGEN_ARCH_ARM64 - return vdiv_f32(a,b); -#else - Packet2f inv, restep, div; - - // NEON does not offer a divide instruction, we have to do a reciprocal approximation - // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers - // a reciprocal estimate AND a reciprocal step -which saves a few instructions - // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with - // Newton-Raphson and vrecpsq_f32() - inv = vrecpe_f32(b); - - // This returns a differential, by which we will have to multiply inv to get a better - // approximation of 1/b. - restep = vrecps_f32(b, inv); - inv = vmul_f32(restep, inv); - - // Finally, multiply a by 1/b and get the wanted result of the division. - div = vmul_f32(a, inv); - - return div; -#endif -} template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { #if EIGEN_ARCH_ARM64 @@ -938,86 +199,10 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const #endif } -template<> EIGEN_STRONG_INLINE Packet4c pdiv(const Packet4c& /*a*/, const Packet4c& /*b*/) -{ - eigen_assert(false && "packet integer division are not supported by NEON"); - return pset1(0); -} -template<> EIGEN_STRONG_INLINE Packet8c pdiv(const Packet8c& /*a*/, const Packet8c& /*b*/) -{ - eigen_assert(false && "packet integer division are not supported by NEON"); - return pset1(0); -} -template<> EIGEN_STRONG_INLINE Packet16c pdiv(const Packet16c& /*a*/, const Packet16c& /*b*/) -{ - eigen_assert(false && "packet integer division are not supported by NEON"); - return pset1(0); -} -template<> EIGEN_STRONG_INLINE Packet4uc pdiv(const Packet4uc& /*a*/, const Packet4uc& /*b*/) -{ - eigen_assert(false && "packet integer division are not supported by NEON"); - return pset1(0); -} -template<> EIGEN_STRONG_INLINE Packet8uc pdiv(const Packet8uc& /*a*/, const Packet8uc& /*b*/) -{ - eigen_assert(false && "packet integer division are not supported by NEON"); - return pset1(0); -} -template<> EIGEN_STRONG_INLINE Packet16uc pdiv(const Packet16uc& /*a*/, const Packet16uc& /*b*/) -{ - eigen_assert(false && "packet integer division are not supported by NEON"); - return pset1(0); -} -template<> EIGEN_STRONG_INLINE Packet4s pdiv(const Packet4s& /*a*/, const Packet4s& /*b*/) -{ - eigen_assert(false && "packet integer division are not supported by NEON"); - return pset1(0); -} -template<> EIGEN_STRONG_INLINE Packet8s pdiv(const Packet8s& /*a*/, const Packet8s& /*b*/) -{ - eigen_assert(false && "packet integer division are not supported by NEON"); - return pset1(0); -} -template<> EIGEN_STRONG_INLINE Packet4us pdiv(const Packet4us& /*a*/, const Packet4us& /*b*/) -{ - eigen_assert(false && "packet integer division are not supported by NEON"); - return pset1(0); -} -template<> EIGEN_STRONG_INLINE Packet8us pdiv(const Packet8us& /*a*/, const Packet8us& /*b*/) -{ - eigen_assert(false && "packet integer division are not supported by NEON"); - return pset1(0); -} -template<> EIGEN_STRONG_INLINE Packet2i pdiv(const Packet2i& /*a*/, const Packet2i& /*b*/) -{ - eigen_assert(false && "packet integer division are not supported by NEON"); - return pset1(0); -} template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, const Packet4i& /*b*/) -{ - eigen_assert(false && "packet integer division are not supported by NEON"); +{ eigen_assert(false && "packet integer division are not supported by NEON"); return pset1(0); } -template<> EIGEN_STRONG_INLINE Packet2ui pdiv(const Packet2ui& /*a*/, const Packet2ui& /*b*/) -{ - eigen_assert(false && "packet integer division are not supported by NEON"); - return pset1(0); -} -template<> EIGEN_STRONG_INLINE Packet4ui pdiv(const Packet4ui& /*a*/, const Packet4ui& /*b*/) -{ - eigen_assert(false && "packet integer division are not supported by NEON"); - return pset1(0); -} -template<> EIGEN_STRONG_INLINE Packet2l pdiv(const Packet2l& /*a*/, const Packet2l& /*b*/) -{ - eigen_assert(false && "packet integer division are not supported by NEON"); - return pset1(0LL); -} -template<> EIGEN_STRONG_INLINE Packet2ul pdiv(const Packet2ul& /*a*/, const Packet2ul& /*b*/) -{ - eigen_assert(false && "packet integer division are not supported by NEON"); - return pset1(0ULL); -} // Clang/ARM wrongly advertises __ARM_FEATURE_FMA even when it's not available, // then implements a slow software scalar fallback calling fmaf()! @@ -1030,11 +215,9 @@ template<> EIGEN_STRONG_INLINE Packet2ul pdiv(const Packet2ul& /*a*/, // MLA is not fused i.e. does 2 roundings. // In addition to giving better accuracy, FMA also gives better performance here on a Krait (Nexus 4): // MLA: 10 GFlop/s ; FMA: 12 GFlops/s. -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) -{ return vfmaq_f32(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vfmaq_f32(c,a,b); } #else -template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) -{ +template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { #if EIGEN_COMP_CLANG && EIGEN_ARCH_ARM // Clang/ARM will replace VMLA by VMUL+VADD at least for some values of -mcpu, // at least -mcpu=cortex-a8 and -mcpu=cortex-a7. Since the former is the default on @@ -1058,2112 +241,316 @@ template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& #endif // No FMA instruction for int, so use MLA unconditionally. -template<> EIGEN_STRONG_INLINE Packet4c pmadd(const Packet4c& a, const Packet4c& b, const Packet4c& c) -{ - return vget_lane_s32(vreinterpret_s32_s8(vmla_s8( - vreinterpret_s8_s32(vdup_n_s32(c)), - vreinterpret_s8_s32(vdup_n_s32(a)), - vreinterpret_s8_s32(vdup_n_s32(b)))), 0); -} -template<> EIGEN_STRONG_INLINE Packet8c pmadd(const Packet8c& a, const Packet8c& b, const Packet8c& c) -{ return vmla_s8(c,a,b); } -template<> EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) -{ return vmlaq_s8(c,a,b); } -template<> EIGEN_STRONG_INLINE Packet4uc pmadd(const Packet4uc& a, const Packet4uc& b, const Packet4uc& c) -{ - return vget_lane_u32(vreinterpret_u32_u8(vmla_u8( - vreinterpret_u8_u32(vdup_n_u32(c)), - vreinterpret_u8_u32(vdup_n_u32(a)), - vreinterpret_u8_u32(vdup_n_u32(b)))), 0); -} -template<> EIGEN_STRONG_INLINE Packet8uc pmadd(const Packet8uc& a, const Packet8uc& b, const Packet8uc& c) -{ return vmla_u8(c,a,b); } -template<> EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c) -{ return vmlaq_u8(c,a,b); } -template<> EIGEN_STRONG_INLINE Packet4s pmadd(const Packet4s& a, const Packet4s& b, const Packet4s& c) -{ return vmla_s16(c,a,b); } -template<> EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) -{ return vmlaq_s16(c,a,b); } -template<> EIGEN_STRONG_INLINE Packet4us pmadd(const Packet4us& a, const Packet4us& b, const Packet4us& c) -{ return vmla_u16(c,a,b); } -template<> EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) -{ return vmlaq_u16(c,a,b); } -template<> EIGEN_STRONG_INLINE Packet2i pmadd(const Packet2i& a, const Packet2i& b, const Packet2i& c) -{ return vmla_s32(c,a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) -{ return vmlaq_s32(c,a,b); } -template<> EIGEN_STRONG_INLINE Packet2ui pmadd(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c) -{ return vmla_u32(c,a,b); } -template<> EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) -{ return vmlaq_u32(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); } -template<> EIGEN_STRONG_INLINE Packet2f pabsdiff(const Packet2f& a, const Packet2f& b) -{ return vabd_f32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4f pabsdiff(const Packet4f& a, const Packet4f& b) -{ return vabdq_f32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4c pabsdiff(const Packet4c& a, const Packet4c& b) -{ - return vget_lane_s32(vreinterpret_s32_s8(vabd_s8( - vreinterpret_s8_s32(vdup_n_s32(a)), - vreinterpret_s8_s32(vdup_n_s32(b)))), 0); -} -template<> EIGEN_STRONG_INLINE Packet8c pabsdiff(const Packet8c& a, const Packet8c& b) -{ return vabd_s8(a,b); } -template<> EIGEN_STRONG_INLINE Packet16c pabsdiff(const Packet16c& a, const Packet16c& b) -{ return vabdq_s8(a,b); } -template<> EIGEN_STRONG_INLINE Packet4uc pabsdiff(const Packet4uc& a, const Packet4uc& b) -{ - return vget_lane_u32(vreinterpret_u32_u8(vabd_u8( - vreinterpret_u8_u32(vdup_n_u32(a)), - vreinterpret_u8_u32(vdup_n_u32(b)))), 0); -} -template<> EIGEN_STRONG_INLINE Packet8uc pabsdiff(const Packet8uc& a, const Packet8uc& b) -{ return vabd_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet16uc pabsdiff(const Packet16uc& a, const Packet16uc& b) -{ return vabdq_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet4s pabsdiff(const Packet4s& a, const Packet4s& b) -{ return vabd_s16(a,b); } -template<> EIGEN_STRONG_INLINE Packet8s pabsdiff(const Packet8s& a, const Packet8s& b) -{ return vabdq_s16(a,b); } -template<> EIGEN_STRONG_INLINE Packet4us pabsdiff(const Packet4us& a, const Packet4us& b) -{ return vabd_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet8us pabsdiff(const Packet8us& a, const Packet8us& b) -{ return vabdq_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet2i pabsdiff(const Packet2i& a, const Packet2i& b) -{ return vabd_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pabsdiff(const Packet4i& a, const Packet4i& b) -{ return vabdq_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet2ui pabsdiff(const Packet2ui& a, const Packet2ui& b) -{ return vabd_u32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4ui pabsdiff(const Packet4ui& a, const Packet4ui& b) -{ return vabdq_u32(a,b); } - -template<> EIGEN_STRONG_INLINE Packet2f pmin(const Packet2f& a, const Packet2f& b) { return vmin_f32(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4c pmin(const Packet4c& a, const Packet4c& b) -{ - return vget_lane_s32(vreinterpret_s32_s8(vmin_s8( - vreinterpret_s8_s32(vdup_n_s32(a)), - vreinterpret_s8_s32(vdup_n_s32(b)))), 0); -} -template<> EIGEN_STRONG_INLINE Packet8c pmin(const Packet8c& a, const Packet8c& b) { return vmin_s8(a,b); } -template<> EIGEN_STRONG_INLINE Packet16c pmin(const Packet16c& a, const Packet16c& b) { return vminq_s8(a,b); } -template<> EIGEN_STRONG_INLINE Packet4uc pmin(const Packet4uc& a, const Packet4uc& b) -{ - return vget_lane_u32(vreinterpret_u32_u8(vmin_u8( - vreinterpret_u8_u32(vdup_n_u32(a)), - vreinterpret_u8_u32(vdup_n_u32(b)))), 0); -} -template<> EIGEN_STRONG_INLINE Packet8uc pmin(const Packet8uc& a, const Packet8uc& b) { return vmin_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet16uc pmin(const Packet16uc& a, const Packet16uc& b) { return vminq_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet4s pmin(const Packet4s& a, const Packet4s& b) { return vmin_s16(a,b); } -template<> EIGEN_STRONG_INLINE Packet8s pmin(const Packet8s& a, const Packet8s& b) { return vminq_s16(a,b); } -template<> EIGEN_STRONG_INLINE Packet4us pmin(const Packet4us& a, const Packet4us& b) { return vmin_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet8us pmin(const Packet8us& a, const Packet8us& b) { return vminq_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet2i pmin(const Packet2i& a, const Packet2i& b) { return vmin_s32(a,b); } template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet2ui pmin(const Packet2ui& a, const Packet2ui& b) { return vmin_u32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4ui pmin(const Packet4ui& a, const Packet4ui& b) { return vminq_u32(a,b); } -template<> EIGEN_STRONG_INLINE Packet2l pmin(const Packet2l& a, const Packet2l& b) { - return vcombine_s64( - vdup_n_s64((std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))), - vdup_n_s64((std::min)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1)))); -} -template<> EIGEN_STRONG_INLINE Packet2ul pmin(const Packet2ul& a, const Packet2ul& b) { - return vcombine_u64( - vdup_n_u64((std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))), - vdup_n_u64((std::min)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1)))); -} -template<> EIGEN_STRONG_INLINE Packet2f pmax(const Packet2f& a, const Packet2f& b) { return vmax_f32(a,b); } template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4c pmax(const Packet4c& a, const Packet4c& b) -{ - return vget_lane_s32(vreinterpret_s32_s8(vmax_s8( - vreinterpret_s8_s32(vdup_n_s32(a)), - vreinterpret_s8_s32(vdup_n_s32(b)))), 0); -} -template<> EIGEN_STRONG_INLINE Packet8c pmax(const Packet8c& a, const Packet8c& b) { return vmax_s8(a,b); } -template<> EIGEN_STRONG_INLINE Packet16c pmax(const Packet16c& a, const Packet16c& b) { return vmaxq_s8(a,b); } -template<> EIGEN_STRONG_INLINE Packet4uc pmax(const Packet4uc& a, const Packet4uc& b) -{ - return vget_lane_u32(vreinterpret_u32_u8(vmax_u8( - vreinterpret_u8_u32(vdup_n_u32(a)), - vreinterpret_u8_u32(vdup_n_u32(b)))), 0); -} -template<> EIGEN_STRONG_INLINE Packet8uc pmax(const Packet8uc& a, const Packet8uc& b) { return vmax_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet16uc pmax(const Packet16uc& a, const Packet16uc& b) { return vmaxq_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet4s pmax(const Packet4s& a, const Packet4s& b) { return vmax_s16(a,b); } -template<> EIGEN_STRONG_INLINE Packet8s pmax(const Packet8s& a, const Packet8s& b) { return vmaxq_s16(a,b); } -template<> EIGEN_STRONG_INLINE Packet4us pmax(const Packet4us& a, const Packet4us& b) { return vmax_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet8us pmax(const Packet8us& a, const Packet8us& b) { return vmaxq_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet2i pmax(const Packet2i& a, const Packet2i& b) { return vmax_s32(a,b); } template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet2ui pmax(const Packet2ui& a, const Packet2ui& b) { return vmax_u32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4ui pmax(const Packet4ui& a, const Packet4ui& b) { return vmaxq_u32(a,b); } -template<> EIGEN_STRONG_INLINE Packet2l pmax(const Packet2l& a, const Packet2l& b) { - return vcombine_s64( - vdup_n_s64((std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))), - vdup_n_s64((std::max)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1)))); -} -template<> EIGEN_STRONG_INLINE Packet2ul pmax(const Packet2ul& a, const Packet2ul& b) { - return vcombine_u64( - vdup_n_u64((std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))), - vdup_n_u64((std::max)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1)))); -} - -template<> EIGEN_STRONG_INLINE Packet2f pcmp_le(const Packet2f& a, const Packet2f& b) -{ return vreinterpret_f32_u32(vcle_f32(a,b)); } -template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) -{ return vreinterpretq_f32_u32(vcleq_f32(a,b)); } -template<> EIGEN_STRONG_INLINE Packet4c pcmp_le(const Packet4c& a, const Packet4c& b) -{ - return vget_lane_s32(vreinterpret_s32_u8(vcle_s8( - vreinterpret_s8_s32(vdup_n_s32(a)), - vreinterpret_s8_s32(vdup_n_s32(b)))), 0); -} -template<> EIGEN_STRONG_INLINE Packet8c pcmp_le(const Packet8c& a, const Packet8c& b) -{ return vreinterpret_s8_u8(vcle_s8(a,b)); } -template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) -{ return vreinterpretq_s8_u8(vcleq_s8(a,b)); } -template<> EIGEN_STRONG_INLINE Packet4uc pcmp_le(const Packet4uc& a, const Packet4uc& b) -{ - return vget_lane_u32(vreinterpret_u32_u8(vcle_u8( - vreinterpret_u8_u32(vdup_n_u32(a)), - vreinterpret_u8_u32(vdup_n_u32(b)))), 0); -} -template<> EIGEN_STRONG_INLINE Packet8uc pcmp_le(const Packet8uc& a, const Packet8uc& b) -{ return vcle_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) -{ return vcleq_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet4s pcmp_le(const Packet4s& a, const Packet4s& b) -{ return vreinterpret_s16_u16(vcle_s16(a,b)); } -template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) -{ return vreinterpretq_s16_u16(vcleq_s16(a,b)); } -template<> EIGEN_STRONG_INLINE Packet4us pcmp_le(const Packet4us& a, const Packet4us& b) -{ return vcle_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) -{ return vcleq_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet2i pcmp_le(const Packet2i& a, const Packet2i& b) -{ return vreinterpret_s32_u32(vcle_s32(a,b)); } -template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) -{ return vreinterpretq_s32_u32(vcleq_s32(a,b)); } -template<> EIGEN_STRONG_INLINE Packet2ui pcmp_le(const Packet2ui& a, const Packet2ui& b) -{ return vcle_u32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4ui pcmp_le(const Packet4ui& a, const Packet4ui& b) -{ return vcleq_u32(a,b); } - -template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt(const Packet2f& a, const Packet2f& b) -{ return vreinterpret_f32_u32(vclt_f32(a,b)); } -template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) -{ return vreinterpretq_f32_u32(vcltq_f32(a,b)); } -template<> EIGEN_STRONG_INLINE Packet4c pcmp_lt(const Packet4c& a, const Packet4c& b) -{ - return vget_lane_s32(vreinterpret_s32_u8(vclt_s8( - vreinterpret_s8_s32(vdup_n_s32(a)), - vreinterpret_s8_s32(vdup_n_s32(b)))), 0); -} -template<> EIGEN_STRONG_INLINE Packet8c pcmp_lt(const Packet8c& a, const Packet8c& b) -{ return vreinterpret_s8_u8(vclt_s8(a,b)); } -template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) -{ return vreinterpretq_s8_u8(vcltq_s8(a,b)); } -template<> EIGEN_STRONG_INLINE Packet4uc pcmp_lt(const Packet4uc& a, const Packet4uc& b) -{ - return vget_lane_u32(vreinterpret_u32_u8(vclt_u8( - vreinterpret_u8_u32(vdup_n_u32(a)), - vreinterpret_u8_u32(vdup_n_u32(b)))), 0); -} -template<> EIGEN_STRONG_INLINE Packet8uc pcmp_lt(const Packet8uc& a, const Packet8uc& b) -{ return vclt_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) -{ return vcltq_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet4s pcmp_lt(const Packet4s& a, const Packet4s& b) -{ return vreinterpret_s16_u16(vclt_s16(a,b)); } -template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) -{ return vreinterpretq_s16_u16(vcltq_s16(a,b)); } -template<> EIGEN_STRONG_INLINE Packet4us pcmp_lt(const Packet4us& a, const Packet4us& b) -{ return vclt_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) -{ return vcltq_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet2i pcmp_lt(const Packet2i& a, const Packet2i& b) -{ return vreinterpret_s32_u32(vclt_s32(a,b)); } -template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) -{ return vreinterpretq_s32_u32(vcltq_s32(a,b)); } -template<> EIGEN_STRONG_INLINE Packet2ui pcmp_lt(const Packet2ui& a, const Packet2ui& b) -{ return vclt_u32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4ui pcmp_lt(const Packet4ui& a, const Packet4ui& b) -{ return vcltq_u32(a,b); } - -template<> EIGEN_STRONG_INLINE Packet2f pcmp_eq(const Packet2f& a, const Packet2f& b) -{ return vreinterpret_f32_u32(vceq_f32(a,b)); } -template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) -{ return vreinterpretq_f32_u32(vceqq_f32(a,b)); } -template<> EIGEN_STRONG_INLINE Packet4c pcmp_eq(const Packet4c& a, const Packet4c& b) -{ - return vget_lane_s32(vreinterpret_s32_u8(vceq_s8( - vreinterpret_s8_s32(vdup_n_s32(a)), - vreinterpret_s8_s32(vdup_n_s32(b)))), 0); -} -template<> EIGEN_STRONG_INLINE Packet8c pcmp_eq(const Packet8c& a, const Packet8c& b) -{ return vreinterpret_s8_u8(vceq_s8(a,b)); } -template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) -{ return vreinterpretq_s8_u8(vceqq_s8(a,b)); } -template<> EIGEN_STRONG_INLINE Packet4uc pcmp_eq(const Packet4uc& a, const Packet4uc& b) -{ - return vget_lane_u32(vreinterpret_u32_u8(vceq_u8( - vreinterpret_u8_u32(vdup_n_u32(a)), - vreinterpret_u8_u32(vdup_n_u32(b)))), 0); -} -template<> EIGEN_STRONG_INLINE Packet8uc pcmp_eq(const Packet8uc& a, const Packet8uc& b) -{ return vceq_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) -{ return vceqq_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet4s pcmp_eq(const Packet4s& a, const Packet4s& b) -{ return vreinterpret_s16_u16(vceq_s16(a,b)); } -template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) -{ return vreinterpretq_s16_u16(vceqq_s16(a,b)); } -template<> EIGEN_STRONG_INLINE Packet4us pcmp_eq(const Packet4us& a, const Packet4us& b) -{ return vceq_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) -{ return vceqq_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet2i pcmp_eq(const Packet2i& a, const Packet2i& b) -{ return vreinterpret_s32_u32(vceq_s32(a,b)); } -template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) -{ return vreinterpretq_s32_u32(vceqq_s32(a,b)); } -template<> EIGEN_STRONG_INLINE Packet2ui pcmp_eq(const Packet2ui& a, const Packet2ui& b) -{ return vceq_u32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4ui pcmp_eq(const Packet4ui& a, const Packet4ui& b) -{ return vceqq_u32(a,b); } - -template<> EIGEN_STRONG_INLINE Packet2f pcmp_lt_or_nan(const Packet2f& a, const Packet2f& b) -{ return vreinterpret_f32_u32(vmvn_u32(vcge_f32(a,b))); } -template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) -{ return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a,b))); } - -template<> EIGEN_STRONG_INLINE Packet2f pfloor(const Packet2f& a) -{ - const Packet2f cst_1 = pset1(1.0f); - /* perform a floorf */ - Packet2f tmp = vcvt_f32_s32(vcvt_s32_f32(a)); - - /* if greater, substract 1 */ - Packet2ui mask = vcgt_f32(tmp, a); - mask = vand_u32(mask, vreinterpret_u32_f32(cst_1)); - return vsub_f32(tmp, vreinterpret_f32_u32(mask)); -} -template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) -{ - const Packet4f cst_1 = pset1(1.0f); - /* perform a floorf */ - Packet4f tmp = vcvtq_f32_s32(vcvtq_s32_f32(a)); - - /* if greater, substract 1 */ - Packet4ui mask = vcgtq_f32(tmp, a); - mask = vandq_u32(mask, vreinterpretq_u32_f32(cst_1)); - return vsubq_f32(tmp, vreinterpretq_f32_u32(mask)); -} // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics -template<> EIGEN_STRONG_INLINE Packet2f pand(const Packet2f& a, const Packet2f& b) -{ return vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); } template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) -{ return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); } -template<> EIGEN_STRONG_INLINE Packet4c pand(const Packet4c& a, const Packet4c& b) -{ return a & b; } -template<> EIGEN_STRONG_INLINE Packet8c pand(const Packet8c& a, const Packet8c& b) -{ return vand_s8(a,b); } -template<> EIGEN_STRONG_INLINE Packet16c pand(const Packet16c& a, const Packet16c& b) -{ return vandq_s8(a,b); } -template<> EIGEN_STRONG_INLINE Packet4uc pand(const Packet4uc& a, const Packet4uc& b) -{ return a & b; } -template<> EIGEN_STRONG_INLINE Packet8uc pand(const Packet8uc& a, const Packet8uc& b) -{ return vand_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet16uc pand(const Packet16uc& a, const Packet16uc& b) -{ return vandq_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet4s pand(const Packet4s& a, const Packet4s& b) { return vand_s16(a,b); } -template<> EIGEN_STRONG_INLINE Packet8s pand(const Packet8s& a, const Packet8s& b) { return vandq_s16(a,b); } -template<> EIGEN_STRONG_INLINE Packet4us pand(const Packet4us& a, const Packet4us& b) -{ return vand_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet8us pand(const Packet8us& a, const Packet8us& b) -{ return vandq_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet2i pand(const Packet2i& a, const Packet2i& b) { return vand_s32(a,b); } +{ + return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); +} template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet2ui pand(const Packet2ui& a, const Packet2ui& b) -{ return vand_u32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4ui pand(const Packet4ui& a, const Packet4ui& b) -{ return vandq_u32(a,b); } -template<> EIGEN_STRONG_INLINE Packet2l pand(const Packet2l& a, const Packet2l& b) { return vandq_s64(a,b); } -template<> EIGEN_STRONG_INLINE Packet2ul pand(const Packet2ul& a, const Packet2ul& b) -{ return vandq_u64(a,b); } -template<> EIGEN_STRONG_INLINE Packet2f por(const Packet2f& a, const Packet2f& b) -{ return vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); } template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) -{ return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); } -template<> EIGEN_STRONG_INLINE Packet4c por(const Packet4c& a, const Packet4c& b) -{ return a | b; } -template<> EIGEN_STRONG_INLINE Packet8c por(const Packet8c& a, const Packet8c& b) { return vorr_s8(a,b); } -template<> EIGEN_STRONG_INLINE Packet16c por(const Packet16c& a, const Packet16c& b) -{ return vorrq_s8(a,b); } -template<> EIGEN_STRONG_INLINE Packet4uc por(const Packet4uc& a, const Packet4uc& b) -{ return a | b; } -template<> EIGEN_STRONG_INLINE Packet8uc por(const Packet8uc& a, const Packet8uc& b) -{ return vorr_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet16uc por(const Packet16uc& a, const Packet16uc& b) -{ return vorrq_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet4s por(const Packet4s& a, const Packet4s& b) -{ return vorr_s16(a,b); } -template<> EIGEN_STRONG_INLINE Packet8s por(const Packet8s& a, const Packet8s& b) -{ return vorrq_s16(a,b); } -template<> EIGEN_STRONG_INLINE Packet4us por(const Packet4us& a, const Packet4us& b) -{ return vorr_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet8us por(const Packet8us& a, const Packet8us& b) -{ return vorrq_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet2i por(const Packet2i& a, const Packet2i& b) { return vorr_s32(a,b); } +{ + return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); +} template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet2ui por(const Packet2ui& a, const Packet2ui& b) -{ return vorr_u32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4ui por(const Packet4ui& a, const Packet4ui& b) -{ return vorrq_u32(a,b); } -template<> EIGEN_STRONG_INLINE Packet2l por(const Packet2l& a, const Packet2l& b) -{ return vorrq_s64(a,b); } -template<> EIGEN_STRONG_INLINE Packet2ul por(const Packet2ul& a, const Packet2ul& b) -{ return vorrq_u64(a,b); } -template<> EIGEN_STRONG_INLINE Packet2f pxor(const Packet2f& a, const Packet2f& b) -{ return vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); } template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) -{ return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); } -template<> EIGEN_STRONG_INLINE Packet4c pxor(const Packet4c& a, const Packet4c& b) -{ return a ^ b; } -template<> EIGEN_STRONG_INLINE Packet8c pxor(const Packet8c& a, const Packet8c& b) -{ return veor_s8(a,b); } -template<> EIGEN_STRONG_INLINE Packet16c pxor(const Packet16c& a, const Packet16c& b) -{ return veorq_s8(a,b); } -template<> EIGEN_STRONG_INLINE Packet4uc pxor(const Packet4uc& a, const Packet4uc& b) -{ return a ^ b; } -template<> EIGEN_STRONG_INLINE Packet8uc pxor(const Packet8uc& a, const Packet8uc& b) -{ return veor_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet16uc pxor(const Packet16uc& a, const Packet16uc& b) -{ return veorq_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet4s pxor(const Packet4s& a, const Packet4s& b) { return veor_s16(a,b); } -template<> EIGEN_STRONG_INLINE Packet8s pxor(const Packet8s& a, const Packet8s& b) { return veorq_s16(a,b); } -template<> EIGEN_STRONG_INLINE Packet4us pxor(const Packet4us& a, const Packet4us& b) -{ return veor_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet8us pxor(const Packet8us& a, const Packet8us& b) -{ return veorq_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet2i pxor(const Packet2i& a, const Packet2i& b) { return veor_s32(a,b); } +{ + return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); +} template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet2ui pxor(const Packet2ui& a, const Packet2ui& b) -{ return veor_u32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4ui pxor(const Packet4ui& a, const Packet4ui& b) -{ return veorq_u32(a,b); } -template<> EIGEN_STRONG_INLINE Packet2l pxor(const Packet2l& a, const Packet2l& b) -{ return veorq_s64(a,b); } -template<> EIGEN_STRONG_INLINE Packet2ul pxor(const Packet2ul& a, const Packet2ul& b) -{ return veorq_u64(a,b); } -template<> EIGEN_STRONG_INLINE Packet2f pandnot(const Packet2f& a, const Packet2f& b) -{ return vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a),vreinterpret_u32_f32(b))); } template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) -{ return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); } -template<> EIGEN_STRONG_INLINE Packet4c pandnot(const Packet4c& a, const Packet4c& b) -{ return a & ~b; } -template<> EIGEN_STRONG_INLINE Packet8c pandnot(const Packet8c& a, const Packet8c& b) { return vbic_s8(a,b); } -template<> EIGEN_STRONG_INLINE Packet16c pandnot(const Packet16c& a, const Packet16c& b) { return vbicq_s8(a,b); } -template<> EIGEN_STRONG_INLINE Packet4uc pandnot(const Packet4uc& a, const Packet4uc& b) -{ return a & ~b; } -template<> EIGEN_STRONG_INLINE Packet8uc pandnot(const Packet8uc& a, const Packet8uc& b) -{ return vbic_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet16uc pandnot(const Packet16uc& a, const Packet16uc& b) -{ return vbicq_u8(a,b); } -template<> EIGEN_STRONG_INLINE Packet4s pandnot(const Packet4s& a, const Packet4s& b) -{ return vbic_s16(a,b); } -template<> EIGEN_STRONG_INLINE Packet8s pandnot(const Packet8s& a, const Packet8s& b) -{ return vbicq_s16(a,b); } -template<> EIGEN_STRONG_INLINE Packet4us pandnot(const Packet4us& a, const Packet4us& b) -{ return vbic_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet8us pandnot(const Packet8us& a, const Packet8us& b) -{ return vbicq_u16(a,b); } -template<> EIGEN_STRONG_INLINE Packet2i pandnot(const Packet2i& a, const Packet2i& b) -{ return vbic_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) -{ return vbicq_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet2ui pandnot(const Packet2ui& a, const Packet2ui& b) -{ return vbic_u32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4ui pandnot(const Packet4ui& a, const Packet4ui& b) -{ return vbicq_u32(a,b); } -template<> EIGEN_STRONG_INLINE Packet2l pandnot(const Packet2l& a, const Packet2l& b) -{ return vbicq_s64(a,b); } -template<> EIGEN_STRONG_INLINE Packet2ul pandnot(const Packet2ul& a, const Packet2ul& b) -{ return vbicq_u64(a,b); } - -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pnot(const Packet2f& a) -{ return vreinterpret_f32_u32(vmvn_u32(vreinterpret_u32_f32(a))); } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pnot(const Packet4f& a) -{ return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(a))); } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pnot(const Packet4c& a) -{ return ~a; } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pnot(const Packet8c& a) -{ return vmvn_s8(a); } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pnot(const Packet16c& a) -{ return vmvnq_s8(a); } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pnot(const Packet4uc& a) -{ return ~a; } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pnot(const Packet8uc& a) -{ return vmvn_u8(a); } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pnot(const Packet16uc& a) -{ return vmvnq_u8(a); } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pnot(const Packet4s& a) -{ return vmvn_s16(a); } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pnot(const Packet8s& a) -{ return vmvnq_s16(a); } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pnot(const Packet4us& a) -{ return vmvn_u16(a); } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pnot(const Packet8us& a) -{ return vmvnq_u16(a); } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pnot(const Packet2i& a) -{ return vmvn_s32(a); } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pnot(const Packet4i& a) -{ return vmvnq_s32(a); } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pnot(const Packet2ui& a) -{ return vmvn_u32(a); } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pnot(const Packet4ui& a) -{ return vmvnq_u32(a); } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pnot(const Packet2l& a) -{ return vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a))); } -template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pnot(const Packet2ul& a) -{ return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(a))); } - -template EIGEN_STRONG_INLINE Packet4c parithmetic_shift_right(Packet4c& a) -{ return vget_lane_s32(vreinterpret_s32_s8(vshr_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); } -template EIGEN_STRONG_INLINE Packet8c parithmetic_shift_right(Packet8c a) { return vshr_n_s8(a,N); } -template EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(Packet16c a) { return vshrq_n_s8(a,N); } -template EIGEN_STRONG_INLINE Packet4uc parithmetic_shift_right(Packet4uc& a) -{ return vget_lane_u32(vreinterpret_u32_u8(vshr_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); } -template EIGEN_STRONG_INLINE Packet8uc parithmetic_shift_right(Packet8uc a) { return vshr_n_u8(a,N); } -template EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); } -template EIGEN_STRONG_INLINE Packet4s parithmetic_shift_right(Packet4s a) { return vshr_n_s16(a,N); } -template EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) { return vshrq_n_s16(a,N); } -template EIGEN_STRONG_INLINE Packet4us parithmetic_shift_right(Packet4us a) { return vshr_n_u16(a,N); } -template EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(Packet8us a) { return vshrq_n_u16(a,N); } -template EIGEN_STRONG_INLINE Packet2i parithmetic_shift_right(Packet2i a) { return vshr_n_s32(a,N); } -template EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) { return vshrq_n_s32(a,N); } -template EIGEN_STRONG_INLINE Packet2ui parithmetic_shift_right(Packet2ui a) { return vshr_n_u32(a,N); } -template EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); } -template EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(Packet2l a) { return vshrq_n_s64(a,N); } -template EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); } - -template EIGEN_STRONG_INLINE Packet4c plogical_shift_right(Packet4c& a) -{ return vget_lane_s32(vreinterpret_s32_u8(vshr_n_u8(vreinterpret_u8_s32(vdup_n_s32(a)), N)), 0); } -template EIGEN_STRONG_INLINE Packet8c plogical_shift_right(Packet8c a) -{ return vreinterpret_s8_u8(vshr_n_u8(vreinterpret_u8_s8(a),N)); } -template EIGEN_STRONG_INLINE Packet16c plogical_shift_right(Packet16c a) -{ return vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(a),N)); } -template EIGEN_STRONG_INLINE Packet4uc plogical_shift_right(Packet4uc& a) -{ return vget_lane_u32(vreinterpret_u32_s8(vshr_n_s8(vreinterpret_s8_u32(vdup_n_u32(a)), N)), 0); } -template EIGEN_STRONG_INLINE Packet8uc plogical_shift_right(Packet8uc a) { return vshr_n_u8(a,N); } -template EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(Packet16uc a) { return vshrq_n_u8(a,N); } -template EIGEN_STRONG_INLINE Packet4s plogical_shift_right(Packet4s a) -{ return vreinterpret_s16_u16(vshr_n_u16(vreinterpret_u16_s16(a),N)); } -template EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a) -{ return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(a),N)); } -template EIGEN_STRONG_INLINE Packet4us plogical_shift_right(Packet4us a) { return vshr_n_u16(a,N); } -template EIGEN_STRONG_INLINE Packet8us plogical_shift_right(Packet8us a) { return vshrq_n_u16(a,N); } -template EIGEN_STRONG_INLINE Packet2i plogical_shift_right(Packet2i a) -{ return vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(a),N)); } -template EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a) -{ return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a),N)); } -template EIGEN_STRONG_INLINE Packet2ui plogical_shift_right(Packet2ui a) { return vshr_n_u32(a,N); } -template EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(Packet4ui a) { return vshrq_n_u32(a,N); } -template EIGEN_STRONG_INLINE Packet2l plogical_shift_right(Packet2l a) -{ return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a),N)); } -template EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(Packet2ul a) { return vshrq_n_u64(a,N); } - -template EIGEN_STRONG_INLINE Packet4c plogical_shift_left(Packet4c& a) -{ return vget_lane_s32(vreinterpret_s32_s8(vshl_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0); } -template EIGEN_STRONG_INLINE Packet8c plogical_shift_left(Packet8c a) { return vshl_n_s8(a,N); } -template EIGEN_STRONG_INLINE Packet16c plogical_shift_left(Packet16c a) { return vshlq_n_s8(a,N); } -template EIGEN_STRONG_INLINE Packet4uc plogical_shift_left(Packet4uc& a) -{ return vget_lane_u32(vreinterpret_u32_u8(vshl_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0); } -template EIGEN_STRONG_INLINE Packet8uc plogical_shift_left(Packet8uc a) { return vshl_n_u8(a,N); } -template EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(Packet16uc a) { return vshlq_n_u8(a,N); } -template EIGEN_STRONG_INLINE Packet4s plogical_shift_left(Packet4s a) { return vshl_n_s16(a,N); } -template EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) { return vshlq_n_s16(a,N); } -template EIGEN_STRONG_INLINE Packet4us plogical_shift_left(Packet4us a) { return vshl_n_u16(a,N); } -template EIGEN_STRONG_INLINE Packet8us plogical_shift_left(Packet8us a) { return vshlq_n_u16(a,N); } -template EIGEN_STRONG_INLINE Packet2i plogical_shift_left(Packet2i a) { return vshl_n_s32(a,N); } -template EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) { return vshlq_n_s32(a,N); } -template EIGEN_STRONG_INLINE Packet2ui plogical_shift_left(Packet2ui a) { return vshl_n_u32(a,N); } -template EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(Packet4ui a) { return vshlq_n_u32(a,N); } -template EIGEN_STRONG_INLINE Packet2l plogical_shift_left(Packet2l a) { return vshlq_n_s64(a,N); } -template EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) { return vshlq_n_u64(a,N); } - -template<> EIGEN_STRONG_INLINE Packet2f pload(const float* from) -{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) -{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4c pload(const int8_t* from) { - Packet4c res; - memcpy(&res, from, sizeof(Packet4c)); - return res; + return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b))); } -template<> EIGEN_STRONG_INLINE Packet8c pload(const int8_t* from) -{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(from); } -template<> EIGEN_STRONG_INLINE Packet16c pload(const int8_t* from) -{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(from); } -template<> EIGEN_STRONG_INLINE Packet4uc pload(const uint8_t* from) -{ - Packet4uc res; - memcpy(&res, from, sizeof(Packet4uc)); - return res; -} -template<> EIGEN_STRONG_INLINE Packet8uc pload(const uint8_t* from) -{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(from); } -template<> EIGEN_STRONG_INLINE Packet16uc pload(const uint8_t* from) -{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(from); } -template<> EIGEN_STRONG_INLINE Packet4s pload(const int16_t* from) -{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(from); } -template<> EIGEN_STRONG_INLINE Packet8s pload(const int16_t* from) -{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(from); } -template<> EIGEN_STRONG_INLINE Packet4us pload(const uint16_t* from) -{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(from); } -template<> EIGEN_STRONG_INLINE Packet8us pload(const uint16_t* from) -{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(from); } -template<> EIGEN_STRONG_INLINE Packet2i pload(const int32_t* from) -{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(from); } -template<> EIGEN_STRONG_INLINE Packet4i pload(const int32_t* from) -{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } -template<> EIGEN_STRONG_INLINE Packet2ui pload(const uint32_t* from) -{ EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(from); } -template<> EIGEN_STRONG_INLINE Packet4ui pload(const uint32_t* from) -{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(from); } -template<> EIGEN_STRONG_INLINE Packet2l pload(const int64_t* from) -{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(from); } -template<> EIGEN_STRONG_INLINE Packet2ul pload(const uint64_t* from) -{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(from); } +template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet2f ploadu(const float* from) -{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) -{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4c ploadu(const int8_t* from) -{ - Packet4c res; - memcpy(&res, from, sizeof(Packet4c)); - return res; -} -template<> EIGEN_STRONG_INLINE Packet8c ploadu(const int8_t* from) -{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s8(from); } -template<> EIGEN_STRONG_INLINE Packet16c ploadu(const int8_t* from) -{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s8(from); } -template<> EIGEN_STRONG_INLINE Packet4uc ploadu(const uint8_t* from) -{ - Packet4uc res; - memcpy(&res, from, sizeof(Packet4uc)); - return res; -} -template<> EIGEN_STRONG_INLINE Packet8uc ploadu(const uint8_t* from) -{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u8(from); } -template<> EIGEN_STRONG_INLINE Packet16uc ploadu(const uint8_t* from) -{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u8(from); } -template<> EIGEN_STRONG_INLINE Packet4s ploadu(const int16_t* from) -{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s16(from); } -template<> EIGEN_STRONG_INLINE Packet8s ploadu(const int16_t* from) -{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s16(from); } -template<> EIGEN_STRONG_INLINE Packet4us ploadu(const uint16_t* from) -{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u16(from); } -template<> EIGEN_STRONG_INLINE Packet8us ploadu(const uint16_t* from) -{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u16(from); } -template<> EIGEN_STRONG_INLINE Packet2i ploadu(const int32_t* from) -{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s32(from); } -template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int32_t* from) -{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); } -template<> EIGEN_STRONG_INLINE Packet2ui ploadu(const uint32_t* from) -{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u32(from); } -template<> EIGEN_STRONG_INLINE Packet4ui ploadu(const uint32_t* from) -{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u32(from); } -template<> EIGEN_STRONG_INLINE Packet2l ploadu(const int64_t* from) -{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s64(from); } -template<> EIGEN_STRONG_INLINE Packet2ul ploadu(const uint64_t* from) -{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u64(from); } +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4i pload(const int32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } + +template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); } -template<> EIGEN_STRONG_INLINE Packet2f ploaddup(const float* from) -{ return vld1_dup_f32(from); } template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) -{ return vcombine_f32(vld1_dup_f32(from), vld1_dup_f32(from+1)); } -template<> EIGEN_STRONG_INLINE Packet4c ploaddup(const int8_t* from) { - const int8x8_t a = vreinterpret_s8_s32(vdup_n_s32(pload(from))); - return vget_lane_s32(vreinterpret_s32_s8(vzip_s8(a,a).val[0]), 0); + float32x2_t lo, hi; + lo = vld1_dup_f32(from); + hi = vld1_dup_f32(from+1); + return vcombine_f32(lo, hi); } -template<> EIGEN_STRONG_INLINE Packet8c ploaddup(const int8_t* from) -{ - const int8x8_t a = vld1_s8(from); - return vzip_s8(a,a).val[0]; -} -template<> EIGEN_STRONG_INLINE Packet16c ploaddup(const int8_t* from) -{ - const int8x8_t a = vld1_s8(from); - const int8x8x2_t b = vzip_s8(a,a); - return vcombine_s8(b.val[0], b.val[1]); -} -template<> EIGEN_STRONG_INLINE Packet4uc ploaddup(const uint8_t* from) -{ - const uint8x8_t a = vreinterpret_u8_u32(vdup_n_u32(pload(from))); - return vget_lane_u32(vreinterpret_u32_u8(vzip_u8(a,a).val[0]), 0); -} -template<> EIGEN_STRONG_INLINE Packet8uc ploaddup(const uint8_t* from) -{ - const uint8x8_t a = vld1_u8(from); - return vzip_u8(a,a).val[0]; -} -template<> EIGEN_STRONG_INLINE Packet16uc ploaddup(const uint8_t* from) -{ - const uint8x8_t a = vld1_u8(from); - const uint8x8x2_t b = vzip_u8(a,a); - return vcombine_u8(b.val[0], b.val[1]); -} -template<> EIGEN_STRONG_INLINE Packet4s ploaddup(const int16_t* from) -{ - return vreinterpret_s16_u32(vzip_u32(vreinterpret_u32_s16(vld1_dup_s16(from)), - vreinterpret_u32_s16(vld1_dup_s16(from+1))).val[0]); -} -template<> EIGEN_STRONG_INLINE Packet8s ploaddup(const int16_t* from) -{ - const int16x4_t a = vld1_s16(from); - const int16x4x2_t b = vzip_s16(a,a); - return vcombine_s16(b.val[0], b.val[1]); -} -template<> EIGEN_STRONG_INLINE Packet4us ploaddup(const uint16_t* from) -{ - return vreinterpret_u16_u32(vzip_u32(vreinterpret_u32_u16(vld1_dup_u16(from)), - vreinterpret_u32_u16(vld1_dup_u16(from+1))).val[0]); -} -template<> EIGEN_STRONG_INLINE Packet8us ploaddup(const uint16_t* from) -{ - const uint16x4_t a = vld1_u16(from); - const uint16x4x2_t b = vzip_u16(a,a); - return vcombine_u16(b.val[0], b.val[1]); -} -template<> EIGEN_STRONG_INLINE Packet2i ploaddup(const int32_t* from) -{ return vld1_dup_s32(from); } template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int32_t* from) -{ return vcombine_s32(vld1_dup_s32(from), vld1_dup_s32(from+1)); } -template<> EIGEN_STRONG_INLINE Packet2ui ploaddup(const uint32_t* from) -{ return vld1_dup_u32(from); } -template<> EIGEN_STRONG_INLINE Packet4ui ploaddup(const uint32_t* from) -{ return vcombine_u32(vld1_dup_u32(from), vld1_dup_u32(from+1)); } -template<> EIGEN_STRONG_INLINE Packet2l ploaddup(const int64_t* from) -{ return vld1q_dup_s64(from); } -template<> EIGEN_STRONG_INLINE Packet2ul ploaddup(const uint64_t* from) -{ return vld1q_dup_u64(from); } +{ + int32x2_t lo, hi; + lo = vld1_dup_s32(from); + hi = vld1_dup_s32(from+1); + return vcombine_s32(lo, hi); +} -template<> EIGEN_STRONG_INLINE Packet4f ploadquad(const float* from) { return vld1q_dup_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4c ploadquad(const int8_t* from) -{ return vget_lane_s32(vreinterpret_s32_s8(vld1_dup_s8(from)), 0); } -template<> EIGEN_STRONG_INLINE Packet8c ploadquad(const int8_t* from) -{ - return vreinterpret_s8_u32(vzip_u32( - vreinterpret_u32_s8(vld1_dup_s8(from)), - vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]); -} -template<> EIGEN_STRONG_INLINE Packet16c ploadquad(const int8_t* from) -{ - const int8x8_t a = vreinterpret_s8_u32(vzip_u32( - vreinterpret_u32_s8(vld1_dup_s8(from)), - vreinterpret_u32_s8(vld1_dup_s8(from+1))).val[0]); - const int8x8_t b = vreinterpret_s8_u32(vzip_u32( - vreinterpret_u32_s8(vld1_dup_s8(from+2)), - vreinterpret_u32_s8(vld1_dup_s8(from+3))).val[0]); - return vcombine_s8(a,b); -} -template<> EIGEN_STRONG_INLINE Packet4uc ploadquad(const uint8_t* from) -{ return vget_lane_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), 0); } -template<> EIGEN_STRONG_INLINE Packet8uc ploadquad(const uint8_t* from) -{ - return vreinterpret_u8_u32(vzip_u32( - vreinterpret_u32_u8(vld1_dup_u8(from)), - vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]); -} -template<> EIGEN_STRONG_INLINE Packet16uc ploadquad(const uint8_t* from) -{ - const uint8x8_t a = vreinterpret_u8_u32(vzip_u32( - vreinterpret_u32_u8(vld1_dup_u8(from)), - vreinterpret_u32_u8(vld1_dup_u8(from+1))).val[0]); - const uint8x8_t b = vreinterpret_u8_u32(vzip_u32( - vreinterpret_u32_u8(vld1_dup_u8(from+2)), - vreinterpret_u32_u8(vld1_dup_u8(from+3))).val[0]); - return vcombine_u8(a,b); -} -template<> EIGEN_STRONG_INLINE Packet8s ploadquad(const int16_t* from) -{ return vcombine_s16(vld1_dup_s16(from), vld1_dup_s16(from+1)); } -template<> EIGEN_STRONG_INLINE Packet8us ploadquad(const uint16_t* from) -{ return vcombine_u16(vld1_dup_u16(from), vld1_dup_u16(from+1)); } -template<> EIGEN_STRONG_INLINE Packet4i ploadquad(const int32_t* from) { return vld1q_dup_s32(from); } -template<> EIGEN_STRONG_INLINE Packet4ui ploadquad(const uint32_t* from) { return vld1q_dup_u32(from); } +template<> EIGEN_STRONG_INLINE void pstore (float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); } +template<> EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); } -template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet2f& from) -{ EIGEN_DEBUG_ALIGNED_STORE vst1_f32(to,from); } -template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) -{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to,from); } -template<> EIGEN_STRONG_INLINE void pstore(int8_t* to, const Packet4c& from) -{ memcpy(to, &from, sizeof(from)); } -template<> EIGEN_STRONG_INLINE void pstore(int8_t* to, const Packet8c& from) -{ EIGEN_DEBUG_ALIGNED_STORE vst1_s8(to,from); } -template<> EIGEN_STRONG_INLINE void pstore(int8_t* to, const Packet16c& from) -{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(to,from); } -template<> EIGEN_STRONG_INLINE void pstore(uint8_t* to, const Packet4uc& from) -{ memcpy(to, &from, sizeof(from)); } -template<> EIGEN_STRONG_INLINE void pstore(uint8_t* to, const Packet8uc& from) -{ EIGEN_DEBUG_ALIGNED_STORE vst1_u8(to,from); } -template<> EIGEN_STRONG_INLINE void pstore(uint8_t* to, const Packet16uc& from) -{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(to,from); } -template<> EIGEN_STRONG_INLINE void pstore(int16_t* to, const Packet4s& from) -{ EIGEN_DEBUG_ALIGNED_STORE vst1_s16(to,from); } -template<> EIGEN_STRONG_INLINE void pstore(int16_t* to, const Packet8s& from) -{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(to,from); } -template<> EIGEN_STRONG_INLINE void pstore(uint16_t* to, const Packet4us& from) -{ EIGEN_DEBUG_ALIGNED_STORE vst1_u16(to,from); } -template<> EIGEN_STRONG_INLINE void pstore(uint16_t* to, const Packet8us& from) -{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(to,from); } -template<> EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet2i& from) -{ EIGEN_DEBUG_ALIGNED_STORE vst1_s32(to,from); } -template<> EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet4i& from) -{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to,from); } -template<> EIGEN_STRONG_INLINE void pstore(uint32_t* to, const Packet2ui& from) -{ EIGEN_DEBUG_ALIGNED_STORE vst1_u32(to,from); } -template<> EIGEN_STRONG_INLINE void pstore(uint32_t* to, const Packet4ui& from) -{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(to,from); } -template<> EIGEN_STRONG_INLINE void pstore(int64_t* to, const Packet2l& from) -{ EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(to,from); } -template<> EIGEN_STRONG_INLINE void pstore(uint64_t* to, const Packet2ul& from) -{ EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu (float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet2f& from) -{ EIGEN_DEBUG_UNALIGNED_STORE vst1_f32(to,from); } -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) -{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to,from); } -template<> EIGEN_STRONG_INLINE void pstoreu(int8_t* to, const Packet4c& from) -{ memcpy(to, &from, sizeof(from)); } -template<> EIGEN_STRONG_INLINE void pstoreu(int8_t* to, const Packet8c& from) -{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s8(to,from); } -template<> EIGEN_STRONG_INLINE void pstoreu(int8_t* to, const Packet16c& from) -{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s8(to,from); } -template<> EIGEN_STRONG_INLINE void pstoreu(uint8_t* to, const Packet4uc& from) -{ memcpy(to, &from, sizeof(from)); } -template<> EIGEN_STRONG_INLINE void pstoreu(uint8_t* to, const Packet8uc& from) -{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u8(to,from); } -template<> EIGEN_STRONG_INLINE void pstoreu(uint8_t* to, const Packet16uc& from) -{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u8(to,from); } -template<> EIGEN_STRONG_INLINE void pstoreu(int16_t* to, const Packet4s& from) -{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s16(to,from); } -template<> EIGEN_STRONG_INLINE void pstoreu(int16_t* to, const Packet8s& from) -{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s16(to,from); } -template<> EIGEN_STRONG_INLINE void pstoreu(uint16_t* to, const Packet4us& from) -{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(to,from); } -template<> EIGEN_STRONG_INLINE void pstoreu(uint16_t* to, const Packet8us& from) -{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u16(to,from); } -template<> EIGEN_STRONG_INLINE void pstoreu(int32_t* to, const Packet2i& from) -{ EIGEN_DEBUG_UNALIGNED_STORE vst1_s32(to,from); } -template<> EIGEN_STRONG_INLINE void pstoreu(int32_t* to, const Packet4i& from) -{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to,from); } -template<> EIGEN_STRONG_INLINE void pstoreu(uint32_t* to, const Packet2ui& from) -{ EIGEN_DEBUG_UNALIGNED_STORE vst1_u32(to,from); } -template<> EIGEN_STRONG_INLINE void pstoreu(uint32_t* to, const Packet4ui& from) -{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u32(to,from); } -template<> EIGEN_STRONG_INLINE void pstoreu(int64_t* to, const Packet2l& from) -{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_s64(to,from); } -template<> EIGEN_STRONG_INLINE void pstoreu(uint64_t* to, const Packet2ul& from) -{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_u64(to,from); } - -template<> EIGEN_DEVICE_FUNC inline Packet2f pgather(const float* from, Index stride) -{ - Packet2f res = vld1_dup_f32(from); - res = vld1_lane_f32(from + 1*stride, res, 1); - return res; -} template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { - Packet4f res = vld1q_dup_f32(from); - res = vld1q_lane_f32(from + 1*stride, res, 1); - res = vld1q_lane_f32(from + 2*stride, res, 2); - res = vld1q_lane_f32(from + 3*stride, res, 3); - return res; -} -template<> EIGEN_DEVICE_FUNC inline Packet4c pgather(const int8_t* from, Index stride) -{ - Packet4c res; - for (int i = 0; i != 4; i++) - reinterpret_cast(&res)[i] = *(from + i * stride); - return res; -} -template<> EIGEN_DEVICE_FUNC inline Packet8c pgather(const int8_t* from, Index stride) -{ - Packet8c res = vld1_dup_s8(from); - res = vld1_lane_s8(from + 1*stride, res, 1); - res = vld1_lane_s8(from + 2*stride, res, 2); - res = vld1_lane_s8(from + 3*stride, res, 3); - res = vld1_lane_s8(from + 4*stride, res, 4); - res = vld1_lane_s8(from + 5*stride, res, 5); - res = vld1_lane_s8(from + 6*stride, res, 6); - res = vld1_lane_s8(from + 7*stride, res, 7); - return res; -} -template<> EIGEN_DEVICE_FUNC inline Packet16c pgather(const int8_t* from, Index stride) -{ - Packet16c res = vld1q_dup_s8(from); - res = vld1q_lane_s8(from + 1*stride, res, 1); - res = vld1q_lane_s8(from + 2*stride, res, 2); - res = vld1q_lane_s8(from + 3*stride, res, 3); - res = vld1q_lane_s8(from + 4*stride, res, 4); - res = vld1q_lane_s8(from + 5*stride, res, 5); - res = vld1q_lane_s8(from + 6*stride, res, 6); - res = vld1q_lane_s8(from + 7*stride, res, 7); - res = vld1q_lane_s8(from + 8*stride, res, 8); - res = vld1q_lane_s8(from + 9*stride, res, 9); - res = vld1q_lane_s8(from + 10*stride, res, 10); - res = vld1q_lane_s8(from + 11*stride, res, 11); - res = vld1q_lane_s8(from + 12*stride, res, 12); - res = vld1q_lane_s8(from + 13*stride, res, 13); - res = vld1q_lane_s8(from + 14*stride, res, 14); - res = vld1q_lane_s8(from + 15*stride, res, 15); - return res; -} -template<> EIGEN_DEVICE_FUNC inline Packet4uc pgather(const uint8_t* from, Index stride) -{ - Packet4uc res; - for (int i = 0; i != 4; i++) - reinterpret_cast(&res)[i] = *(from + i * stride); - return res; -} -template<> EIGEN_DEVICE_FUNC inline Packet8uc pgather(const uint8_t* from, Index stride) -{ - Packet8uc res = vld1_dup_u8(from); - res = vld1_lane_u8(from + 1*stride, res, 1); - res = vld1_lane_u8(from + 2*stride, res, 2); - res = vld1_lane_u8(from + 3*stride, res, 3); - res = vld1_lane_u8(from + 4*stride, res, 4); - res = vld1_lane_u8(from + 5*stride, res, 5); - res = vld1_lane_u8(from + 6*stride, res, 6); - res = vld1_lane_u8(from + 7*stride, res, 7); - return res; -} -template<> EIGEN_DEVICE_FUNC inline Packet16uc pgather(const uint8_t* from, Index stride) -{ - Packet16uc res = vld1q_dup_u8(from); - res = vld1q_lane_u8(from + 1*stride, res, 1); - res = vld1q_lane_u8(from + 2*stride, res, 2); - res = vld1q_lane_u8(from + 3*stride, res, 3); - res = vld1q_lane_u8(from + 4*stride, res, 4); - res = vld1q_lane_u8(from + 5*stride, res, 5); - res = vld1q_lane_u8(from + 6*stride, res, 6); - res = vld1q_lane_u8(from + 7*stride, res, 7); - res = vld1q_lane_u8(from + 8*stride, res, 8); - res = vld1q_lane_u8(from + 9*stride, res, 9); - res = vld1q_lane_u8(from + 10*stride, res, 10); - res = vld1q_lane_u8(from + 11*stride, res, 11); - res = vld1q_lane_u8(from + 12*stride, res, 12); - res = vld1q_lane_u8(from + 13*stride, res, 13); - res = vld1q_lane_u8(from + 14*stride, res, 14); - res = vld1q_lane_u8(from + 15*stride, res, 15); - return res; -} -template<> EIGEN_DEVICE_FUNC inline Packet4s pgather(const int16_t* from, Index stride) -{ - Packet4s res = vld1_dup_s16(from); - res = vld1_lane_s16(from + 1*stride, res, 1); - res = vld1_lane_s16(from + 2*stride, res, 2); - res = vld1_lane_s16(from + 3*stride, res, 3); - return res; -} -template<> EIGEN_DEVICE_FUNC inline Packet8s pgather(const int16_t* from, Index stride) -{ - Packet8s res = vld1q_dup_s16(from); - res = vld1q_lane_s16(from + 1*stride, res, 1); - res = vld1q_lane_s16(from + 2*stride, res, 2); - res = vld1q_lane_s16(from + 3*stride, res, 3); - res = vld1q_lane_s16(from + 4*stride, res, 4); - res = vld1q_lane_s16(from + 5*stride, res, 5); - res = vld1q_lane_s16(from + 6*stride, res, 6); - res = vld1q_lane_s16(from + 7*stride, res, 7); - return res; -} -template<> EIGEN_DEVICE_FUNC inline Packet4us pgather(const uint16_t* from, Index stride) -{ - Packet4us res = vld1_dup_u16(from); - res = vld1_lane_u16(from + 1*stride, res, 1); - res = vld1_lane_u16(from + 2*stride, res, 2); - res = vld1_lane_u16(from + 3*stride, res, 3); - return res; -} -template<> EIGEN_DEVICE_FUNC inline Packet8us pgather(const uint16_t* from, Index stride) -{ - Packet8us res = vld1q_dup_u16(from); - res = vld1q_lane_u16(from + 1*stride, res, 1); - res = vld1q_lane_u16(from + 2*stride, res, 2); - res = vld1q_lane_u16(from + 3*stride, res, 3); - res = vld1q_lane_u16(from + 4*stride, res, 4); - res = vld1q_lane_u16(from + 5*stride, res, 5); - res = vld1q_lane_u16(from + 6*stride, res, 6); - res = vld1q_lane_u16(from + 7*stride, res, 7); - return res; -} -template<> EIGEN_DEVICE_FUNC inline Packet2i pgather(const int32_t* from, Index stride) -{ - Packet2i res = vld1_dup_s32(from); - res = vld1_lane_s32(from + 1*stride, res, 1); + Packet4f res = pset1(0.f); + res = vsetq_lane_f32(from[0*stride], res, 0); + res = vsetq_lane_f32(from[1*stride], res, 1); + res = vsetq_lane_f32(from[2*stride], res, 2); + res = vsetq_lane_f32(from[3*stride], res, 3); return res; } template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int32_t* from, Index stride) { - Packet4i res = vld1q_dup_s32(from); - res = vld1q_lane_s32(from + 1*stride, res, 1); - res = vld1q_lane_s32(from + 2*stride, res, 2); - res = vld1q_lane_s32(from + 3*stride, res, 3); - return res; -} -template<> EIGEN_DEVICE_FUNC inline Packet2ui pgather(const uint32_t* from, Index stride) -{ - Packet2ui res = vld1_dup_u32(from); - res = vld1_lane_u32(from + 1*stride, res, 1); - return res; -} -template<> EIGEN_DEVICE_FUNC inline Packet4ui pgather(const uint32_t* from, Index stride) -{ - Packet4ui res = vld1q_dup_u32(from); - res = vld1q_lane_u32(from + 1*stride, res, 1); - res = vld1q_lane_u32(from + 2*stride, res, 2); - res = vld1q_lane_u32(from + 3*stride, res, 3); - return res; -} -template<> EIGEN_DEVICE_FUNC inline Packet2l pgather(const int64_t* from, Index stride) -{ - Packet2l res = vld1q_dup_s64(from); - res = vld1q_lane_s64(from + 1*stride, res, 1); - return res; -} -template<> EIGEN_DEVICE_FUNC inline Packet2ul pgather(const uint64_t* from, Index stride) -{ - Packet2ul res = vld1q_dup_u64(from); - res = vld1q_lane_u64(from + 1*stride, res, 1); + Packet4i res = pset1(0); + res = vsetq_lane_s32(from[0*stride], res, 0); + res = vsetq_lane_s32(from[1*stride], res, 1); + res = vsetq_lane_s32(from[2*stride], res, 2); + res = vsetq_lane_s32(from[3*stride], res, 3); return res; } -template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet2f& from, Index stride) -{ - vst1_lane_f32(to + stride*0, from, 0); - vst1_lane_f32(to + stride*1, from, 1); -} template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) { - vst1q_lane_f32(to + stride*0, from, 0); - vst1q_lane_f32(to + stride*1, from, 1); - vst1q_lane_f32(to + stride*2, from, 2); - vst1q_lane_f32(to + stride*3, from, 3); -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(int8_t* to, const Packet4c& from, Index stride) -{ - for (int i = 0; i != 4; i++) - *(to + i * stride) = reinterpret_cast(&from)[i]; -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(int8_t* to, const Packet8c& from, Index stride) -{ - vst1_lane_s8(to + stride*0, from, 0); - vst1_lane_s8(to + stride*1, from, 1); - vst1_lane_s8(to + stride*2, from, 2); - vst1_lane_s8(to + stride*3, from, 3); - vst1_lane_s8(to + stride*4, from, 4); - vst1_lane_s8(to + stride*5, from, 5); - vst1_lane_s8(to + stride*6, from, 6); - vst1_lane_s8(to + stride*7, from, 7); -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(int8_t* to, const Packet16c& from, Index stride) -{ - vst1q_lane_s8(to + stride*0, from, 0); - vst1q_lane_s8(to + stride*1, from, 1); - vst1q_lane_s8(to + stride*2, from, 2); - vst1q_lane_s8(to + stride*3, from, 3); - vst1q_lane_s8(to + stride*4, from, 4); - vst1q_lane_s8(to + stride*5, from, 5); - vst1q_lane_s8(to + stride*6, from, 6); - vst1q_lane_s8(to + stride*7, from, 7); - vst1q_lane_s8(to + stride*8, from, 8); - vst1q_lane_s8(to + stride*9, from, 9); - vst1q_lane_s8(to + stride*10, from, 10); - vst1q_lane_s8(to + stride*11, from, 11); - vst1q_lane_s8(to + stride*12, from, 12); - vst1q_lane_s8(to + stride*13, from, 13); - vst1q_lane_s8(to + stride*14, from, 14); - vst1q_lane_s8(to + stride*15, from, 15); -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(uint8_t* to, const Packet4uc& from, Index stride) -{ - for (int i = 0; i != 4; i++) - *(to + i * stride) = reinterpret_cast(&from)[i]; -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(uint8_t* to, const Packet8uc& from, Index stride) -{ - vst1_lane_u8(to + stride*0, from, 0); - vst1_lane_u8(to + stride*1, from, 1); - vst1_lane_u8(to + stride*2, from, 2); - vst1_lane_u8(to + stride*3, from, 3); - vst1_lane_u8(to + stride*4, from, 4); - vst1_lane_u8(to + stride*5, from, 5); - vst1_lane_u8(to + stride*6, from, 6); - vst1_lane_u8(to + stride*7, from, 7); -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(uint8_t* to, const Packet16uc& from, Index stride) -{ - vst1q_lane_u8(to + stride*0, from, 0); - vst1q_lane_u8(to + stride*1, from, 1); - vst1q_lane_u8(to + stride*2, from, 2); - vst1q_lane_u8(to + stride*3, from, 3); - vst1q_lane_u8(to + stride*4, from, 4); - vst1q_lane_u8(to + stride*5, from, 5); - vst1q_lane_u8(to + stride*6, from, 6); - vst1q_lane_u8(to + stride*7, from, 7); - vst1q_lane_u8(to + stride*8, from, 8); - vst1q_lane_u8(to + stride*9, from, 9); - vst1q_lane_u8(to + stride*10, from, 10); - vst1q_lane_u8(to + stride*11, from, 11); - vst1q_lane_u8(to + stride*12, from, 12); - vst1q_lane_u8(to + stride*13, from, 13); - vst1q_lane_u8(to + stride*14, from, 14); - vst1q_lane_u8(to + stride*15, from, 15); -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(int16_t* to, const Packet4s& from, Index stride) -{ - vst1_lane_s16(to + stride*0, from, 0); - vst1_lane_s16(to + stride*1, from, 1); - vst1_lane_s16(to + stride*2, from, 2); - vst1_lane_s16(to + stride*3, from, 3); -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(int16_t* to, const Packet8s& from, Index stride) -{ - vst1q_lane_s16(to + stride*0, from, 0); - vst1q_lane_s16(to + stride*1, from, 1); - vst1q_lane_s16(to + stride*2, from, 2); - vst1q_lane_s16(to + stride*3, from, 3); - vst1q_lane_s16(to + stride*4, from, 4); - vst1q_lane_s16(to + stride*5, from, 5); - vst1q_lane_s16(to + stride*6, from, 6); - vst1q_lane_s16(to + stride*7, from, 7); -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(uint16_t* to, const Packet4us& from, Index stride) -{ - vst1_lane_u16(to + stride*0, from, 0); - vst1_lane_u16(to + stride*1, from, 1); - vst1_lane_u16(to + stride*2, from, 2); - vst1_lane_u16(to + stride*3, from, 3); -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(uint16_t* to, const Packet8us& from, Index stride) -{ - vst1q_lane_u16(to + stride*0, from, 0); - vst1q_lane_u16(to + stride*1, from, 1); - vst1q_lane_u16(to + stride*2, from, 2); - vst1q_lane_u16(to + stride*3, from, 3); - vst1q_lane_u16(to + stride*4, from, 4); - vst1q_lane_u16(to + stride*5, from, 5); - vst1q_lane_u16(to + stride*6, from, 6); - vst1q_lane_u16(to + stride*7, from, 7); -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(int32_t* to, const Packet2i& from, Index stride) -{ - vst1_lane_s32(to + stride*0, from, 0); - vst1_lane_s32(to + stride*1, from, 1); + to[stride*0] = vgetq_lane_f32(from, 0); + to[stride*1] = vgetq_lane_f32(from, 1); + to[stride*2] = vgetq_lane_f32(from, 2); + to[stride*3] = vgetq_lane_f32(from, 3); } template<> EIGEN_DEVICE_FUNC inline void pscatter(int32_t* to, const Packet4i& from, Index stride) { - vst1q_lane_s32(to + stride*0, from, 0); - vst1q_lane_s32(to + stride*1, from, 1); - vst1q_lane_s32(to + stride*2, from, 2); - vst1q_lane_s32(to + stride*3, from, 3); -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(uint32_t* to, const Packet2ui& from, Index stride) -{ - vst1_lane_u32(to + stride*0, from, 0); - vst1_lane_u32(to + stride*1, from, 1); -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(uint32_t* to, const Packet4ui& from, Index stride) -{ - vst1q_lane_u32(to + stride*0, from, 0); - vst1q_lane_u32(to + stride*1, from, 1); - vst1q_lane_u32(to + stride*2, from, 2); - vst1q_lane_u32(to + stride*3, from, 3); -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(int64_t* to, const Packet2l& from, Index stride) -{ - vst1q_lane_s64(to + stride*0, from, 0); - vst1q_lane_s64(to + stride*1, from, 1); -} -template<> EIGEN_DEVICE_FUNC inline void pscatter(uint64_t* to, const Packet2ul& from, Index stride) -{ - vst1q_lane_u64(to + stride*0, from, 0); - vst1q_lane_u64(to + stride*1, from, 1); + to[stride*0] = vgetq_lane_s32(from, 0); + to[stride*1] = vgetq_lane_s32(from, 1); + to[stride*2] = vgetq_lane_s32(from, 2); + to[stride*3] = vgetq_lane_s32(from, 3); } -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ARM_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE void prefetch(const int8_t* addr) { EIGEN_ARM_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE void prefetch(const uint8_t* addr) { EIGEN_ARM_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE void prefetch(const int16_t* addr) { EIGEN_ARM_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE void prefetch(const uint16_t* addr) { EIGEN_ARM_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE void prefetch(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE void prefetch(const uint32_t* addr) { EIGEN_ARM_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE void prefetch(const int64_t* addr) { EIGEN_ARM_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE void prefetch(const uint64_t* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch (const float* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE float pfirst(const Packet2f& a) { return vget_lane_f32(a,0); } -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { return vgetq_lane_f32(a,0); } -template<> EIGEN_STRONG_INLINE int8_t pfirst(const Packet4c& a) { return static_cast(a & 0xff); } -template<> EIGEN_STRONG_INLINE int8_t pfirst(const Packet8c& a) { return vget_lane_s8(a,0); } -template<> EIGEN_STRONG_INLINE int8_t pfirst(const Packet16c& a) { return vgetq_lane_s8(a,0); } -template<> EIGEN_STRONG_INLINE uint8_t pfirst(const Packet4uc& a) { return static_cast(a & 0xff); } -template<> EIGEN_STRONG_INLINE uint8_t pfirst(const Packet8uc& a) { return vget_lane_u8(a,0); } -template<> EIGEN_STRONG_INLINE uint8_t pfirst(const Packet16uc& a) { return vgetq_lane_u8(a,0); } -template<> EIGEN_STRONG_INLINE int16_t pfirst(const Packet4s& a) { return vget_lane_s16(a,0); } -template<> EIGEN_STRONG_INLINE int16_t pfirst(const Packet8s& a) { return vgetq_lane_s16(a,0); } -template<> EIGEN_STRONG_INLINE uint16_t pfirst(const Packet4us& a) { return vget_lane_u16(a,0); } -template<> EIGEN_STRONG_INLINE uint16_t pfirst(const Packet8us& a) { return vgetq_lane_u16(a,0); } -template<> EIGEN_STRONG_INLINE int32_t pfirst(const Packet2i& a) { return vget_lane_s32(a,0); } -template<> EIGEN_STRONG_INLINE int32_t pfirst(const Packet4i& a) { return vgetq_lane_s32(a,0); } -template<> EIGEN_STRONG_INLINE uint32_t pfirst(const Packet2ui& a) { return vget_lane_u32(a,0); } -template<> EIGEN_STRONG_INLINE uint32_t pfirst(const Packet4ui& a) { return vgetq_lane_u32(a,0); } -template<> EIGEN_STRONG_INLINE int64_t pfirst(const Packet2l& a) { return vgetq_lane_s64(a,0); } -template<> EIGEN_STRONG_INLINE uint64_t pfirst(const Packet2ul& a) { return vgetq_lane_u64(a,0); } +// FIXME only store the 2 first elements ? +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE int32_t pfirst(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; } -template<> EIGEN_STRONG_INLINE Packet2f preverse(const Packet2f& a) { return vrev64_f32(a); } -template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) -{ - const float32x4_t a_r64 = vrev64q_f32(a); - return vcombine_f32(vget_high_f32(a_r64), vget_low_f32(a_r64)); -} -template<> EIGEN_STRONG_INLINE Packet4c preverse(const Packet4c& a) -{ return vget_lane_s32(vreinterpret_s32_s8(vrev64_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); } -template<> EIGEN_STRONG_INLINE Packet8c preverse(const Packet8c& a) { return vrev64_s8(a); } -template<> EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) -{ - const int8x16_t a_r64 = vrev64q_s8(a); - return vcombine_s8(vget_high_s8(a_r64), vget_low_s8(a_r64)); -} -template<> EIGEN_STRONG_INLINE Packet4uc preverse(const Packet4uc& a) -{ return vget_lane_u32(vreinterpret_u32_u8(vrev64_u8(vreinterpret_u8_u32(vdup_n_u32(a)))), 0); } -template<> EIGEN_STRONG_INLINE Packet8uc preverse(const Packet8uc& a) { return vrev64_u8(a); } -template<> EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) -{ - const uint8x16_t a_r64 = vrev64q_u8(a); - return vcombine_u8(vget_high_u8(a_r64), vget_low_u8(a_r64)); -} -template<> EIGEN_STRONG_INLINE Packet4s preverse(const Packet4s& a) { return vrev64_s16(a); } -template<> EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) -{ - const int16x8_t a_r64 = vrev64q_s16(a); - return vcombine_s16(vget_high_s16(a_r64), vget_low_s16(a_r64)); -} -template<> EIGEN_STRONG_INLINE Packet4us preverse(const Packet4us& a) { return vrev64_u16(a); } -template<> EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) -{ - const uint16x8_t a_r64 = vrev64q_u16(a); - return vcombine_u16(vget_high_u16(a_r64), vget_low_u16(a_r64)); -} -template<> EIGEN_STRONG_INLINE Packet2i preverse(const Packet2i& a) { return vrev64_s32(a); } -template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) -{ - const int32x4_t a_r64 = vrev64q_s32(a); - return vcombine_s32(vget_high_s32(a_r64), vget_low_s32(a_r64)); -} -template<> EIGEN_STRONG_INLINE Packet2ui preverse(const Packet2ui& a) { return vrev64_u32(a); } -template<> EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) -{ - const uint32x4_t a_r64 = vrev64q_u32(a); - return vcombine_u32(vget_high_u32(a_r64), vget_low_u32(a_r64)); -} -template<> EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a) -{ return vcombine_s64(vget_high_s64(a), vget_low_s64(a)); } -template<> EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a) -{ return vcombine_u64(vget_high_u64(a), vget_low_u64(a)); } +template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { + float32x2_t a_lo, a_hi; + Packet4f a_r64; + + a_r64 = vrev64q_f32(a); + a_lo = vget_low_f32(a_r64); + a_hi = vget_high_f32(a_r64); + return vcombine_f32(a_hi, a_lo); +} +template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { + int32x2_t a_lo, a_hi; + Packet4i a_r64; + + a_r64 = vrev64q_s32(a); + a_lo = vget_low_s32(a_r64); + a_hi = vget_high_s32(a_r64); + return vcombine_s32(a_hi, a_lo); +} -template<> EIGEN_STRONG_INLINE Packet2f pabs(const Packet2f& a) { return vabs_f32(a); } template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); } -template<> EIGEN_STRONG_INLINE Packet4c pabs(const Packet4c& a) -{ return vget_lane_s32(vreinterpret_s32_s8(vabs_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0); } -template<> EIGEN_STRONG_INLINE Packet8c pabs(const Packet8c& a) { return vabs_s8(a); } -template<> EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) { return vabsq_s8(a); } -template<> EIGEN_STRONG_INLINE Packet4uc pabs(const Packet4uc& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet8uc pabs(const Packet8uc& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4s pabs(const Packet4s& a) { return vabs_s16(a); } -template<> EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) { return vabsq_s16(a); } -template<> EIGEN_STRONG_INLINE Packet4us pabs(const Packet4us& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet2i pabs(const Packet2i& a) { return vabs_s32(a); } template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); } -template<> EIGEN_STRONG_INLINE Packet2ui pabs(const Packet2ui& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) { -#if EIGEN_ARCH_ARM64 - return vabsq_s64(a); -#else - return vcombine_s64( - vdup_n_s64((std::abs)(vgetq_lane_s64(a, 0))), - vdup_n_s64((std::abs)(vgetq_lane_s64(a, 1)))); -#endif -} -template<> EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet2f pfrexp(const Packet2f& a, Packet2f& exponent) -{ return pfrexp_float(a,exponent); } -template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Packet4f& exponent) -{ return pfrexp_float(a,exponent); } - -template<> EIGEN_STRONG_INLINE Packet2f pldexp(const Packet2f& a, const Packet2f& exponent) -{ return pldexp_float(a,exponent); } -template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) -{ return pldexp_float(a,exponent); } - -template<> EIGEN_STRONG_INLINE float predux(const Packet2f& a) { return vget_lane_f32(vpadd_f32(a,a), 0); } template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) { - const float32x2_t sum = vadd_f32(vget_low_f32(a), vget_high_f32(a)); - return vget_lane_f32(vpadd_f32(sum, sum), 0); + float32x2_t a_lo, a_hi, sum; + + a_lo = vget_low_f32(a); + a_hi = vget_high_f32(a); + sum = vpadd_f32(a_lo, a_hi); + sum = vpadd_f32(sum, sum); + return vget_lane_f32(sum, 0); } -template<> EIGEN_STRONG_INLINE int8_t predux(const Packet4c& a) + +template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) { - const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a)); - int8x8_t sum = vpadd_s8(a_dup, a_dup); - sum = vpadd_s8(sum, sum); - return vget_lane_s8(sum, 0); + float32x4x2_t vtrn1, vtrn2, res1, res2; + Packet4f sum1, sum2, sum; + + // NEON zip performs interleaving of the supplied vectors. + // We perform two interleaves in a row to acquire the transposed vector + vtrn1 = vzipq_f32(vecs[0], vecs[2]); + vtrn2 = vzipq_f32(vecs[1], vecs[3]); + res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]); + res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]); + + // Do the addition of the resulting vectors + sum1 = vaddq_f32(res1.val[0], res1.val[1]); + sum2 = vaddq_f32(res2.val[0], res2.val[1]); + sum = vaddq_f32(sum1, sum2); + + return sum; } -template<> EIGEN_STRONG_INLINE int8_t predux(const Packet8c& a) -{ - int8x8_t sum = vpadd_s8(a,a); - sum = vpadd_s8(sum, sum); - sum = vpadd_s8(sum, sum); - return vget_lane_s8(sum, 0); -} -template<> EIGEN_STRONG_INLINE int8_t predux(const Packet16c& a) -{ - int8x8_t sum = vadd_s8(vget_low_s8(a), vget_high_s8(a)); - sum = vpadd_s8(sum, sum); - sum = vpadd_s8(sum, sum); - sum = vpadd_s8(sum, sum); - return vget_lane_s8(sum, 0); -} -template<> EIGEN_STRONG_INLINE uint8_t predux(const Packet4uc& a) -{ - const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a)); - uint8x8_t sum = vpadd_u8(a_dup, a_dup); - sum = vpadd_u8(sum, sum); - return vget_lane_u8(sum, 0); -} -template<> EIGEN_STRONG_INLINE uint8_t predux(const Packet8uc& a) -{ - uint8x8_t sum = vpadd_u8(a,a); - sum = vpadd_u8(sum, sum); - sum = vpadd_u8(sum, sum); - return vget_lane_u8(sum, 0); -} -template<> EIGEN_STRONG_INLINE uint8_t predux(const Packet16uc& a) -{ - uint8x8_t sum = vadd_u8(vget_low_u8(a), vget_high_u8(a)); - sum = vpadd_u8(sum, sum); - sum = vpadd_u8(sum, sum); - sum = vpadd_u8(sum, sum); - return vget_lane_u8(sum, 0); -} -template<> EIGEN_STRONG_INLINE int16_t predux(const Packet4s& a) -{ - const int16x4_t sum = vpadd_s16(a,a); - return vget_lane_s16(vpadd_s16(sum, sum), 0); -} -template<> EIGEN_STRONG_INLINE int16_t predux(const Packet8s& a) -{ - int16x4_t sum = vadd_s16(vget_low_s16(a), vget_high_s16(a)); - sum = vpadd_s16(sum, sum); - sum = vpadd_s16(sum, sum); - return vget_lane_s16(sum, 0); -} -template<> EIGEN_STRONG_INLINE uint16_t predux(const Packet4us& a) -{ - const uint16x4_t sum = vpadd_u16(a,a); - return vget_lane_u16(vpadd_u16(sum, sum), 0); -} -template<> EIGEN_STRONG_INLINE uint16_t predux(const Packet8us& a) -{ - uint16x4_t sum = vadd_u16(vget_low_u16(a), vget_high_u16(a)); - sum = vpadd_u16(sum, sum); - sum = vpadd_u16(sum, sum); - return vget_lane_u16(sum, 0); -} -template<> EIGEN_STRONG_INLINE int32_t predux(const Packet2i& a) { return vget_lane_s32(vpadd_s32(a,a), 0); } + template<> EIGEN_STRONG_INLINE int32_t predux(const Packet4i& a) { - const int32x2_t sum = vadd_s32(vget_low_s32(a), vget_high_s32(a)); - return vget_lane_s32(vpadd_s32(sum, sum), 0); -} -template<> EIGEN_STRONG_INLINE uint32_t predux(const Packet2ui& a) { return vget_lane_u32(vpadd_u32(a,a), 0); } -template<> EIGEN_STRONG_INLINE uint32_t predux(const Packet4ui& a) -{ - const uint32x2_t sum = vadd_u32(vget_low_u32(a), vget_high_u32(a)); - return vget_lane_u32(vpadd_u32(sum, sum), 0); -} -template<> EIGEN_STRONG_INLINE int64_t predux(const Packet2l& a) -{ return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); } -template<> EIGEN_STRONG_INLINE uint64_t predux(const Packet2ul& a) -{ return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); } + int32x2_t a_lo, a_hi, sum; -template<> EIGEN_DEVICE_FUNC inline Packet4c predux_half_dowto4(const Packet8c& a) -{ - return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a, - vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0); + a_lo = vget_low_s32(a); + a_hi = vget_high_s32(a); + sum = vpadd_s32(a_lo, a_hi); + sum = vpadd_s32(sum, sum); + return vget_lane_s32(sum, 0); } -template<> EIGEN_DEVICE_FUNC inline Packet8c predux_half_dowto4(const Packet16c& a) -{ return vadd_s8(vget_high_s8(a), vget_low_s8(a)); } -template<> EIGEN_DEVICE_FUNC inline Packet4uc predux_half_dowto4(const Packet8uc& a) + +template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) { - return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a, - vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0); + int32x4x2_t vtrn1, vtrn2, res1, res2; + Packet4i sum1, sum2, sum; + + // NEON zip performs interleaving of the supplied vectors. + // We perform two interleaves in a row to acquire the transposed vector + vtrn1 = vzipq_s32(vecs[0], vecs[2]); + vtrn2 = vzipq_s32(vecs[1], vecs[3]); + res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]); + res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]); + + // Do the addition of the resulting vectors + sum1 = vaddq_s32(res1.val[0], res1.val[1]); + sum2 = vaddq_s32(res2.val[0], res2.val[1]); + sum = vaddq_s32(sum1, sum2); + + return sum; } -template<> EIGEN_DEVICE_FUNC inline Packet8uc predux_half_dowto4(const Packet16uc& a) -{ return vadd_u8(vget_high_u8(a), vget_low_u8(a)); } -template<> EIGEN_DEVICE_FUNC inline Packet4s predux_half_dowto4(const Packet8s& a) -{ return vadd_s16(vget_high_s16(a), vget_low_s16(a)); } -template<> EIGEN_DEVICE_FUNC inline Packet4us predux_half_dowto4(const Packet8us& a) -{ return vadd_u16(vget_high_u16(a), vget_low_u16(a)); } // Other reduction functions: // mul -template<> EIGEN_STRONG_INLINE float predux_mul(const Packet2f& a) -{ return vget_lane_f32(a, 0) * vget_lane_f32(a, 1); } template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) -{ return predux_mul(vmul_f32(vget_low_f32(a), vget_high_f32(a))); } -template<> EIGEN_STRONG_INLINE int8_t predux_mul(const Packet4c& a) { - int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a)); - prod = vmul_s8(prod, vrev16_s8(prod)); - return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 2); -} -template<> EIGEN_STRONG_INLINE int8_t predux_mul(const Packet8c& a) -{ - int8x8_t prod = vmul_s8(a, vrev16_s8(a)); - prod = vmul_s8(prod, vrev32_s8(prod)); - return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4); -} -template<> EIGEN_STRONG_INLINE int8_t predux_mul(const Packet16c& a) -{ return predux_mul(vmul_s8(vget_low_s8(a), vget_high_s8(a))); } -template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet4uc& a) -{ - uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a)); - prod = vmul_u8(prod, vrev16_u8(prod)); - return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 2); -} -template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet8uc& a) -{ - uint8x8_t prod = vmul_u8(a, vrev16_u8(a)); - prod = vmul_u8(prod, vrev32_u8(prod)); - return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4); -} -template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet16uc& a) -{ return predux_mul(vmul_u8(vget_low_u8(a), vget_high_u8(a))); } -template<> EIGEN_STRONG_INLINE int16_t predux_mul(const Packet4s& a) -{ - const int16x4_t prod = vmul_s16(a, vrev32_s16(a)); - return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2); -} -template<> EIGEN_STRONG_INLINE int16_t predux_mul(const Packet8s& a) -{ - int16x4_t prod; + float32x2_t a_lo, a_hi, prod; - // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8| - prod = vmul_s16(vget_low_s16(a), vget_high_s16(a)); - // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8| - prod = vmul_s16(prod, vrev32_s16(prod)); - // Multiply |a1*a5*a2*a6*a3*a7*a4*a8| - return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2); -} -template<> EIGEN_STRONG_INLINE uint16_t predux_mul(const Packet4us& a) -{ - const uint16x4_t prod = vmul_u16(a, vrev32_u16(a)); - return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2); -} -template<> EIGEN_STRONG_INLINE uint16_t predux_mul(const Packet8us& a) -{ - uint16x4_t prod; + // Get a_lo = |a1|a2| and a_hi = |a3|a4| + a_lo = vget_low_f32(a); + a_hi = vget_high_f32(a); + // Get the product of a_lo * a_hi -> |a1*a3|a2*a4| + prod = vmul_f32(a_lo, a_hi); + // Multiply prod with its swapped value |a2*a4|a1*a3| + prod = vmul_f32(prod, vrev64_f32(prod)); - // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8| - prod = vmul_u16(vget_low_u16(a), vget_high_u16(a)); - // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8| - prod = vmul_u16(prod, vrev32_u16(prod)); - // Multiply |a1*a5*a2*a6*a3*a7*a4*a8| - return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2); + return vget_lane_f32(prod, 0); } -template<> EIGEN_STRONG_INLINE int32_t predux_mul(const Packet2i& a) -{ return vget_lane_s32(a, 0) * vget_lane_s32(a, 1); } template<> EIGEN_STRONG_INLINE int32_t predux_mul(const Packet4i& a) -{ return predux_mul(vmul_s32(vget_low_s32(a), vget_high_s32(a))); } -template<> EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet2ui& a) -{ return vget_lane_u32(a, 0) * vget_lane_u32(a, 1); } -template<> EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet4ui& a) -{ return predux_mul(vmul_u32(vget_low_u32(a), vget_high_u32(a))); } -template<> EIGEN_STRONG_INLINE int64_t predux_mul(const Packet2l& a) -{ return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1); } -template<> EIGEN_STRONG_INLINE uint64_t predux_mul(const Packet2ul& a) -{ return vgetq_lane_u64(a, 0) * vgetq_lane_u64(a, 1); } +{ + int32x2_t a_lo, a_hi, prod; + + // Get a_lo = |a1|a2| and a_hi = |a3|a4| + a_lo = vget_low_s32(a); + a_hi = vget_high_s32(a); + // Get the product of a_lo * a_hi -> |a1*a3|a2*a4| + prod = vmul_s32(a_lo, a_hi); + // Multiply prod with its swapped value |a2*a4|a1*a3| + prod = vmul_s32(prod, vrev64_s32(prod)); + + return vget_lane_s32(prod, 0); +} // min -template<> EIGEN_STRONG_INLINE float predux_min(const Packet2f& a) -{ return vget_lane_f32(vpmin_f32(a,a), 0); } template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) { - const float32x2_t min = vmin_f32(vget_low_f32(a), vget_high_f32(a)); - return vget_lane_f32(vpmin_f32(min, min), 0); + float32x2_t a_lo, a_hi, min; + + a_lo = vget_low_f32(a); + a_hi = vget_high_f32(a); + min = vpmin_f32(a_lo, a_hi); + min = vpmin_f32(min, min); + + return vget_lane_f32(min, 0); } -template<> EIGEN_STRONG_INLINE int8_t predux_min(const Packet4c& a) -{ - const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a)); - int8x8_t min = vpmin_s8(a_dup, a_dup); - min = vpmin_s8(min, min); - return vget_lane_s8(min, 0); -} -template<> EIGEN_STRONG_INLINE int8_t predux_min(const Packet8c& a) -{ - int8x8_t min = vpmin_s8(a,a); - min = vpmin_s8(min, min); - min = vpmin_s8(min, min); - return vget_lane_s8(min, 0); -} -template<> EIGEN_STRONG_INLINE int8_t predux_min(const Packet16c& a) -{ - int8x8_t min = vmin_s8(vget_low_s8(a), vget_high_s8(a)); - min = vpmin_s8(min, min); - min = vpmin_s8(min, min); - min = vpmin_s8(min, min); - return vget_lane_s8(min, 0); -} -template<> EIGEN_STRONG_INLINE uint8_t predux_min(const Packet4uc& a) -{ - const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a)); - uint8x8_t min = vpmin_u8(a_dup, a_dup); - min = vpmin_u8(min, min); - return vget_lane_u8(min, 0); -} -template<> EIGEN_STRONG_INLINE uint8_t predux_min(const Packet8uc& a) -{ - uint8x8_t min = vpmin_u8(a,a); - min = vpmin_u8(min, min); - min = vpmin_u8(min, min); - return vget_lane_u8(min, 0); -} -template<> EIGEN_STRONG_INLINE uint8_t predux_min(const Packet16uc& a) -{ - uint8x8_t min = vmin_u8(vget_low_u8(a), vget_high_u8(a)); - min = vpmin_u8(min, min); - min = vpmin_u8(min, min); - min = vpmin_u8(min, min); - return vget_lane_u8(min, 0); -} -template<> EIGEN_STRONG_INLINE int16_t predux_min(const Packet4s& a) -{ - const int16x4_t min = vpmin_s16(a,a); - return vget_lane_s16(vpmin_s16(min, min), 0); -} -template<> EIGEN_STRONG_INLINE int16_t predux_min(const Packet8s& a) -{ - int16x4_t min = vmin_s16(vget_low_s16(a), vget_high_s16(a)); - min = vpmin_s16(min, min); - min = vpmin_s16(min, min); - return vget_lane_s16(min, 0); -} -template<> EIGEN_STRONG_INLINE uint16_t predux_min(const Packet4us& a) -{ - const uint16x4_t min = vpmin_u16(a,a); - return vget_lane_u16(vpmin_u16(min, min), 0); -} -template<> EIGEN_STRONG_INLINE uint16_t predux_min(const Packet8us& a) -{ - uint16x4_t min = vmin_u16(vget_low_u16(a), vget_high_u16(a)); - min = vpmin_u16(min, min); - min = vpmin_u16(min, min); - return vget_lane_u16(min, 0); -} -template<> EIGEN_STRONG_INLINE int32_t predux_min(const Packet2i& a) -{ return vget_lane_s32(vpmin_s32(a,a), 0); } + template<> EIGEN_STRONG_INLINE int32_t predux_min(const Packet4i& a) { - const int32x2_t min = vmin_s32(vget_low_s32(a), vget_high_s32(a)); - return vget_lane_s32(vpmin_s32(min, min), 0); + int32x2_t a_lo, a_hi, min; + + a_lo = vget_low_s32(a); + a_hi = vget_high_s32(a); + min = vpmin_s32(a_lo, a_hi); + min = vpmin_s32(min, min); + + return vget_lane_s32(min, 0); } -template<> EIGEN_STRONG_INLINE uint32_t predux_min(const Packet2ui& a) -{ return vget_lane_u32(vpmin_u32(a,a), 0); } -template<> EIGEN_STRONG_INLINE uint32_t predux_min(const Packet4ui& a) -{ - const uint32x2_t min = vmin_u32(vget_low_u32(a), vget_high_u32(a)); - return vget_lane_u32(vpmin_u32(min, min), 0); -} -template<> EIGEN_STRONG_INLINE int64_t predux_min(const Packet2l& a) -{ return (std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); } -template<> EIGEN_STRONG_INLINE uint64_t predux_min(const Packet2ul& a) -{ return (std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); } // max -template<> EIGEN_STRONG_INLINE float predux_max(const Packet2f& a) -{ return vget_lane_f32(vpmax_f32(a,a), 0); } template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) { - const float32x2_t max = vmax_f32(vget_low_f32(a), vget_high_f32(a)); - return vget_lane_f32(vpmax_f32(max, max), 0); + float32x2_t a_lo, a_hi, max; + + a_lo = vget_low_f32(a); + a_hi = vget_high_f32(a); + max = vpmax_f32(a_lo, a_hi); + max = vpmax_f32(max, max); + + return vget_lane_f32(max, 0); } -template<> EIGEN_STRONG_INLINE int8_t predux_max(const Packet4c& a) -{ - const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a)); - int8x8_t max = vpmax_s8(a_dup, a_dup); - max = vpmax_s8(max, max); - return vget_lane_s8(max, 0); -} -template<> EIGEN_STRONG_INLINE int8_t predux_max(const Packet8c& a) -{ - int8x8_t max = vpmax_s8(a,a); - max = vpmax_s8(max, max); - max = vpmax_s8(max, max); - return vget_lane_s8(max, 0); -} -template<> EIGEN_STRONG_INLINE int8_t predux_max(const Packet16c& a) -{ - int8x8_t max = vmax_s8(vget_low_s8(a), vget_high_s8(a)); - max = vpmax_s8(max, max); - max = vpmax_s8(max, max); - max = vpmax_s8(max, max); - return vget_lane_s8(max, 0); -} -template<> EIGEN_STRONG_INLINE uint8_t predux_max(const Packet4uc& a) -{ - const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a)); - uint8x8_t max = vpmax_u8(a_dup, a_dup); - max = vpmax_u8(max, max); - return vget_lane_u8(max, 0); -} -template<> EIGEN_STRONG_INLINE uint8_t predux_max(const Packet8uc& a) -{ - uint8x8_t max = vpmax_u8(a,a); - max = vpmax_u8(max, max); - max = vpmax_u8(max, max); - return vget_lane_u8(max, 0); -} -template<> EIGEN_STRONG_INLINE uint8_t predux_max(const Packet16uc& a) -{ - uint8x8_t max = vmax_u8(vget_low_u8(a), vget_high_u8(a)); - max = vpmax_u8(max, max); - max = vpmax_u8(max, max); - max = vpmax_u8(max, max); - return vget_lane_u8(max, 0); -} -template<> EIGEN_STRONG_INLINE int16_t predux_max(const Packet4s& a) -{ - const int16x4_t max = vpmax_s16(a,a); - return vget_lane_s16(vpmax_s16(max, max), 0); -} -template<> EIGEN_STRONG_INLINE int16_t predux_max(const Packet8s& a) -{ - int16x4_t max = vmax_s16(vget_low_s16(a), vget_high_s16(a)); - max = vpmax_s16(max, max); - max = vpmax_s16(max, max); - return vget_lane_s16(max, 0); -} -template<> EIGEN_STRONG_INLINE uint16_t predux_max(const Packet4us& a) -{ - const uint16x4_t max = vpmax_u16(a,a); - return vget_lane_u16(vpmax_u16(max, max), 0); -} -template<> EIGEN_STRONG_INLINE uint16_t predux_max(const Packet8us& a) -{ - uint16x4_t max = vmax_u16(vget_low_u16(a), vget_high_u16(a)); - max = vpmax_u16(max, max); - max = vpmax_u16(max, max); - return vget_lane_u16(max, 0); -} -template<> EIGEN_STRONG_INLINE int32_t predux_max(const Packet2i& a) -{ return vget_lane_s32(vpmax_s32(a,a), 0); } + template<> EIGEN_STRONG_INLINE int32_t predux_max(const Packet4i& a) { - const int32x2_t max = vmax_s32(vget_low_s32(a), vget_high_s32(a)); - return vget_lane_s32(vpmax_s32(max, max), 0); -} -template<> EIGEN_STRONG_INLINE uint32_t predux_max(const Packet2ui& a) -{ return vget_lane_u32(vpmax_u32(a,a), 0); } -template<> EIGEN_STRONG_INLINE uint32_t predux_max(const Packet4ui& a) -{ - const uint32x2_t max = vmax_u32(vget_low_u32(a), vget_high_u32(a)); - return vget_lane_u32(vpmax_u32(max, max), 0); -} -template<> EIGEN_STRONG_INLINE int64_t predux_max(const Packet2l& a) -{ return (std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1)); } -template<> EIGEN_STRONG_INLINE uint64_t predux_max(const Packet2ul& a) -{ return (std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1)); } + int32x2_t a_lo, a_hi, max; -template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) -{ - uint32x2_t tmp = vorr_u32(vget_low_u32( vreinterpretq_u32_f32(x)), - vget_high_u32(vreinterpretq_u32_f32(x))); - return vget_lane_u32(vpmax_u32(tmp, tmp), 0); + a_lo = vget_low_s32(a); + a_hi = vget_high_s32(a); + max = vpmax_s32(a_lo, a_hi); + max = vpmax_s32(max, max); + + return vget_lane_s32(max, 0); } -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) -{ - const float32x2x2_t z = vzip_f32(kernel.packet[0], kernel.packet[1]); - kernel.packet[0] = z.val[0]; - kernel.packet[1] = z.val[1]; -} -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) -{ - const float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]); - const float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]); +// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors, +// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074 +#define PALIGN_NEON(Offset,Type,Command) \ +template<>\ +struct palign_impl\ +{\ + EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\ + {\ + if (Offset!=0)\ + first = Command(first, second, Offset);\ + }\ +};\ + +PALIGN_NEON(0,Packet4f,vextq_f32) +PALIGN_NEON(1,Packet4f,vextq_f32) +PALIGN_NEON(2,Packet4f,vextq_f32) +PALIGN_NEON(3,Packet4f,vextq_f32) +PALIGN_NEON(0,Packet4i,vextq_s32) +PALIGN_NEON(1,Packet4i,vextq_s32) +PALIGN_NEON(2,Packet4i,vextq_s32) +PALIGN_NEON(3,Packet4i,vextq_s32) + +#undef PALIGN_NEON + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]); + float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]); kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0])); kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0])); kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1])); kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1])); } -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) -{ - const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1)); - const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1)); - - const int8x8x2_t zip8 = vzip_s8(a,b); - const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1])); - - kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0); - kernel.packet[1] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 1); - kernel.packet[2] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 0); - kernel.packet[3] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 1); -} -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) -{ - int8x8x2_t zip8[4]; - uint16x4x2_t zip16[4]; - - EIGEN_UNROLL_LOOP - for (int i = 0; i != 4; i++) - zip8[i] = vzip_s8(kernel.packet[i*2], kernel.packet[i*2+1]); - - EIGEN_UNROLL_LOOP - for (int i = 0; i != 2; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - zip16[i*2+j] = vzip_u16(vreinterpret_u16_s8(zip8[i*2].val[j]), vreinterpret_u16_s8(zip8[i*2+1].val[j])); - } - - EIGEN_UNROLL_LOOP - for (int i = 0; i != 2; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - { - const uint32x2x2_t z = vzip_u32(vreinterpret_u32_u16(zip16[i].val[j]), vreinterpret_u32_u16(zip16[i+2].val[j])); - EIGEN_UNROLL_LOOP - for (int k = 0; k != 2; k++) - kernel.packet[i*4+j*2+k] = vreinterpret_s8_u32(z.val[k]); - } - } -} -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) -{ - int8x16x2_t zip8[8]; - uint16x8x2_t zip16[8]; - uint32x4x2_t zip32[8]; - - EIGEN_UNROLL_LOOP - for (int i = 0; i != 8; i++) - zip8[i] = vzipq_s8(kernel.packet[i*2], kernel.packet[i*2+1]); - - EIGEN_UNROLL_LOOP - for (int i = 0; i != 4; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - { - zip16[i*2+j] = vzipq_u16(vreinterpretq_u16_s8(zip8[i*2].val[j]), - vreinterpretq_u16_s8(zip8[i*2+1].val[j])); - } - } - - EIGEN_UNROLL_LOOP - for (int i = 0; i != 2; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - { - EIGEN_UNROLL_LOOP - for (int k = 0; k != 2; k++) - zip32[i*4+j*2+k] = vzipq_u32(vreinterpretq_u32_u16(zip16[i*4+j].val[k]), - vreinterpretq_u32_u16(zip16[i*4+j+2].val[k])); - } - } - - EIGEN_UNROLL_LOOP - for (int i = 0; i != 4; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - { - kernel.packet[i*4+j*2] = vreinterpretq_s8_u32(vcombine_u32(vget_low_u32(zip32[i].val[j]), - vget_low_u32(zip32[i+4].val[j]))); - kernel.packet[i*4+j*2+1] = vreinterpretq_s8_u32(vcombine_u32(vget_high_u32(zip32[i].val[j]), - vget_high_u32(zip32[i+4].val[j]))); - } - } -} -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) -{ - const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1)); - const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1)); - - const uint8x8x2_t zip8 = vzip_u8(a,b); - const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1])); - - kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0); - kernel.packet[1] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 1); - kernel.packet[2] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 0); - kernel.packet[3] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 1); -} -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) -{ - uint8x8x2_t zip8[4]; - uint16x4x2_t zip16[4]; - - EIGEN_UNROLL_LOOP - for (int i = 0; i != 4; i++) - zip8[i] = vzip_u8(kernel.packet[i*2], kernel.packet[i*2+1]); - - EIGEN_UNROLL_LOOP - for (int i = 0; i != 2; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - zip16[i*2+j] = vzip_u16(vreinterpret_u16_u8(zip8[i*2].val[j]), vreinterpret_u16_u8(zip8[i*2+1].val[j])); - } - - EIGEN_UNROLL_LOOP - for (int i = 0; i != 2; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - { - const uint32x2x2_t z = vzip_u32(vreinterpret_u32_u16(zip16[i].val[j]), vreinterpret_u32_u16(zip16[i+2].val[j])); - EIGEN_UNROLL_LOOP - for (int k = 0; k != 2; k++) - kernel.packet[i*4+j*2+k] = vreinterpret_u8_u32(z.val[k]); - } - } -} -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) -{ - uint8x16x2_t zip8[8]; - uint16x8x2_t zip16[8]; - uint32x4x2_t zip32[8]; - - EIGEN_UNROLL_LOOP - for (int i = 0; i != 8; i++) - zip8[i] = vzipq_u8(kernel.packet[i*2], kernel.packet[i*2+1]); - - EIGEN_UNROLL_LOOP - for (int i = 0; i != 4; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - zip16[i*2+j] = vzipq_u16(vreinterpretq_u16_u8(zip8[i*2].val[j]), - vreinterpretq_u16_u8(zip8[i*2+1].val[j])); - } - - EIGEN_UNROLL_LOOP - for (int i = 0; i != 2; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - { - EIGEN_UNROLL_LOOP - for (int k = 0; k != 2; k++) - zip32[i*4+j*2+k] = vzipq_u32(vreinterpretq_u32_u16(zip16[i*4+j].val[k]), - vreinterpretq_u32_u16(zip16[i*4+j+2].val[k])); - } - } - - EIGEN_UNROLL_LOOP - for (int i = 0; i != 4; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - { - kernel.packet[i*4+j*2] = vreinterpretq_u8_u32(vcombine_u32(vget_low_u32(zip32[i].val[j]), - vget_low_u32(zip32[i+4].val[j]))); - kernel.packet[i*4+j*2+1] = vreinterpretq_u8_u32(vcombine_u32(vget_high_u32(zip32[i].val[j]), - vget_high_u32(zip32[i+4].val[j]))); - } - } -} -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) -{ - const int16x4x2_t zip16_1 = vzip_s16(kernel.packet[0], kernel.packet[1]); - const int16x4x2_t zip16_2 = vzip_s16(kernel.packet[2], kernel.packet[3]); - - const uint32x2x2_t zip32_1 = vzip_u32(vreinterpret_u32_s16(zip16_1.val[0]), vreinterpret_u32_s16(zip16_2.val[0])); - const uint32x2x2_t zip32_2 = vzip_u32(vreinterpret_u32_s16(zip16_1.val[1]), vreinterpret_u32_s16(zip16_2.val[1])); - - kernel.packet[0] = vreinterpret_s16_u32(zip32_1.val[0]); - kernel.packet[1] = vreinterpret_s16_u32(zip32_1.val[1]); - kernel.packet[2] = vreinterpret_s16_u32(zip32_2.val[0]); - kernel.packet[3] = vreinterpret_s16_u32(zip32_2.val[1]); -} -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) -{ - const int16x8x2_t zip16_1 = vzipq_s16(kernel.packet[0], kernel.packet[1]); - const int16x8x2_t zip16_2 = vzipq_s16(kernel.packet[2], kernel.packet[3]); - const int16x8x2_t zip16_3 = vzipq_s16(kernel.packet[4], kernel.packet[5]); - const int16x8x2_t zip16_4 = vzipq_s16(kernel.packet[6], kernel.packet[7]); - - const uint32x4x2_t zip32_1 = vzipq_u32(vreinterpretq_u32_s16(zip16_1.val[0]), vreinterpretq_u32_s16(zip16_2.val[0])); - const uint32x4x2_t zip32_2 = vzipq_u32(vreinterpretq_u32_s16(zip16_1.val[1]), vreinterpretq_u32_s16(zip16_2.val[1])); - const uint32x4x2_t zip32_3 = vzipq_u32(vreinterpretq_u32_s16(zip16_3.val[0]), vreinterpretq_u32_s16(zip16_4.val[0])); - const uint32x4x2_t zip32_4 = vzipq_u32(vreinterpretq_u32_s16(zip16_3.val[1]), vreinterpretq_u32_s16(zip16_4.val[1])); - - kernel.packet[0] = vreinterpretq_s16_u32(vcombine_u32(vget_low_u32(zip32_1.val[0]), vget_low_u32(zip32_3.val[0]))); - kernel.packet[1] = vreinterpretq_s16_u32(vcombine_u32(vget_high_u32(zip32_1.val[0]), vget_high_u32(zip32_3.val[0]))); - kernel.packet[2] = vreinterpretq_s16_u32(vcombine_u32(vget_low_u32(zip32_1.val[1]), vget_low_u32(zip32_3.val[1]))); - kernel.packet[3] = vreinterpretq_s16_u32(vcombine_u32(vget_high_u32(zip32_1.val[1]), vget_high_u32(zip32_3.val[1]))); - kernel.packet[4] = vreinterpretq_s16_u32(vcombine_u32(vget_low_u32(zip32_2.val[0]), vget_low_u32(zip32_4.val[0]))); - kernel.packet[5] = vreinterpretq_s16_u32(vcombine_u32(vget_high_u32(zip32_2.val[0]), vget_high_u32(zip32_4.val[0]))); - kernel.packet[6] = vreinterpretq_s16_u32(vcombine_u32(vget_low_u32(zip32_2.val[1]), vget_low_u32(zip32_4.val[1]))); - kernel.packet[7] = vreinterpretq_s16_u32(vcombine_u32(vget_high_u32(zip32_2.val[1]), vget_high_u32(zip32_4.val[1]))); -} -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) -{ - const uint16x4x2_t zip16_1 = vzip_u16(kernel.packet[0], kernel.packet[1]); - const uint16x4x2_t zip16_2 = vzip_u16(kernel.packet[2], kernel.packet[3]); - - const uint32x2x2_t zip32_1 = vzip_u32(vreinterpret_u32_u16(zip16_1.val[0]), vreinterpret_u32_u16(zip16_2.val[0])); - const uint32x2x2_t zip32_2 = vzip_u32(vreinterpret_u32_u16(zip16_1.val[1]), vreinterpret_u32_u16(zip16_2.val[1])); - - kernel.packet[0] = vreinterpret_u16_u32(zip32_1.val[0]); - kernel.packet[1] = vreinterpret_u16_u32(zip32_1.val[1]); - kernel.packet[2] = vreinterpret_u16_u32(zip32_2.val[0]); - kernel.packet[3] = vreinterpret_u16_u32(zip32_2.val[1]); -} -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) -{ - const uint16x8x2_t zip16_1 = vzipq_u16(kernel.packet[0], kernel.packet[1]); - const uint16x8x2_t zip16_2 = vzipq_u16(kernel.packet[2], kernel.packet[3]); - const uint16x8x2_t zip16_3 = vzipq_u16(kernel.packet[4], kernel.packet[5]); - const uint16x8x2_t zip16_4 = vzipq_u16(kernel.packet[6], kernel.packet[7]); - - const uint32x4x2_t zip32_1 = vzipq_u32(vreinterpretq_u32_u16(zip16_1.val[0]), vreinterpretq_u32_u16(zip16_2.val[0])); - const uint32x4x2_t zip32_2 = vzipq_u32(vreinterpretq_u32_u16(zip16_1.val[1]), vreinterpretq_u32_u16(zip16_2.val[1])); - const uint32x4x2_t zip32_3 = vzipq_u32(vreinterpretq_u32_u16(zip16_3.val[0]), vreinterpretq_u32_u16(zip16_4.val[0])); - const uint32x4x2_t zip32_4 = vzipq_u32(vreinterpretq_u32_u16(zip16_3.val[1]), vreinterpretq_u32_u16(zip16_4.val[1])); - - kernel.packet[0] = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(zip32_1.val[0]), vget_low_u32(zip32_3.val[0]))); - kernel.packet[1] = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(zip32_1.val[0]), vget_high_u32(zip32_3.val[0]))); - kernel.packet[2] = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(zip32_1.val[1]), vget_low_u32(zip32_3.val[1]))); - kernel.packet[3] = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(zip32_1.val[1]), vget_high_u32(zip32_3.val[1]))); - kernel.packet[4] = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(zip32_2.val[0]), vget_low_u32(zip32_4.val[0]))); - kernel.packet[5] = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(zip32_2.val[0]), vget_high_u32(zip32_4.val[0]))); - kernel.packet[6] = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(zip32_2.val[1]), vget_low_u32(zip32_4.val[1]))); - kernel.packet[7] = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(zip32_2.val[1]), vget_high_u32(zip32_4.val[1]))); -} -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) -{ - const int32x2x2_t z = vzip_s32(kernel.packet[0], kernel.packet[1]); - kernel.packet[0] = z.val[0]; - kernel.packet[1] = z.val[1]; -} -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) -{ - const int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]); - const int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]); +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]); + int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]); kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0])); kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0])); kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1])); kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1])); } -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) -{ - const uint32x2x2_t z = vzip_u32(kernel.packet[0], kernel.packet[1]); - kernel.packet[0] = z.val[0]; - kernel.packet[1] = z.val[1]; -} -EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) -{ - const uint32x4x2_t tmp1 = vzipq_u32(kernel.packet[0], kernel.packet[1]); - const uint32x4x2_t tmp2 = vzipq_u32(kernel.packet[2], kernel.packet[3]); - - kernel.packet[0] = vcombine_u32(vget_low_u32(tmp1.val[0]), vget_low_u32(tmp2.val[0])); - kernel.packet[1] = vcombine_u32(vget_high_u32(tmp1.val[0]), vget_high_u32(tmp2.val[0])); - kernel.packet[2] = vcombine_u32(vget_low_u32(tmp1.val[1]), vget_low_u32(tmp2.val[1])); - kernel.packet[3] = vcombine_u32(vget_high_u32(tmp1.val[1]), vget_high_u32(tmp2.val[1])); -} -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) -{ -#if EIGEN_ARCH_ARM64 - const int64x2_t tmp1 = vzip1q_s64(kernel.packet[0], kernel.packet[1]); - const int64x2_t tmp2 = vzip2q_s64(kernel.packet[0], kernel.packet[1]); - - kernel.packet[0] = tmp1; - kernel.packet[1] = tmp2; -#else - const int64x1_t tmp[2][2] = { - { vget_low_s64(kernel.packet[0]), vget_high_s64(kernel.packet[0]) }, - { vget_low_s64(kernel.packet[1]), vget_high_s64(kernel.packet[1]) } - }; - - kernel.packet[0] = vcombine_s64(tmp[0][0], tmp[1][0]); - kernel.packet[1] = vcombine_s64(tmp[0][1], tmp[1][1]); -#endif -} -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) -{ -#if EIGEN_ARCH_ARM64 - const uint64x2_t tmp1 = vzip1q_u64(kernel.packet[0], kernel.packet[1]); - const uint64x2_t tmp2 = vzip2q_u64(kernel.packet[0], kernel.packet[1]); - - kernel.packet[0] = tmp1; - kernel.packet[1] = tmp2; -#else - const uint64x1_t tmp[2][2] = { - { vget_low_u64(kernel.packet[0]), vget_high_u64(kernel.packet[0]) }, - { vget_low_u64(kernel.packet[1]), vget_high_u64(kernel.packet[1]) } - }; - - kernel.packet[0] = vcombine_u64(tmp[0][0], tmp[1][0]); - kernel.packet[1] = vcombine_u64(tmp[0][1], tmp[1][1]); -#endif -} - -template<> EIGEN_DEVICE_FUNC inline Packet2f pselect( const Packet2f& mask, const Packet2f& a, const Packet2f& b) -{ return vbsl_f32(vreinterpret_u32_f32(mask), a, b); } -template<> EIGEN_DEVICE_FUNC inline Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) -{ return vbslq_f32(vreinterpretq_u32_f32(mask), a, b); } -template<> EIGEN_DEVICE_FUNC inline Packet8c pselect(const Packet8c& mask, const Packet8c& a, const Packet8c& b) -{ return vbsl_s8(vreinterpret_u8_s8(mask), a, b); } -template<> EIGEN_DEVICE_FUNC inline Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b) -{ return vbslq_s8(vreinterpretq_u8_s8(mask), a, b); } -template<> EIGEN_DEVICE_FUNC inline Packet8uc pselect(const Packet8uc& mask, const Packet8uc& a, const Packet8uc& b) -{ return vbsl_u8(mask, a, b); } -template<> EIGEN_DEVICE_FUNC inline Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a, const Packet16uc& b) -{ return vbslq_u8(mask, a, b); } -template<> EIGEN_DEVICE_FUNC inline Packet4s pselect(const Packet4s& mask, const Packet4s& a, const Packet4s& b) -{ return vbsl_s16(vreinterpret_u16_s16(mask), a, b); } -template<> EIGEN_DEVICE_FUNC inline Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b) -{ return vbslq_s16(vreinterpretq_u16_s16(mask), a, b); } -template<> EIGEN_DEVICE_FUNC inline Packet4us pselect(const Packet4us& mask, const Packet4us& a, const Packet4us& b) -{ return vbsl_u16(mask, a, b); } -template<> EIGEN_DEVICE_FUNC inline Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b) -{ return vbslq_u16(mask, a, b); } -template<> EIGEN_DEVICE_FUNC inline Packet2i pselect(const Packet2i& mask, const Packet2i& a, const Packet2i& b) -{ return vbsl_s32(vreinterpret_u32_s32(mask), a, b); } -template<> EIGEN_DEVICE_FUNC inline Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) -{ return vbslq_s32(vreinterpretq_u32_s32(mask), a, b); } -template<> EIGEN_DEVICE_FUNC inline Packet2ui pselect(const Packet2ui& mask, const Packet2ui& a, const Packet2ui& b) -{ return vbsl_u32(mask, a, b); } -template<> EIGEN_DEVICE_FUNC inline Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) -{ return vbslq_u32(mask, a, b); } -template<> EIGEN_DEVICE_FUNC inline Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) -{ return vbslq_s64(vreinterpretq_u64_s64(mask), a, b); } -template<> EIGEN_DEVICE_FUNC inline Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b) -{ return vbslq_u64(mask, a, b); } - -/** - * Computes the integer square root - * @remarks The calculation is performed using an algorithm which iterates through each binary digit of the result - * and tests whether setting that digit to 1 would cause the square of the value to be greater than the argument - * value. The algorithm is described in detail here: http://ww1.microchip.com/downloads/en/AppNotes/91040a.pdf . - */ -template<> EIGEN_STRONG_INLINE Packet4uc psqrt(const Packet4uc& a) { - uint8x8_t x = vreinterpret_u8_u32(vdup_n_u32(a)); - uint8x8_t res = vdup_n_u8(0); - uint8x8_t add = vdup_n_u8(0x8); - for (int i = 0; i < 4; i++) - { - const uint8x8_t temp = vorr_u8(res, add); - res = vbsl_u8(vcge_u8(x, vmul_u8(temp, temp)), temp, res); - add = vshr_n_u8(add, 1); - } - return vget_lane_u32(vreinterpret_u32_u8(res), 0); -} -/// @copydoc Eigen::internal::psqrt(const Packet4uc& a) -template<> EIGEN_STRONG_INLINE Packet8uc psqrt(const Packet8uc& a) { - uint8x8_t res = vdup_n_u8(0); - uint8x8_t add = vdup_n_u8(0x8); - for (int i = 0; i < 4; i++) - { - const uint8x8_t temp = vorr_u8(res, add); - res = vbsl_u8(vcge_u8(a, vmul_u8(temp, temp)), temp, res); - add = vshr_n_u8(add, 1); - } - return res; -} -/// @copydoc Eigen::internal::psqrt(const Packet4uc& a) -template<> EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) { - uint8x16_t res = vdupq_n_u8(0); - uint8x16_t add = vdupq_n_u8(0x8); - for (int i = 0; i < 4; i++) - { - const uint8x16_t temp = vorrq_u8(res, add); - res = vbslq_u8(vcgeq_u8(a, vmulq_u8(temp, temp)), temp, res); - add = vshrq_n_u8(add, 1); - } - return res; -} -/// @copydoc Eigen::internal::psqrt(const Packet4uc& a) -template<> EIGEN_STRONG_INLINE Packet4us psqrt(const Packet4us& a) { - uint16x4_t res = vdup_n_u16(0); - uint16x4_t add = vdup_n_u16(0x80); - for (int i = 0; i < 8; i++) - { - const uint16x4_t temp = vorr_u16(res, add); - res = vbsl_u16(vcge_u16(a, vmul_u16(temp, temp)), temp, res); - add = vshr_n_u16(add, 1); - } - return res; -} -/// @copydoc Eigen::internal::psqrt(const Packet4uc& a) -template<> EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) { - uint16x8_t res = vdupq_n_u16(0); - uint16x8_t add = vdupq_n_u16(0x80); - for (int i = 0; i < 8; i++) - { - const uint16x8_t temp = vorrq_u16(res, add); - res = vbslq_u16(vcgeq_u16(a, vmulq_u16(temp, temp)), temp, res); - add = vshrq_n_u16(add, 1); - } - return res; -} -/// @copydoc Eigen::internal::psqrt(const Packet4uc& a) -template<> EIGEN_STRONG_INLINE Packet2ui psqrt(const Packet2ui& a) { - uint32x2_t res = vdup_n_u32(0); - uint32x2_t add = vdup_n_u32(0x8000); - for (int i = 0; i < 16; i++) - { - const uint32x2_t temp = vorr_u32(res, add); - res = vbsl_u32(vcge_u32(a, vmul_u32(temp, temp)), temp, res); - add = vshr_n_u32(add, 1); - } - return res; -} -/// @copydoc Eigen::internal::psqrt(const Packet4uc& a) -template<> EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) { - uint32x4_t res = vdupq_n_u32(0); - uint32x4_t add = vdupq_n_u32(0x8000); - for (int i = 0; i < 16; i++) - { - const uint32x4_t temp = vorrq_u32(res, add); - res = vbslq_u32(vcgeq_u32(a, vmulq_u32(temp, temp)), temp, res); - add = vshrq_n_u32(add, 1); - } - return res; -} //---------- double ---------- @@ -3184,9 +571,17 @@ template<> EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) { // Defining these functions as templates ensures that if these intrinsics are // already defined in arm_neon.h, then our workaround doesn't cause a conflict // and has lower priority in overload resolution. -template uint64x2_t vreinterpretq_u64_f64(T a) { return (uint64x2_t) a; } +template +uint64x2_t vreinterpretq_u64_f64(T a) +{ + return (uint64x2_t) a; +} -template float64x2_t vreinterpretq_f64_u64(T a) { return (float64x2_t) a; } +template +float64x2_t vreinterpretq_f64_u64(T a) +{ + return (float64x2_t) a; +} typedef float64x2_t Packet2d; typedef float64x1_t Packet1d; @@ -3195,67 +590,32 @@ template<> struct packet_traits : default_packet_traits { typedef Packet2d type; typedef Packet2d half; - enum - { + enum { Vectorizable = 1, AlignedOnScalar = 1, size = 2, - HasHalfPacket = 0, - - HasCast = 1, - HasCmp = 1, - HasAdd = 1, - HasSub = 1, - HasShift = 1, - HasMul = 1, - HasNegate = 1, - HasAbs = 1, - HasArg = 0, - HasAbs2 = 1, - HasAbsDiff = 1, - HasMin = 1, - HasMax = 1, - HasConj = 1, - HasSetLinear = 0, - HasBlend = 0, - HasInsert = 1, - - HasDiv = 1, - HasFloor = 1, - + HasHalfPacket=0, + + HasDiv = 1, + // FIXME check the Has* HasSin = 0, HasCos = 0, HasLog = 0, HasExp = 0, - HasSqrt = 0, - HasTanh = 0, - HasErf = 0 + HasSqrt = 0 }; }; -template<> struct unpacket_traits -{ - typedef double type; - typedef Packet2d half; - typedef Packet2l integer_packet; - enum - { - size = 2, - alignment = Aligned16, - vectorizable = true, - masked_load_available = false, - masked_store_available = false - }; -}; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return vdupq_n_f64(from); } template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { - const double c[] = {0.0,1.0}; - return vaddq_f64(pset1(a), vld1q_f64(c)); + const double countdown_raw[] = {0.0,1.0}; + const Packet2d countdown = vld1q_f64(countdown_raw); + return vaddq_f64(pset1(a), countdown); } - template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return vaddq_f64(a,b); } template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return vsubq_f64(a,b); } @@ -3270,130 +630,128 @@ template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const #ifdef __ARM_FEATURE_FMA // See bug 936. See above comment about FMA for float. -template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) -{ return vfmaq_f64(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vfmaq_f64(c,a,b); } #else -template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) -{ return vmlaq_f64(c,a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vmlaq_f64(c,a,b); } #endif template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vminq_f64(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vmaxq_f64(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) -{ - const Packet2d cst_1 = pset1(1.0); - /* perform a floorf */ - const Packet2d tmp = vcvtq_f64_s64(vcvtq_s64_f64(a)); - - /* if greater, substract 1 */ - uint64x2_t mask = vcgtq_f64(tmp, a); - mask = vandq_u64(mask, vreinterpretq_u64_f64(cst_1)); - return vsubq_f64(tmp, vreinterpretq_f64_u64(mask)); -} - // Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) -{ return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); } +{ + return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +} template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) -{ return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); } +{ + return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +} template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) -{ return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); } +{ + return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +} template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) -{ return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); } +{ + return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a),vreinterpretq_u64_f64(b))); +} -template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) -{ return vreinterpretq_f64_u64(vcleq_f64(a,b)); } +template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); } -template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) -{ return vreinterpretq_f64_u64(vcltq_f64(a,b)); } +template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); } -template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) -{ return vreinterpretq_f64_u64(vceqq_f64(a,b)); } +template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) +{ + return vld1q_dup_f64(from); +} +template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); } -template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) -{ EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); } - -template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) -{ EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from); } - -template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { return vld1q_dup_f64(from); } -template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) -{ EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to,from); } - -template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) -{ EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to,from); } +template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from); } template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) { Packet2d res = pset1(0.0); - res = vld1q_lane_f64(from + 0*stride, res, 0); - res = vld1q_lane_f64(from + 1*stride, res, 1); + res = vsetq_lane_f64(from[0*stride], res, 0); + res = vsetq_lane_f64(from[1*stride], res, 1); return res; } - template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) { - vst1q_lane_f64(to + stride*0, from, 0); - vst1q_lane_f64(to + stride*1, from, 1); + to[stride*0] = vgetq_lane_f64(from, 0); + to[stride*1] = vgetq_lane_f64(from, 1); } - template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ARM_PREFETCH(addr); } // FIXME only store the 2 first elements ? -template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return vgetq_lane_f64(a,0); } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return vgetq_lane_f64(a, 0); } -template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) -{ return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); } +template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) { return vcombine_f64(vget_high_f64(a), vget_low_f64(a)); } template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vabsq_f64(a); } #if EIGEN_COMP_CLANG && defined(__apple_build_version__) // workaround ICE, see bug 907 -template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) -{ return (vget_low_f64(a) + vget_high_f64(a))[0]; } +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return (vget_low_f64(a) + vget_high_f64(a))[0]; } #else -template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) -{ return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); } +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); } #endif +template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) +{ + float64x2_t trn1, trn2; + + // NEON zip performs interleaving of the supplied vectors. + // We perform two interleaves in a row to acquire the transposed vector + trn1 = vzip1q_f64(vecs[0], vecs[1]); + trn2 = vzip2q_f64(vecs[0], vecs[1]); + + // Do the addition of the resulting vectors + return vaddq_f64(trn1, trn2); +} // Other reduction functions: // mul #if EIGEN_COMP_CLANG && defined(__apple_build_version__) -template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) -{ return (vget_low_f64(a) * vget_high_f64(a))[0]; } +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; } #else -template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) -{ return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); } +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); } #endif // min -template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) -{ return vgetq_lane_f64(vpminq_f64(a,a), 0); } +template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) { return vgetq_lane_f64(vpminq_f64(a, a), 0); } // max -template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) -{ return vgetq_lane_f64(vpmaxq_f64(a,a), 0); } +template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) { return vgetq_lane_f64(vpmaxq_f64(a, a), 0); } +// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors, +// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074 +#define PALIGN_NEON(Offset,Type,Command) \ +template<>\ +struct palign_impl\ +{\ + EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\ + {\ + if (Offset!=0)\ + first = Command(first, second, Offset);\ + }\ +};\ + +PALIGN_NEON(0,Packet2d,vextq_f64) +PALIGN_NEON(1,Packet2d,vextq_f64) +#undef PALIGN_NEON EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) -{ - const float64x2_t tmp1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]); - const float64x2_t tmp2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]); +ptranspose(PacketBlock& kernel) { + float64x2_t trn1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]); + float64x2_t trn2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]); - kernel.packet[0] = tmp1; - kernel.packet[1] = tmp2; + kernel.packet[0] = trn1; + kernel.packet[1] = trn2; } - -template<> EIGEN_DEVICE_FUNC inline Packet2d pselect( const Packet2d& mask, const Packet2d& a, const Packet2d& b) -{ return vbslq_f64(vreinterpretq_u64_f64(mask), a, b); } - -#endif // EIGEN_ARCH_ARM64 +#endif // EIGEN_ARCH_ARM64 } // end namespace internal diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/TypeCasting.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/TypeCasting.h deleted file mode 100644 index 68d24dc5c..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/NEON/TypeCasting.h +++ /dev/null @@ -1,278 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2018 Rasmus Munk Larsen -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_TYPE_CASTING_NEON_H -#define EIGEN_TYPE_CASTING_NEON_H - -namespace Eigen { - -namespace internal { - -template<> struct type_casting_traits -{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; -template<> struct type_casting_traits -{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; -template<> struct type_casting_traits -{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; -template<> struct type_casting_traits -{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; -template<> struct type_casting_traits -{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; -template<> struct type_casting_traits -{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; -template<> struct type_casting_traits -{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; -template<> struct type_casting_traits -{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; -template<> struct type_casting_traits -{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; -template<> struct type_casting_traits -{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; -template<> struct type_casting_traits -{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; -template<> struct type_casting_traits -{ enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; }; - -template<> EIGEN_STRONG_INLINE Packet2f pcast(const Packet2i& a) { return vcvt_f32_s32(a); } -template<> EIGEN_STRONG_INLINE Packet2f pcast(const Packet2ui& a) { return vcvt_f32_u32(a); } -template<> EIGEN_STRONG_INLINE Packet2f pcast(const Packet2l& a) -{ return vcvt_f32_s32(vmovn_s64(a)); } -template<> EIGEN_STRONG_INLINE Packet2f pcast(const Packet2ul& a) -{ return vcvt_f32_u32(vmovn_u64(a)); } -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4c& a) -{ return vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s32(vdup_n_s32(a)))))); } -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4uc& a) -{ return vcvtq_f32_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(a))))))); } -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4s& a) -{ return vcvtq_f32_s32(vmovl_s16(a)); } -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4us& a) -{ return vcvtq_f32_s32(vreinterpretq_s32_u32(vmovl_u16(a))); } -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { return vcvtq_f32_s32(a); } -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4ui& a) { return vcvtq_f32_u32(a); } -template<> EIGEN_STRONG_INLINE Packet4c pcast(const Packet4f& a) -{ - const int16x4_t b = vmovn_s32(vcvtq_s32_f32(a)); - return vget_lane_s32(vreinterpret_s32_s8(vmovn_s16(vcombine_s16(b, b))), 0); -} -template<> EIGEN_STRONG_INLINE Packet4c pcast(const Packet4uc& a) -{ return static_cast(a); } -template<> EIGEN_STRONG_INLINE Packet4c pcast(const Packet4s& a) -{ return vget_lane_s32(vreinterpret_s32_s8(vmovn_s16(vcombine_s16(a, a))), 0); } -template<> EIGEN_STRONG_INLINE Packet4c pcast(const Packet4us& a) -{ - const int16x4_t b = vreinterpret_s16_u16(a); - return vget_lane_s32(vreinterpret_s32_s8(vmovn_s16(vcombine_s16(b, b))), 0); -} -template<> EIGEN_STRONG_INLINE Packet4c pcast(const Packet4i& a) -{ - const int16x4_t b = vmovn_s32(a); - return vget_lane_s32(vreinterpret_s32_s8(vmovn_s16(vcombine_s16(b, b))), 0); -} -template<> EIGEN_STRONG_INLINE Packet4c pcast(const Packet4ui& a) -{ - const int16x4_t b = vmovn_s32(vreinterpretq_s32_u32(a)); - return vget_lane_s32(vreinterpret_s32_s8(vmovn_s16(vcombine_s16(b, b))), 0); -} -template<> EIGEN_STRONG_INLINE Packet8c pcast(const Packet8uc& a) { return vreinterpret_s8_u8(a); } -template<> EIGEN_STRONG_INLINE Packet8c pcast(const Packet8s& a) { return vmovn_s16(a); } -template<> EIGEN_STRONG_INLINE Packet8c pcast(const Packet8us& a) -{ return vreinterpret_s8_u8(vmovn_u16(a)); } -template<> EIGEN_STRONG_INLINE Packet16c pcast(const Packet16uc& a) -{ return vreinterpretq_s8_u8(a); } -template<> EIGEN_STRONG_INLINE Packet4uc pcast(const Packet4f& a) -{ - const uint16x4_t b = vmovn_u32(vreinterpretq_u32_s32(vcvtq_s32_f32(a))); - return vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(b, b))), 0); -} -template<> EIGEN_STRONG_INLINE Packet4uc pcast(const Packet4i& a) -{ - const uint16x4_t b = vmovn_u32(vreinterpretq_u32_s32(a)); - return vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(b, b))), 0); -} -template<> EIGEN_STRONG_INLINE Packet4uc pcast(const Packet4ui& a) -{ - const uint16x4_t b = vmovn_u32(a); - return vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(b, b))), 0); -} -template<> EIGEN_STRONG_INLINE Packet4uc pcast(const Packet4c& a) -{ return static_cast(a); } -template<> EIGEN_STRONG_INLINE Packet4uc pcast(const Packet4s& a) -{ - const uint16x4_t b = vreinterpret_u16_s16(a); - return vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(b, b))), 0); -} -template<> EIGEN_STRONG_INLINE Packet4uc pcast(const Packet4us& a) -{ return vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(a, a))), 0); } -template<> EIGEN_STRONG_INLINE Packet8uc pcast(const Packet8c& a) { return vreinterpret_u8_s8(a); } -template<> EIGEN_STRONG_INLINE Packet8uc pcast(const Packet8s& a) -{ return vreinterpret_u8_s8(vmovn_s16(a)); } -template<> EIGEN_STRONG_INLINE Packet8uc pcast(const Packet8us& a) { return vmovn_u16(a); } -template<> EIGEN_STRONG_INLINE Packet16uc pcast(const Packet16c& a) -{ return vreinterpretq_u8_s8(a); } -template<> EIGEN_STRONG_INLINE Packet4s pcast(const Packet4f& a) -{ return vmovn_s32(vcvtq_s32_f32(a)); } -template<> EIGEN_STRONG_INLINE Packet4s pcast(const Packet4c& a) -{ return vget_low_s16(vmovl_s8(vreinterpret_s8_s32(vdup_n_s32(a)))); } -template<> EIGEN_STRONG_INLINE Packet4s pcast(const Packet4uc& a) -{ return vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(a))))); } -template<> EIGEN_STRONG_INLINE Packet4s pcast(const Packet4us& a) -{ return vreinterpret_s16_u16(a); } -template<> EIGEN_STRONG_INLINE Packet4s pcast(const Packet4i& a) { return vmovn_s32(a); } -template<> EIGEN_STRONG_INLINE Packet4s pcast(const Packet4ui& a) -{ return vmovn_s32(vreinterpretq_s32_u32(a)); } -template<> EIGEN_STRONG_INLINE Packet8s pcast(const Packet8uc& a) -{ return vreinterpretq_s16_u16(vmovl_u8(a)); } -template<> EIGEN_STRONG_INLINE Packet8s pcast(const Packet8c& a) { return vmovl_s8(a); } -template<> EIGEN_STRONG_INLINE Packet8s pcast(const Packet8us& a) -{ return vreinterpretq_s16_u16(a); } -template<> EIGEN_STRONG_INLINE Packet4us pcast(const Packet4f& a) -{ return vmovn_u32(vreinterpretq_u32_s32(vcvtq_s32_f32(a))); } -template<> EIGEN_STRONG_INLINE Packet4us pcast(const Packet4c& a) -{ return vget_low_u16(vreinterpretq_u16_s16(vmovl_s8(vreinterpret_s8_s32(vdup_n_s32(a))))); } -template<> EIGEN_STRONG_INLINE Packet4us pcast(const Packet4uc& a) -{ return vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(a)))); } -template<> EIGEN_STRONG_INLINE Packet4us pcast(const Packet4s& a) -{ return vreinterpret_u16_s16(a); } -template<> EIGEN_STRONG_INLINE Packet4us pcast(const Packet4i& a) -{ return vmovn_u32(vreinterpretq_u32_s32(a)); } -template<> EIGEN_STRONG_INLINE Packet4us pcast(const Packet4ui& a) { return vmovn_u32(a); } -template<> EIGEN_STRONG_INLINE Packet8us pcast(const Packet8c& a) -{ return vreinterpretq_u16_s16(vmovl_s8(a)); } -template<> EIGEN_STRONG_INLINE Packet8us pcast(const Packet8uc& a) { return vmovl_u8(a); } -template<> EIGEN_STRONG_INLINE Packet8us pcast(const Packet8s& a) -{ return vreinterpretq_u16_s16(a); } -template<> EIGEN_STRONG_INLINE Packet2i pcast(const Packet2f& a) { return vcvt_s32_f32(a); } -template<> EIGEN_STRONG_INLINE Packet2i pcast(const Packet2ui& a) -{ return vreinterpret_s32_u32(a); } -template<> EIGEN_STRONG_INLINE Packet2i pcast(const Packet2l& a) -{ return vmovn_s64(a); } -template<> EIGEN_STRONG_INLINE Packet2i pcast(const Packet2ul& a) -{ return vmovn_s64(vreinterpretq_s64_u64(a)); } -template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { return vcvtq_s32_f32(a); } -template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4c& a) -{ return vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s32(vdup_n_s32(a))))); } -template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4uc& a) -{ return vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(a)))))); } -template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4s& a) { return vmovl_s16(a); } -template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4us& a) -{ return vreinterpretq_s32_u32(vmovl_u16(a)); } -template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4ui& a) -{ return vreinterpretq_s32_u32(a); } -template<> EIGEN_STRONG_INLINE Packet2ui pcast(const Packet2f& a) { return vcvt_u32_f32(a); } -template<> EIGEN_STRONG_INLINE Packet2ui pcast(const Packet2i& a) -{ return vreinterpret_u32_s32(a); } -template<> EIGEN_STRONG_INLINE Packet2ui pcast(const Packet2l& a) -{ return vmovn_u64(vreinterpretq_u64_s64(a)); } -template<> EIGEN_STRONG_INLINE Packet2ui pcast(const Packet2ul& a) -{ return vmovn_u64(a); } -template<> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4f& a) { return vcvtq_u32_f32(a); } -template<> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4c& a) -{ return vreinterpretq_u32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s32(vdup_n_s32(a)))))); } -template<> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4uc& a) -{ return vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(a))))); } -template<> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4s& a) -{ return vreinterpretq_u32_s32(vmovl_s16(a)); } -template<> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4us& a) { return vmovl_u16(a); } -template<> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4i& a) -{ return vreinterpretq_u32_s32(a); } -template<> EIGEN_STRONG_INLINE Packet2l pcast(const Packet2f& a) -{ return vmovl_s32(vcvt_s32_f32(a)); } -template<> EIGEN_STRONG_INLINE Packet2l pcast(const Packet2i& a) -{ return vmovl_s32(a); } -template<> EIGEN_STRONG_INLINE Packet2l pcast(const Packet2ui& a) -{ return vreinterpretq_s64_u64(vmovl_u32(a)); } -template<> EIGEN_STRONG_INLINE Packet2l pcast(const Packet2ul& a) -{ return vreinterpretq_s64_u64(a); } -template<> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet2f& a) -{ return vmovl_u32(vcvt_u32_f32(a)); } -template<> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet2i& a) -{ return vreinterpretq_u64_s64(vmovl_s32(a)); } -template<> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet2ui& a) -{ return vmovl_u32(a); } -template<> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet2l& a) -{ return vreinterpretq_u64_s64(a); } - -template<> EIGEN_STRONG_INLINE Packet2f preinterpret(const Packet2i& a) -{ return vreinterpret_f32_s32(a); } -template<> EIGEN_STRONG_INLINE Packet2f preinterpret(const Packet2ui& a) -{ return vreinterpret_f32_u32(a); } -template<> EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) -{ return vreinterpretq_f32_s32(a); } -template<> EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4ui& a) -{ return vreinterpretq_f32_u32(a); } -template<> EIGEN_STRONG_INLINE Packet4c preinterpret(const Packet4uc& a) -{ return static_cast(a); } -template<> EIGEN_STRONG_INLINE Packet8c preinterpret(const Packet8uc& a) -{ return vreinterpret_s8_u8(a); } -template<> EIGEN_STRONG_INLINE Packet16c preinterpret(const Packet16uc& a) -{ return vreinterpretq_s8_u8(a); } -template<> EIGEN_STRONG_INLINE Packet4uc preinterpret(const Packet4c& a) -{ return static_cast(a); } -template<> EIGEN_STRONG_INLINE Packet8uc preinterpret(const Packet8c& a) -{ return vreinterpret_u8_s8(a); } -template<> EIGEN_STRONG_INLINE Packet16uc preinterpret(const Packet16c& a) -{ return vreinterpretq_u8_s8(a); } -template<> EIGEN_STRONG_INLINE Packet4s preinterpret(const Packet4us& a) -{ return vreinterpret_s16_u16(a); } -template<> EIGEN_STRONG_INLINE Packet8s preinterpret(const Packet8us& a) -{ return vreinterpretq_s16_u16(a); } -template<> EIGEN_STRONG_INLINE Packet4us preinterpret(const Packet4s& a) -{ return vreinterpret_u16_s16(a); } -template<> EIGEN_STRONG_INLINE Packet8us preinterpret(const Packet8s& a) -{ return vreinterpretq_u16_s16(a); } -template<> EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2f& a) -{ return vreinterpret_s32_f32(a); } -template<> EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2ui& a) -{ return vreinterpret_s32_u32(a); } -template<> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) -{ return vreinterpretq_s32_f32(a); } -template<> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4ui& a) -{ return vreinterpretq_s32_u32(a); } -template<> EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2f& a) -{ return vreinterpret_u32_f32(a); } -template<> EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2i& a) -{ return vreinterpret_u32_s32(a); } -template<> EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4f& a) -{ return vreinterpretq_u32_f32(a); } -template<> EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4i& a) -{ return vreinterpretq_u32_s32(a); } -template<> EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2ul& a) -{ return vreinterpretq_s64_u64(a); } -template<> EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2l& a) -{ return vreinterpretq_u64_s64(a); } - -#if EIGEN_ARCH_ARM64 - -template<> EIGEN_STRONG_INLINE Packet2f pcast(const Packet2d& a) { return vcvt_f32_f64(a); } -template<> EIGEN_STRONG_INLINE Packet2d pcast(const Packet2f& a) { return vcvt_f64_f32(a); } -template<> EIGEN_STRONG_INLINE Packet2d pcast(const Packet2i& a) { return vcvtq_f64_s64(vmovl_s32(a)); } -template<> EIGEN_STRONG_INLINE Packet2d pcast(const Packet2ui& a) { return vcvtq_f64_u64(vmovl_u32(a)); } -template<> EIGEN_STRONG_INLINE Packet2d pcast(const Packet2l& a) { return vcvtq_f64_s64(a); } -template<> EIGEN_STRONG_INLINE Packet2d pcast(const Packet2ul& a) { return vcvtq_f64_u64(a); } -template<> EIGEN_STRONG_INLINE Packet2i pcast(const Packet2d& a) { return vcvt_s32_f32(vcvt_f32_f64(a)); } -template<> EIGEN_STRONG_INLINE Packet2ui pcast(const Packet2d& a) { return vcvt_u32_f32(vcvt_f32_f64(a)); } -template<> EIGEN_STRONG_INLINE Packet2l pcast(const Packet2d& a) { return vcvtq_s64_f64(a); } -template<> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet2d& a) { return vcvtq_u64_f64(a); } - -template<> EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2l& a) -{ return vreinterpretq_f64_s64(a); } -template<> EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2ul& a) -{ return vreinterpretq_f64_u64(a); } -template<> EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2d& a) -{ return vreinterpretq_s64_f64(a); } -template<> EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2d& a) -{ return vreinterpretq_u64_f64(a); } - -#endif // EIGEN_ARCH_ARM64 - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_TYPE_CASTING_NEON_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/Complex.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/Complex.h index 8bf8bfe85..d075043ce 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/Complex.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/Complex.h @@ -45,13 +45,12 @@ template<> struct packet_traits > : default_packet_traits HasMin = 0, HasMax = 0, HasSetLinear = 0, - HasBlend = 1, - HasInsert = 1 + HasBlend = 1 }; }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } @@ -83,13 +82,10 @@ template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, con #endif } -template<> EIGEN_STRONG_INLINE Packet2cf ptrue (const Packet2cf& a) { return Packet2cf(ptrue(Packet4f(a.v))); } -template<> EIGEN_STRONG_INLINE Packet2cf pnot (const Packet2cf& a) { return Packet2cf(pnot(Packet4f(a.v))); } - template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(b.v,a.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload(&numext::real_ref(*from))); } template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu(&numext::real_ref(*from))); } @@ -156,11 +152,29 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Packe return pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v,a.v)))); } +template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) +{ + return Packet2cf(_mm_add_ps(_mm_movelh_ps(vecs[0].v,vecs[1].v), _mm_movehl_ps(vecs[1].v,vecs[0].v))); +} + template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) { return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v)))); } +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) + { + if (Offset==1) + { + first.v = _mm_movehl_ps(first.v, first.v); + first.v = _mm_movelh_ps(first.v, second.v); + } + } +}; + template<> struct conj_helper { EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const @@ -266,7 +280,7 @@ template<> struct packet_traits > : default_packet_traits }; #endif -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); } @@ -291,12 +305,10 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con #endif } -template<> EIGEN_STRONG_INLINE Packet1cd ptrue (const Packet1cd& a) { return Packet1cd(ptrue(Packet2d(a.v))); } -template<> EIGEN_STRONG_INLINE Packet1cd pnot (const Packet1cd& a) { return Packet1cd(pnot(Packet2d(a.v))); } template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(b.v,a.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); } // FIXME force unaligned load, this is a temporary fix template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) @@ -328,11 +340,26 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Pack return pfirst(a); } +template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) +{ + return vecs[0]; +} + template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) + { + // FIXME is it sure we never have to align a Packet1cd? + // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... + } +}; + template<> struct conj_helper { EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const @@ -412,23 +439,31 @@ ptranspose(PacketBlock& kernel) { kernel.packet[1].v = tmp; } -template<> EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) -{ - __m128 eq = _mm_cmpeq_ps(a.v, b.v); - return Packet2cf(pand(eq, vec4f_swizzle1(eq, 1, 0, 3, 2))); -} - -template<> EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) -{ - __m128d eq = _mm_cmpeq_pd(a.v, b.v); - return Packet1cd(pand(eq, vec2d_swizzle1(eq, 1, 0))); -} - template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { __m128d result = pblend(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v)); return Packet2cf(_mm_castpd_ps(result)); } +template<> EIGEN_STRONG_INLINE Packet2cf pinsertfirst(const Packet2cf& a, std::complex b) +{ + return Packet2cf(_mm_loadl_pi(a.v, reinterpret_cast(&b))); +} + +template<> EIGEN_STRONG_INLINE Packet1cd pinsertfirst(const Packet1cd&, std::complex b) +{ + return pset1(b); +} + +template<> EIGEN_STRONG_INLINE Packet2cf pinsertlast(const Packet2cf& a, std::complex b) +{ + return Packet2cf(_mm_loadh_pi(a.v, reinterpret_cast(&b))); +} + +template<> EIGEN_STRONG_INLINE Packet1cd pinsertlast(const Packet1cd&, std::complex b) +{ + return pset1(b); +} + } // end namespace internal } // end namespace Eigen diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/MathFunctions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/MathFunctions.h index 92c1eecc7..7b5f948e1 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/MathFunctions.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/MathFunctions.h @@ -8,7 +8,7 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -/* The sin and cos and functions of this file come from +/* The sin, cos, exp, and log functions of this file come from * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/ */ @@ -20,42 +20,426 @@ namespace Eigen { namespace internal { template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f plog(const Packet4f& _x) { - return plog_float(_x); -} +Packet4f plog(const Packet4f& _x) +{ + Packet4f x = _x; + _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); + _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f plog1p(const Packet4f& _x) { - return generic_plog1p(_x); -} + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f pexpm1(const Packet4f& _x) { - return generic_expm1(_x); + /* the smallest non denormalized float number */ + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000);//-1.f/0.f); + + /* natural logarithm computed for 4 simultaneous float + return NaN for x <= 0 + */ + _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); + + + Packet4i emm0; + + Packet4f invalid_mask = _mm_cmpnge_ps(x, _mm_setzero_ps()); // not greater equal is true if x is NaN + Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps()); + + x = pmax(x, p4f_min_norm_pos); /* cut off denormalized stuff */ + emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23); + + /* keep only the fractional part */ + x = _mm_and_ps(x, p4f_inv_mant_mask); + x = _mm_or_ps(x, p4f_half); + + emm0 = _mm_sub_epi32(emm0, p4i_0x7f); + Packet4f e = padd(Packet4f(_mm_cvtepi32_ps(emm0)), p4f_1); + + /* part2: + if( x < SQRTHF ) { + e -= 1; + x = x + x - 1.0; + } else { x = x - 1.0; } + */ + Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF); + Packet4f tmp = pand(x, mask); + x = psub(x, p4f_1); + e = psub(e, pand(p4f_1, mask)); + x = padd(x, tmp); + + Packet4f x2 = pmul(x,x); + Packet4f x3 = pmul(x2,x); + + Packet4f y, y1, y2; + y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1); + y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4); + y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7); + y = pmadd(y , x, p4f_cephes_log_p2); + y1 = pmadd(y1, x, p4f_cephes_log_p5); + y2 = pmadd(y2, x, p4f_cephes_log_p8); + y = pmadd(y, x3, y1); + y = pmadd(y, x3, y2); + y = pmul(y, x3); + + y1 = pmul(e, p4f_cephes_log_q1); + tmp = pmul(x2, p4f_half); + y = padd(y, y1); + x = psub(x, tmp); + y2 = pmul(e, p4f_cephes_log_q2); + x = padd(x, y); + x = padd(x, y2); + // negative arg will be NAN, 0 will be -INF + return _mm_or_ps(_mm_andnot_ps(iszero_mask, _mm_or_ps(x, invalid_mask)), + _mm_and_ps(iszero_mask, p4f_minus_inf)); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pexp(const Packet4f& _x) { - return pexp_float(_x); + Packet4f x = _x; + _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); + _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); + + + _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); + _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); + + _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); + + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); + + Packet4f tmp, fx; + Packet4i emm0; + + // clamp x + x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo); + + /* express exp(x) as exp(g + n*log(2)) */ + fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half); + +#ifdef EIGEN_VECTORIZE_SSE4_1 + fx = _mm_floor_ps(fx); +#else + emm0 = _mm_cvttps_epi32(fx); + tmp = _mm_cvtepi32_ps(emm0); + /* if greater, substract 1 */ + Packet4f mask = _mm_cmpgt_ps(tmp, fx); + mask = _mm_and_ps(mask, p4f_1); + fx = psub(tmp, mask); +#endif + + tmp = pmul(fx, p4f_cephes_exp_C1); + Packet4f z = pmul(fx, p4f_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + + z = pmul(x,x); + + Packet4f y = p4f_cephes_exp_p0; + y = pmadd(y, x, p4f_cephes_exp_p1); + y = pmadd(y, x, p4f_cephes_exp_p2); + y = pmadd(y, x, p4f_cephes_exp_p3); + y = pmadd(y, x, p4f_cephes_exp_p4); + y = pmadd(y, x, p4f_cephes_exp_p5); + y = pmadd(y, z, x); + y = padd(y, p4f_1); + + // build 2^n + emm0 = _mm_cvttps_epi32(fx); + emm0 = _mm_add_epi32(emm0, p4i_0x7f); + emm0 = _mm_slli_epi32(emm0, 23); + return pmax(pmul(y, Packet4f(_mm_castsi128_ps(emm0))), _x); +} +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +Packet2d pexp(const Packet2d& _x) +{ + Packet2d x = _x; + + _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); + _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); + _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); + + _EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437); + _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303); + + _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599); + + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1); + + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0); + + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125); + _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6); + static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0); + + Packet2d tmp, fx; + Packet4i emm0; + + // clamp x + x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo); + /* express exp(x) as exp(g + n*log(2)) */ + fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half); + +#ifdef EIGEN_VECTORIZE_SSE4_1 + fx = _mm_floor_pd(fx); +#else + emm0 = _mm_cvttpd_epi32(fx); + tmp = _mm_cvtepi32_pd(emm0); + /* if greater, substract 1 */ + Packet2d mask = _mm_cmpgt_pd(tmp, fx); + mask = _mm_and_pd(mask, p2d_1); + fx = psub(tmp, mask); +#endif + + tmp = pmul(fx, p2d_cephes_exp_C1); + Packet2d z = pmul(fx, p2d_cephes_exp_C2); + x = psub(x, tmp); + x = psub(x, z); + + Packet2d x2 = pmul(x,x); + + Packet2d px = p2d_cephes_exp_p0; + px = pmadd(px, x2, p2d_cephes_exp_p1); + px = pmadd(px, x2, p2d_cephes_exp_p2); + px = pmul (px, x); + + Packet2d qx = p2d_cephes_exp_q0; + qx = pmadd(qx, x2, p2d_cephes_exp_q1); + qx = pmadd(qx, x2, p2d_cephes_exp_q2); + qx = pmadd(qx, x2, p2d_cephes_exp_q3); + + x = pdiv(px,psub(qx,px)); + x = pmadd(p2d_2,x,p2d_1); + + // build 2^n + emm0 = _mm_cvttpd_epi32(fx); + emm0 = _mm_add_epi32(emm0, p4i_1023_0); + emm0 = _mm_slli_epi32(emm0, 20); + emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3)); + return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet2d pexp(const Packet2d& x) -{ - return pexp_double(x); -} +/* evaluation of 4 sines at onces, using SSE2 intrinsics. + + The code is the exact rewriting of the cephes sinf function. + Precision is excellent as long as x < 8192 (I did not bother to + take into account the special handling they have for greater values + -- it does not return garbage for arguments over 8192, though, but + the extra precision is missing). + + Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the + surprising but correct result. +*/ template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psin(const Packet4f& _x) { - return psin_float(_x); + Packet4f x = _x; + _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); + _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + + _EIGEN_DECLARE_CONST_Packet4i(1, 1); + _EIGEN_DECLARE_CONST_Packet4i(not1, ~1); + _EIGEN_DECLARE_CONST_Packet4i(2, 2); + _EIGEN_DECLARE_CONST_Packet4i(4, 4); + + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000); + + _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f); + _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); + _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); + _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f); + _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f); + _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f); + _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f); + _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f); + _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI + + Packet4f xmm1, xmm2, xmm3, sign_bit, y; + + Packet4i emm0, emm2; + sign_bit = x; + /* take the absolute value */ + x = pabs(x); + + /* take the modulo */ + + /* extract the sign bit (upper one) */ + sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask); + + /* scale by 4/Pi */ + y = pmul(x, p4f_cephes_FOPI); + + /* store the integer part of y in mm0 */ + emm2 = _mm_cvttps_epi32(y); + /* j=(j+1) & (~1) (see the cephes sources) */ + emm2 = _mm_add_epi32(emm2, p4i_1); + emm2 = _mm_and_si128(emm2, p4i_not1); + y = _mm_cvtepi32_ps(emm2); + /* get the swap sign flag */ + emm0 = _mm_and_si128(emm2, p4i_4); + emm0 = _mm_slli_epi32(emm0, 29); + /* get the polynom selection mask + there is one polynom for 0 <= x <= Pi/4 + and another one for Pi/4 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f pcos(const Packet4f& _x) { - return pcos_float(_x); + Packet4f x = _x; + _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); + _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); + + _EIGEN_DECLARE_CONST_Packet4i(1, 1); + _EIGEN_DECLARE_CONST_Packet4i(not1, ~1); + _EIGEN_DECLARE_CONST_Packet4i(2, 2); + _EIGEN_DECLARE_CONST_Packet4i(4, 4); + + _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f); + _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f); + _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f); + _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f); + _EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736E-3f); + _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f); + _EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948E-005f); + _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f); + _EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827E-002f); + _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI + + Packet4f xmm1, xmm2, xmm3, y; + Packet4i emm0, emm2; + + x = pabs(x); + + /* scale by 4/Pi */ + y = pmul(x, p4f_cephes_FOPI); + + /* get the integer part of y */ + emm2 = _mm_cvttps_epi32(y); + /* j=(j+1) & (~1) (see the cephes sources) */ + emm2 = _mm_add_epi32(emm2, p4i_1); + emm2 = _mm_and_si128(emm2, p4i_not1); + y = _mm_cvtepi32_ps(emm2); + + emm2 = _mm_sub_epi32(emm2, p4i_2); + + /* get the swap sign flag */ + emm0 = _mm_andnot_si128(emm2, p4i_4); + emm0 = _mm_slli_epi32(emm0, 29); + /* get the polynom selection mask */ + emm2 = _mm_and_si128(emm2, p4i_2); + emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); + + Packet4f sign_bit = _mm_castsi128_ps(emm0); + Packet4f poly_mask = _mm_castsi128_ps(emm2); + + /* The magic pass: "Extended precision modular arithmetic" + x = ((x - y * DP1) - y * DP2) - y * DP3; */ + xmm1 = pmul(y, p4f_minus_cephes_DP1); + xmm2 = pmul(y, p4f_minus_cephes_DP2); + xmm3 = pmul(y, p4f_minus_cephes_DP3); + x = padd(x, xmm1); + x = padd(x, xmm2); + x = padd(x, xmm3); + + /* Evaluate the first polynom (0 <= x <= Pi/4) */ + y = p4f_coscof_p0; + Packet4f z = pmul(x,x); + + y = pmadd(y,z,p4f_coscof_p1); + y = pmadd(y,z,p4f_coscof_p2); + y = pmul(y, z); + y = pmul(y, z); + Packet4f tmp = _mm_mul_ps(z, p4f_half); + y = psub(y, tmp); + y = padd(y, p4f_1); + + /* Evaluate the second polynom (Pi/4 <= x <= 0) */ + Packet4f y2 = p4f_sincof_p0; + y2 = pmadd(y2, z, p4f_sincof_p1); + y2 = pmadd(y2, z, p4f_sincof_p2); + y2 = pmul(y2, z); + y2 = pmadd(y2, x, x); + + /* select the correct result from the two polynoms */ + y2 = _mm_and_ps(poly_mask, y2); + y = _mm_andnot_ps(poly_mask, y); + y = _mm_or_ps(y,y2); + + /* update the sign */ + return _mm_xor_ps(y, sign_bit); } #if EIGEN_FAST_MATH @@ -98,34 +482,30 @@ Packet2d psqrt(const Packet2d& x) { return _mm_sqrt_pd(x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt(const Packet4f& _x) { + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(nan, 0x7fc00000); _EIGEN_DECLARE_CONST_Packet4f(one_point_five, 1.5f); _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5f); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inf, 0x7f800000u); - _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000u); + _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(flt_min, 0x00800000); Packet4f neg_half = pmul(_x, p4f_minus_half); - // Identity infinite, zero, negative and denormal arguments. - Packet4f lt_min_mask = _mm_cmplt_ps(_x, p4f_flt_min); - Packet4f inf_mask = _mm_cmpeq_ps(_x, p4f_inf); - Packet4f not_normal_finite_mask = _mm_or_ps(lt_min_mask, inf_mask); + // select only the inverse sqrt of positive normal inputs (denormals are + // flushed to zero and cause infs as well). + Packet4f le_zero_mask = _mm_cmple_ps(_x, p4f_flt_min); + Packet4f x = _mm_andnot_ps(le_zero_mask, _mm_rsqrt_ps(_x)); - // Compute an approximate result using the rsqrt intrinsic. - Packet4f y_approx = _mm_rsqrt_ps(_x); + // Fill in NaNs and Infs for the negative/zero entries. + Packet4f neg_mask = _mm_cmplt_ps(_x, _mm_setzero_ps()); + Packet4f zero_mask = _mm_andnot_ps(neg_mask, le_zero_mask); + Packet4f infs_and_nans = _mm_or_ps(_mm_and_ps(neg_mask, p4f_nan), + _mm_and_ps(zero_mask, p4f_inf)); - // Do a single step of Newton-Raphson iteration to improve the approximation. - // This uses the formula y_{n+1} = y_n * (1.5 - y_n * (0.5 * x) * y_n). - // It is essential to evaluate the inner term like this because forming - // y_n^2 may over- or underflow. - Packet4f y_newton = pmul( - y_approx, pmadd(y_approx, pmul(neg_half, y_approx), p4f_one_point_five)); + // Do a single step of Newton's iteration. + x = pmul(x, pmadd(neg_half, pmul(x, x), p4f_one_point_five)); - // Select the result of the Newton-Raphson step for positive normal arguments. - // For other arguments, choose the output of the intrinsic. This will - // return rsqrt(+inf) = 0, rsqrt(x) = NaN if x < 0, and rsqrt(x) = +inf if - // x is zero or a positive denormalized float (equivalent to flushing positive - // denormalized inputs to zero). - return pselect(not_normal_finite_mask, y_approx, y_newton); + // Insert NaNs and Infs in all the right places. + return _mm_or_ps(x, infs_and_nans); } #else @@ -168,7 +548,7 @@ double sqrt(const double &x) { #if EIGEN_COMP_GNUC_STRICT // This works around a GCC bug generating poor code for _mm_sqrt_pd - // See https://gitlab.com/libeigen/eigen/commit/8dca9f97e38970 + // See https://bitbucket.org/eigen/eigen/commits/14f468dba4d350d7c19c9b93072e19f7b3df563b return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x)))); #else return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x)))); diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/PacketMath.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/PacketMath.h index 645aee0cd..60e2517e4 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/PacketMath.h @@ -18,13 +18,11 @@ namespace internal { #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 #endif -#if !defined(EIGEN_VECTORIZE_AVX) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS) -// 32 bits => 8 registers -// 64 bits => 16 registers +#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) #endif -#ifdef EIGEN_VECTORIZE_FMA +#ifdef __FMA__ #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 1 #endif @@ -36,37 +34,47 @@ namespace internal { // One solution is to increase ABI version using -fabi-version=4 (or greater). // Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper // structure: +template +struct eigen_packet_wrapper +{ + EIGEN_ALWAYS_INLINE operator T&() { return m_val; } + EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; } + EIGEN_ALWAYS_INLINE eigen_packet_wrapper() {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T &v) : m_val(v) {} + EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T &v) { + m_val = v; + return *this; + } + + T m_val; +}; typedef eigen_packet_wrapper<__m128> Packet4f; +typedef eigen_packet_wrapper<__m128i> Packet4i; typedef eigen_packet_wrapper<__m128d> Packet2d; #else typedef __m128 Packet4f; +typedef __m128i Packet4i; typedef __m128d Packet2d; #endif -typedef eigen_packet_wrapper<__m128i, 0> Packet4i; -typedef eigen_packet_wrapper<__m128i, 1> Packet16b; - template<> struct is_arithmetic<__m128> { enum { value = true }; }; template<> struct is_arithmetic<__m128i> { enum { value = true }; }; template<> struct is_arithmetic<__m128d> { enum { value = true }; }; -template<> struct is_arithmetic { enum { value = true }; }; - -#define EIGEN_SSE_SHUFFLE_MASK(p,q,r,s) ((s)<<6|(r)<<4|(q)<<2|(p)) #define vec4f_swizzle1(v,p,q,r,s) \ - (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s)))) + (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p))))) #define vec4i_swizzle1(v,p,q,r,s) \ - (_mm_shuffle_epi32( v, EIGEN_SSE_SHUFFLE_MASK(p,q,r,s))) + (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p)))) #define vec2d_swizzle1(v,p,q) \ - (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), EIGEN_SSE_SHUFFLE_MASK(2*p,2*p+1,2*q,2*q+1)))) - + (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2))))) + #define vec4f_swizzle2(a,b,p,q,r,s) \ - (_mm_shuffle_ps( (a), (b), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s))) + (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p)))) #define vec4i_swizzle2(a,b,p,q,r,s) \ - (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), EIGEN_SSE_SHUFFLE_MASK(p,q,r,s))))) + (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p)))))) #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ const Packet4f p4f_##NAME = pset1(X) @@ -75,7 +83,7 @@ template<> struct is_arithmetic { enum { value = true }; }; const Packet2d p2d_##NAME = pset1(X) #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ - const Packet4f p4f_##NAME = pset1frombits(X) + const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1(X)) #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ const Packet4i p4i_##NAME = pset1(X) @@ -84,43 +92,36 @@ template<> struct is_arithmetic { enum { value = true }; }; // Use the packet_traits defined in AVX/PacketMath.h instead if we're going // to leverage AVX instructions. #ifndef EIGEN_VECTORIZE_AVX -template <> -struct packet_traits : default_packet_traits { +template<> struct packet_traits : default_packet_traits +{ typedef Packet4f type; typedef Packet4f half; enum { Vectorizable = 1, AlignedOnScalar = 1, - size = 4, + size=4, HasHalfPacket = 0, - HasDiv = 1, - HasSin = EIGEN_FAST_MATH, - HasCos = EIGEN_FAST_MATH, - HasLog = 1, - HasLog1p = 1, - HasExpm1 = 1, - HasNdtri = 1, - HasExp = 1, - HasBessel = 1, + HasDiv = 1, + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, HasSqrt = 1, HasRsqrt = 1, - HasTanh = EIGEN_FAST_MATH, - HasErf = EIGEN_FAST_MATH, - HasBlend = 1, - HasInsert = 1, - HasFloor = 1 + HasTanh = EIGEN_FAST_MATH, + HasBlend = 1 #ifdef EIGEN_VECTORIZE_SSE4_1 , - HasRint = 1, HasRound = 1, + HasFloor = 1, HasCeil = 1 #endif }; }; -template <> -struct packet_traits : default_packet_traits { +template<> struct packet_traits : default_packet_traits +{ typedef Packet2d type; typedef Packet2d half; enum { @@ -133,13 +134,11 @@ struct packet_traits : default_packet_traits { HasExp = 1, HasSqrt = 1, HasRsqrt = 1, - HasBlend = 1, - HasInsert = 1 + HasBlend = 1 #ifdef EIGEN_VECTORIZE_SSE4_1 , HasRound = 1, - HasRint = 1, HasFloor = 1, HasCeil = 1 #endif @@ -155,55 +154,13 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size=4, - HasShift = 1, HasBlend = 1 }; }; -template<> struct packet_traits : default_packet_traits -{ - typedef Packet16b type; - typedef Packet16b half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - HasHalfPacket = 0, - size=16, - - HasAdd = 1, - HasSub = 0, - HasShift = 0, - HasMul = 1, - HasNegate = 0, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasConj = 0 - }; -}; - -template<> struct unpacket_traits { - typedef float type; - typedef Packet4f half; - typedef Packet4i integer_packet; - enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; -}; -template<> struct unpacket_traits { - typedef double type; - typedef Packet2d half; - enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; -}; -template<> struct unpacket_traits { - typedef int type; - typedef Packet4i half; - enum {size=4, alignment=Aligned16, vectorizable=false, masked_load_available=false, masked_store_available=false}; -}; -template<> struct unpacket_traits { - typedef bool type; - typedef Packet16b half; - enum {size=16, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; -}; +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; +template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; #ifndef EIGEN_VECTORIZE_AVX template<> struct scalar_div_cost { enum { value = 7 }; }; @@ -222,13 +179,6 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { re template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { return _mm_set1_pd(from); } template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return _mm_set1_epi32(from); } #endif -template<> EIGEN_STRONG_INLINE Packet16b pset1(const bool& from) { return _mm_set1_epi8(static_cast(from)); } - -template<> EIGEN_STRONG_INLINE Packet4f pset1frombits(unsigned int from) { return _mm_castsi128_ps(pset1(from)); } - -template<> EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) { return _mm_setzero_ps(); } -template<> EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) { return _mm_setzero_pd(); } -template<> EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) { return _mm_setzero_si128(); } // GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction. // However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203) @@ -240,7 +190,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pload1(const float *from) { return vec4f_swizzle1(_mm_load_ss(from),0,0,0,0); } #endif - + template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return _mm_add_ps(pset1(a), _mm_set_ps(3,2,1,0)); } template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return _mm_add_pd(pset1(a),_mm_set_pd(1,0)); } template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return _mm_add_epi32(pset1(a),_mm_set_epi32(3,2,1,0)); } @@ -249,8 +199,6 @@ template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); } -template<> EIGEN_STRONG_INLINE Packet16b padd(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); } - template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); } @@ -292,62 +240,18 @@ template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const #endif } -template<> EIGEN_STRONG_INLINE Packet16b pmul(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); } - template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); } template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); } // for some weird raisons, it has to be overloaded for packet of integers template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); } -#ifdef EIGEN_VECTORIZE_FMA +#ifdef __FMA__ template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return _mm_fmadd_ps(a,b,c); } template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return _mm_fmadd_pd(a,b,c); } #endif -#ifdef EIGEN_VECTORIZE_SSE4_1 -template<> EIGEN_DEVICE_FUNC inline Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) { return _mm_blendv_ps(b,a,mask); } - -template<> EIGEN_DEVICE_FUNC inline Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) { return _mm_blendv_pd(b,a,mask); } -#endif - -template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { -#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 - // There appears to be a bug in GCC, by which the optimizer may - // flip the argument order in calls to _mm_min_ps, so we have to - // resort to inline ASM here. This is supposed to be fixed in gcc6.3, - // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 - #ifdef EIGEN_VECTORIZE_AVX - Packet4f res; - asm("vminps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); - #else - Packet4f res = b; - asm("minps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); - #endif - return res; -#else - // Arguments are reversed to match NaN propagation behavior of std::min. - return _mm_min_ps(b, a); -#endif -} -template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { -#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 - // There appears to be a bug in GCC, by which the optimizer may - // flip the argument order in calls to _mm_min_pd, so we have to - // resort to inline ASM here. This is supposed to be fixed in gcc6.3, - // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 - #ifdef EIGEN_VECTORIZE_AVX - Packet2d res; - asm("vminpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); - #else - Packet2d res = b; - asm("minpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); - #endif - return res; -#else - // Arguments are reversed to match NaN propagation behavior of std::min. - return _mm_min_pd(b, a); -#endif -} +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { #ifdef EIGEN_VECTORIZE_SSE4_1 @@ -359,44 +263,8 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const #endif } -template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { -#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 - // There appears to be a bug in GCC, by which the optimizer may - // flip the argument order in calls to _mm_max_ps, so we have to - // resort to inline ASM here. This is supposed to be fixed in gcc6.3, - // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 - #ifdef EIGEN_VECTORIZE_AVX - Packet4f res; - asm("vmaxps %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); - #else - Packet4f res = b; - asm("maxps %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); - #endif - return res; -#else - // Arguments are reversed to match NaN propagation behavior of std::max. - return _mm_max_ps(b, a); -#endif -} -template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { -#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 - // There appears to be a bug in GCC, by which the optimizer may - // flip the argument order in calls to _mm_max_pd, so we have to - // resort to inline ASM here. This is supposed to be fixed in gcc6.3, - // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867 - #ifdef EIGEN_VECTORIZE_AVX - Packet2d res; - asm("vmaxpd %[a], %[b], %[res]" : [res] "=x" (res) : [a] "x" (a), [b] "x" (b)); - #else - Packet2d res = b; - asm("maxpd %[a], %[res]" : [res] "+x" (res) : [a] "x" (a)); - #endif - return res; -#else - // Arguments are reversed to match NaN propagation behavior of std::max. - return _mm_max_pd(b, a); -#endif -} +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); } template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { #ifdef EIGEN_VECTORIZE_SSE4_1 @@ -408,112 +276,36 @@ template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const #endif } -template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return _mm_cmple_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return _mm_cmplt_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { return _mm_cmpnge_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return _mm_cmpeq_ps(a,b); } - -template<> EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) { return _mm_cmple_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) { return _mm_cmplt_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) { return _mm_cmpnge_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { return _mm_cmpeq_pd(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return _mm_cmplt_epi32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return _mm_cmpeq_epi32(a,b); } -template<> EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) { return _mm_cmpeq_epi8(a,b); } - - -template<> EIGEN_STRONG_INLINE Packet4i ptrue(const Packet4i& a) { return _mm_cmpeq_epi32(a, a); } -template<> EIGEN_STRONG_INLINE Packet16b ptrue(const Packet16b& a) { return _mm_cmpeq_epi8(a, a); } -template<> EIGEN_STRONG_INLINE Packet4f -ptrue(const Packet4f& a) { - Packet4i b = _mm_castps_si128(a); - return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b)); -} -template<> EIGEN_STRONG_INLINE Packet2d -ptrue(const Packet2d& a) { - Packet4i b = _mm_castpd_si128(a); - return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b)); -} - - -template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); } -template<> EIGEN_STRONG_INLINE Packet16b pand(const Packet16b& a, const Packet16b& b) { return _mm_and_si128(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); } -template<> EIGEN_STRONG_INLINE Packet16b por(const Packet16b& a, const Packet16b& b) { return _mm_or_si128(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); } -template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); } -template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); } -template<> EIGEN_STRONG_INLINE Packet16b pxor(const Packet16b& a, const Packet16b& b) { return _mm_xor_si128(a,b); } - -template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(b,a); } -template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(b,a); } -template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(b,a); } - -template EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) { return _mm_srai_epi32(a,N); } -template EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a) { return _mm_srli_epi32(a,N); } -template EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) { return _mm_slli_epi32(a,N); } - #ifdef EIGEN_VECTORIZE_SSE4_1 -template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) -{ - // Unfortunatly _mm_round_ps doesn't have a rounding mode to implement numext::round. - const Packet4f mask = pset1frombits(0x80000000u); - const Packet4f prev0dot5 = pset1frombits(0x3EFFFFFFu); - return _mm_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); -} - -template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) -{ - const Packet2d mask = _mm_castsi128_pd(_mm_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull)); - const Packet2d prev0dot5 = _mm_castsi128_pd(_mm_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull)); - return _mm_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO); -} - -template<> EIGEN_STRONG_INLINE Packet4f print(const Packet4f& a) { return _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION); } -template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) { return _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); } +template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) { return _mm_round_ps(a, 0); } +template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return _mm_round_pd(a, 0); } template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { return _mm_ceil_ps(a); } template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return _mm_ceil_pd(a); } template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { return _mm_floor_ps(a); } template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return _mm_floor_pd(a); } -#else -template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) -{ - const Packet4f cst_1 = pset1(1.0f); - Packet4i emm0 = _mm_cvttps_epi32(a); - Packet4f tmp = _mm_cvtepi32_ps(emm0); - /* if greater, substract 1 */ - Packet4f mask = _mm_cmpgt_ps(tmp, a); - mask = pand(mask, cst_1); - return psub(tmp, mask); -} - -// WARNING: this pfloor implementation makes sense for small inputs only, -// It is currently only used by pexp and not exposed through HasFloor. -template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) -{ - const Packet2d cst_1 = pset1(1.0); - Packet4i emm0 = _mm_cvttpd_epi32(a); - Packet2d tmp = _mm_cvtepi32_pd(emm0); - /* if greater, substract 1 */ - Packet2d mask = _mm_cmpgt_pd(tmp, a); - mask = pand(mask, cst_1); - return psub(tmp, mask); -} #endif +template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); } + +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); } +template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); } +template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); } + template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); } template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); } template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast(from)); } -template<> EIGEN_STRONG_INLINE Packet16b pload(const bool* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast(from)); } #if EIGEN_COMP_MSVC template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { @@ -548,10 +340,6 @@ template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast(from)); } -template<> EIGEN_STRONG_INLINE Packet16b ploadu(const bool* from) { - EIGEN_DEBUG_UNALIGNED_LOAD - return _mm_loadu_si128(reinterpret_cast(from)); -} template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) @@ -570,12 +358,10 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); } template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); } template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } -template<> EIGEN_STRONG_INLINE void pstore(bool* to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from); } template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from); } template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); } -template<> EIGEN_STRONG_INLINE void pstoreu(bool* to, const Packet16b& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from); } template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { @@ -623,7 +409,7 @@ template<> EIGEN_STRONG_INLINE void pstore1(double* to, const double& pstore(to, Packet2d(vec2d_swizzle1(pa,0,0))); } -#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900 +#if EIGEN_COMP_PGI typedef const void * SsePrefetchPtrType; #else typedef const char * SsePrefetchPtrType; @@ -650,7 +436,6 @@ template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { return _mm_cvtss_f32(a); } template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { return _mm_cvtsd_f64(a); } template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { return _mm_cvtsi128_si32(a); } -template<> EIGEN_STRONG_INLINE bool pfirst(const Packet16b& a) { int x = _mm_cvtsi128_si32(a); return static_cast(x & 1); } #endif template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) @@ -680,23 +465,6 @@ template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) #endif } -template<> EIGEN_STRONG_INLINE Packet4f pfrexp(const Packet4f& a, Packet4f& exponent) { - return pfrexp_float(a,exponent); -} - -template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) { - return pldexp_float(a,exponent); -} - -template<> EIGEN_STRONG_INLINE Packet2d pldexp(const Packet2d& a, const Packet2d& exponent) { - const Packet4i cst_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0); - Packet4i emm0 = _mm_cvttpd_epi32(exponent); - emm0 = padd(emm0, cst_1023_0); - emm0 = _mm_slli_epi32(emm0, 20); - emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3)); - return pmul(a, Packet2d(_mm_castsi128_pd(emm0))); -} - // with AVX, the default implementations based on pload1 are faster #ifndef __AVX__ template<> EIGEN_STRONG_INLINE void @@ -737,6 +505,38 @@ EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00)); } +#ifdef EIGEN_VECTORIZE_SSE3 +template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) +{ + return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3])); +} + +template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) +{ + return _mm_hadd_pd(vecs[0], vecs[1]); +} + +#else +template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) +{ + Packet4f tmp0, tmp1, tmp2; + tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]); + tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]); + tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]); + tmp0 = _mm_add_ps(tmp0, tmp1); + tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]); + tmp1 = _mm_add_ps(tmp1, tmp2); + tmp2 = _mm_movehl_ps(tmp1, tmp0); + tmp0 = _mm_movelh_ps(tmp0, tmp1); + return _mm_add_ps(tmp0, tmp2); +} + +template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) +{ + return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1])); +} +#endif // SSE3 + template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) { // Disable SSE3 _mm_hadd_pd that is extremely slow on all existing Intel's architectures @@ -762,28 +562,38 @@ template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) } #ifdef EIGEN_VECTORIZE_SSSE3 +template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) +{ + return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3])); +} template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) { Packet4i tmp0 = _mm_hadd_epi32(a,a); return pfirst(_mm_hadd_epi32(tmp0,tmp0)); } - #else template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) { Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a)); return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1)); } -#endif -template<> EIGEN_STRONG_INLINE bool predux(const Packet16b& a) { -Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a,a)); - return (pfirst(tmp) != 0) || (pfirst(_mm_shuffle_epi32(tmp, 1)) != 0); +template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) +{ + Packet4i tmp0, tmp1, tmp2; + tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + tmp0 = _mm_add_epi32(tmp0, tmp1); + tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + tmp1 = _mm_add_epi32(tmp1, tmp2); + tmp2 = _mm_unpacklo_epi64(tmp0, tmp1); + tmp0 = _mm_unpackhi_epi64(tmp0, tmp1); + return _mm_add_epi32(tmp0, tmp2); } - +#endif // Other reduction functions: - // mul template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) { @@ -801,7 +611,7 @@ template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) // TODO try to call _mm_mul_epu32 directly EIGEN_ALIGN16 int aux[4]; pstore(aux, a); - return (aux[0] * aux[1]) * (aux[2] * aux[3]); + return (aux[0] * aux[1]) * (aux[2] * aux[3]);; } // min @@ -856,16 +666,113 @@ template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) #endif // EIGEN_VECTORIZE_SSE4_1 } -// not needed yet -// template<> EIGEN_STRONG_INLINE bool predux_all(const Packet4f& x) +#if EIGEN_COMP_GNUC +// template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) // { -// return _mm_movemask_ps(x) == 0xF; +// Packet4f res = b; +// asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c)); +// return res; // } +// EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i& a, const Packet4i& b, const int i) +// { +// Packet4i res = a; +// asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i)); +// return res; +// } +#endif -template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) +#ifdef EIGEN_VECTORIZE_SSSE3 +// SSSE3 versions +template +struct palign_impl { - return _mm_movemask_ps(x) != 0x0; -} + static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) + { + if (Offset!=0) + first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4)); + } +}; + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) + { + if (Offset!=0) + first = _mm_alignr_epi8(second,first, Offset*4); + } +}; + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) + { + if (Offset==1) + first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8)); + } +}; +#else +// SSE2 versions +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) + { + if (Offset==1) + { + first = _mm_move_ss(first,second); + first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39)); + } + else if (Offset==2) + { + first = _mm_movehl_ps(first,first); + first = _mm_movelh_ps(first,second); + } + else if (Offset==3) + { + first = _mm_move_ss(first,second); + first = _mm_shuffle_ps(first,second,0x93); + } + } +}; + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) + { + if (Offset==1) + { + first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); + first = _mm_shuffle_epi32(first,0x39); + } + else if (Offset==2) + { + first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first))); + first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); + } + else if (Offset==3) + { + first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second))); + first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93)); + } + } +}; + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) + { + if (Offset==1) + { + first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first))); + first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second))); + } + } +}; +#endif EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { @@ -892,19 +799,6 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3] = _mm_unpackhi_epi64(T2, T3); } -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - __m128i T0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]); - __m128i T1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]); - __m128i T2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]); - __m128i T3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]); - kernel.packet[0] = _mm_unpacklo_epi16(T0, T2); - kernel.packet[1] = _mm_unpackhi_epi16(T0, T2); - kernel.packet[2] = _mm_unpacklo_epi16(T1, T3); - kernel.packet[3] = _mm_unpackhi_epi16(T1, T3); -} - - template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { const __m128i zero = _mm_setzero_si128(); const __m128i select = _mm_set_epi32(ifPacket.select[3], ifPacket.select[2], ifPacket.select[1], ifPacket.select[0]); @@ -936,8 +830,46 @@ template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, cons #endif } +template<> EIGEN_STRONG_INLINE Packet4f pinsertfirst(const Packet4f& a, float b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blend_ps(a,pset1(b),1); +#else + return _mm_move_ss(a, _mm_load_ss(&b)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet2d pinsertfirst(const Packet2d& a, double b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blend_pd(a,pset1(b),1); +#else + return _mm_move_sd(a, _mm_load_sd(&b)); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet4f pinsertlast(const Packet4f& a, float b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blend_ps(a,pset1(b),(1<<3)); +#else + const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x0,0x0,0x0,0xFFFFFFFF)); + return _mm_or_ps(_mm_andnot_ps(mask, a), _mm_and_ps(mask, pset1(b))); +#endif +} + +template<> EIGEN_STRONG_INLINE Packet2d pinsertlast(const Packet2d& a, double b) +{ +#ifdef EIGEN_VECTORIZE_SSE4_1 + return _mm_blend_pd(a,pset1(b),(1<<1)); +#else + const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x0,0xFFFFFFFF,0xFFFFFFFF)); + return _mm_or_pd(_mm_andnot_pd(mask, a), _mm_and_pd(mask, pset1(b))); +#endif +} + // Scalar path for pmadd with FMA to ensure consistency with vectorized path. -#ifdef EIGEN_VECTORIZE_FMA +#ifdef __FMA__ template<> EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) { return ::fmaf(a,b,c); } @@ -946,219 +878,11 @@ template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, co } #endif - -// Packet math for Eigen::half -// Disable the following code since it's broken on too many platforms / compilers. -//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) -#if 0 - -typedef struct { - __m64 x; -} Packet4h; - - -template<> struct is_arithmetic { enum { value = true }; }; - -template <> -struct packet_traits : default_packet_traits { - typedef Packet4h type; - // There is no half-size packet for Packet4h. - typedef Packet4h half; - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = 4, - HasHalfPacket = 0, - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasNegate = 0, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasConj = 0, - HasSetLinear = 0, - HasSqrt = 0, - HasRsqrt = 0, - HasExp = 0, - HasLog = 0, - HasBlend = 0 - }; -}; - - -template<> struct unpacket_traits { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h half; }; - -template<> EIGEN_STRONG_INLINE Packet4h pset1(const Eigen::half& from) { - Packet4h result; - result.x = _mm_set1_pi16(from.x); - return result; -} - -template<> EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet4h& from) { - return half_impl::raw_uint16_to_half(static_cast(_mm_cvtsi64_si32(from.x))); -} - -template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; } - -template<> EIGEN_STRONG_INLINE Packet4h padd(const Packet4h& a, const Packet4h& b) { - __int64_t a64 = _mm_cvtm64_si64(a.x); - __int64_t b64 = _mm_cvtm64_si64(b.x); - - Eigen::half h[4]; - - Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); - Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); - h[0] = ha + hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); - h[1] = ha + hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); - h[2] = ha + hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); - h[3] = ha + hb; - Packet4h result; - result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet4h psub(const Packet4h& a, const Packet4h& b) { - __int64_t a64 = _mm_cvtm64_si64(a.x); - __int64_t b64 = _mm_cvtm64_si64(b.x); - - Eigen::half h[4]; - - Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); - Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); - h[0] = ha - hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); - h[1] = ha - hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); - h[2] = ha - hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); - h[3] = ha - hb; - Packet4h result; - result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet4h pmul(const Packet4h& a, const Packet4h& b) { - __int64_t a64 = _mm_cvtm64_si64(a.x); - __int64_t b64 = _mm_cvtm64_si64(b.x); - - Eigen::half h[4]; - - Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); - Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); - h[0] = ha * hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); - h[1] = ha * hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); - h[2] = ha * hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); - h[3] = ha * hb; - Packet4h result; - result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet4h pdiv(const Packet4h& a, const Packet4h& b) { - __int64_t a64 = _mm_cvtm64_si64(a.x); - __int64_t b64 = _mm_cvtm64_si64(b.x); - - Eigen::half h[4]; - - Eigen::half ha = half_impl::raw_uint16_to_half(static_cast(a64)); - Eigen::half hb = half_impl::raw_uint16_to_half(static_cast(b64)); - h[0] = ha / hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 16)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 16)); - h[1] = ha / hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 32)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 32)); - h[2] = ha / hb; - ha = half_impl::raw_uint16_to_half(static_cast(a64 >> 48)); - hb = half_impl::raw_uint16_to_half(static_cast(b64 >> 48)); - h[3] = ha / hb; - Packet4h result; - result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet4h pload(const Eigen::half* from) { - Packet4h result; - result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE Packet4h ploadu(const Eigen::half* from) { - Packet4h result; - result.x = _mm_cvtsi64_m64(*reinterpret_cast(from)); - return result; -} - -template<> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet4h& from) { - __int64_t r = _mm_cvtm64_si64(from.x); - *(reinterpret_cast<__int64_t*>(to)) = r; -} - -template<> EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet4h& from) { - __int64_t r = _mm_cvtm64_si64(from.x); - *(reinterpret_cast<__int64_t*>(to)) = r; -} - -template<> EIGEN_STRONG_INLINE Packet4h -ploadquad(const Eigen::half* from) { - return pset1(*from); -} - -template<> EIGEN_STRONG_INLINE Packet4h pgather(const Eigen::half* from, Index stride) -{ - Packet4h result; - result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x); - return result; -} - -template<> EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const Packet4h& from, Index stride) -{ - __int64_t a = _mm_cvtm64_si64(from.x); - to[stride*0].x = static_cast(a); - to[stride*1].x = static_cast(a >> 16); - to[stride*2].x = static_cast(a >> 32); - to[stride*3].x = static_cast(a >> 48); -} - -EIGEN_STRONG_INLINE void -ptranspose(PacketBlock& kernel) { - __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x); - __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x); - __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x); - __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x); - - kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1); - kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1); - kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3); - kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3); -} - -#endif - - } // end namespace internal } // end namespace Eigen -#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900 +#if EIGEN_COMP_PGI // PGI++ does not define the following intrinsics in C++ mode. static inline __m128 _mm_castpd_ps (__m128d x) { return reinterpret_cast<__m128&>(x); } static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); } diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/TypeCasting.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/TypeCasting.h index 1b8e9a550..c6ca8c716 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/TypeCasting.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SSE/TypeCasting.h @@ -69,64 +69,6 @@ template<> EIGEN_STRONG_INLINE Packet2d pcast(const Packet4f return _mm_cvtps_pd(a); } -template<> EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { - return _mm_castps_si128(a); -} - -template<> EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) { - return _mm_castsi128_ps(a); -} - - -// Disable the following code since it's broken on too many platforms / compilers. -//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC) -#if 0 - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4h& a) { - __int64_t a64 = _mm_cvtm64_si64(a.x); - Eigen::half h = raw_uint16_to_half(static_cast(a64)); - float f1 = static_cast(h); - h = raw_uint16_to_half(static_cast(a64 >> 16)); - float f2 = static_cast(h); - h = raw_uint16_to_half(static_cast(a64 >> 32)); - float f3 = static_cast(h); - h = raw_uint16_to_half(static_cast(a64 >> 48)); - float f4 = static_cast(h); - return _mm_set_ps(f4, f3, f2, f1); -} - -template <> -struct type_casting_traits { - enum { - VectorizedCast = 1, - SrcCoeffRatio = 1, - TgtCoeffRatio = 1 - }; -}; - -template<> EIGEN_STRONG_INLINE Packet4h pcast(const Packet4f& a) { - EIGEN_ALIGN16 float aux[4]; - pstore(aux, a); - Eigen::half h0(aux[0]); - Eigen::half h1(aux[1]); - Eigen::half h2(aux[2]); - Eigen::half h3(aux[3]); - - Packet4h result; - result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x); - return result; -} - -#endif } // end namespace internal diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h deleted file mode 100644 index 710059d50..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/InteropHeaders.h +++ /dev/null @@ -1,229 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * InteropHeaders.h - * - * \brief: - * InteropHeaders - * - *****************************************************************/ - -#ifndef EIGEN_INTEROP_HEADERS_SYCL_H -#define EIGEN_INTEROP_HEADERS_SYCL_H - -namespace Eigen { - -#if !defined(EIGEN_DONT_VECTORIZE_SYCL) - -namespace internal { - -template -struct sycl_packet_traits : default_packet_traits { - enum { - Vectorizable = 1, - AlignedOnScalar = 1, - size = lengths, - HasHalfPacket = 0, - HasDiv = 1, - HasLog = 1, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, - HasSin = 1, - HasCos = 1, - HasTan = 1, - HasASin = 1, - HasACos = 1, - HasATan = 1, - HasSinh = 1, - HasCosh = 1, - HasTanh = 1, - HasLGamma = 0, - HasDiGamma = 0, - HasZeta = 0, - HasPolygamma = 0, - HasErf = 0, - HasErfc = 0, - HasNdtri = 0, - HasIGamma = 0, - HasIGammac = 0, - HasBetaInc = 0, - HasBlend = has_blend, - HasMax = 1, - HasMin = 1, - HasMul = 1, - HasAdd = 1, - HasFloor = 1, - HasRound = 1, - HasRint = 1, - HasLog1p = 1, - HasExpm1 = 1, - HasCeil = 1, - }; -}; - -#ifdef SYCL_DEVICE_ONLY -#define SYCL_PACKET_TRAITS(packet_type, has_blend, unpacket_type, lengths) \ - template <> \ - struct packet_traits \ - : sycl_packet_traits { \ - typedef packet_type type; \ - typedef packet_type half; \ - }; - -SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4) -SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4) -SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2) -SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, const double, 2) -#undef SYCL_PACKET_TRAITS - -// Make sure this is only available when targeting a GPU: we don't want to -// introduce conflicts between these packet_traits definitions and the ones -// we'll use on the host side (SSE, AVX, ...) -#define SYCL_ARITHMETIC(packet_type) \ - template <> \ - struct is_arithmetic { \ - enum { value = true }; \ - }; -SYCL_ARITHMETIC(cl::sycl::cl_float4) -SYCL_ARITHMETIC(cl::sycl::cl_double2) -#undef SYCL_ARITHMETIC - -#define SYCL_UNPACKET_TRAITS(packet_type, unpacket_type, lengths) \ - template <> \ - struct unpacket_traits { \ - typedef unpacket_type type; \ - enum { size = lengths, vectorizable = true, alignment = Aligned16 }; \ - typedef packet_type half; \ - }; -SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4) -SYCL_UNPACKET_TRAITS(cl::sycl::cl_double2, double, 2) - -#undef SYCL_UNPACKET_TRAITS -#endif - -} // end namespace internal - -#endif - -namespace TensorSycl { -namespace internal { - -template -struct PacketWrapper; -// This function should never get called on the device -#ifndef SYCL_DEVICE_ONLY -template -struct PacketWrapper { - typedef typename ::Eigen::internal::unpacket_traits::type - Scalar; - template - EIGEN_DEVICE_FUNC static Scalar scalarize(Index, PacketReturnType &) { - eigen_assert(false && "THERE IS NO PACKETIZE VERSION FOR THE CHOSEN TYPE"); - abort(); - } - EIGEN_DEVICE_FUNC static PacketReturnType convert_to_packet_type(Scalar in, - Scalar) { - return ::Eigen::internal::template plset(in); - } - EIGEN_DEVICE_FUNC static void set_packet(PacketReturnType, Scalar *) { - eigen_assert(false && "THERE IS NO PACKETIZE VERSION FOR THE CHOSEN TYPE"); - abort(); - } -}; - -#elif defined(SYCL_DEVICE_ONLY) -template -struct PacketWrapper { - typedef typename ::Eigen::internal::unpacket_traits::type - Scalar; - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType &in) { - switch (index) { - case 0: - return in.x(); - case 1: - return in.y(); - case 2: - return in.z(); - case 3: - return in.w(); - default: - //INDEX MUST BE BETWEEN 0 and 3.There is no abort function in SYCL kernel. so we cannot use abort here. - // The code will never reach here - __builtin_unreachable(); - } - __builtin_unreachable(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type( - Scalar in, Scalar other) { - return PacketReturnType(in, other, other, other); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) { - lhs = PacketReturnType(rhs[0], rhs[1], rhs[2], rhs[3]); - } -}; - -template -struct PacketWrapper { - typedef typename ::Eigen::internal::unpacket_traits::type - Scalar; - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index, PacketReturnType &in) { - return in; - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(Scalar in, - Scalar) { - return PacketReturnType(in); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) { - lhs = rhs[0]; - } -}; - -template -struct PacketWrapper { - typedef typename ::Eigen::internal::unpacket_traits::type - Scalar; - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType &in) { - switch (index) { - case 0: - return in.x(); - case 1: - return in.y(); - default: - //INDEX MUST BE BETWEEN 0 and 1.There is no abort function in SYCL kernel. so we cannot use abort here. - // The code will never reach here - __builtin_unreachable(); - } - __builtin_unreachable(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type( - Scalar in, Scalar other) { - return PacketReturnType(in, other); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) { - lhs = PacketReturnType(rhs[0], rhs[1]); - } -}; - -#endif - -} // end namespace internal -} // end namespace TensorSycl -} // end namespace Eigen - -#endif // EIGEN_INTEROP_HEADERS_SYCL_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h deleted file mode 100644 index a96625e2c..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/MathFunctions.h +++ /dev/null @@ -1,289 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * MathFunctions.h - * - * \brief: - * MathFunctions - * - *****************************************************************/ - -#ifndef EIGEN_MATH_FUNCTIONS_SYCL_H -#define EIGEN_MATH_FUNCTIONS_SYCL_H - -namespace Eigen { - -namespace internal { - -// Make sure this is only available when targeting a GPU: we don't want to -// introduce conflicts between these packet_traits definitions and the ones -// we'll use on the host side (SSE, AVX, ...) -#if defined(SYCL_DEVICE_ONLY) -#define SYCL_PLOG(packet_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog( \ - const packet_type& a) { \ - return cl::sycl::log(a); \ - } - -SYCL_PLOG(cl::sycl::cl_float4) -SYCL_PLOG(cl::sycl::cl_double2) -#undef SYCL_PLOG - -#define SYCL_PLOG1P(packet_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog1p( \ - const packet_type& a) { \ - return cl::sycl::log1p(a); \ - } - -SYCL_PLOG1P(cl::sycl::cl_float4) -SYCL_PLOG1P(cl::sycl::cl_double2) -#undef SYCL_PLOG1P - -#define SYCL_PLOG10(packet_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog10( \ - const packet_type& a) { \ - return cl::sycl::log10(a); \ - } - -SYCL_PLOG10(cl::sycl::cl_float4) -SYCL_PLOG10(cl::sycl::cl_double2) -#undef SYCL_PLOG10 - -#define SYCL_PEXP(packet_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexp( \ - const packet_type& a) { \ - return cl::sycl::exp(a); \ - } - -SYCL_PEXP(cl::sycl::cl_float4) -SYCL_PEXP(cl::sycl::cl_double2) -#undef SYCL_PEXP - -#define SYCL_PEXPM1(packet_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexpm1( \ - const packet_type& a) { \ - return cl::sycl::expm1(a); \ - } - -SYCL_PEXPM1(cl::sycl::cl_float4) -SYCL_PEXPM1(cl::sycl::cl_double2) -#undef SYCL_PEXPM1 - -#define SYCL_PSQRT(packet_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psqrt( \ - const packet_type& a) { \ - return cl::sycl::sqrt(a); \ - } - -SYCL_PSQRT(cl::sycl::cl_float4) -SYCL_PSQRT(cl::sycl::cl_double2) -#undef SYCL_PSQRT - -#define SYCL_PRSQRT(packet_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type prsqrt( \ - const packet_type& a) { \ - return cl::sycl::rsqrt(a); \ - } - -SYCL_PRSQRT(cl::sycl::cl_float4) -SYCL_PRSQRT(cl::sycl::cl_double2) -#undef SYCL_PRSQRT - -/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */ -#define SYCL_PSIN(packet_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psin( \ - const packet_type& a) { \ - return cl::sycl::sin(a); \ - } - -SYCL_PSIN(cl::sycl::cl_float4) -SYCL_PSIN(cl::sycl::cl_double2) -#undef SYCL_PSIN - -/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */ -#define SYCL_PCOS(packet_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcos( \ - const packet_type& a) { \ - return cl::sycl::cos(a); \ - } - -SYCL_PCOS(cl::sycl::cl_float4) -SYCL_PCOS(cl::sycl::cl_double2) -#undef SYCL_PCOS - -/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */ -#define SYCL_PTAN(packet_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptan( \ - const packet_type& a) { \ - return cl::sycl::tan(a); \ - } - -SYCL_PTAN(cl::sycl::cl_float4) -SYCL_PTAN(cl::sycl::cl_double2) -#undef SYCL_PTAN - -/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */ -#define SYCL_PASIN(packet_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pasin( \ - const packet_type& a) { \ - return cl::sycl::asin(a); \ - } - -SYCL_PASIN(cl::sycl::cl_float4) -SYCL_PASIN(cl::sycl::cl_double2) -#undef SYCL_PASIN - -/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */ -#define SYCL_PACOS(packet_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pacos( \ - const packet_type& a) { \ - return cl::sycl::acos(a); \ - } - -SYCL_PACOS(cl::sycl::cl_float4) -SYCL_PACOS(cl::sycl::cl_double2) -#undef SYCL_PACOS - -/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */ -#define SYCL_PATAN(packet_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type patan( \ - const packet_type& a) { \ - return cl::sycl::atan(a); \ - } - -SYCL_PATAN(cl::sycl::cl_float4) -SYCL_PATAN(cl::sycl::cl_double2) -#undef SYCL_PATAN - -/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */ -#define SYCL_PSINH(packet_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psinh( \ - const packet_type& a) { \ - return cl::sycl::sinh(a); \ - } - -SYCL_PSINH(cl::sycl::cl_float4) -SYCL_PSINH(cl::sycl::cl_double2) -#undef SYCL_PSINH - -/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */ -#define SYCL_PCOSH(packet_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcosh( \ - const packet_type& a) { \ - return cl::sycl::cosh(a); \ - } - -SYCL_PCOSH(cl::sycl::cl_float4) -SYCL_PCOSH(cl::sycl::cl_double2) -#undef SYCL_PCOSH - -/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */ -#define SYCL_PTANH(packet_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptanh( \ - const packet_type& a) { \ - return cl::sycl::tanh(a); \ - } - -SYCL_PTANH(cl::sycl::cl_float4) -SYCL_PTANH(cl::sycl::cl_double2) -#undef SYCL_PTANH - -#define SYCL_PCEIL(packet_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pceil( \ - const packet_type& a) { \ - return cl::sycl::ceil(a); \ - } - -SYCL_PCEIL(cl::sycl::cl_float4) -SYCL_PCEIL(cl::sycl::cl_double2) -#undef SYCL_PCEIL - -#define SYCL_PROUND(packet_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pround( \ - const packet_type& a) { \ - return cl::sycl::round(a); \ - } - -SYCL_PROUND(cl::sycl::cl_float4) -SYCL_PROUND(cl::sycl::cl_double2) -#undef SYCL_PROUND - -#define SYCL_PRINT(packet_type) \ - template<> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type print( \ - const packet_type& a) { \ - return cl::sycl::rint(a); \ - } - -SYCL_PRINT(cl::sycl::cl_float4) -SYCL_PRINT(cl::sycl::cl_double2) -#undef SYCL_PRINT - -#define SYCL_FLOOR(packet_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pfloor( \ - const packet_type& a) { \ - return cl::sycl::floor(a); \ - } - -SYCL_FLOOR(cl::sycl::cl_float4) -SYCL_FLOOR(cl::sycl::cl_double2) -#undef SYCL_FLOOR - -#define SYCL_PMIN(packet_type, expr) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmin( \ - const packet_type& a, const packet_type& b) { \ - return expr; \ - } - -SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b)) -SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b)) -#undef SYCL_PMIN - -#define SYCL_PMAX(packet_type, expr) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmax( \ - const packet_type& a, const packet_type& b) { \ - return expr; \ - } - -SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b)) -SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b)) -#undef SYCL_PMAX - -#endif - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_MATH_FUNCTIONS_SYCL_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/PacketMath.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/PacketMath.h deleted file mode 100644 index b11b5af9d..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/PacketMath.h +++ /dev/null @@ -1,670 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * PacketMath.h - * - * \brief: - * PacketMath - * - *****************************************************************/ - -#ifndef EIGEN_PACKET_MATH_SYCL_H -#define EIGEN_PACKET_MATH_SYCL_H -#include -namespace Eigen { - -namespace internal { -#ifdef SYCL_DEVICE_ONLY - -#define SYCL_PLOADT_RO(address_space_target) \ - template \ - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt_ro( \ - typename cl::sycl::multi_ptr< \ - const typename unpacket_traits::type, \ - cl::sycl::access::address_space::address_space_target>::pointer_t \ - from) { \ - typedef typename unpacket_traits::type scalar; \ - typedef cl::sycl::multi_ptr< \ - scalar, cl::sycl::access::address_space::address_space_target> \ - multi_ptr; \ - auto res = packet_type( \ - static_cast::type>(0)); \ - res.load(0, multi_ptr(const_cast(from))); \ - return res; \ - } - -SYCL_PLOADT_RO(global_space) -SYCL_PLOADT_RO(local_space) -#undef SYCL_PLOADT_RO -#endif - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type -ploadt_ro(const Eigen::TensorSycl::internal::RangeAccess< - cl::sycl::access::mode::read_write, T>& from) { - return ploadt_ro(from.get_pointer()); -} - -#ifdef SYCL_DEVICE_ONLY -#define SYCL_PLOAD(address_space_target, Alignment, AlignedType) \ - template \ - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \ - typename cl::sycl::multi_ptr< \ - const typename unpacket_traits::type, \ - cl::sycl::access::address_space::address_space_target>::pointer_t \ - from) { \ - return ploadt_ro(from); \ - } - -// global space -SYCL_PLOAD(global_space, Unaligned, u) -SYCL_PLOAD(global_space, Aligned, ) -// local space -SYCL_PLOAD(local_space, Unaligned, u) -SYCL_PLOAD(local_space, Aligned, ) - -#undef SYCL_PLOAD -#endif - -#define SYCL_PLOAD(Alignment, AlignedType) \ - template \ - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType( \ - const Eigen::TensorSycl::internal::RangeAccess< \ - cl::sycl::access::mode::read_write, \ - typename unpacket_traits::type> \ - from) { \ - return ploadt_ro(from); \ - } -SYCL_PLOAD(Unaligned, u) -SYCL_PLOAD(Aligned, ) -#undef SYCL_PLOAD - -#ifdef SYCL_DEVICE_ONLY -/** \internal \returns a packet version of \a *from. - * The pointer \a from must be aligned on a \a Alignment bytes boundary. */ -#define SYCL_PLOADT(address_space_target) \ - template \ - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type ploadt( \ - typename cl::sycl::multi_ptr< \ - const typename unpacket_traits::type, \ - cl::sycl::access::address_space::address_space_target>::pointer_t \ - from) { \ - if (Alignment >= unpacket_traits::alignment) \ - return pload(from); \ - else \ - return ploadu(from); \ - } - -// global space -SYCL_PLOADT(global_space) -// local space -SYCL_PLOADT(local_space) -#undef SYCL_PLOADT -#endif - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type -ploadt(const Eigen::TensorSycl::internal::RangeAccess< - cl::sycl::access::mode::read_write, - typename unpacket_traits::type>& from) { - return ploadt(from.get_pointer()); -} -#ifdef SYCL_DEVICE_ONLY - -// private_space -#define SYCL_PLOADT_RO_SPECIAL(packet_type, Alignment) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type \ - ploadt_ro( \ - const typename unpacket_traits::type* from) { \ - typedef typename unpacket_traits::type scalar; \ - auto res = packet_type(static_cast(0)); \ - res.template load( \ - 0, const_cast(from)); \ - return res; \ - } - -SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Aligned) -SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Aligned) -SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_float4, Unaligned) -SYCL_PLOADT_RO_SPECIAL(cl::sycl::cl_double2, Unaligned) - -#define SYCL_PLOAD_SPECIAL(packet_type, alignment_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##alignment_type( \ - const typename unpacket_traits::type* from) { \ - typedef typename unpacket_traits::type scalar; \ - auto res = packet_type(static_cast(0)); \ - res.template load( \ - 0, const_cast(from)); \ - return res; \ - } -SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, ) -SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, ) -SYCL_PLOAD_SPECIAL(cl::sycl::cl_float4, u) -SYCL_PLOAD_SPECIAL(cl::sycl::cl_double2, u) - -#undef SYCL_PLOAD_SPECIAL - -#define SYCL_PSTORE(scalar, packet_type, address_space_target, alignment) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \ - typename cl::sycl::multi_ptr< \ - scalar, \ - cl::sycl::access::address_space::address_space_target>::pointer_t \ - to, \ - const packet_type& from) { \ - typedef cl::sycl::multi_ptr< \ - scalar, cl::sycl::access::address_space::address_space_target> \ - multi_ptr; \ - from.store(0, multi_ptr(to)); \ - } - -// global space -SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, ) -SYCL_PSTORE(float, cl::sycl::cl_float4, global_space, u) -SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, ) -SYCL_PSTORE(double, cl::sycl::cl_double2, global_space, u) -SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, ) -SYCL_PSTORE(float, cl::sycl::cl_float4, local_space, u) -SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, ) -SYCL_PSTORE(double, cl::sycl::cl_double2, local_space, u) - -SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, ) -SYCL_PSTORE(float, cl::sycl::cl_float4, private_space, u) -SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, ) -SYCL_PSTORE(double, cl::sycl::cl_double2, private_space, u) -#undef SYCL_PSTORE - -#define SYCL_PSTORE_T(address_space_target) \ - template \ - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret( \ - typename cl::sycl::multi_ptr< \ - scalar, \ - cl::sycl::access::address_space::address_space_target>::pointer_t \ - to, \ - const packet_type& from) { \ - if (Alignment) \ - pstore(to, from); \ - else \ - pstoreu(to, from); \ - } - -SYCL_PSTORE_T(global_space) - -SYCL_PSTORE_T(local_space) - -#undef SYCL_PSTORE_T - -#define SYCL_PSET1(packet_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1( \ - const typename unpacket_traits::type& from) { \ - return packet_type(from); \ - } - -// global space -SYCL_PSET1(cl::sycl::cl_float4) -SYCL_PSET1(cl::sycl::cl_double2) - -#undef SYCL_PSET1 - -template -struct get_base_packet { - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type - get_ploaddup(sycl_multi_pointer) {} - - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type - get_pgather(sycl_multi_pointer, Index) {} -}; - -template <> -struct get_base_packet { - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_ploaddup( - sycl_multi_pointer from) { - return cl::sycl::cl_float4(from[0], from[0], from[1], from[1]); - } - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_pgather( - sycl_multi_pointer from, Index stride) { - return cl::sycl::cl_float4(from[0 * stride], from[1 * stride], - from[2 * stride], from[3 * stride]); - } - - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter( - sycl_multi_pointer to, const cl::sycl::cl_float4& from, Index stride) { - auto tmp = stride; - to[0] = from.x(); - to[tmp] = from.y(); - to[tmp += stride] = from.z(); - to[tmp += stride] = from.w(); - } - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 set_plset( - const float& a) { - return cl::sycl::cl_float4(static_cast(a), static_cast(a + 1), - static_cast(a + 2), - static_cast(a + 3)); - } -}; - -template <> -struct get_base_packet { - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 - get_ploaddup(const sycl_multi_pointer from) { - return cl::sycl::cl_double2(from[0], from[0]); - } - - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_pgather( - const sycl_multi_pointer from, Index stride) { - return cl::sycl::cl_double2(from[0 * stride], from[1 * stride]); - } - - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter( - sycl_multi_pointer to, const cl::sycl::cl_double2& from, Index stride) { - to[0] = from.x(); - to[stride] = from.y(); - } - - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 set_plset( - const double& a) { - return cl::sycl::cl_double2(static_cast(a), - static_cast(a + 1)); - } -}; - -#define SYCL_PLOAD_DUP(address_space_target) \ - template \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup( \ - typename cl::sycl::multi_ptr< \ - const typename unpacket_traits::type, \ - cl::sycl::access::address_space::address_space_target>::pointer_t \ - from) { \ - return get_base_packet::get_ploaddup(from); \ - } - -// global space -SYCL_PLOAD_DUP(global_space) -// local_space -SYCL_PLOAD_DUP(local_space) -#undef SYCL_PLOAD_DUP - -#define SYCL_PLOAD_DUP_SPECILIZE(packet_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup( \ - const typename unpacket_traits::type* from) { \ - return get_base_packet::get_ploaddup(from); \ - } - -SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_float4) -SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2) - -#undef SYCL_PLOAD_DUP_SPECILIZE - -#define SYCL_PLSET(packet_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type plset( \ - const typename unpacket_traits::type& a) { \ - return get_base_packet::set_plset(a); \ - } - -SYCL_PLSET(cl::sycl::cl_float4) -SYCL_PLSET(cl::sycl::cl_double2) - -#undef SYCL_PLSET - -#define SYCL_PGATHER(address_space_target) \ - template \ - EIGEN_DEVICE_FUNC inline packet_type pgather( \ - typename cl::sycl::multi_ptr< \ - const typename unpacket_traits::type, \ - cl::sycl::access::address_space::address_space_target>::pointer_t \ - from, \ - Index stride) { \ - return get_base_packet::get_pgather(from, stride); \ - } - -// global space -SYCL_PGATHER(global_space) -// local space -SYCL_PGATHER(local_space) - -#undef SYCL_PGATHER - -#define SYCL_PGATHER_SPECILIZE(scalar, packet_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type \ - pgather( \ - const typename unpacket_traits::type* from, Index stride) { \ - return get_base_packet::get_pgather(from, stride); \ - } - -SYCL_PGATHER_SPECILIZE(float, cl::sycl::cl_float4) -SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2) - -#undef SYCL_PGATHER_SPECILIZE - -#define SYCL_PSCATTER(address_space_target) \ - template \ - EIGEN_DEVICE_FUNC inline void pscatter( \ - typename cl::sycl::multi_ptr< \ - typename unpacket_traits::type, \ - cl::sycl::access::address_space::address_space_target>::pointer_t \ - to, \ - const packet_type& from, Index stride) { \ - get_base_packet::set_pscatter(to, from, stride); \ - } - -// global space -SYCL_PSCATTER(global_space) -// local space -SYCL_PSCATTER(local_space) - -#undef SYCL_PSCATTER - -#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter( \ - typename unpacket_traits::type * to, \ - const packet_type& from, Index stride) { \ - get_base_packet::set_pscatter(to, from, stride); \ - } - -SYCL_PSCATTER_SPECILIZE(float, cl::sycl::cl_float4) -SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2) - -#undef SYCL_PSCATTER_SPECILIZE - -#define SYCL_PMAD(packet_type) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pmadd( \ - const packet_type& a, const packet_type& b, const packet_type& c) { \ - return cl::sycl::mad(a, b, c); \ - } - -SYCL_PMAD(cl::sycl::cl_float4) -SYCL_PMAD(cl::sycl::cl_double2) -#undef SYCL_PMAD - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float pfirst( - const cl::sycl::cl_float4& a) { - return a.x(); -} -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double pfirst( - const cl::sycl::cl_double2& a) { - return a.x(); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux( - const cl::sycl::cl_float4& a) { - return a.x() + a.y() + a.z() + a.w(); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux( - const cl::sycl::cl_double2& a) { - return a.x() + a.y(); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_max( - const cl::sycl::cl_float4& a) { - return cl::sycl::fmax(cl::sycl::fmax(a.x(), a.y()), - cl::sycl::fmax(a.z(), a.w())); -} -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_max( - const cl::sycl::cl_double2& a) { - return cl::sycl::fmax(a.x(), a.y()); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_min( - const cl::sycl::cl_float4& a) { - return cl::sycl::fmin(cl::sycl::fmin(a.x(), a.y()), - cl::sycl::fmin(a.z(), a.w())); -} -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_min( - const cl::sycl::cl_double2& a) { - return cl::sycl::fmin(a.x(), a.y()); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_mul( - const cl::sycl::cl_float4& a) { - return a.x() * a.y() * a.z() * a.w(); -} -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_mul( - const cl::sycl::cl_double2& a) { - return a.x() * a.y(); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 -pabs(const cl::sycl::cl_float4& a) { - return cl::sycl::cl_float4(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()), - cl::sycl::fabs(a.z()), cl::sycl::fabs(a.w())); -} -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2 -pabs(const cl::sycl::cl_double2& a) { - return cl::sycl::cl_double2(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y())); -} - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_le(const Packet &a, - const Packet &b) { - return ((a <= b) - .template convert::type, - cl::sycl::rounding_mode::automatic>()); -} - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_lt(const Packet &a, - const Packet &b) { - return ((a < b) - .template convert::type, - cl::sycl::rounding_mode::automatic>()); -} - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_eq(const Packet &a, - const Packet &b) { - return ((a == b) - .template convert::type, - cl::sycl::rounding_mode::automatic>()); -} - -#define SYCL_PCMP(OP, TYPE) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TYPE pcmp_##OP(const TYPE &a, \ - const TYPE &b) { \ - return sycl_pcmp_##OP(a, b); \ - } - -SYCL_PCMP(le, cl::sycl::cl_float4) -SYCL_PCMP(lt, cl::sycl::cl_float4) -SYCL_PCMP(eq, cl::sycl::cl_float4) -SYCL_PCMP(le, cl::sycl::cl_double2) -SYCL_PCMP(lt, cl::sycl::cl_double2) -SYCL_PCMP(eq, cl::sycl::cl_double2) -#undef SYCL_PCMP - -template struct convert_to_integer; - -template <> struct convert_to_integer { - using type = int; - using packet_type = cl::sycl::cl_int4; -}; -template <> struct convert_to_integer { - using type = long; - using packet_type = cl::sycl::cl_long2; -}; - -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename convert_to_integer< - typename unpacket_traits::type>::packet_type -vector_as_int(const PacketIn &p) { - return ( - p.template convert::type>::type, - cl::sycl::rounding_mode::automatic>()); -} - -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packetOut -convert_vector(const PacketIn &p) { - return (p.template convert::type, - cl::sycl::rounding_mode::automatic>()); -} - -#define SYCL_PAND(TYPE) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE pand(const TYPE &a, \ - const TYPE &b) { \ - return convert_vector(vector_as_int(a) & vector_as_int(b)); \ - } -SYCL_PAND(cl::sycl::cl_float4) -SYCL_PAND(cl::sycl::cl_double2) -#undef SYCL_PAND - -#define SYCL_POR(TYPE) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE por(const TYPE &a, \ - const TYPE &b) { \ - return convert_vector(vector_as_int(a) | vector_as_int(b)); \ - } - -SYCL_POR(cl::sycl::cl_float4) -SYCL_POR(cl::sycl::cl_double2) -#undef SYCL_POR - -#define SYCL_PXOR(TYPE) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE pxor(const TYPE &a, \ - const TYPE &b) { \ - return convert_vector(vector_as_int(a) ^ vector_as_int(b)); \ - } - -SYCL_PXOR(cl::sycl::cl_float4) -SYCL_PXOR(cl::sycl::cl_double2) -#undef SYCL_PXOR - -#define SYCL_PANDNOT(TYPE) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TYPE pandnot(const TYPE &a, \ - const TYPE &b) { \ - return convert_vector(vector_as_int(a) & (~vector_as_int(b))); \ - } -SYCL_PANDNOT(cl::sycl::cl_float4) -SYCL_PANDNOT(cl::sycl::cl_double2) -#undef SYCL_PANDNOT - -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose( - PacketBlock& kernel) { - float tmp = kernel.packet[0].y(); - kernel.packet[0].y() = kernel.packet[1].x(); - kernel.packet[1].x() = tmp; - - tmp = kernel.packet[0].z(); - kernel.packet[0].z() = kernel.packet[2].x(); - kernel.packet[2].x() = tmp; - - tmp = kernel.packet[0].w(); - kernel.packet[0].w() = kernel.packet[3].x(); - kernel.packet[3].x() = tmp; - - tmp = kernel.packet[1].z(); - kernel.packet[1].z() = kernel.packet[2].y(); - kernel.packet[2].y() = tmp; - - tmp = kernel.packet[1].w(); - kernel.packet[1].w() = kernel.packet[3].y(); - kernel.packet[3].y() = tmp; - - tmp = kernel.packet[2].w(); - kernel.packet[2].w() = kernel.packet[3].z(); - kernel.packet[3].z() = tmp; -} - -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose( - PacketBlock& kernel) { - double tmp = kernel.packet[0].y(); - kernel.packet[0].y() = kernel.packet[1].x(); - kernel.packet[1].x() = tmp; -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pblend( - const Selector::size>& ifPacket, - const cl::sycl::cl_float4& thenPacket, - const cl::sycl::cl_float4& elsePacket) { - cl::sycl::cl_int4 condition( - ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1, - ifPacket.select[2] ? 0 : -1, ifPacket.select[3] ? 0 : -1); - return cl::sycl::select(thenPacket, elsePacket, condition); -} - -template <> -inline cl::sycl::cl_double2 pblend( - const Selector::size>& ifPacket, - const cl::sycl::cl_double2& thenPacket, - const cl::sycl::cl_double2& elsePacket) { - cl::sycl::cl_long2 condition(ifPacket.select[0] ? 0 : -1, - ifPacket.select[1] ? 0 : -1); - return cl::sycl::select(thenPacket, elsePacket, condition); -} -#endif // SYCL_DEVICE_ONLY - -#define SYCL_PSTORE(alignment) \ - template \ - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment( \ - const Eigen::TensorSycl::internal::RangeAccess< \ - cl::sycl::access::mode::read_write, \ - typename unpacket_traits::type>& to, \ - const packet_type& from) { \ - pstore##alignment(to.get_pointer(), from); \ - } - -// global space -SYCL_PSTORE() -SYCL_PSTORE(u) - -#undef SYCL_PSTORE - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret( - Eigen::TensorSycl::internal::RangeAccess< - cl::sycl::access::mode::read_write, - typename unpacket_traits::type> - to, - const packet_type& from) { - pstoret(to.get_pointer(), from); -} - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_PACKET_MATH_SYCL_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h deleted file mode 100644 index f81e59db5..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/SyclMemoryModel.h +++ /dev/null @@ -1,694 +0,0 @@ -/*************************************************************************** - * Copyright (C) 2017 Codeplay Software Limited - * This Source Code Form is subject to the terms of the Mozilla - * Public License v. 2.0. If a copy of the MPL was not distributed - * with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - * - * - * SyclMemoryModel.h - * - * Description: - * Interface for SYCL buffers to behave as a non-dereferenceable pointer - * Interface for Placeholder accessor to behave as a pointer on both host - * and device - * - * Authors: - * - * Ruyman Reyes Codeplay Software Ltd. - * Mehdi Goli Codeplay Software Ltd. - * Vanya Yaneva Codeplay Software Ltd. - * - **************************************************************************/ - -#if defined(EIGEN_USE_SYCL) && \ - !defined(EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H) -#define EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H - -#include -#ifdef EIGEN_EXCEPTIONS -#include -#endif -#include -#include -#include -#include - -namespace Eigen { -namespace TensorSycl { -namespace internal { - -using sycl_acc_target = cl::sycl::access::target; -using sycl_acc_mode = cl::sycl::access::mode; - -/** - * Default values for template arguments - */ -using buffer_data_type_t = uint8_t; -const sycl_acc_target default_acc_target = sycl_acc_target::global_buffer; -const sycl_acc_mode default_acc_mode = sycl_acc_mode::read_write; - -/** - * PointerMapper - * Associates fake pointers with buffers. - * - */ -class PointerMapper { - public: - using base_ptr_t = std::intptr_t; - - /* Structure of a virtual pointer - * - * |================================================| - * | POINTER ADDRESS | - * |================================================| - */ - struct virtual_pointer_t { - /* Type for the pointers - */ - base_ptr_t m_contents; - - /** Conversions from virtual_pointer_t to - * void * should just reinterpret_cast the integer number - */ - operator void *() const { return reinterpret_cast(m_contents); } - - /** - * Convert back to the integer number. - */ - operator base_ptr_t() const { return m_contents; } - - /** - * Add a certain value to the pointer to create a - * new pointer to that offset - */ - virtual_pointer_t operator+(size_t off) { return m_contents + off; } - - /* Numerical order for sorting pointers in containers. */ - bool operator<(virtual_pointer_t rhs) const { - return (static_cast(m_contents) < - static_cast(rhs.m_contents)); - } - - bool operator>(virtual_pointer_t rhs) const { - return (static_cast(m_contents) > - static_cast(rhs.m_contents)); - } - - /** - * Numerical order for sorting pointers in containers - */ - bool operator==(virtual_pointer_t rhs) const { - return (static_cast(m_contents) == - static_cast(rhs.m_contents)); - } - - /** - * Simple forward to the equality overload. - */ - bool operator!=(virtual_pointer_t rhs) const { - return !(this->operator==(rhs)); - } - - /** - * Converts a void * into a virtual pointer structure. - * Note that this will only work if the void * was - * already a virtual_pointer_t, but we have no way of - * checking - */ - virtual_pointer_t(const void *ptr) - : m_contents(reinterpret_cast(ptr)){}; - - /** - * Creates a virtual_pointer_t from the given integer - * number - */ - virtual_pointer_t(base_ptr_t u) : m_contents(u){}; - }; - - /* Definition of a null pointer - */ - const virtual_pointer_t null_virtual_ptr = nullptr; - - /** - * Whether if a pointer is null or not. - * A pointer is nullptr if the value is of null_virtual_ptr - */ - static inline bool is_nullptr(virtual_pointer_t ptr) { - return (static_cast(ptr) == nullptr); - } - - /* basic type for all buffers - */ - using buffer_t = cl::sycl::buffer_mem; - - /** - * Node that stores information about a device allocation. - * Nodes are sorted by size to organise a free list of nodes - * that can be recovered. - */ - struct pMapNode_t { - buffer_t m_buffer; - size_t m_size; - bool m_free; - - pMapNode_t(buffer_t b, size_t size, bool f) - : m_buffer{b}, m_size{size}, m_free{f} { - m_buffer.set_final_data(nullptr); - } - - bool operator<=(const pMapNode_t &rhs) { return (m_size <= rhs.m_size); } - }; - - /** Storage of the pointer / buffer tree - */ - using pointerMap_t = std::map; - - /** - * Obtain the insertion point in the pointer map for - * a pointer of the given size. - * \param requiredSize Size attemted to reclaim - */ - typename pointerMap_t::iterator get_insertion_point(size_t requiredSize) { - typename pointerMap_t::iterator retVal; - bool reuse = false; - if (!m_freeList.empty()) { - // try to re-use an existing block - for (auto freeElem : m_freeList) { - if (freeElem->second.m_size >= requiredSize) { - retVal = freeElem; - reuse = true; - // Element is not going to be free anymore - m_freeList.erase(freeElem); - break; - } - } - } - if (!reuse) { - retVal = std::prev(m_pointerMap.end()); - } - return retVal; - } - - /** - * Returns an iterator to the node that stores the information - * of the given virtual pointer from the given pointer map structure. - * If pointer is not found, throws std::out_of_range. - * If the pointer map structure is empty, throws std::out_of_range - * - * \param pMap the pointerMap_t structure storing all the pointers - * \param virtual_pointer_ptr The virtual pointer to obtain the node of - * \throws std::out:of_range if the pointer is not found or pMap is empty - */ - typename pointerMap_t::iterator get_node(const virtual_pointer_t ptr) { - if (this->count() == 0) { - m_pointerMap.clear(); - EIGEN_THROW_X(std::out_of_range("There are no pointers allocated\n")); - - } - if (is_nullptr(ptr)) { - m_pointerMap.clear(); - EIGEN_THROW_X(std::out_of_range("Cannot access null pointer\n")); - } - // The previous element to the lower bound is the node that - // holds this memory address - auto node = m_pointerMap.lower_bound(ptr); - // If the value of the pointer is not the one of the node - // then we return the previous one - if (node == std::end(m_pointerMap)) { - --node; - } else if (node->first != ptr) { - if (node == std::begin(m_pointerMap)) { - m_pointerMap.clear(); - EIGEN_THROW_X( - std::out_of_range("The pointer is not registered in the map\n")); - - } - --node; - } - - return node; - } - - /* get_buffer. - * Returns a buffer from the map using the pointer address - */ - template - cl::sycl::buffer get_buffer( - const virtual_pointer_t ptr) { - using sycl_buffer_t = cl::sycl::buffer; - - // get_node() returns a `buffer_mem`, so we need to cast it to a `buffer<>`. - // We can do this without the `buffer_mem` being a pointer, as we - // only declare member variables in the base class (`buffer_mem`) and not in - // the child class (`buffer<>). - auto node = get_node(ptr); - eigen_assert(node->first == ptr || node->first < ptr); - eigen_assert(ptr < static_cast(node->second.m_size + - node->first)); - return *(static_cast(&node->second.m_buffer)); - } - - /** - * @brief Returns an accessor to the buffer of the given virtual pointer - * @param accessMode - * @param accessTarget - * @param ptr The virtual pointer - */ - template - cl::sycl::accessor - get_access(const virtual_pointer_t ptr) { - auto buf = get_buffer(ptr); - return buf.template get_access(); - } - - /** - * @brief Returns an accessor to the buffer of the given virtual pointer - * in the given command group scope - * @param accessMode - * @param accessTarget - * @param ptr The virtual pointer - * @param cgh Reference to the command group scope - */ - template - cl::sycl::accessor - get_access(const virtual_pointer_t ptr, cl::sycl::handler &cgh) { - auto buf = get_buffer(ptr); - return buf.template get_access(cgh); - } - - /* - * Returns the offset from the base address of this pointer. - */ - inline std::ptrdiff_t get_offset(const virtual_pointer_t ptr) { - // The previous element to the lower bound is the node that - // holds this memory address - auto node = get_node(ptr); - auto start = node->first; - eigen_assert(start == ptr || start < ptr); - eigen_assert(ptr < start + node->second.m_size); - return (ptr - start); - } - - /* - * Returns the number of elements by which the given pointer is offset from - * the base address. - */ - template - inline size_t get_element_offset(const virtual_pointer_t ptr) { - return get_offset(ptr) / sizeof(buffer_data_type); - } - - /** - * Constructs the PointerMapper structure. - */ - PointerMapper(base_ptr_t baseAddress = 4096) - : m_pointerMap{}, m_freeList{}, m_baseAddress{baseAddress} { - if (m_baseAddress == 0) { - EIGEN_THROW_X(std::invalid_argument("Base address cannot be zero\n")); - } - }; - - /** - * PointerMapper cannot be copied or moved - */ - PointerMapper(const PointerMapper &) = delete; - - /** - * Empty the pointer list - */ - inline void clear() { - m_freeList.clear(); - m_pointerMap.clear(); - } - - /* add_pointer. - * Adds an existing pointer to the map and returns the virtual pointer id. - */ - inline virtual_pointer_t add_pointer(const buffer_t &b) { - return add_pointer_impl(b); - } - - /* add_pointer. - * Adds a pointer to the map and returns the virtual pointer id. - */ - inline virtual_pointer_t add_pointer(buffer_t &&b) { - return add_pointer_impl(b); - } - - /** - * @brief Fuses the given node with the previous nodes in the - * pointer map if they are free - * - * @param node A reference to the free node to be fused - */ - void fuse_forward(typename pointerMap_t::iterator &node) { - while (node != std::prev(m_pointerMap.end())) { - // if following node is free - // remove it and extend the current node with its size - auto fwd_node = std::next(node); - if (!fwd_node->second.m_free) { - break; - } - auto fwd_size = fwd_node->second.m_size; - m_freeList.erase(fwd_node); - m_pointerMap.erase(fwd_node); - - node->second.m_size += fwd_size; - } - } - - /** - * @brief Fuses the given node with the following nodes in the - * pointer map if they are free - * - * @param node A reference to the free node to be fused - */ - void fuse_backward(typename pointerMap_t::iterator &node) { - while (node != m_pointerMap.begin()) { - // if previous node is free, extend it - // with the size of the current one - auto prev_node = std::prev(node); - if (!prev_node->second.m_free) { - break; - } - prev_node->second.m_size += node->second.m_size; - - // remove the current node - m_freeList.erase(node); - m_pointerMap.erase(node); - - // point to the previous node - node = prev_node; - } - } - - /* remove_pointer. - * Removes the given pointer from the map. - * The pointer is allowed to be reused only if ReUse if true. - */ - template - void remove_pointer(const virtual_pointer_t ptr) { - if (is_nullptr(ptr)) { - return; - } - auto node = this->get_node(ptr); - - node->second.m_free = true; - m_freeList.emplace(node); - - // Fuse the node - // with free nodes before and after it - fuse_forward(node); - fuse_backward(node); - - // If after fusing the node is the last one - // simply remove it (since it is free) - if (node == std::prev(m_pointerMap.end())) { - m_freeList.erase(node); - m_pointerMap.erase(node); - } - } - - /* count. - * Return the number of active pointers (i.e, pointers that - * have been malloc but not freed). - */ - size_t count() const { return (m_pointerMap.size() - m_freeList.size()); } - - private: - /* add_pointer_impl. - * Adds a pointer to the map and returns the virtual pointer id. - * BufferT is either a const buffer_t& or a buffer_t&&. - */ - template - virtual_pointer_t add_pointer_impl(BufferT b) { - virtual_pointer_t retVal = nullptr; - size_t bufSize = b.get_count(); - pMapNode_t p{b, bufSize, false}; - // If this is the first pointer: - if (m_pointerMap.empty()) { - virtual_pointer_t initialVal{m_baseAddress}; - m_pointerMap.emplace(initialVal, p); - return initialVal; - } - - auto lastElemIter = get_insertion_point(bufSize); - // We are recovering an existing free node - if (lastElemIter->second.m_free) { - lastElemIter->second.m_buffer = b; - lastElemIter->second.m_free = false; - - // If the recovered node is bigger than the inserted one - // add a new free node with the remaining space - if (lastElemIter->second.m_size > bufSize) { - // create a new node with the remaining space - auto remainingSize = lastElemIter->second.m_size - bufSize; - pMapNode_t p2{b, remainingSize, true}; - - // update size of the current node - lastElemIter->second.m_size = bufSize; - - // add the new free node - auto newFreePtr = lastElemIter->first + bufSize; - auto freeNode = m_pointerMap.emplace(newFreePtr, p2).first; - m_freeList.emplace(freeNode); - } - - retVal = lastElemIter->first; - } else { - size_t lastSize = lastElemIter->second.m_size; - retVal = lastElemIter->first + lastSize; - m_pointerMap.emplace(retVal, p); - } - return retVal; - } - - /** - * Compare two iterators to pointer map entries according to - * the size of the allocation on the device. - */ - struct SortBySize { - bool operator()(typename pointerMap_t::iterator a, - typename pointerMap_t::iterator b) const { - return ((a->first < b->first) && (a->second <= b->second)) || - ((a->first < b->first) && (b->second <= a->second)); - } - }; - - /* Maps the pointer addresses to buffer and size pairs. - */ - pointerMap_t m_pointerMap; - - /* List of free nodes available for re-using - */ - std::set m_freeList; - - /* Base address used when issuing the first virtual pointer, allows users - * to specify alignment. Cannot be zero. */ - std::intptr_t m_baseAddress; -}; - -/* remove_pointer. - * Removes the given pointer from the map. - * The pointer is allowed to be reused only if ReUse if true. - */ -template <> -inline void PointerMapper::remove_pointer(const virtual_pointer_t ptr) { - if (is_nullptr(ptr)) { - return; - } - m_pointerMap.erase(this->get_node(ptr)); -} - -/** - * Malloc-like interface to the pointer-mapper. - * Given a size, creates a byte-typed buffer and returns a - * fake pointer to keep track of it. - * \param size Size in bytes of the desired allocation - * \throw cl::sycl::exception if error while creating the buffer - */ -inline void *SYCLmalloc(size_t size, PointerMapper &pMap) { - if (size == 0) { - return nullptr; - } - // Create a generic buffer of the given size - using buffer_t = cl::sycl::buffer; - auto thePointer = pMap.add_pointer(buffer_t(cl::sycl::range<1>{size})); - // Store the buffer on the global list - return static_cast(thePointer); -} - -/** - * Free-like interface to the pointer mapper. - * Given a fake-pointer created with the virtual-pointer malloc, - * destroys the buffer and remove it from the list. - * If ReUse is false, the pointer is not added to the freeList, - * it should be false only for sub-buffers. - */ -template -inline void SYCLfree(void *ptr, PointerMapper &pMap) { - pMap.template remove_pointer(ptr); -} - -/** - * Clear all the memory allocated by SYCL. - */ -template -inline void SYCLfreeAll(PointerMapper &pMap) { - pMap.clear(); -} - -template -struct RangeAccess { - static const auto global_access = cl::sycl::access::target::global_buffer; - static const auto is_place_holder = cl::sycl::access::placeholder::true_t; - typedef T scalar_t; - typedef scalar_t &ref_t; - typedef typename cl::sycl::global_ptr::pointer_t ptr_t; - - // the accessor type does not necessarily the same as T - typedef cl::sycl::accessor - accessor; - - typedef RangeAccess self_t; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE RangeAccess(accessor access, - size_t offset, - std::intptr_t virtual_ptr) - : access_(access), offset_(offset), virtual_ptr_(virtual_ptr) {} - - RangeAccess(cl::sycl::buffer buff = - cl::sycl::buffer(cl::sycl::range<1>(1))) - : access_{accessor{buff}}, offset_(0), virtual_ptr_(-1) {} - - // This should be only used for null constructor on the host side - RangeAccess(std::nullptr_t) : RangeAccess() {} - // This template parameter must be removed and scalar_t should be replaced - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t get_pointer() const { - return (access_.get_pointer().get() + offset_); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator+=(Index offset) { - offset_ += (offset); - return *this; - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator+(Index offset) const { - return self_t(access_, offset_ + offset, virtual_ptr_); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator-(Index offset) const { - return self_t(access_, offset_ - offset, virtual_ptr_); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator-=(Index offset) { - offset_ -= offset; - return *this; - } - - // THIS IS FOR NULL COMPARISON ONLY - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator==( - const RangeAccess &lhs, std::nullptr_t) { - return ((lhs.virtual_ptr_ == -1)); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator!=( - const RangeAccess &lhs, std::nullptr_t i) { - return !(lhs == i); - } - - // THIS IS FOR NULL COMPARISON ONLY - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator==( - std::nullptr_t, const RangeAccess &rhs) { - return ((rhs.virtual_ptr_ == -1)); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend bool operator!=( - std::nullptr_t i, const RangeAccess &rhs) { - return !(i == rhs); - } - // Prefix operator (Increment and return value) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t &operator++() { - offset_++; - return (*this); - } - - // Postfix operator (Return value and increment) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE self_t operator++(int i) { - EIGEN_UNUSED_VARIABLE(i); - self_t temp_iterator(*this); - offset_++; - return temp_iterator; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t get_size() const { - return (access_.get_count() - offset_); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t get_offset() const { - return offset_; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_offset(std::ptrdiff_t offset) { - offset_ = offset; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator*() const { - return *get_pointer(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator*() { - return *get_pointer(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptr_t operator->() = delete; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator[](int x) { - return *(get_pointer() + x); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ref_t operator[](int x) const { - return *(get_pointer() + x); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_t *get_virtual_pointer() const { - return reinterpret_cast(virtual_ptr_ + - (offset_ * sizeof(scalar_t))); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit operator bool() const { - return (virtual_ptr_ != -1); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator RangeAccess() { - return RangeAccess(access_, offset_, virtual_ptr_); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - operator RangeAccess() const { - return RangeAccess(access_, offset_, virtual_ptr_); - } - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind( - cl::sycl::handler &cgh) const { - cgh.require(access_); - } - - private: - accessor access_; - size_t offset_; - std::intptr_t virtual_ptr_; // the location of the buffer in the map -}; - -template -struct RangeAccess : RangeAccess { - typedef RangeAccess Base; - using Base::Base; -}; - -} // namespace internal -} // namespace TensorSycl -} // namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_SYCL_STORAGE_MEMORY_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h deleted file mode 100644 index 9208ab21d..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/SYCL/TypeCasting.h +++ /dev/null @@ -1,85 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * TypeCasting.h - * - * \brief: - * TypeCasting - * - *****************************************************************/ - -#ifndef EIGEN_TYPE_CASTING_SYCL_H -#define EIGEN_TYPE_CASTING_SYCL_H - -namespace Eigen { - -namespace internal { -#ifdef SYCL_DEVICE_ONLY -template <> -struct type_casting_traits { - enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; -}; - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_int4 -pcast(const cl::sycl::cl_float4& a) { - return a - .template convert(); -} - -template <> -struct type_casting_traits { - enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; -}; - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 -pcast(const cl::sycl::cl_int4& a) { - return a.template convert(); -} - -template <> -struct type_casting_traits { - enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 }; -}; - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 -pcast( - const cl::sycl::cl_double2& a, const cl::sycl::cl_double2& b) { - auto a1 = a.template convert(); - auto b1 = b.template convert(); - return cl::sycl::float4(a1.x(), a1.y(), b1.x(), b1.y()); -} - -template <> -struct type_casting_traits { - enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 }; -}; - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2 -pcast(const cl::sycl::cl_float4& a) { - // Simply discard the second half of the input - return cl::sycl::cl_double2(a.x(), a.y()); -} - -#endif -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_TYPE_CASTING_SYCL_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/ZVector/Complex.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/ZVector/Complex.h index d3e41b43e..1bfb73397 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/ZVector/Complex.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/ZVector/Complex.h @@ -15,10 +15,6 @@ namespace Eigen { namespace internal { -#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) -static Packet4ui p4ui_CONJ_XOR = { 0x00000000, 0x80000000, 0x00000000, 0x80000000 }; //vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO); -#endif - static Packet2ul p2ul_CONJ_XOR1 = (Packet2ul) vec_sld((Packet4ui) p2d_ZERO_, (Packet4ui) p2l_ZERO, 8);//{ 0x8000000000000000, 0x0000000000000000 }; static Packet2ul p2ul_CONJ_XOR2 = (Packet2ul) vec_sld((Packet4ui) p2l_ZERO, (Packet4ui) p2d_ZERO_, 8);//{ 0x8000000000000000, 0x0000000000000000 }; @@ -33,14 +29,10 @@ struct Packet2cf { EIGEN_STRONG_INLINE Packet2cf() {} EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {} -#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12) union { Packet4f v; Packet1cd cd[2]; }; -#else - Packet4f v; -#endif }; template<> struct packet_traits > : default_packet_traits @@ -91,33 +83,69 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16}; typedef Packet2cf half; }; +template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16}; typedef Packet1cd half; }; /* Forward declaration */ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel); -/* complex first */ +template<> EIGEN_STRONG_INLINE Packet2cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload((const double*)from)); } +template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); } template<> EIGEN_STRONG_INLINE Packet1cd ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu((const double*)from)); } +template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } +template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } template<> EIGEN_STRONG_INLINE Packet1cd pset1(const std::complex& from) { /* here we really have to use unaligned loads :( */ return ploadu(&from); } +template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) +{ + Packet2cf res; + res.cd[0] = Packet1cd(vec_ld2f((const float *)&from)); + res.cd[1] = res.cd[0]; + return res; +} +template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) +{ + std::complex EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return pload(af); +} template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride EIGEN_UNUSED) { return pload(from); } +template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) +{ + std::complex EIGEN_ALIGN16 af[2]; + pstore >((std::complex *) af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::complex* to, const Packet1cd& from, Index stride EIGEN_UNUSED) { pstore >(to, from); } + +template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd(a.v, b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd padd(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v + b.v); } +template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v, b.v)); } template<> EIGEN_STRONG_INLINE Packet1cd psub(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(a.v - b.v); } template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(Packet2d(a.v))); } +template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); } template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) { return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2)); } +template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) +{ + Packet2cf res; + res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[0]))).v; + res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[1]))).v; + return res; +} + template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) { Packet2d a_re, a_im, v1, v2; @@ -135,12 +163,27 @@ template<> EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, con return Packet1cd(v1 + v2); } -template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet1cd pandnot (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); } -template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } +template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) +{ + Packet2cf res; + res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[0])), Packet1cd(reinterpret_cast(b.v.v4f[0]))).v; + res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[1])), Packet1cd(reinterpret_cast(b.v.v4f[1]))).v; + return res; +} +template<> EIGEN_STRONG_INLINE Packet1cd pand (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd por (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_or(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pxor (const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_xor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor(a.v,b.v)); } +template<> EIGEN_STRONG_INLINE Packet1cd pandnot(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(vec_and(a.v, vec_nor(b.v,b.v))); } +template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot(a.v,b.v)); } + +template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* from) { return pset1(*from); } +template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } + +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) @@ -150,16 +193,83 @@ template<> EIGEN_STRONG_INLINE std::complex pfirst(const Pac return res; } +template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) +{ + std::complex EIGEN_ALIGN16 res[2]; + pstore >(res, a); + + return res[0]; +} template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) +{ + Packet2cf res; + res.cd[0] = a.cd[1]; + res.cd[1] = a.cd[0]; + return res; +} + template<> EIGEN_STRONG_INLINE std::complex predux(const Packet1cd& a) { return pfirst(a); } +template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) +{ + std::complex res; + Packet1cd b = padd(a.cd[0], a.cd[1]); + vec_st2f(b.v, (float*)&res); + return res; +} + +template<> EIGEN_STRONG_INLINE Packet1cd preduxp(const Packet1cd* vecs) +{ + return vecs[0]; +} +template<> EIGEN_STRONG_INLINE Packet2cf preduxp(const Packet2cf* vecs) +{ + PacketBlock transpose; + transpose.packet[0] = vecs[0]; + transpose.packet[1] = vecs[1]; + ptranspose(transpose); + + return padd(transpose.packet[0], transpose.packet[1]); +} + template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } +template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) +{ + std::complex res; + Packet1cd b = pmul(a.cd[0], a.cd[1]); + vec_st2f(b.v, (float*)&res); + return res; +} + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/) + { + // FIXME is it sure we never have to align a Packet1cd? + // Even though a std::complex has 16 bytes, it is not necessarily aligned on a 16 bytes boundary... + } +}; + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second) + { + if (Offset == 1) { + first.cd[0] = first.cd[1]; + first.cd[1] = second.cd[0]; + } + } +}; + template<> struct conj_helper { EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const @@ -193,134 +303,6 @@ template<> struct conj_helper } }; -EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) - -template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) -{ - // TODO optimize it for AltiVec - Packet1cd res = conj_helper().pmul(a,b); - Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_); - return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64))); -} - -EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) -{ - return Packet1cd(preverse(Packet2d(x.v))); -} - -EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); - kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); - kernel.packet[0].v = tmp; -} - -/* complex follows */ -template<> EIGEN_STRONG_INLINE Packet2cf pload (const std::complex* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload((const float*)from)); } -template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu((const float*)from)); } -template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); } -template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } - -template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) -{ - std::complex EIGEN_ALIGN16 res[2]; - pstore >(res, a); - - return res[0]; -} - - -#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12) -template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) -{ - Packet2cf res; - res.cd[0] = Packet1cd(vec_ld2f((const float *)&from)); - res.cd[1] = res.cd[0]; - return res; -} -#else -template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) -{ - Packet2cf res; - if((std::ptrdiff_t(&from) % 16) == 0) - res.v = pload((const float *)&from); - else - res.v = ploadu((const float *)&from); - res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI); - return res; -} -#endif - -template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) -{ - std::complex EIGEN_ALIGN16 af[2]; - af[0] = from[0*stride]; - af[1] = from[1*stride]; - return pload(af); -} -template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) -{ - std::complex EIGEN_ALIGN16 af[2]; - pstore >((std::complex *) af, from); - to[0*stride] = af[0]; - to[1*stride] = af[1]; -} - -template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd(a.v, b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v, b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(Packet4f(a.v))); } - -template<> EIGEN_STRONG_INLINE Packet2cf pand (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pand(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf por (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(por(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pxor (const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pxor(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(pandnot(a.v,b.v)); } - -template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } - -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ZVECTOR_PREFETCH(addr); } - - -#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12) -template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) -{ - Packet2cf res; - res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[0]))).v; - res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast(a.v.v4f[1]))).v; - return res; -} - -template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) -{ - Packet2cf res; - res.v.v4f[0] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[0])), Packet1cd(reinterpret_cast(b.v.v4f[0]))).v; - res.v.v4f[1] = pmul(Packet1cd(reinterpret_cast(a.v.v4f[1])), Packet1cd(reinterpret_cast(b.v.v4f[1]))).v; - return res; -} - -template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) -{ - Packet2cf res; - res.cd[0] = a.cd[1]; - res.cd[1] = a.cd[0]; - return res; -} - -template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) -{ - std::complex res; - Packet1cd b = padd(a.cd[0], a.cd[1]); - vec_st2f(b.v, (float*)&res); - return res; -} - -template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) -{ - std::complex res; - Packet1cd b = pmul(a.cd[0], a.cd[1]); - vec_st2f(b.v, (float*)&res); - return res; -} - template<> struct conj_helper { EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const @@ -355,6 +337,15 @@ template<> struct conj_helper }; EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) + +template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) +{ + // TODO optimize it for AltiVec + Packet1cd res = conj_helper().pmul(a,b); + Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_); + return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64))); +} template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { @@ -365,6 +356,11 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, con return res; } +EIGEN_STRONG_INLINE Packet1cd pcplxflip/**/(const Packet1cd& x) +{ + return Packet1cd(preverse(Packet2d(x.v))); +} + EIGEN_STRONG_INLINE Packet2cf pcplxflip/**/(const Packet2cf& x) { Packet2cf res; @@ -373,6 +369,13 @@ EIGEN_STRONG_INLINE Packet2cf pcplxflip/**/(const Packet2cf& x) return res; } +EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +{ + Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); + kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); + kernel.packet[0].v = tmp; +} + EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { Packet1cd tmp = kernel.packet[0].cd[1]; @@ -386,116 +389,6 @@ template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, con result.v = pblend(ifPacket4, thenPacket.v, elsePacket.v); return result; } -#else -template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf(pxor(a.v, reinterpret_cast(p4ui_CONJ_XOR))); } -template<> EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) -{ - Packet4f a_re, a_im, prod, prod_im; - - // Permute and multiply the real parts of a and b - a_re = vec_perm(a.v, a.v, p16uc_PSET32_WODD); - - // Get the imaginary parts of a - a_im = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN); - - // multiply a_im * b and get the conjugate result - prod_im = a_im * b.v; - prod_im = pxor(prod_im, reinterpret_cast(p4ui_CONJ_XOR)); - // permute back to a proper order - prod_im = vec_perm(prod_im, prod_im, p16uc_COMPLEX32_REV); - - // multiply a_re * b, add prod_im - prod = pmadd(a_re, b.v, prod_im); - - return Packet2cf(prod); -} - -template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) -{ - Packet4f rev_a; - rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX32_REV2); - return Packet2cf(rev_a); -} - -template<> EIGEN_STRONG_INLINE std::complex predux(const Packet2cf& a) -{ - Packet4f b; - b = vec_sld(a.v, a.v, 8); - b = padd(a.v, b); - return pfirst(Packet2cf(b)); -} - -template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a) -{ - Packet4f b; - Packet2cf prod; - b = vec_sld(a.v, a.v, 8); - prod = pmul(a, Packet2cf(b)); - - return pfirst(prod); -} - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - -EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) - -template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) -{ - // TODO optimize it for AltiVec - Packet2cf res = conj_helper().pmul(a, b); - Packet4f s = pmul(b.v, b.v); - return Packet2cf(pdiv(res.v, padd(s, vec_perm(s, s, p16uc_COMPLEX32_REV)))); -} - -template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip(const Packet2cf& x) -{ - return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV)); -} - -EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI); - kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO); - kernel.packet[0].v = tmp; -} - -template<> EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket, const Packet2cf& elsePacket) { - Packet2cf result; - result.v = reinterpret_cast(pblend(ifPacket, reinterpret_cast(thenPacket.v), reinterpret_cast(elsePacket.v))); - return result; -} -#endif } // end namespace internal diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h index 689ecc702..5c7aa7256 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/ZVector/MathFunctions.h @@ -20,50 +20,6 @@ namespace Eigen { namespace internal { -#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) -static _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f); -static _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f); -static _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f); -static _EIGEN_DECLARE_CONST_Packet4i(23, 23); - -static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000); - -/* the smallest non denormalized float number */ -static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000); -static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000); // -1.f/0.f -static _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan, 0xffffffff); - -/* natural logarithm computed for 4 simultaneous float - return NaN for x <= 0 -*/ -static _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f); - -static _EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f); -static _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f); - -static _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f); - -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f); -static _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f); -#endif - static _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0); static _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0); static _EIGEN_DECLARE_CONST_Packet2d(half, 0.5); @@ -137,101 +93,43 @@ Packet2d pexp(const Packet2d& _x) } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f pexp(const Packet4f& _x) +Packet4f pexp(const Packet4f& x) { -#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) -/* - Packet4f x = _x; - - Packet4f tmp, fx; - Packet4i emm0; - - // clamp x - x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo); - - // express exp(x) as exp(g + n*log(2)) - fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half); - - fx = pfloor(fx); - - tmp = pmul(fx, p4f_cephes_exp_C1); - Packet4f z = pmul(fx, p4f_cephes_exp_C2); - x = psub(x, tmp); - x = psub(x, z); - - z = pmul(x,x); - - Packet4f y = p4f_cephes_exp_p0; - y = pmadd(y, x, p4f_cephes_exp_p1); - y = pmadd(y, x, p4f_cephes_exp_p2); - y = pmadd(y, x, p4f_cephes_exp_p3); - y = pmadd(y, x, p4f_cephes_exp_p4); - y = pmadd(y, x, p4f_cephes_exp_p5); - y = pmadd(y, z, x); - y = padd(y, p4f_1); - - // build 2^n - emm0 = vec_cts(fx, 0); - emm0 = emm0 + p4i_0x7f; - emm0 = emm0 << reinterpret_cast(p4i_23); - - // Altivec's max & min operators just drop silent NaNs. Check NaNs in - // inputs and return them unmodified. - Packet4ui isnumber_mask = reinterpret_cast(vec_cmpeq(_x, _x)); - return vec_sel(_x, pmax(pmul(y, reinterpret_cast(emm0)), _x), - isnumber_mask);*/ - return _x; -#else Packet4f res; - res.v4f[0] = pexp(_x.v4f[0]); - res.v4f[1] = pexp(_x.v4f[1]); + res.v4f[0] = pexp(x.v4f[0]); + res.v4f[1] = pexp(x.v4f[1]); return res; -#endif } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d psqrt(const Packet2d& x) { - return vec_sqrt(x); + return __builtin_s390_vfsqdb(x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f psqrt(const Packet4f& x) { Packet4f res; -#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) - res = vec_sqrt(x); -#else res.v4f[0] = psqrt(x.v4f[0]); res.v4f[1] = psqrt(x.v4f[1]); -#endif return res; } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet2d prsqrt(const Packet2d& x) { + // Unfortunately we can't use the much faster mm_rqsrt_pd since it only provides an approximation. return pset1(1.0) / psqrt(x); } template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt(const Packet4f& x) { Packet4f res; -#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) - res = pset1(1.0) / psqrt(x); -#else res.v4f[0] = prsqrt(x.v4f[0]); res.v4f[1] = prsqrt(x.v4f[1]); -#endif return res; } -// Hyperbolic Tangent function. -template <> -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f -ptanh(const Packet4f& x) { - return internal::generic_fast_tanh_float(x); -} - } // end namespace internal } // end namespace Eigen diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/ZVector/PacketMath.h b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/ZVector/PacketMath.h index 3fb642a38..57b01fc63 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -17,7 +17,7 @@ namespace Eigen { namespace internal { #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD -#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16 +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 #endif #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD @@ -29,7 +29,7 @@ namespace internal { #endif #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16 #endif typedef __vector int Packet4i; @@ -41,14 +41,9 @@ typedef __vector double Packet2d; typedef __vector unsigned long long Packet2ul; typedef __vector long long Packet2l; -// Z14 has builtin support for float vectors -#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) -typedef __vector float Packet4f; -#else typedef struct { Packet2d v4f[2]; } Packet4f; -#endif typedef union { int32_t i[4]; @@ -56,15 +51,11 @@ typedef union { int64_t l[2]; uint64_t ul[2]; double d[2]; - float f[4]; Packet4i v4i; Packet4ui v4ui; Packet2l v2l; Packet2ul v2ul; Packet2d v2d; -#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) - Packet4f v4f; -#endif } Packet; // We don't want to write the same code all the time, but we need to reuse the constants @@ -89,7 +80,7 @@ typedef union { Packet2l p2l_##NAME = pset1(X) // These constants are endian-agnostic -static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} +//static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1); //{ 1, 1, 1, 1} static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0); @@ -99,21 +90,6 @@ static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1); static Packet2d p2d_ONE = { 1.0, 1.0 }; static Packet2d p2d_ZERO_ = { -0.0, -0.0 }; -#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) -#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ - Packet4f p4f_##NAME = reinterpret_cast(vec_splat_s32(X)) - -#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ - Packet4f p4f_##NAME = pset1(X) - -#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ - const Packet4f p4f_##NAME = reinterpret_cast(pset1(X)) - -static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} -static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} -static Packet4f p4f_MZERO = { 0x80000000, 0x80000000, 0x80000000, 0x80000000}; -#endif - static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; static Packet2d p2d_COUNTDOWN = reinterpret_cast(vec_sld(reinterpret_cast(p2d_ZERO), reinterpret_cast(p2d_ONE), 8)); @@ -144,9 +120,9 @@ static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0 static Packet16uc p16uc_TRANSPOSE64_HI = { 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; static Packet16uc p16uc_TRANSPOSE64_LO = { 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; -static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; +//static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; -static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; +//static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC @@ -173,35 +149,29 @@ template<> struct packet_traits : default_packet_traits }; }; -template <> -struct packet_traits : default_packet_traits { +template<> struct packet_traits : default_packet_traits +{ typedef Packet4f type; typedef Packet4f half; enum { Vectorizable = 1, AlignedOnScalar = 1, - size = 4, + size=4, HasHalfPacket = 0, - HasAdd = 1, - HasSub = 1, - HasMul = 1, - HasDiv = 1, - HasMin = 1, - HasMax = 1, - HasAbs = 1, - HasSin = 0, - HasCos = 0, - HasLog = 0, -#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) - HasExp = 0, -#else - HasExp = 1, -#endif + HasAdd = 1, + HasSub = 1, + HasMul = 1, + HasDiv = 1, + HasMin = 1, + HasMax = 1, + HasAbs = 1, + HasSin = 0, + HasCos = 0, + HasLog = 0, + HasExp = 1, HasSqrt = 1, HasRsqrt = 1, - HasTanh = 1, - HasErf = 1, HasRound = 1, HasFloor = 1, HasCeil = 1, @@ -241,9 +211,9 @@ template<> struct packet_traits : default_packet_traits }; }; -template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4i half; }; -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4f half; }; -template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2d half; }; +template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; +template<> struct unpacket_traits { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; /* Forward declaration */ EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel); @@ -288,301 +258,6 @@ inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) return s; } -#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) -inline std::ostream & operator <<(std::ostream & s, const Packet4f & v) -{ - Packet vt; - vt.v4f = v; - s << vt.f[0] << ", " << vt.f[1] << ", " << vt.f[2] << ", " << vt.f[3]; - return s; -} -#endif - -template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) -{ - // FIXME: No intrinsic yet - EIGEN_DEBUG_ALIGNED_LOAD - Packet *vfrom; - vfrom = (Packet *) from; - return vfrom->v4i; -} - -template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) -{ - // FIXME: No intrinsic yet - EIGEN_DEBUG_ALIGNED_LOAD - Packet *vfrom; - vfrom = (Packet *) from; - return vfrom->v2d; -} - -template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) -{ - // FIXME: No intrinsic yet - EIGEN_DEBUG_ALIGNED_STORE - Packet *vto; - vto = (Packet *) to; - vto->v4i = from; -} - -template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) -{ - // FIXME: No intrinsic yet - EIGEN_DEBUG_ALIGNED_STORE - Packet *vto; - vto = (Packet *) to; - vto->v2d = from; -} - -template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) -{ - return vec_splats(from); -} -template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { - return vec_splats(from); -} - -template<> EIGEN_STRONG_INLINE void -pbroadcast4(const int *a, - Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) -{ - a3 = pload(a); - a0 = vec_splat(a3, 0); - a1 = vec_splat(a3, 1); - a2 = vec_splat(a3, 2); - a3 = vec_splat(a3, 3); -} - -template<> EIGEN_STRONG_INLINE void -pbroadcast4(const double *a, - Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) -{ - a1 = pload(a); - a0 = vec_splat(a1, 0); - a1 = vec_splat(a1, 1); - a3 = pload(a+2); - a2 = vec_splat(a3, 0); - a3 = vec_splat(a3, 1); -} - -template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) -{ - int EIGEN_ALIGN16 ai[4]; - ai[0] = from[0*stride]; - ai[1] = from[1*stride]; - ai[2] = from[2*stride]; - ai[3] = from[3*stride]; - return pload(ai); -} - -template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) -{ - double EIGEN_ALIGN16 af[2]; - af[0] = from[0*stride]; - af[1] = from[1*stride]; - return pload(af); -} - -template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) -{ - int EIGEN_ALIGN16 ai[4]; - pstore((int *)ai, from); - to[0*stride] = ai[0]; - to[1*stride] = ai[1]; - to[2*stride] = ai[2]; - to[3*stride] = ai[3]; -} - -template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) -{ - double EIGEN_ALIGN16 af[2]; - pstore(af, from); - to[0*stride] = af[0]; - to[1*stride] = af[1]; -} - -template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return (a + b); } -template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return (a + b); } - -template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return (a - b); } -template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return (a - b); } - -template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { return (a * b); } -template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { return (a * b); } - -template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& a, const Packet4i& b) { return (a / b); } -template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return (a / b); } - -template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); } -template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); } - -template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } - -template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a, b), c); } -template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } - -template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return padd(pset1(a), p4i_COUNTDOWN); } -template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return padd(pset1(a), p2d_COUNTDOWN); } - -template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } -template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } - -template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } -template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } - -template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } -template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } - -template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } -template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } - -template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } -template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); } - -template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return pand(a, vec_nor(b, b)); } -template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } - -template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return vec_round(a); } -template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return vec_ceil(a); } -template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return vec_floor(a); } - -template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { return pload(from); } -template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { return pload(from); } - - -template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) -{ - Packet4i p = pload(from); - return vec_perm(p, p, p16uc_DUPLICATE32_HI); -} - -template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) -{ - Packet2d p = pload(from); - return vec_perm(p, p, p16uc_PSET64_HI); -} - -template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { pstore(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { pstore(to, from); } - -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } - -template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } -template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } - -template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) -{ - return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); -} - -template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) -{ - return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE64)); -} - -template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } -template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } - -template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) -{ - Packet4i b, sum; - b = vec_sld(a, a, 8); - sum = padd(a, b); - b = vec_sld(sum, sum, 4); - sum = padd(sum, b); - return pfirst(sum); -} - -template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) -{ - Packet2d b, sum; - b = reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)); - sum = padd(a, b); - return pfirst(sum); -} - -// Other reduction functions: -// mul -template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) -{ - EIGEN_ALIGN16 int aux[4]; - pstore(aux, a); - return aux[0] * aux[1] * aux[2] * aux[3]; -} - -template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) -{ - return pfirst(pmul(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); -} - -// min -template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) -{ - Packet4i b, res; - b = pmin(a, vec_sld(a, a, 8)); - res = pmin(b, vec_sld(b, b, 4)); - return pfirst(res); -} - -template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) -{ - return pfirst(pmin(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); -} - -// max -template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) -{ - Packet4i b, res; - b = pmax(a, vec_sld(a, a, 8)); - res = pmax(b, vec_sld(b, b, 4)); - return pfirst(res); -} - -// max -template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) -{ - return pfirst(pmax(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); - Packet4i t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); - Packet4i t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); - Packet4i t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); - kernel.packet[0] = vec_mergeh(t0, t2); - kernel.packet[1] = vec_mergel(t0, t2); - kernel.packet[2] = vec_mergeh(t1, t3); - kernel.packet[3] = vec_mergel(t1, t3); -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - Packet2d t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI); - Packet2d t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO); - kernel.packet[0] = t0; - kernel.packet[1] = t1; -} - -template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { - Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; - Packet4ui mask = vec_cmpeq(select, reinterpret_cast(p4i_ONE)); - return vec_sel(elsePacket, thenPacket, mask); -} - - -template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { - Packet2ul select = { ifPacket.select[0], ifPacket.select[1] }; - Packet2ul mask = vec_cmpeq(select, reinterpret_cast(p2l_ONE)); - return vec_sel(elsePacket, thenPacket, mask); -} - -/* z13 has no vector float support so we emulate that with double - z14 has proper vector float support. -*/ -#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12) /* Helper function to simulate a vec_splat_packet4f */ template EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from) @@ -609,6 +284,66 @@ template EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Pack return splat; } +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) + { + switch (Offset % 4) { + case 1: + first = vec_sld(first, second, 4); break; + case 2: + first = vec_sld(first, second, 8); break; + case 3: + first = vec_sld(first, second, 12); break; + } + } +}; + +/* This is a tricky one, we have to translate float alignment to vector elements of sizeof double + */ +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) + { + switch (Offset % 4) { + case 1: + first.v4f[0] = vec_sld(first.v4f[0], first.v4f[1], 8); + first.v4f[1] = vec_sld(first.v4f[1], second.v4f[0], 8); + break; + case 2: + first.v4f[0] = first.v4f[1]; + first.v4f[1] = second.v4f[0]; + break; + case 3: + first.v4f[0] = vec_sld(first.v4f[1], second.v4f[0], 8); + first.v4f[1] = vec_sld(second.v4f[0], second.v4f[1], 8); + break; + } + } +}; + + +template +struct palign_impl +{ + static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) + { + if (Offset == 1) + first = reinterpret_cast(vec_sld(reinterpret_cast(first), reinterpret_cast(second), 8)); + } +}; + +template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_LOAD + Packet *vfrom; + vfrom = (Packet *) from; + return vfrom->v4i; +} + template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { // FIXME: No intrinsic yet @@ -619,6 +354,24 @@ template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) return vfrom; } +template<> EIGEN_STRONG_INLINE Packet2d pload(const double* from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_LOAD + Packet *vfrom; + vfrom = (Packet *) from; + return vfrom->v2d; +} + +template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_STORE + Packet *vto; + vto = (Packet *) to; + vto->v4i = from; +} + template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { // FIXME: No intrinsic yet @@ -627,6 +380,23 @@ template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& f vec_st2f(from.v4f[1], &to[2]); } + +template<> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) +{ + // FIXME: No intrinsic yet + EIGEN_DEBUG_ALIGNED_STORE + Packet *vto; + vto = (Packet *) to; + vto->v2d = from; +} + +template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) +{ + return vec_splats(from); +} +template<> EIGEN_STRONG_INLINE Packet2d pset1(const double& from) { + return vec_splats(from); +} template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { Packet4f to; @@ -635,6 +405,17 @@ template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) return to; } +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const int *a, + Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) +{ + a3 = pload(a); + a0 = vec_splat(a3, 0); + a1 = vec_splat(a3, 1); + a2 = vec_splat(a3, 2); + a3 = vec_splat(a3, 3); +} + template<> EIGEN_STRONG_INLINE void pbroadcast4(const float *a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) @@ -646,6 +427,28 @@ pbroadcast4(const float *a, a3 = vec_splat_packet4f<3>(a3); } +template<> EIGEN_STRONG_INLINE void +pbroadcast4(const double *a, + Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +{ + a1 = pload(a); + a0 = vec_splat(a1, 0); + a1 = vec_splat(a1, 1); + a3 = pload(a+2); + a2 = vec_splat(a3, 0); + a3 = vec_splat(a3, 1); +} + +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) +{ + int EIGEN_ALIGN16 ai[4]; + ai[0] = from[0*stride]; + ai[1] = from[1*stride]; + ai[2] = from[2*stride]; + ai[3] = from[3*stride]; + return pload(ai); +} + template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { float EIGEN_ALIGN16 ai[4]; @@ -656,6 +459,24 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const floa return pload(ai); } +template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) +{ + double EIGEN_ALIGN16 af[2]; + af[0] = from[0*stride]; + af[1] = from[1*stride]; + return pload(af); +} + +template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) +{ + int EIGEN_ALIGN16 ai[4]; + pstore((int *)ai, from); + to[0*stride] = ai[0]; + to[1*stride] = ai[1]; + to[2*stride] = ai[2]; + to[3*stride] = ai[3]; +} + template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) { float EIGEN_ALIGN16 ai[4]; @@ -666,6 +487,15 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, co to[3*stride] = ai[3]; } +template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) +{ + double EIGEN_ALIGN16 af[2]; + pstore(af, from); + to[0*stride] = af[0]; + to[1*stride] = af[1]; +} + +template<> EIGEN_STRONG_INLINE Packet4i padd(const Packet4i& a, const Packet4i& b) { return (a + b); } template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { Packet4f c; @@ -673,7 +503,9 @@ template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const c.v4f[1] = a.v4f[1] + b.v4f[1]; return c; } +template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const Packet2d& b) { return (a + b); } +template<> EIGEN_STRONG_INLINE Packet4i psub(const Packet4i& a, const Packet4i& b) { return (a - b); } template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { Packet4f c; @@ -681,7 +513,9 @@ template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const c.v4f[1] = a.v4f[1] - b.v4f[1]; return c; } +template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return (a - b); } +template<> EIGEN_STRONG_INLINE Packet4i pmul(const Packet4i& a, const Packet4i& b) { return (a * b); } template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { Packet4f c; @@ -689,7 +523,9 @@ template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const c.v4f[1] = a.v4f[1] * b.v4f[1]; return c; } +template<> EIGEN_STRONG_INLINE Packet2d pmul(const Packet2d& a, const Packet2d& b) { return (a * b); } +template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& a, const Packet4i& b) { return (a / b); } template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { Packet4f c; @@ -697,7 +533,9 @@ template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const c.v4f[1] = a.v4f[1] / b.v4f[1]; return c; } +template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const Packet2d& b) { return (a / b); } +template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return (-a); } template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { Packet4f c; @@ -705,7 +543,13 @@ template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) c.v4f[1] = -a.v4f[1]; return c; } +template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return (-a); } +template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } +template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } + +template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a, b), c); } template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { Packet4f res; @@ -713,7 +557,14 @@ template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]); return res; } +template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } +template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) { return padd(pset1(a), p4i_COUNTDOWN); } +template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { return padd(pset1(a), p4f_COUNTDOWN); } +template<> EIGEN_STRONG_INLINE Packet2d plset(const double& a) { return padd(pset1(a), p2d_COUNTDOWN); } + +template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { Packet4f res; @@ -722,6 +573,8 @@ template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const return res; } +template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { Packet4f res; @@ -730,6 +583,8 @@ template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const return res; } +template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { Packet4f res; @@ -738,6 +593,8 @@ template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const return res; } +template<> EIGEN_STRONG_INLINE Packet4i por(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d por(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const Packet4f& b) { Packet4f res; @@ -746,6 +603,8 @@ template<> EIGEN_STRONG_INLINE Packet4f por(const Packet4f& a, const P return res; } +template<> EIGEN_STRONG_INLINE Packet4i pxor(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); } template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b) { Packet4f res; @@ -754,6 +613,8 @@ template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const return res; } +template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return pand(a, vec_nor(b, b)); } +template<> EIGEN_STRONG_INLINE Packet2d pandnot(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { Packet4f res; @@ -769,7 +630,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) res.v4f[1] = vec_round(a.v4f[1]); return res; } - +template<> EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) { return vec_round(a); } template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) { Packet4f res; @@ -777,7 +638,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) res.v4f[1] = vec_ceil(a.v4f[1]); return res; } - +template<> EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) { return vec_ceil(a); } template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) { Packet4f res; @@ -785,6 +646,18 @@ template<> EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) res.v4f[1] = vec_floor(a.v4f[1]); return res; } +template<> EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) { return vec_floor(a); } + +template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { return pload(from); } +template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { return pload(from); } +template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { return pload(from); } + + +template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) +{ + Packet4i p = pload(from); + return vec_perm(p, p, p16uc_DUPLICATE32_HI); +} template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) { @@ -794,7 +667,33 @@ template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) return p; } +template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) +{ + Packet2d p = pload(from); + return vec_perm(p, p, p16uc_PSET64_HI); +} + +template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { pstore(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { pstore(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& from) { pstore(to, from); } + +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } + +template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } + +template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) +{ + return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); +} + +template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) +{ + return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE64)); +} template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { @@ -804,6 +703,8 @@ template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) return rev; } +template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } +template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { Packet4f res; @@ -812,6 +713,23 @@ template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) return res; } +template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) +{ + Packet4i b, sum; + b = vec_sld(a, a, 8); + sum = padd(a, b); + b = vec_sld(sum, sum, 4); + sum = padd(sum, b); + return pfirst(sum); +} + +template<> EIGEN_STRONG_INLINE double predux(const Packet2d& a) +{ + Packet2d b, sum; + b = reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)); + sum = padd(a, b); + return pfirst(sum); +} template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) { Packet2d sum; @@ -820,12 +738,94 @@ template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) return static_cast(first); } +template<> EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs) +{ + Packet4i v[4], sum[4]; + + // It's easier and faster to transpose then add as columns + // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation + // Do the transpose, first set of moves + v[0] = vec_mergeh(vecs[0], vecs[2]); + v[1] = vec_mergel(vecs[0], vecs[2]); + v[2] = vec_mergeh(vecs[1], vecs[3]); + v[3] = vec_mergel(vecs[1], vecs[3]); + // Get the resulting vectors + sum[0] = vec_mergeh(v[0], v[2]); + sum[1] = vec_mergel(v[0], v[2]); + sum[2] = vec_mergeh(v[1], v[3]); + sum[3] = vec_mergel(v[1], v[3]); + + // Now do the summation: + // Lines 0+1 + sum[0] = padd(sum[0], sum[1]); + // Lines 2+3 + sum[1] = padd(sum[2], sum[3]); + // Add the results + sum[0] = padd(sum[0], sum[1]); + + return sum[0]; +} + +template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) +{ + Packet2d v[2], sum; + v[0] = padd(vecs[0], reinterpret_cast(vec_sld(reinterpret_cast(vecs[0]), reinterpret_cast(vecs[0]), 8))); + v[1] = padd(vecs[1], reinterpret_cast(vec_sld(reinterpret_cast(vecs[1]), reinterpret_cast(vecs[1]), 8))); + + sum = reinterpret_cast(vec_sld(reinterpret_cast(v[0]), reinterpret_cast(v[1]), 8)); + + return sum; +} + +template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) +{ + PacketBlock transpose; + transpose.packet[0] = vecs[0]; + transpose.packet[1] = vecs[1]; + transpose.packet[2] = vecs[2]; + transpose.packet[3] = vecs[3]; + ptranspose(transpose); + + Packet4f sum = padd(transpose.packet[0], transpose.packet[1]); + sum = padd(sum, transpose.packet[2]); + sum = padd(sum, transpose.packet[3]); + return sum; +} + +// Other reduction functions: +// mul +template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) +{ + EIGEN_ALIGN16 int aux[4]; + pstore(aux, a); + return aux[0] * aux[1] * aux[2] * aux[3]; +} + +template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) +{ + return pfirst(pmul(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); +} + template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) { // Return predux_mul of the subvectors product return static_cast(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1])))); } +// min +template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) +{ + Packet4i b, res; + b = pmin(a, vec_sld(a, a, 8)); + res = pmin(b, vec_sld(b, b, 4)); + return pfirst(res); +} + +template<> EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) +{ + return pfirst(pmin(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); +} + template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) { Packet2d b, res; @@ -834,6 +834,21 @@ template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) return static_cast(pfirst(res)); } +// max +template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) +{ + Packet4i b, res; + b = pmax(a, vec_sld(a, a, 8)); + res = pmax(b, vec_sld(b, b, 4)); + return pfirst(res); +} + +// max +template<> EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) +{ + return pfirst(pmax(a, reinterpret_cast(vec_sld(reinterpret_cast(a), reinterpret_cast(a), 8)))); +} + template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) { Packet2d b, res; @@ -842,6 +857,26 @@ template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) return static_cast(pfirst(res)); } +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); + Packet4i t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); + Packet4i t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); + Packet4i t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); + kernel.packet[0] = vec_mergeh(t0, t2); + kernel.packet[1] = vec_mergel(t0, t2); + kernel.packet[2] = vec_mergeh(t1, t3); + kernel.packet[3] = vec_mergel(t1, t3); +} + +EIGEN_DEVICE_FUNC inline void +ptranspose(PacketBlock& kernel) { + Packet2d t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI); + Packet2d t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO); + kernel.packet[0] = t0; + kernel.packet[1] = t1; +} + /* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one */ EIGEN_DEVICE_FUNC inline void @@ -880,6 +915,12 @@ ptranspose(PacketBlock& kernel) { kernel.packet[3].v4f[1] = t3.packet[1]; } +template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { + Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; + Packet4ui mask = vec_cmpeq(select, reinterpret_cast(p4i_ONE)); + return vec_sel(elsePacket, thenPacket, mask); +} + template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { Packet2ul select_hi = { ifPacket.select[0], ifPacket.select[1] }; Packet2ul select_lo = { ifPacket.select[2], ifPacket.select[3] }; @@ -890,153 +931,13 @@ template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, cons result.v4f[1] = vec_sel(elsePacket.v4f[1], thenPacket.v4f[1], mask_lo); return result; } -#else -template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) -{ - // FIXME: No intrinsic yet - EIGEN_DEBUG_ALIGNED_LOAD - Packet *vfrom; - vfrom = (Packet *) from; - return vfrom->v4f; -} -template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) -{ - // FIXME: No intrinsic yet - EIGEN_DEBUG_ALIGNED_STORE - Packet *vto; - vto = (Packet *) to; - vto->v4f = from; -} - -template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) -{ - return vec_splats(from); -} - -template<> EIGEN_STRONG_INLINE void -pbroadcast4(const float *a, - Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) -{ - a3 = pload(a); - a0 = vec_splat(a3, 0); - a1 = vec_splat(a3, 1); - a2 = vec_splat(a3, 2); - a3 = vec_splat(a3, 3); -} - -template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) -{ - float EIGEN_ALIGN16 af[4]; - af[0] = from[0*stride]; - af[1] = from[1*stride]; - af[2] = from[2*stride]; - af[3] = from[3*stride]; - return pload(af); -} - -template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) -{ - float EIGEN_ALIGN16 af[4]; - pstore((float*)af, from); - to[0*stride] = af[0]; - to[1*stride] = af[1]; - to[2*stride] = af[2]; - to[3*stride] = af[3]; -} - -template<> EIGEN_STRONG_INLINE Packet4f padd(const Packet4f& a, const Packet4f& b) { return (a + b); } -template<> EIGEN_STRONG_INLINE Packet4f psub(const Packet4f& a, const Packet4f& b) { return (a - b); } -template<> EIGEN_STRONG_INLINE Packet4f pmul(const Packet4f& a, const Packet4f& b) { return (a * b); } -template<> EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) { return (a / b); } -template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return (-a); } -template<> EIGEN_STRONG_INLINE Packet4f pconj (const Packet4f& a) { return a; } -template<> EIGEN_STRONG_INLINE Packet4f pmadd (const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); } -template<> EIGEN_STRONG_INLINE Packet4f pmin (const Packet4f& a, const Packet4f& b) { return vec_min(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pmax (const Packet4f& a, const Packet4f& b) { return vec_max(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pand (const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f por (const Packet4f& a, const Packet4f& b) { return vec_or(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pxor (const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } -template<> EIGEN_STRONG_INLINE Packet4f pround (const Packet4f& a) { return vec_round(a); } -template<> EIGEN_STRONG_INLINE Packet4f pceil (const Packet4f& a) { return vec_ceil(a); } -template<> EIGEN_STRONG_INLINE Packet4f pfloor (const Packet4f& a) { return vec_floor(a); } -template<> EIGEN_STRONG_INLINE Packet4f pabs (const Packet4f& a) { return vec_abs(a); } -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } - -template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) -{ - Packet4f p = pload(from); - return vec_perm(p, p, p16uc_DUPLICATE32_HI); -} - -template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) -{ - return reinterpret_cast(vec_perm(reinterpret_cast(a), reinterpret_cast(a), p16uc_REVERSE32)); -} - -template<> EIGEN_STRONG_INLINE float predux(const Packet4f& a) -{ - Packet4f b, sum; - b = vec_sld(a, a, 8); - sum = padd(a, b); - b = vec_sld(sum, sum, 4); - sum = padd(sum, b); - return pfirst(sum); -} - -// Other reduction functions: -// mul -template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) -{ - Packet4f prod; - prod = pmul(a, vec_sld(a, a, 8)); - return pfirst(pmul(prod, vec_sld(prod, prod, 4))); -} - -// min -template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) -{ - Packet4f b, res; - b = pmin(a, vec_sld(a, a, 8)); - res = pmin(b, vec_sld(b, b, 4)); - return pfirst(res); -} - -// max -template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) -{ - Packet4f b, res; - b = pmax(a, vec_sld(a, a, 8)); - res = pmax(b, vec_sld(b, b, 4)); - return pfirst(res); -} - -EIGEN_DEVICE_FUNC inline void -ptranspose(PacketBlock& kernel) { - Packet4f t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); - Packet4f t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); - Packet4f t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); - Packet4f t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); - kernel.packet[0] = vec_mergeh(t0, t2); - kernel.packet[1] = vec_mergel(t0, t2); - kernel.packet[2] = vec_mergeh(t1, t3); - kernel.packet[3] = vec_mergel(t1, t3); -} - -template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { - Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; - Packet4ui mask = vec_cmpeq(select, reinterpret_cast(p4i_ONE)); +template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { + Packet2ul select = { ifPacket.select[0], ifPacket.select[1] }; + Packet2ul mask = vec_cmpeq(select, reinterpret_cast(p2l_ONE)); return vec_sel(elsePacket, thenPacket, mask); } -#endif - -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE Packet4f ploadu (const float* from) { return pload(from); } -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { pstore(to, from); } -template<> EIGEN_STRONG_INLINE Packet4f plset (const float& a) { return padd(pset1(a), p4f_COUNTDOWN); } - } // end namespace internal } // end namespace Eigen diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/functors/AssignmentFunctors.h b/uppsrc/plugin/Eigen/Eigen/src/Core/functors/AssignmentFunctors.h index bf64ef4ed..4153b877c 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/functors/AssignmentFunctors.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/functors/AssignmentFunctors.h @@ -144,7 +144,7 @@ template struct swap_assign_op { EIGEN_EMPTY_STRUCT_CTOR(swap_assign_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const { -#ifdef EIGEN_GPUCC +#ifdef __CUDACC__ // FIXME is there some kind of cuda::swap? Scalar t=b; const_cast(b)=a; a=t; #else @@ -157,16 +157,7 @@ template struct functor_traits > { enum { Cost = 3 * NumTraits::ReadCost, - PacketAccess = - #if defined(EIGEN_VECTORIZE_AVX) && EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<800 || defined(__apple_build_version__)) - // This is a partial workaround for a bug in clang generating bad code - // when mixing 256/512 bits loads and 128 bits moves. - // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1684 - // https://bugs.llvm.org/show_bug.cgi?id=40815 - 0 - #else - packet_traits::Vectorizable - #endif + PacketAccess = packet_traits::Vectorizable }; }; diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/functors/BinaryFunctors.h b/uppsrc/plugin/Eigen/Eigen/src/Core/functors/BinaryFunctors.h index 697816663..3eae6b8ca 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/functors/BinaryFunctors.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/functors/BinaryFunctors.h @@ -39,12 +39,12 @@ struct scalar_sum_op : binary_op_base EIGEN_SCALAR_BINARY_OP_PLUGIN } #endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const { return internal::padd(a,b); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const { return internal::predux(a); } }; template @@ -56,9 +56,15 @@ struct functor_traits > { }; }; - -template<> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_sum_op::operator() (const bool& a, const bool& b) const { return a || b; } +/** \internal + * \brief Template specialization to deprecate the summation of boolean expressions. + * This is required to solve Bug 426. + * \sa DenseBase::count(), DenseBase::any(), ArrayBase::cast(), MatrixBase::cast() + */ +template<> struct scalar_sum_op : scalar_sum_op { + EIGEN_DEPRECATED + scalar_sum_op() {} +}; /** \internal @@ -77,12 +83,12 @@ struct scalar_product_op : binary_op_base EIGEN_SCALAR_BINARY_OP_PLUGIN } #endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const { return internal::pmul(a,b); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const { return internal::predux_mul(a); } }; template @@ -94,10 +100,6 @@ struct functor_traits > { }; }; -template<> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_product_op::operator() (const bool& a, const bool& b) const { return a && b; } - - /** \internal * \brief Template functor to compute the conjugate product of two scalars * @@ -114,11 +116,11 @@ struct scalar_conj_product_op : binary_op_base typedef typename ScalarBinaryOpTraits::ReturnType result_type; EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return conj_helper().pmul(a,b); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const { return conj_helper().pmul(a,b); } }; template @@ -139,12 +141,12 @@ struct scalar_min_op : binary_op_base { typedef typename ScalarBinaryOpTraits::ReturnType result_type; EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::mini(a, b); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::mini(a, b); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const { return internal::pmin(a,b); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const { return internal::predux_min(a); } }; template @@ -165,12 +167,12 @@ struct scalar_max_op : binary_op_base { typedef typename ScalarBinaryOpTraits::ReturnType result_type; EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::maxi(a, b); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::maxi(a, b); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const { return internal::pmax(a,b); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const { return internal::predux_max(a); } }; template @@ -380,14 +382,11 @@ struct functor_traits > { struct scalar_boolean_and_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const - { return internal::pand(a,b); } }; template<> struct functor_traits { enum { Cost = NumTraits::AddCost, - PacketAccess = true + PacketAccess = false }; }; @@ -399,14 +398,11 @@ template<> struct functor_traits { struct scalar_boolean_or_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const - { return internal::por(a,b); } }; template<> struct functor_traits { enum { Cost = NumTraits::AddCost, - PacketAccess = true + PacketAccess = false }; }; @@ -418,44 +414,11 @@ template<> struct functor_traits { struct scalar_boolean_xor_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_xor_op) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a ^ b; } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const - { return internal::pxor(a,b); } }; template<> struct functor_traits { enum { Cost = NumTraits::AddCost, - PacketAccess = true - }; -}; - -/** \internal - * \brief Template functor to compute the absolute difference of two scalars - * - * \sa class CwiseBinaryOp, MatrixBase::absolute_difference - */ -template -struct scalar_absolute_difference_op : binary_op_base -{ - typedef typename ScalarBinaryOpTraits::ReturnType result_type; -#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN - EIGEN_EMPTY_STRUCT_CTOR(scalar_absolute_difference_op) -#else - scalar_absolute_difference_op() { - EIGEN_SCALAR_BINARY_OP_PLUGIN - } -#endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const - { return numext::absdiff(a,b); } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const - { return internal::pabsdiff(a,b); } -}; -template -struct functor_traits > { - enum { - Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, - PacketAccess = is_same::value && packet_traits::HasAbsDiff + PacketAccess = false }; }; @@ -473,7 +436,7 @@ template struct bind1st_op : BinaryOp { typedef typename BinaryOp::second_argument_type second_argument_type; typedef typename BinaryOp::result_type result_type; - EIGEN_DEVICE_FUNC explicit bind1st_op(const first_argument_type &val) : m_value(val) {} + bind1st_op(const first_argument_type &val) : m_value(val) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); } @@ -492,7 +455,7 @@ template struct bind2nd_op : BinaryOp { typedef typename BinaryOp::second_argument_type second_argument_type; typedef typename BinaryOp::result_type result_type; - EIGEN_DEVICE_FUNC explicit bind2nd_op(const second_argument_type &val) : m_value(val) {} + bind2nd_op(const second_argument_type &val) : m_value(val) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); } diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/functors/NullaryFunctors.h b/uppsrc/plugin/Eigen/Eigen/src/Core/functors/NullaryFunctors.h index 4aa33a19f..b03be0269 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/functors/NullaryFunctors.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/functors/NullaryFunctors.h @@ -37,27 +37,26 @@ template struct functor_traits > { enum { Cost = NumTraits::AddCost, PacketAccess = false, IsRepeatable = true }; }; -template struct linspaced_op_impl; +template struct linspaced_op_impl; -template -struct linspaced_op_impl +template +struct linspaced_op_impl { - typedef typename NumTraits::Real RealScalar; - linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : - m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : Scalar((high-low)/RealScalar(num_steps-1))), + m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)), m_flip(numext::abs(high) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { + typedef typename NumTraits::Real RealScalar; if(m_flip) - return (i==0)? m_low : Scalar(m_high - RealScalar(m_size1-i)*m_step); + return (i==0)? m_low : (m_high - RealScalar(m_size1-i)*m_step); else - return (i==m_size1)? m_high : Scalar(m_low + RealScalar(i)*m_step); + return (i==m_size1)? m_high : (m_low + RealScalar(i)*m_step); } - template + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { // Principle: @@ -66,17 +65,17 @@ struct linspaced_op_impl { Packet pi = plset(Scalar(i-m_size1)); Packet res = padd(pset1(m_high), pmul(pset1(m_step), pi)); - if (EIGEN_PREDICT_TRUE(i != 0)) return res; - Packet mask = pcmp_lt(pset1(0), plset(0)); - return pselect(mask, res, pset1(m_low)); + if(i==0) + res = pinsertfirst(res, m_low); + return res; } else { Packet pi = plset(Scalar(i)); Packet res = padd(pset1(m_low), pmul(pset1(m_step), pi)); - if(EIGEN_PREDICT_TRUE(i != m_size1-unpacket_traits::size+1)) return res; - Packet mask = pcmp_lt(plset(0), pset1(unpacket_traits::size-1)); - return pselect(mask, res, pset1(m_high)); + if(i==m_size1-unpacket_traits::size+1) + res = pinsertlast(res, m_high); + return res; } } @@ -87,8 +86,8 @@ struct linspaced_op_impl const bool m_flip; }; -template -struct linspaced_op_impl +template +struct linspaced_op_impl { linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : m_low(low), @@ -116,8 +115,8 @@ struct linspaced_op_impl // Forward declaration (we default to random access which does not really give // us a speed gain when using packet access but it allows to use the functor in // nested expressions). -template struct linspaced_op; -template struct functor_traits< linspaced_op > +template struct linspaced_op; +template struct functor_traits< linspaced_op > { enum { @@ -127,7 +126,7 @@ template struct functor_traits< linspaced_op > IsRepeatable = true }; }; -template struct linspaced_op +template struct linspaced_op { linspaced_op(const Scalar& low, const Scalar& high, Index num_steps) : impl((num_steps==1 ? high : low),high,num_steps) @@ -137,11 +136,11 @@ template struct linspaced_op EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { return impl(i); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { return impl.template packetOp(i); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const { return impl.packetOp(i); } // This proxy object handles the actual required temporaries and the different // implementations (integer vs. floating point). - const linspaced_op_impl::IsInteger> impl; + const linspaced_op_impl::IsInteger> impl; }; // Linear access is automatically determined from the operator() prototypes available for the given functor. @@ -167,12 +166,12 @@ struct has_unary_operator,IndexType> { enum { value = template struct has_binary_operator,IndexType> { enum { value = 1}; }; -template -struct has_nullary_operator,IndexType> { enum { value = 0}; }; -template -struct has_unary_operator,IndexType> { enum { value = 1}; }; -template -struct has_binary_operator,IndexType> { enum { value = 0}; }; +template +struct has_nullary_operator,IndexType> { enum { value = 0}; }; +template +struct has_unary_operator,IndexType> { enum { value = 1}; }; +template +struct has_binary_operator,IndexType> { enum { value = 0}; }; template struct has_nullary_operator,IndexType> { enum { value = 1}; }; diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/functors/UnaryFunctors.h b/uppsrc/plugin/Eigen/Eigen/src/Core/functors/UnaryFunctors.h index a55d7b74e..b56e7afd2 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/functors/UnaryFunctors.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/functors/UnaryFunctors.h @@ -117,15 +117,7 @@ template struct functor_traits > { enum { - Cost = 0, - // Yes the cost is zero even for complexes because in most cases for which - // the cost is used, conjugation turns to be a no-op. Some examples: - // cost(a*conj(b)) == cost(a*b) - // cost(a+conj(b)) == cost(a+b) - // ::IsComplex ? NumTraits::AddCost : 0, PacketAccess = packet_traits::HasConj }; }; @@ -166,44 +158,6 @@ template struct functor_traits > { enum { Cost = is_same::value ? 0 : NumTraits::AddCost, PacketAccess = false }; }; -/** \internal - * \brief Template functor to arithmetically shift a scalar right by a number of bits - * - * \sa class CwiseUnaryOp, MatrixBase::shift_right() - */ -template -struct scalar_shift_right_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_shift_right_op) - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const - { return a >> N; } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const - { return internal::parithmetic_shift_right(a); } -}; -template -struct functor_traits > -{ enum { Cost = NumTraits::AddCost, PacketAccess = packet_traits::HasShift }; }; - -/** \internal - * \brief Template functor to logically shift a scalar left by a number of bits - * - * \sa class CwiseUnaryOp, MatrixBase::shift_left() - */ -template -struct scalar_shift_left_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_shift_left_op) - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const - { return a << N; } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const - { return internal::plogical_shift_left(a); } -}; -template -struct functor_traits > -{ enum { Cost = NumTraits::AddCost, PacketAccess = packet_traits::HasShift }; }; - /** \internal * \brief Template functor to extract the real part of a complex * @@ -308,26 +262,6 @@ struct functor_traits > { }; }; -/** \internal - * - * \brief Template functor to compute the exponential of a scalar - 1. - * - * \sa class CwiseUnaryOp, ArrayBase::expm1() - */ -template struct scalar_expm1_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_expm1_op) - EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const { return numext::expm1(a); } - template - EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pexpm1(a); } -}; -template -struct functor_traits > { - enum { - PacketAccess = packet_traits::HasExpm1, - Cost = functor_traits >::Cost // TODO measure cost of expm1 - }; -}; - /** \internal * * \brief Template functor to compute the logarithm of a scalar @@ -594,23 +528,6 @@ struct functor_traits > { }; }; -#if EIGEN_HAS_CXX11_MATH -/** \internal - * \brief Template functor to compute the atanh of a scalar - * \sa class CwiseUnaryOp, ArrayBase::atanh() - */ -template -struct scalar_atanh_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_atanh_op) - EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::atanh(a); } -}; - -template -struct functor_traits > { - enum { Cost = 5 * NumTraits::MulCost, PacketAccess = false }; -}; -#endif - /** \internal * \brief Template functor to compute the sinh of a scalar * \sa class CwiseUnaryOp, ArrayBase::sinh() @@ -630,23 +547,6 @@ struct functor_traits > }; }; -#if EIGEN_HAS_CXX11_MATH -/** \internal - * \brief Template functor to compute the asinh of a scalar - * \sa class CwiseUnaryOp, ArrayBase::asinh() - */ -template -struct scalar_asinh_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_asinh_op) - EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::asinh(a); } -}; - -template -struct functor_traits > { - enum { Cost = 5 * NumTraits::MulCost, PacketAccess = false }; -}; -#endif - /** \internal * \brief Template functor to compute the cosh of a scalar * \sa class CwiseUnaryOp, ArrayBase::cosh() @@ -666,23 +566,6 @@ struct functor_traits > }; }; -#if EIGEN_HAS_CXX11_MATH -/** \internal - * \brief Template functor to compute the acosh of a scalar - * \sa class CwiseUnaryOp, ArrayBase::acosh() - */ -template -struct scalar_acosh_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_acosh_op) - EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::acosh(a); } -}; - -template -struct functor_traits > { - enum { Cost = 5 * NumTraits::MulCost, PacketAccess = false }; -}; -#endif - /** \internal * \brief Template functor to compute the inverse of a scalar * \sa class CwiseUnaryOp, Cwise::inverse() @@ -695,13 +578,9 @@ struct scalar_inverse_op { EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const { return internal::pdiv(pset1(Scalar(1)),a); } }; -template -struct functor_traits > { - enum { - PacketAccess = packet_traits::HasDiv, - Cost = scalar_div_cost::value - }; -}; +template +struct functor_traits > +{ enum { Cost = NumTraits::MulCost, PacketAccess = packet_traits::HasDiv }; }; /** \internal * \brief Template functor to compute the square of a scalar @@ -773,25 +652,6 @@ struct functor_traits > }; }; -/** \internal - * \brief Template functor to compute the rounded (with current rounding mode) value of a scalar - * \sa class CwiseUnaryOp, ArrayBase::rint() - */ -template struct scalar_rint_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_rint_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::rint(a); } - template - EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::print(a); } -}; -template -struct functor_traits > -{ - enum { - Cost = NumTraits::MulCost, - PacketAccess = packet_traits::HasRint - }; -}; - /** \internal * \brief Template functor to compute the ceil of a scalar * \sa class CwiseUnaryOp, ArrayBase::ceil() @@ -818,13 +678,7 @@ struct functor_traits > template struct scalar_isnan_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_isnan_op) typedef bool result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { -#if defined(SYCL_DEVICE_ONLY) - return numext::isnan(a); -#else - return (numext::isnan)(a); -#endif - } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isnan)(a); } }; template struct functor_traits > @@ -842,13 +696,7 @@ struct functor_traits > template struct scalar_isinf_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_isinf_op) typedef bool result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { -#if defined(SYCL_DEVICE_ONLY) - return numext::isinf(a); -#else - return (numext::isinf)(a); -#endif - } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isinf)(a); } }; template struct functor_traits > @@ -866,13 +714,7 @@ struct functor_traits > template struct scalar_isfinite_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_isfinite_op) typedef bool result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { -#if defined(SYCL_DEVICE_ONLY) - return numext::isfinite(a); -#else - return (numext::isfinite)(a); -#endif - } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return (numext::isfinite)(a); } }; template struct functor_traits > @@ -935,7 +777,7 @@ struct scalar_sign_op { template struct functor_traits > { enum { - Cost = + Cost = NumTraits::IsComplex ? ( 8*NumTraits::MulCost ) // roughly : ( 3*NumTraits::AddCost), @@ -943,130 +785,6 @@ struct functor_traits > }; }; -/** \internal - * \brief Template functor to compute the logistic function of a scalar - * \sa class CwiseUnaryOp, ArrayBase::logistic() - */ -template -struct scalar_logistic_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { - const T one = T(1); - return one / (one + numext::exp(-x)); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Packet packetOp(const Packet& x) const { - const Packet one = pset1(T(1)); - return pdiv(one, padd(one, pexp(pnegate(x)))); - } -}; - -#ifndef EIGEN_GPU_COMPILE_PHASE -/** \internal - * \brief Template specialization of the logistic function for float. - * - * Uses just a 9/10-degree rational interpolant which - * interpolates 1/(1+exp(-x)) - 0.5 up to a couple of ulps in the range - * [-9, 18]. Below -9 we use the more accurate approximation - * 1/(1+exp(-x)) ~= exp(x), and above 18 the logistic function is 1 withing - * one ulp. The shifted logistic is interpolated because it was easier to - * make the fit converge. - * - */ -template <> -struct scalar_logistic_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_logistic_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator()(const float& x) const { - // The upper cut-off is the smallest x for which the rational approximation evaluates to 1. - // Choosing this value saves us a few instructions clamping the results at the end. -#ifdef EIGEN_VECTORIZE_FMA - const float cutoff_upper = 15.7243833541870117f; -#else - const float cutoff_upper = 15.6437711715698242f; -#endif - const float cutoff_lower = -9.f; - if (x > cutoff_upper) return 1.0f; - else if (x < cutoff_lower) return numext::exp(x); - else return 1.0f / (1.0f + numext::exp(-x)); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Packet packetOp(const Packet& _x) const { - const Packet cutoff_lower = pset1(-9.f); - const Packet lt_mask = pcmp_lt(_x, cutoff_lower); - const bool any_small = predux(lt_mask); - - // Clamp the input to be at most 'cutoff_upper'. -#ifdef EIGEN_VECTORIZE_FMA - const Packet cutoff_upper = pset1(15.7243833541870117f); -#else - const Packet cutoff_upper = pset1(15.6437711715698242f); -#endif - const Packet x = pmin(_x, cutoff_upper); - - // The monomial coefficients of the numerator polynomial (odd). - const Packet alpha_1 = pset1(2.48287947061529e-01f); - const Packet alpha_3 = pset1(8.51377133304701e-03f); - const Packet alpha_5 = pset1(6.08574864600143e-05f); - const Packet alpha_7 = pset1(1.15627324459942e-07f); - const Packet alpha_9 = pset1(4.37031012579801e-11f); - - // The monomial coefficients of the denominator polynomial (even). - const Packet beta_0 = pset1(9.93151921023180e-01f); - const Packet beta_2 = pset1(1.16817656904453e-01f); - const Packet beta_4 = pset1(1.70198817374094e-03f); - const Packet beta_6 = pset1(6.29106785017040e-06f); - const Packet beta_8 = pset1(5.76102136993427e-09f); - const Packet beta_10 = pset1(6.10247389755681e-13f); - - // Since the polynomials are odd/even, we need x^2. - const Packet x2 = pmul(x, x); - - // Evaluate the numerator polynomial p. - Packet p = pmadd(x2, alpha_9, alpha_7); - p = pmadd(x2, p, alpha_5); - p = pmadd(x2, p, alpha_3); - p = pmadd(x2, p, alpha_1); - p = pmul(x, p); - - // Evaluate the denominator polynomial q. - Packet q = pmadd(x2, beta_10, beta_8); - q = pmadd(x2, q, beta_6); - q = pmadd(x2, q, beta_4); - q = pmadd(x2, q, beta_2); - q = pmadd(x2, q, beta_0); - // Divide the numerator by the denominator and shift it up. - const Packet logistic = padd(pdiv(p, q), pset1(0.5f)); - if (EIGEN_PREDICT_FALSE(any_small)) { - const Packet exponential = pexp(_x); - return pselect(lt_mask, exponential, logistic); - } else { - return logistic; - } - } -}; -#endif // #ifndef EIGEN_GPU_COMPILE_PHASE - -template -struct functor_traits > { - enum { - // The cost estimate for float here here is for the common(?) case where - // all arguments are greater than -9. - Cost = scalar_div_cost::HasDiv>::value + - (internal::is_same::value - ? NumTraits::AddCost * 15 + NumTraits::MulCost * 11 - : NumTraits::AddCost * 2 + - functor_traits >::Cost), - PacketAccess = - packet_traits::HasAdd && packet_traits::HasDiv && - (internal::is_same::value - ? packet_traits::HasMul && packet_traits::HasMax && - packet_traits::HasMin - : packet_traits::HasNegate && packet_traits::HasExp) - }; -}; - } // end namespace internal } // end namespace Eigen diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 64e7f79cf..681451cc3 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -15,13 +15,7 @@ namespace Eigen { namespace internal { -enum GEBPPacketSizeType { - GEBPPacketFull = 0, - GEBPPacketHalf, - GEBPPacketQuarter -}; - -template +template class gebp_traits; @@ -31,42 +25,16 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff return a<=0 ? b : a; } -#if defined(EIGEN_DEFAULT_L1_CACHE_SIZE) -#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE -#else -#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val -#endif // defined(EIGEN_DEFAULT_L1_CACHE_SIZE) - -#if defined(EIGEN_DEFAULT_L2_CACHE_SIZE) -#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE -#else -#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val -#endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE) - -#if defined(EIGEN_DEFAULT_L3_CACHE_SIZE) -#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_SET_DEFAULT_L3_CACHE_SIZE -#else -#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val -#endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE) - #if EIGEN_ARCH_i386_OR_x86_64 -const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32*1024); -const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256*1024); -const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2*1024*1024); -#elif EIGEN_ARCH_PPC -const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64*1024); -const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024); -const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4*1024*1024); +const std::ptrdiff_t defaultL1CacheSize = 32*1024; +const std::ptrdiff_t defaultL2CacheSize = 256*1024; +const std::ptrdiff_t defaultL3CacheSize = 2*1024*1024; #else -const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16*1024); -const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512*1024); -const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512*1024); +const std::ptrdiff_t defaultL1CacheSize = 16*1024; +const std::ptrdiff_t defaultL2CacheSize = 512*1024; +const std::ptrdiff_t defaultL3CacheSize = 512*1024; #endif -#undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE -#undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE -#undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE - /** \internal */ struct CacheSizes { CacheSizes(): m_l1(-1),m_l2(-1),m_l3(-1) { @@ -82,6 +50,7 @@ struct CacheSizes { std::ptrdiff_t m_l3; }; + /** \internal */ inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3) { @@ -132,16 +101,6 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // at the register level. This small horizontal panel has to stay within L1 cache. std::ptrdiff_t l1, l2, l3; manage_caching_sizes(GetAction, &l1, &l2, &l3); - #ifdef EIGEN_VECTORIZE_AVX512 - // We need to find a rationale for that, but without this adjustment, - // performance with AVX512 is pretty bad, like -20% slower. - // One reason is that with increasing packet-size, the blocking size k - // has to become pretty small if we want that 1 lhs panel fit within L1. - // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are: - // k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144. - // This is quite small for a good reuse of the accumulation registers. - l1 *= 4; - #endif if (num_threads > 1) { typedef typename Traits::ResScalar ResScalar; @@ -156,7 +115,8 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // registers. However once the latency is hidden there is no point in // increasing the value of k, so we'll cap it at 320 (value determined // experimentally). - const Index k_cache = (numext::mini)((l1-ksub)/kdiv, 320); + // To avoid that k vanishes, we make k_cache at least as big as kr + const Index k_cache = numext::maxi(kr, (numext::mini)((l1-ksub)/kdiv, 320)); if (k_cache < k) { k = k_cache - (k_cache % kr); eigen_internal_assert(k > 0); @@ -378,61 +338,6 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ // #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); #endif -template -struct RhsPanelHelper { - private: - static const int remaining_registers = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS - registers_taken; - public: - typedef typename conditional=4, RhsPacketx4, RhsPacket>::type type; -}; - -template -struct QuadPacket -{ - Packet B_0, B1, B2, B3; - const Packet& get(const FixedInt<0>&) const { return B_0; } - const Packet& get(const FixedInt<1>&) const { return B1; } - const Packet& get(const FixedInt<2>&) const { return B2; } - const Packet& get(const FixedInt<3>&) const { return B3; } -}; - -template -struct packet_conditional { typedef T3 type; }; - -template -struct packet_conditional { typedef T1 type; }; - -template -struct packet_conditional { typedef T2 type; }; - -#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \ - typedef typename packet_conditional::type, \ - typename packet_traits::half, \ - typename unpacket_traits::half>::half>::type \ - prefix ## name ## Packet - -#define PACKET_DECL_COND(name, packet_size) \ - typedef typename packet_conditional::type, \ - typename packet_traits::half, \ - typename unpacket_traits::half>::half>::type \ - name ## Packet - -#define PACKET_DECL_COND_SCALAR_PREFIX(prefix, packet_size) \ - typedef typename packet_conditional::type, \ - typename packet_traits::half, \ - typename unpacket_traits::half>::half>::type \ - prefix ## ScalarPacket - -#define PACKET_DECL_COND_SCALAR(packet_size) \ - typedef typename packet_conditional::type, \ - typename packet_traits::half, \ - typename unpacket_traits::half>::half>::type \ - ScalarPacket - /* Vectorization logic * real*real: unpack rhs to constant packets, ... * @@ -443,7 +348,7 @@ struct packet_conditional { typedef T2 type; }; * cplx*real : unpack rhs to constant packets, ... * real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual */ -template +template class gebp_traits { public: @@ -451,17 +356,13 @@ public: typedef _RhsScalar RhsScalar; typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; - PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); - PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); - PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); - enum { ConjLhs = _ConjLhs, ConjRhs = _ConjRhs, - Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable, - LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, - RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, - ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, + Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable, + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, + ResPacketSize = Vectorizable ? packet_traits::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, @@ -470,12 +371,10 @@ public: // register block size along the M direction (currently, this one cannot be modified) default_mr = (EIGEN_PLAIN_ENUM_MIN(16,NumberOfRegisters)/2/nr)*LhsPacketSize, -#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) \ - && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC>=1914)) - // we assume 16 registers or more +#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX) + // we assume 16 registers // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined, // then using 3*LhsPacketSize triggers non-implemented paths in syrk. - // Bug 1515: MSVC prior to v19.14 yields to register spilling. mr = Vectorizable ? 3*LhsPacketSize : default_mr, #else mr = default_mr, @@ -485,41 +384,37 @@ public: RhsProgress = 1 }; + typedef typename packet_traits::type _LhsPacket; + typedef typename packet_traits::type _RhsPacket; + typedef typename packet_traits::type _ResPacket; typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; - typedef LhsPacket LhsPacket4Packing; - typedef QuadPacket RhsPacketx4; typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1(ResScalar(0)); } - + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + pbroadcast4(b, b0, b1, b2, b3); + } + +// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) +// { +// pbroadcast2(b, b0, b1); +// } + template EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const { dest = pset1(*b); } - - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const - { - pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); - } - - template - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const - { - loadRhs(b, dest); - } - - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const - { - } - + EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad(b); @@ -537,8 +432,8 @@ public: dest = ploadu(a); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const + template + EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, AccPacketType& tmp) const { conj_helper cj; // It would be a lot cleaner to call pmadd all the time. Unfortunately if we @@ -553,12 +448,6 @@ public: #endif } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const - { - madd(a, b.get(lane), c, tmp, lane); - } - EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const { r = pmadd(c,alpha,r); @@ -572,25 +461,21 @@ public: }; -template -class gebp_traits, RealScalar, _ConjLhs, false, Arch, _PacketSize> +template +class gebp_traits, RealScalar, _ConjLhs, false> { public: typedef std::complex LhsScalar; typedef RealScalar RhsScalar; typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; - PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); - PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); - PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); - enum { ConjLhs = _ConjLhs, ConjRhs = false, - Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && unpacket_traits<_RhsPacket>::vectorizable, - LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, - RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, - ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, + Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable, + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, + ResPacketSize = Vectorizable ? packet_traits::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, nr = 4, @@ -605,12 +490,13 @@ public: RhsProgress = 1 }; + typedef typename packet_traits::type _LhsPacket; + typedef typename packet_traits::type _RhsPacket; + typedef typename packet_traits::type _ResPacket; + typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; - typedef LhsPacket LhsPacket4Packing; - - typedef QuadPacket RhsPacketx4; typedef ResPacket AccPacket; @@ -619,42 +505,13 @@ public: p = pset1(ResScalar(0)); } - template - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { - dest = pset1(*b); + dest = pset1(*b); } - - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const - { - pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); - } - - template - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const - { - loadRhs(b, dest); - } - - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const - {} EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { - loadRhsQuad_impl(b,dest, typename conditional::type()); - } - - EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const - { - // FIXME we can do better! - // what we want here is a ploadheight - RhsScalar tmp[4] = {b[0],b[0],b[1],b[1]}; - dest = ploadquad(tmp); - } - - EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const - { - eigen_internal_assert(RhsPacketSize<=8); dest = pset1(*b); } @@ -663,20 +520,27 @@ public: dest = pload(a); } - template - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const { - dest = ploadu(a); + dest = ploadu(a); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + pbroadcast4(b, b0, b1, b2, b3); + } + +// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) +// { +// pbroadcast2(b, b0, b1); +// } + + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const { madd_impl(a, b, c, tmp, typename conditional::type()); } - template - EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const + EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); @@ -691,20 +555,13 @@ public: c += a * b; } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const { - madd(a, b.get(lane), c, tmp, lane); - } - - template - EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const - { - conj_helper cj; r = cj.pmadd(c,alpha,r); } protected: + conj_helper cj; }; template @@ -723,57 +580,13 @@ DoublePacket padd(const DoublePacket &a, const DoublePacket the "4" in "downto4" -// corresponds to the number of complexes, so it means "8" -// it terms of real coefficients. - template -const DoublePacket& -predux_half_dowto4(const DoublePacket &a, - typename enable_if::size<=8>::type* = 0) +const DoublePacket& predux_downto4(const DoublePacket &a) { return a; } -template -DoublePacket::half> -predux_half_dowto4(const DoublePacket &a, - typename enable_if::size==16>::type* = 0) -{ - // yes, that's pretty hackish :( - DoublePacket::half> res; - typedef std::complex::type> Cplx; - typedef typename packet_traits::type CplxPacket; - res.first = predux_half_dowto4(CplxPacket(a.first)).v; - res.second = predux_half_dowto4(CplxPacket(a.second)).v; - return res; -} - -// same here, "quad" actually means "8" in terms of real coefficients -template -void loadQuadToDoublePacket(const Scalar* b, DoublePacket& dest, - typename enable_if::size<=8>::type* = 0) -{ - dest.first = pset1(numext::real(*b)); - dest.second = pset1(numext::imag(*b)); -} - -template -void loadQuadToDoublePacket(const Scalar* b, DoublePacket& dest, - typename enable_if::size==16>::type* = 0) -{ - // yes, that's pretty hackish too :( - typedef typename NumTraits::Real RealScalar; - RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])}; - RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])}; - dest.first = ploadquad(r); - dest.second = ploadquad(i); -} - - -template struct unpacket_traits > { - typedef DoublePacket::half> half; -}; +template struct unpacket_traits > { typedef DoublePacket half; }; // template // DoublePacket pmadd(const DoublePacket &a, const DoublePacket &b) // { @@ -783,8 +596,8 @@ template struct unpacket_traits > { // return res; // } -template -class gebp_traits, std::complex, _ConjLhs, _ConjRhs, Arch, _PacketSize > +template +class gebp_traits, std::complex, _ConjLhs, _ConjRhs > { public: typedef std::complex Scalar; @@ -792,21 +605,15 @@ public: typedef std::complex RhsScalar; typedef std::complex ResScalar; - PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); - PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); - PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); - PACKET_DECL_COND(Real, _PacketSize); - PACKET_DECL_COND_SCALAR(_PacketSize); - enum { ConjLhs = _ConjLhs, ConjRhs = _ConjRhs, - Vectorizable = unpacket_traits::vectorizable - && unpacket_traits::vectorizable, - ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, - LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, - RhsPacketSize = Vectorizable ? unpacket_traits::size : 1, - RealPacketSize = Vectorizable ? unpacket_traits::size : 1, + Vectorizable = packet_traits::Vectorizable + && packet_traits::Vectorizable, + RealPacketSize = Vectorizable ? packet_traits::size : 1, + ResPacketSize = Vectorizable ? packet_traits::size : 1, + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, // FIXME: should depend on NumberOfRegisters nr = 4, @@ -816,16 +623,14 @@ public: RhsProgress = 1 }; - typedef DoublePacket DoublePacketType; + typedef typename packet_traits::type RealPacket; + typedef typename packet_traits::type ScalarPacket; + typedef DoublePacket DoublePacketType; - typedef typename conditional::type LhsPacket4Packing; typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; typedef typename conditional::type AccPacket; - - // this actualy holds 8 packets! - typedef QuadPacket RhsPacketx4; EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); } @@ -836,41 +641,17 @@ public: } // Scalar path - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { - dest = pset1(*b); + dest = pset1(*b); } // Vectorized path - template - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const { - dest.first = pset1(numext::real(*b)); - dest.second = pset1(numext::imag(*b)); + dest.first = pset1(numext::real(*b)); + dest.second = pset1(numext::imag(*b)); } - - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const - { - loadRhs(b, dest.B_0); - loadRhs(b + 1, dest.B1); - loadRhs(b + 2, dest.B2); - loadRhs(b + 3, dest.B3); - } - - // Scalar path - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const - { - loadRhs(b, dest); - } - - // Vectorized path - template - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket& dest) const - { - loadRhs(b, dest); - } - - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {} EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const { @@ -878,7 +659,33 @@ public: } EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const { - loadQuadToDoublePacket(b,dest); + eigen_internal_assert(unpacket_traits::size<=4); + loadRhs(b,dest); + } + + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) + { + // FIXME not sure that's the best way to implement it! + loadRhs(b+0, b0); + loadRhs(b+1, b1); + loadRhs(b+2, b2); + loadRhs(b+3, b3); + } + + // Vectorized path + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, DoublePacketType& b0, DoublePacketType& b1) + { + // FIXME not sure that's the best way to implement it! + loadRhs(b+0, b0); + loadRhs(b+1, b1); + } + + // Scalar path + EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsScalar& b0, RhsScalar& b1) + { + // FIXME not sure that's the best way to implement it! + loadRhs(b+0, b0); + loadRhs(b+1, b1); } // nothing special here @@ -887,59 +694,47 @@ public: dest = pload((const typename unpacket_traits::type*)(a)); } - template - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const { - dest = ploadu((const typename unpacket_traits::type*)(a)); + dest = ploadu((const typename unpacket_traits::type*)(a)); } - template - EIGEN_STRONG_INLINE - typename enable_if::value>::type - madd(const LhsPacketType& a, const RhsPacketType& b, DoublePacket& c, TmpType& /*tmp*/, const LaneIdType&) const + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacketType& c, RhsPacket& /*tmp*/) const { c.first = padd(pmul(a,b.first), c.first); c.second = padd(pmul(a,b.second),c.second); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/, const LaneIdType&) const + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const { c = cj.pmadd(a,b,c); } - - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const - { - madd(a, b.get(lane), c, tmp, lane); - } EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; } - template - EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacketType& alpha, ResPacketType& r) const + EIGEN_STRONG_INLINE void acc(const DoublePacketType& c, const ResPacket& alpha, ResPacket& r) const { // assemble c - ResPacketType tmp; + ResPacket tmp; if((!ConjLhs)&&(!ConjRhs)) { - tmp = pcplxflip(pconj(ResPacketType(c.second))); - tmp = padd(ResPacketType(c.first),tmp); + tmp = pcplxflip(pconj(ResPacket(c.second))); + tmp = padd(ResPacket(c.first),tmp); } else if((!ConjLhs)&&(ConjRhs)) { - tmp = pconj(pcplxflip(ResPacketType(c.second))); - tmp = padd(ResPacketType(c.first),tmp); + tmp = pconj(pcplxflip(ResPacket(c.second))); + tmp = padd(ResPacket(c.first),tmp); } else if((ConjLhs)&&(!ConjRhs)) { - tmp = pcplxflip(ResPacketType(c.second)); - tmp = padd(pconj(ResPacketType(c.first)),tmp); + tmp = pcplxflip(ResPacket(c.second)); + tmp = padd(pconj(ResPacket(c.first)),tmp); } else if((ConjLhs)&&(ConjRhs)) { - tmp = pcplxflip(ResPacketType(c.second)); - tmp = psub(pconj(ResPacketType(c.first)),tmp); + tmp = pcplxflip(ResPacket(c.second)); + tmp = psub(pconj(ResPacket(c.first)),tmp); } r = pmadd(tmp,alpha,r); @@ -949,8 +744,8 @@ protected: conj_helper cj; }; -template -class gebp_traits, false, _ConjRhs, Arch, _PacketSize > +template +class gebp_traits, false, _ConjRhs > { public: typedef std::complex Scalar; @@ -958,25 +753,14 @@ public: typedef Scalar RhsScalar; typedef Scalar ResScalar; - PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); - PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); - PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); - PACKET_DECL_COND_PREFIX(_, Real, _PacketSize); - PACKET_DECL_COND_SCALAR_PREFIX(_, _PacketSize); - -#undef PACKET_DECL_COND_SCALAR_PREFIX -#undef PACKET_DECL_COND_PREFIX -#undef PACKET_DECL_COND_SCALAR -#undef PACKET_DECL_COND - enum { ConjLhs = false, ConjRhs = _ConjRhs, - Vectorizable = unpacket_traits<_RealPacket>::vectorizable - && unpacket_traits<_ScalarPacket>::vectorizable, - LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, - RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, - ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1, + Vectorizable = packet_traits::Vectorizable + && packet_traits::Vectorizable, + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, + ResPacketSize = Vectorizable ? packet_traits::size : 1, NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, // FIXME: should depend on NumberOfRegisters @@ -987,11 +771,14 @@ public: RhsProgress = 1 }; + typedef typename packet_traits::type _LhsPacket; + typedef typename packet_traits::type _RhsPacket; + typedef typename packet_traits::type _ResPacket; + typedef typename conditional::type LhsPacket; typedef typename conditional::type RhsPacket; typedef typename conditional::type ResPacket; - typedef LhsPacket LhsPacket4Packing; - typedef QuadPacket RhsPacketx4; + typedef ResPacket AccPacket; EIGEN_STRONG_INLINE void initAcc(AccPacket& p) @@ -999,25 +786,22 @@ public: p = pset1(ResScalar(0)); } - template - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const + EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { - dest = pset1(*b); + dest = pset1(*b); } - - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const + + void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1, RhsPacket& b2, RhsPacket& b3) { - pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3); + pbroadcast4(b, b0, b1, b2, b3); } - - template - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const - { - loadRhs(b, dest); - } - - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const - {} + +// EIGEN_STRONG_INLINE void broadcastRhs(const RhsScalar* b, RhsPacket& b0, RhsPacket& b1) +// { +// // FIXME not sure that's the best way to implement it! +// b0 = pload1(b+0); +// b1 = pload1(b+1); +// } EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { @@ -1026,23 +810,21 @@ public: EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { - dest = ploadquad(b); + eigen_internal_assert(unpacket_traits::size<=4); + loadRhs(b,dest); } - template - EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const + EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacket& dest) const { - dest = ploaddup(a); + dest = ploaddup(a); } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const LaneIdType&) const + EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const { madd_impl(a, b, c, tmp, typename conditional::type()); } - template - EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp, const true_type&) const + EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const { #ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD EIGEN_UNUSED_VARIABLE(tmp); @@ -1058,166 +840,16 @@ public: c += a * b; } - template - EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp, const LaneIdType& lane) const + EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const { - madd(a, b.get(lane), c, tmp, lane); - } - - template - EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const - { - conj_helper cj; r = cj.pmadd(alpha,c,r); } protected: - + conj_helper cj; }; - -#if EIGEN_ARCH_ARM64 && defined EIGEN_VECTORIZE_NEON - -template<> -struct gebp_traits - : gebp_traits -{ - typedef float RhsPacket; - - typedef float32x4_t RhsPacketx4; - - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const - { - dest = *b; - } - - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const - { - dest = vld1q_f32(b); - } - - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const - { - dest = *b; - } - - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketx4& dest) const - {} - - EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const - { - loadRhs(b,dest); - } - - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const - { - c = vfmaq_n_f32(c, a, b); - } - - // NOTE: Template parameter inference failed when compiled with Android NDK: - // "candidate template ignored: could not match 'FixedInt' against 'Eigen::internal::FixedInt<0>". - - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const - { madd_helper<0>(a, b, c); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const - { madd_helper<1>(a, b, c); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const - { madd_helper<2>(a, b, c); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const - { madd_helper<3>(a, b, c); } - - private: - template - EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const - { - #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) - // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 - // vfmaq_laneq_f32 is implemented through a costly dup - if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : ); - else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : ); - else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : ); - else if(LaneID==3) asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) : ); - #else - c = vfmaq_laneq_f32(c, a, b, LaneID); - #endif - } -}; - - -template<> -struct gebp_traits - : gebp_traits -{ - typedef double RhsPacket; - - struct RhsPacketx4 { - float64x2_t B_0, B_1; - }; - - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const - { - dest = *b; - } - - EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const - { - dest.B_0 = vld1q_f64(b); - dest.B_1 = vld1q_f64(b+2); - } - - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const - { - loadRhs(b,dest); - } - - EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketx4& dest) const - {} - - EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const - { - loadRhs(b,dest); - } - - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const - { - c = vfmaq_n_f64(c, a, b); - } - - // NOTE: Template parameter inference failed when compiled with Android NDK: - // "candidate template ignored: could not match 'FixedInt' against 'Eigen::internal::FixedInt<0>". - - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const - { madd_helper<0>(a, b, c); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const - { madd_helper<1>(a, b, c); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const - { madd_helper<2>(a, b, c); } - EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const - { madd_helper<3>(a, b, c); } - - private: - template - EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const - { - #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0)) - // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101 - // vfmaq_laneq_f64 is implemented through a costly dup - if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); - else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : ); - else if(LaneID==2) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); - else if(LaneID==3) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : ); - #else - if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0); - else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1); - else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0); - else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1); - #endif - } -}; - -#endif - -/* optimized General packed Block * packed Panel product kernel +/* optimized GEneral packed Block * packed Panel product kernel * * Mixing type logic: C += A * B * | A | B | comments @@ -1227,47 +859,26 @@ struct gebp_traits template struct gebp_kernel { - typedef gebp_traits Traits; - typedef gebp_traits HalfTraits; - typedef gebp_traits QuarterTraits; - + typedef gebp_traits Traits; typedef typename Traits::ResScalar ResScalar; typedef typename Traits::LhsPacket LhsPacket; typedef typename Traits::RhsPacket RhsPacket; typedef typename Traits::ResPacket ResPacket; typedef typename Traits::AccPacket AccPacket; - typedef typename Traits::RhsPacketx4 RhsPacketx4; - - typedef typename RhsPanelHelper::type RhsPanel15; - - typedef gebp_traits SwappedTraits; + typedef gebp_traits SwappedTraits; typedef typename SwappedTraits::ResScalar SResScalar; typedef typename SwappedTraits::LhsPacket SLhsPacket; typedef typename SwappedTraits::RhsPacket SRhsPacket; typedef typename SwappedTraits::ResPacket SResPacket; typedef typename SwappedTraits::AccPacket SAccPacket; - typedef typename HalfTraits::LhsPacket LhsPacketHalf; - typedef typename HalfTraits::RhsPacket RhsPacketHalf; - typedef typename HalfTraits::ResPacket ResPacketHalf; - typedef typename HalfTraits::AccPacket AccPacketHalf; - - typedef typename QuarterTraits::LhsPacket LhsPacketQuarter; - typedef typename QuarterTraits::RhsPacket RhsPacketQuarter; - typedef typename QuarterTraits::ResPacket ResPacketQuarter; - typedef typename QuarterTraits::AccPacket AccPacketQuarter; - typedef typename DataMapper::LinearMapper LinearMapper; enum { Vectorizable = Traits::Vectorizable, LhsProgress = Traits::LhsProgress, - LhsProgressHalf = HalfTraits::LhsProgress, - LhsProgressQuarter = QuarterTraits::LhsProgress, RhsProgress = Traits::RhsProgress, - RhsProgressHalf = HalfTraits::RhsProgress, - RhsProgressQuarter = QuarterTraits::RhsProgress, ResPacketSize = Traits::ResPacketSize }; @@ -1277,299 +888,6 @@ struct gebp_kernel Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0); }; -template::LhsProgress> -struct last_row_process_16_packets -{ - typedef gebp_traits Traits; - typedef gebp_traits SwappedTraits; - - typedef typename Traits::ResScalar ResScalar; - typedef typename SwappedTraits::LhsPacket SLhsPacket; - typedef typename SwappedTraits::RhsPacket SRhsPacket; - typedef typename SwappedTraits::ResPacket SResPacket; - typedef typename SwappedTraits::AccPacket SAccPacket; - - EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA, - const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2, - ResScalar alpha, SAccPacket &C0) - { - EIGEN_UNUSED_VARIABLE(res); - EIGEN_UNUSED_VARIABLE(straits); - EIGEN_UNUSED_VARIABLE(blA); - EIGEN_UNUSED_VARIABLE(blB); - EIGEN_UNUSED_VARIABLE(depth); - EIGEN_UNUSED_VARIABLE(endk); - EIGEN_UNUSED_VARIABLE(i); - EIGEN_UNUSED_VARIABLE(j2); - EIGEN_UNUSED_VARIABLE(alpha); - EIGEN_UNUSED_VARIABLE(C0); - } -}; - - -template -struct last_row_process_16_packets { - typedef gebp_traits Traits; - typedef gebp_traits SwappedTraits; - - typedef typename Traits::ResScalar ResScalar; - typedef typename SwappedTraits::LhsPacket SLhsPacket; - typedef typename SwappedTraits::RhsPacket SRhsPacket; - typedef typename SwappedTraits::ResPacket SResPacket; - typedef typename SwappedTraits::AccPacket SAccPacket; - - EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits &straits, const LhsScalar* blA, - const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2, - ResScalar alpha, SAccPacket &C0) - { - typedef typename unpacket_traits::half>::half SResPacketQuarter; - typedef typename unpacket_traits::half>::half SLhsPacketQuarter; - typedef typename unpacket_traits::half>::half SRhsPacketQuarter; - typedef typename unpacket_traits::half>::half SAccPacketQuarter; - - SResPacketQuarter R = res.template gatherPacket(i, j2); - SResPacketQuarter alphav = pset1(alpha); - - if (depth - endk > 0) - { - // We have to handle the last row(s) of the rhs, which - // correspond to a half-packet - SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0)); - - for (Index kk = endk; kk < depth; kk++) - { - SLhsPacketQuarter a0; - SRhsPacketQuarter b0; - straits.loadLhsUnaligned(blB, a0); - straits.loadRhs(blA, b0); - straits.madd(a0,b0,c0,b0, fix<0>); - blB += SwappedTraits::LhsProgress/4; - blA += 1; - } - straits.acc(c0, alphav, R); - } - else - { - straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R); - } - res.scatterPacket(i, j2, R); - } -}; - -template -struct lhs_process_one_packet -{ - typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4; - - EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacketx4 *rhs_panel, RhsPacket *T0, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) - { - EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); - EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); - traits.loadLhs(&blA[(0+1*K)*LhsProgress], *A0); - traits.loadRhs(&blB[(0+4*K)*RhsProgress], *rhs_panel); - traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>); - traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>); - traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>); - traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>); - #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) - __asm__ ("" : "+x,m" (*A0)); - #endif - EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); - } - - EIGEN_STRONG_INLINE void operator()( - const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, ResScalar alpha, - Index peelStart, Index peelEnd, Index strideA, Index strideB, Index offsetA, Index offsetB, - int prefetch_res_offset, Index peeled_kc, Index pk, Index cols, Index depth, Index packet_cols4) - { - GEBPTraits traits; - - // loops on each largest micro horizontal panel of lhs - // (LhsProgress x depth) - for(Index i=peelStart; i(alpha); - - R0 = r0.template loadPacket(0); - R1 = r1.template loadPacket(0); - traits.acc(C0, alphav, R0); - traits.acc(C1, alphav, R1); - r0.storePacket(0, R0); - r1.storePacket(0, R1); - - R0 = r2.template loadPacket(0); - R1 = r3.template loadPacket(0); - traits.acc(C2, alphav, R0); - traits.acc(C3, alphav, R1); - r2.storePacket(0, R0); - r3.storePacket(0, R1); - } - - // Deal with remaining columns of the rhs - for(Index j2=packet_cols4; j2); \ - EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1"); \ - } while(false); - - EIGEN_GEBGP_ONESTEP(0); - EIGEN_GEBGP_ONESTEP(1); - EIGEN_GEBGP_ONESTEP(2); - EIGEN_GEBGP_ONESTEP(3); - EIGEN_GEBGP_ONESTEP(4); - EIGEN_GEBGP_ONESTEP(5); - EIGEN_GEBGP_ONESTEP(6); - EIGEN_GEBGP_ONESTEP(7); - - blB += pk*RhsProgress; - blA += pk*LhsProgress; - - EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1"); - } - - // process remaining peeled loop - for(Index k=peeled_kc; k(alpha); - R0 = r0.template loadPacket(0); - traits.acc(C0, alphav, R0); - r0.storePacket(0, R0); - } - } - } -}; - -template -struct lhs_process_fraction_of_packet : lhs_process_one_packet -{ - -EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits, LhsPacket *A0, RhsPacket *B_0, RhsPacket *B1, RhsPacket *B2, RhsPacket *B3, AccPacket *C0, AccPacket *C1, AccPacket *C2, AccPacket *C3) - { - EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4"); - EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); - traits.loadLhsUnaligned(&blA[(0+1*K)*(LhsProgress)], *A0); - traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], *B_0, *B1, *B2, *B3); - traits.madd(*A0, *B_0, *C0, *B_0); - traits.madd(*A0, *B1, *C1, *B1); - traits.madd(*A0, *B2, *C2, *B2); - traits.madd(*A0, *B3, *C3, *B3); - EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4"); - } -}; - template EIGEN_DONT_INLINE void gebp_kernel @@ -1586,12 +904,10 @@ void gebp_kernel=4 ? (cols/4) * 4 : 0; const Index peeled_mc3 = mr>=3*Traits::LhsProgress ? (rows/(3*LhsProgress))*(3*LhsProgress) : 0; const Index peeled_mc2 = mr>=2*Traits::LhsProgress ? peeled_mc3+((rows-peeled_mc3)/(2*LhsProgress))*(2*LhsProgress) : 0; - const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? peeled_mc2+((rows-peeled_mc2)/(1*LhsProgress))*(1*LhsProgress) : 0; - const Index peeled_mc_half = mr>=LhsProgressHalf ? peeled_mc1+((rows-peeled_mc1)/(LhsProgressHalf))*(LhsProgressHalf) : 0; - const Index peeled_mc_quarter = mr>=LhsProgressQuarter ? peeled_mc_half+((rows-peeled_mc_half)/(LhsProgressQuarter))*(LhsProgressQuarter) : 0; + const Index peeled_mc1 = mr>=1*Traits::LhsProgress ? (rows/(1*LhsProgress))*(1*LhsProgress) : 0; enum { pk = 8 }; // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell) const Index peeled_kc = depth & ~(pk-1); - const int prefetch_res_offset = 32/sizeof(ResScalar); + const Index prefetch_res_offset = 32/sizeof(ResScalar); // const Index depth2 = depth & ~1; //---------- Process 3 * LhsProgress rows at once ---------- @@ -1649,48 +965,36 @@ void gebp_kernel); \ - traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ - traits.madd(A2, rhs_panel, C8, T0, fix<0>); \ - traits.updateRhs(blB + (1+4*K) * Traits::RhsProgress, rhs_panel); \ - traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ - traits.madd(A1, rhs_panel, C5, T0, fix<1>); \ - traits.madd(A2, rhs_panel, C9, T0, fix<1>); \ - traits.updateRhs(blB + (2+4*K) * Traits::RhsProgress, rhs_panel); \ - traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ - traits.madd(A1, rhs_panel, C6, T0, fix<2>); \ - traits.madd(A2, rhs_panel, C10, T0, fix<2>); \ - traits.updateRhs(blB + (3+4*K) * Traits::RhsProgress, rhs_panel); \ - traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ - traits.madd(A1, rhs_panel, C7, T0, fix<3>); \ - traits.madd(A2, rhs_panel, C11, T0, fix<3>); \ - EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ - } while (false) + internal::prefetch(blA+(3*K+16)*LhsProgress); \ + if (EIGEN_ARCH_ARM) { internal::prefetch(blB+(4*K+16)*RhsProgress); } /* Bug 953 */ \ + traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ + traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ + traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ + traits.loadRhs(blB + (0+4*K)*Traits::RhsProgress, B_0); \ + traits.madd(A0, B_0, C0, T0); \ + traits.madd(A1, B_0, C4, T0); \ + traits.madd(A2, B_0, C8, B_0); \ + traits.loadRhs(blB + (1+4*K)*Traits::RhsProgress, B_0); \ + traits.madd(A0, B_0, C1, T0); \ + traits.madd(A1, B_0, C5, T0); \ + traits.madd(A2, B_0, C9, B_0); \ + traits.loadRhs(blB + (2+4*K)*Traits::RhsProgress, B_0); \ + traits.madd(A0, B_0, C2, T0); \ + traits.madd(A1, B_0, C6, T0); \ + traits.madd(A2, B_0, C10, B_0); \ + traits.loadRhs(blB + (3+4*K)*Traits::RhsProgress, B_0); \ + traits.madd(A0, B_0, C3 , T0); \ + traits.madd(A1, B_0, C7, T0); \ + traits.madd(A2, B_0, C11, B_0); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4"); \ + } while(false) internal::prefetch(blB); EIGEN_GEBP_ONESTEP(0); @@ -1710,8 +1014,7 @@ void gebp_kernel(alpha); - R0 = r0.template loadPacket(0 * Traits::ResPacketSize); - R1 = r0.template loadPacket(1 * Traits::ResPacketSize); - R2 = r0.template loadPacket(2 * Traits::ResPacketSize); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R2 = r0.loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C8, alphav, R2); @@ -1733,9 +1036,9 @@ void gebp_kernel(0 * Traits::ResPacketSize); - R1 = r1.template loadPacket(1 * Traits::ResPacketSize); - R2 = r1.template loadPacket(2 * Traits::ResPacketSize); + R0 = r1.loadPacket(0 * Traits::ResPacketSize); + R1 = r1.loadPacket(1 * Traits::ResPacketSize); + R2 = r1.loadPacket(2 * Traits::ResPacketSize); traits.acc(C1, alphav, R0); traits.acc(C5, alphav, R1); traits.acc(C9, alphav, R2); @@ -1743,9 +1046,9 @@ void gebp_kernel(0 * Traits::ResPacketSize); - R1 = r2.template loadPacket(1 * Traits::ResPacketSize); - R2 = r2.template loadPacket(2 * Traits::ResPacketSize); + R0 = r2.loadPacket(0 * Traits::ResPacketSize); + R1 = r2.loadPacket(1 * Traits::ResPacketSize); + R2 = r2.loadPacket(2 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C10, alphav, R2); @@ -1753,9 +1056,9 @@ void gebp_kernel(0 * Traits::ResPacketSize); - R1 = r3.template loadPacket(1 * Traits::ResPacketSize); - R2 = r3.template loadPacket(2 * Traits::ResPacketSize); + R0 = r3.loadPacket(0 * Traits::ResPacketSize); + R1 = r3.loadPacket(1 * Traits::ResPacketSize); + R2 = r3.loadPacket(2 * Traits::ResPacketSize); traits.acc(C3, alphav, R0); traits.acc(C7, alphav, R1); traits.acc(C11, alphav, R2); @@ -1791,20 +1094,20 @@ void gebp_kernel); \ - traits.madd(A1, B_0, C4, B_0, fix<0>); \ - traits.madd(A2, B_0, C8, B_0, fix<0>); \ - EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ - } while (false) - + traits.loadLhs(&blA[(0+3*K)*LhsProgress], A0); \ + traits.loadLhs(&blA[(1+3*K)*LhsProgress], A1); \ + traits.loadLhs(&blA[(2+3*K)*LhsProgress], A2); \ + traits.loadRhs(&blB[(0+K)*RhsProgress], B_0); \ + traits.madd(A0, B_0, C0, B_0); \ + traits.madd(A1, B_0, C4, B_0); \ + traits.madd(A2, B_0, C8, B_0); \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1"); \ + } while(false) + EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); EIGEN_GEBGP_ONESTEP(2); @@ -1832,9 +1135,9 @@ void gebp_kernel(alpha); - R0 = r0.template loadPacket(0 * Traits::ResPacketSize); - R1 = r0.template loadPacket(1 * Traits::ResPacketSize); - R2 = r0.template loadPacket(2 * Traits::ResPacketSize); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R2 = r0.loadPacket(2 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C8, alphav, R2); @@ -1893,8 +1196,7 @@ void gebp_kernel=6 without FMA (bug 1637) @@ -1903,24 +1205,24 @@ void gebp_kernel); \ - traits.madd(A1, rhs_panel, C4, T0, fix<0>); \ - traits.madd(A0, rhs_panel, C1, T0, fix<1>); \ - traits.madd(A1, rhs_panel, C5, T0, fix<1>); \ - traits.madd(A0, rhs_panel, C2, T0, fix<2>); \ - traits.madd(A1, rhs_panel, C6, T0, fix<2>); \ - traits.madd(A0, rhs_panel, C3, T0, fix<3>); \ - traits.madd(A1, rhs_panel, C7, T0, fix<3>); \ - EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \ - EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ - } while (false) - + #define EIGEN_GEBGP_ONESTEP(K) \ + do { \ + EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ + traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ + traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ + traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ + traits.madd(A0, B_0, C0, T0); \ + traits.madd(A1, B_0, C4, B_0); \ + traits.madd(A0, B1, C1, T0); \ + traits.madd(A1, B1, C5, B1); \ + traits.madd(A0, B2, C2, T0); \ + traits.madd(A1, B2, C6, B2); \ + traits.madd(A0, B3, C3, T0); \ + traits.madd(A1, B3, C7, B3); \ + EIGEN_GEBP_2PX4_SPILLING_WORKAROUND \ + EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4"); \ + } while(false) + internal::prefetch(blB+(48+0)); EIGEN_GEBGP_ONESTEP(0); EIGEN_GEBGP_ONESTEP(1); @@ -1940,8 +1242,7 @@ void gebp_kernel(alpha); - R0 = r0.template loadPacket(0 * Traits::ResPacketSize); - R1 = r0.template loadPacket(1 * Traits::ResPacketSize); - R2 = r1.template loadPacket(0 * Traits::ResPacketSize); - R3 = r1.template loadPacket(1 * Traits::ResPacketSize); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); + R2 = r1.loadPacket(0 * Traits::ResPacketSize); + R3 = r1.loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); traits.acc(C1, alphav, R2); @@ -1964,10 +1265,10 @@ void gebp_kernel(0 * Traits::ResPacketSize); - R1 = r2.template loadPacket(1 * Traits::ResPacketSize); - R2 = r3.template loadPacket(0 * Traits::ResPacketSize); - R3 = r3.template loadPacket(1 * Traits::ResPacketSize); + R0 = r2.loadPacket(0 * Traits::ResPacketSize); + R1 = r2.loadPacket(1 * Traits::ResPacketSize); + R2 = r3.loadPacket(0 * Traits::ResPacketSize); + R3 = r3.loadPacket(1 * Traits::ResPacketSize); traits.acc(C2, alphav, R0); traits.acc(C6, alphav, R1); traits.acc(C3, alphav, R2); @@ -2012,8 +1313,8 @@ void gebp_kernel); \ - traits.madd(A1, B_0, C4, B_0, fix<0>); \ + traits.madd(A0, B_0, C0, B1); \ + traits.madd(A1, B_0, C4, B_0); \ EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1"); \ } while(false) @@ -2044,8 +1345,8 @@ void gebp_kernel(alpha); - R0 = r0.template loadPacket(0 * Traits::ResPacketSize); - R1 = r0.template loadPacket(1 * Traits::ResPacketSize); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r0.loadPacket(1 * Traits::ResPacketSize); traits.acc(C0, alphav, R0); traits.acc(C4, alphav, R1); r0.storePacket(0 * Traits::ResPacketSize, R0); @@ -2057,43 +1358,186 @@ void gebp_kernel=1*Traits::LhsProgress) { - lhs_process_one_packet p; - p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); - } - //---------- Process LhsProgressHalf rows at once ---------- - if((LhsProgressHalf < LhsProgress) && mr>=LhsProgressHalf) - { - lhs_process_fraction_of_packet p; - p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); - } - //---------- Process LhsProgressQuarter rows at once ---------- - if((LhsProgressQuarter < LhsProgressHalf) && mr>=LhsProgressQuarter) - { - lhs_process_fraction_of_packet p; - p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB, prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4); + // loops on each largest micro horizontal panel of lhs (1*LhsProgress x depth) + for(Index i=peeled_mc2; i(alpha); + + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + R1 = r1.loadPacket(0 * Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + traits.acc(C1, alphav, R1); + r0.storePacket(0 * Traits::ResPacketSize, R0); + r1.storePacket(0 * Traits::ResPacketSize, R1); + + R0 = r2.loadPacket(0 * Traits::ResPacketSize); + R1 = r3.loadPacket(0 * Traits::ResPacketSize); + traits.acc(C2, alphav, R0); + traits.acc(C3, alphav, R1); + r2.storePacket(0 * Traits::ResPacketSize, R0); + r3.storePacket(0 * Traits::ResPacketSize, R1); + } + + // Deal with remaining columns of the rhs + for(Index j2=packet_cols4; j2(alpha); + R0 = r0.loadPacket(0 * Traits::ResPacketSize); + traits.acc(C0, alphav, R0); + r0.storePacket(0 * Traits::ResPacketSize, R0); + } + } } //---------- Process remaining rows, 1 at once ---------- - if(peeled_mc_quarter::half>::size; - const int SResPacketQuarterSize = unpacket_traits::half>::half>::size; if ((SwappedTraits::LhsProgress % 4) == 0 && - (SwappedTraits::LhsProgress<=16) && - (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr) && - (SwappedTraits::LhsProgress!=16 || SResPacketQuarterSize==nr)) + (SwappedTraits::LhsProgress <= 8) && + (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr)) { SAccPacket C0, C1, C2, C3; straits.initAcc(C0); @@ -2116,15 +1560,15 @@ void gebp_kernel); - straits.madd(A1,B_1,C1,B_1, fix<0>); + straits.madd(A0,B_0,C0,B_0); + straits.madd(A1,B_1,C1,B_1); straits.loadLhsUnaligned(blB+2*SwappedTraits::LhsProgress, A0); straits.loadLhsUnaligned(blB+3*SwappedTraits::LhsProgress, A1); straits.loadRhsQuad(blA+2*spk, B_0); straits.loadRhsQuad(blA+3*spk, B_1); - straits.madd(A0,B_0,C2,B_0, fix<0>); - straits.madd(A1,B_1,C3,B_1, fix<0>); + straits.madd(A0,B_0,C2,B_0); + straits.madd(A1,B_1,C3,B_1); blB += 4*SwappedTraits::LhsProgress; blA += 4*spk; @@ -2137,7 +1581,7 @@ void gebp_kernel); + straits.madd(A0,B_0,C0,B_0); blB += SwappedTraits::LhsProgress; blA += spk; @@ -2147,7 +1591,7 @@ void gebp_kernel=8,typename unpacket_traits::half,SResPacket>::type SResPacketHalf; typedef typename conditional=8,typename unpacket_traits::half,SLhsPacket>::type SLhsPacketHalf; - typedef typename conditional=8,typename unpacket_traits::half,SRhsPacket>::type SRhsPacketHalf; + typedef typename conditional=8,typename unpacket_traits::half,SRhsPacket>::type SRhsPacketHalf; typedef typename conditional=8,typename unpacket_traits::half,SAccPacket>::type SAccPacketHalf; SResPacketHalf R = res.template gatherPacket(i, j2); @@ -2160,25 +1604,16 @@ void gebp_kernel); + SAccPacketHalf c0 = predux_downto4(C0); + straits.madd(a0,b0,c0,b0); straits.acc(c0, alphav, R); } else { - straits.acc(predux_half_dowto4(C0), alphav, R); + straits.acc(predux_downto4(C0), alphav, R); } res.scatterPacket(i, j2, R); } - else if (SwappedTraits::LhsProgress==16) - { - // Special case where we have to first reduce the - // accumulation register C0. We specialize the block in - // template form, so that LhsProgress < 16 paths don't - // fail to compile - last_row_process_16_packets p; - p(res, straits, blA, blB, depth, endk, i, j2,alpha, C0); - } else { SResPacket R = res.template gatherPacket(i, j2); @@ -2222,7 +1657,7 @@ void gebp_kernel -struct gemm_pack_lhs +template +struct gemm_pack_lhs { typedef typename DataMapper::LinearMapper LinearMapper; EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_lhs +template +EIGEN_DONT_INLINE void gemm_pack_lhs ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - typedef typename unpacket_traits::half HalfPacket; - typedef typename unpacket_traits::half>::half QuarterPacket; - enum { PacketSize = unpacket_traits::size, - HalfPacketSize = unpacket_traits::size, - QuarterPacketSize = unpacket_traits::size, - HasHalf = (int)HalfPacketSize < (int)PacketSize, - HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; + typedef typename packet_traits::type Packet; + enum { PacketSize = packet_traits::size }; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); EIGEN_UNUSED_VARIABLE(stride); @@ -2287,12 +1717,9 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; - const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0; - const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0; - const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? (rows/(QuarterPacketSize))*(QuarterPacketSize) : 0; - const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0; - const Index peeled_mc0 = Pack2>=PacketSize ? peeled_mc_quarter - : Pack2>1 && last_lhs_progress ? (rows/last_lhs_progress)*last_lhs_progress : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + const Index peeled_mc0 = Pack2>=1*PacketSize ? peeled_mc1 + : Pack2>1 ? (rows/Pack2)*Pack2 : 0; Index i=0; @@ -2306,9 +1733,9 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(i+0*PacketSize, k); - B = lhs.template loadPacket(i+1*PacketSize, k); - C = lhs.template loadPacket(i+2*PacketSize, k); + A = lhs.loadPacket(i+0*PacketSize, k); + B = lhs.loadPacket(i+1*PacketSize, k); + C = lhs.loadPacket(i+2*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; pstore(blockA+count, cj.pconj(C)); count+=PacketSize; @@ -2326,8 +1753,8 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(i+0*PacketSize, k); - B = lhs.template loadPacket(i+1*PacketSize, k); + A = lhs.loadPacket(i+0*PacketSize, k); + B = lhs.loadPacket(i+1*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; pstore(blockA+count, cj.pconj(B)); count+=PacketSize; } @@ -2344,67 +1771,27 @@ EIGEN_DONT_INLINE void gemm_pack_lhs(i+0*PacketSize, k); + A = lhs.loadPacket(i+0*PacketSize, k); pstore(blockA+count, cj.pconj(A)); count+=PacketSize; } if(PanelMode) count += (1*PacketSize) * (stride-offset-depth); } } - // Pack half packets - if(HasHalf && Pack1>=HalfPacketSize) - { - for(; i(i+0*(HalfPacketSize), k); - pstoreu(blockA+count, cj.pconj(A)); - count+=HalfPacketSize; - } - if(PanelMode) count += (HalfPacketSize) * (stride-offset-depth); - } - } - // Pack quarter packets - if(HasQuarter && Pack1>=QuarterPacketSize) - { - for(; i(i+0*(QuarterPacketSize), k); - pstoreu(blockA+count, cj.pconj(A)); - count+=QuarterPacketSize; - } - if(PanelMode) count += (QuarterPacketSize) * (stride-offset-depth); - } - } - // Pack2 may be *smaller* than PacketSize—that happens for - // products like real * complex, where we have to go half the - // progress on the lhs in order to duplicate those operands to - // address both real & imaginary parts on the rhs. This portion will - // pack those half ones until they match the number expected on the - // last peeling loop at this point (for the rhs). + // Pack scalars if(Pack21) { - for(; i -struct gemm_pack_lhs +template +struct gemm_pack_lhs { typedef typename DataMapper::LinearMapper LinearMapper; EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride=0, Index offset=0); }; -template -EIGEN_DONT_INLINE void gemm_pack_lhs +template +EIGEN_DONT_INLINE void gemm_pack_lhs ::operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) { - typedef typename unpacket_traits::half HalfPacket; - typedef typename unpacket_traits::half>::half QuarterPacket; - enum { PacketSize = unpacket_traits::size, - HalfPacketSize = unpacket_traits::size, - QuarterPacketSize = unpacket_traits::size, - HasHalf = (int)HalfPacketSize < (int)PacketSize, - HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; + typedef typename packet_traits::type Packet; + enum { PacketSize = packet_traits::size }; EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS"); EIGEN_UNUSED_VARIABLE(stride); @@ -2439,51 +1821,37 @@ EIGEN_DONT_INLINE void gemm_pack_lhs=depth && offset<=stride)); conj_if::IsComplex && Conjugate> cj; Index count = 0; - bool gone_half = false, gone_quarter = false, gone_last = false; - Index i = 0; +// const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; +// const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; +// const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; + int pack = Pack1; - int psize = PacketSize; + Index i = 0; while(pack>0) { Index remaining_rows = rows-i; - Index peeled_mc = gone_last ? Pack2>1 ? (rows/pack)*pack : 0 : i+(remaining_rows/pack)*pack; - Index starting_pos = i; + Index peeled_mc = i+(remaining_rows/pack)*pack; for(; i=psize && psize >= QuarterPacketSize) + if(pack>=PacketSize) { - const Index peeled_k = (depth/psize)*psize; - for(; k kernel; - for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket(i+p+m, k); - ptranspose(kernel); - for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); - } else if (HasHalf && psize == HalfPacketSize) { - gone_half = true; - PacketBlock kernel_half; - for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket(i+p+m, k); - ptranspose(kernel_half); - for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p])); - } else if (HasQuarter && psize == QuarterPacketSize) { - gone_quarter = true; - PacketBlock kernel_quarter; - for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket(i+p+m, k); - ptranspose(kernel_quarter); - for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p])); - } + PacketBlock kernel; + for (int p = 0; p < PacketSize; ++p) kernel.packet[p] = lhs.loadPacket(i+p+m, k); + ptranspose(kernel); + for (int p = 0; p < PacketSize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); } - count += psize*pack; + count += PacketSize*pack; } } - for(; k= psize/2 || left >= psize/4) && - ((psize/2 == HalfPacketSize && HasHalf && !gone_half) || - (psize/2 == QuarterPacketSize && HasQuarter && !gone_quarter))) { - psize /= 2; - pack = psize; - continue; - } - // Pack2 may be *smaller* than PacketSize—that happens for - // products like real * complex, where we have to go half the - // progress on the lhs in order to duplicate those operands to - // address both real & imaginary parts on the rhs. This portion will - // pack those half ones until they match the number expected on the - // last peeling loop at this point (for the rhs). - if (Pack2 < PacketSize && !gone_last) { - gone_last = true; - psize = pack = left & ~1; - } - } + pack -= PacketSize; + if(pack kernel; @@ -2630,10 +1979,10 @@ EIGEN_DONT_INLINE void gemm_pack_rhs kernel; - kernel.packet[0 ] = dm0.template loadPacket(k); - kernel.packet[1%PacketSize] = dm1.template loadPacket(k); - kernel.packet[2%PacketSize] = dm2.template loadPacket(k); - kernel.packet[3%PacketSize] = dm3.template loadPacket(k); + kernel.packet[0] = dm0.loadPacket(k); + kernel.packet[1%PacketSize] = dm1.loadPacket(k); + kernel.packet[2%PacketSize] = dm2.loadPacket(k); + kernel.packet[3%PacketSize] = dm3.loadPacket(k); ptranspose(kernel); pstoreu(blockB+count+0*PacketSize, cj.pconj(kernel.packet[0])); pstoreu(blockB+count+1*PacketSize, cj.pconj(kernel.packet[1%PacketSize])); @@ -2674,14 +2023,8 @@ template { typedef typename packet_traits::type Packet; - typedef typename unpacket_traits::half HalfPacket; - typedef typename unpacket_traits::half>::half QuarterPacket; typedef typename DataMapper::LinearMapper LinearMapper; - enum { PacketSize = packet_traits::size, - HalfPacketSize = unpacket_traits::size, - QuarterPacketSize = unpacket_traits::size, - HasHalf = (int)HalfPacketSize < (int)PacketSize, - HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize }; + enum { PacketSize = packet_traits::size }; EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride=0, Index offset=0); }; @@ -2740,17 +2083,9 @@ EIGEN_DONT_INLINE void gemm_pack_rhs(k, j2); + Packet A = rhs.loadPacket(k, j2); pstoreu(blockB+count, cj.pconj(A)); count += PacketSize; - } else if (HasHalf && HalfPacketSize==4) { - HalfPacket A = rhs.template loadPacket(k, j2); - pstoreu(blockB+count, cj.pconj(A)); - count += HalfPacketSize; - } else if (HasQuarter && QuarterPacketSize==4) { - QuarterPacket A = rhs.template loadPacket(k, j2); - pstoreu(blockB+count, cj.pconj(A)); - count += QuarterPacketSize; } else { const LinearMapper dm0 = rhs.getLinearMapper(k, j2); blockB[count+0] = cj(dm0(0)); diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h b/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h index 508c05c97..ed6234c37 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -77,7 +77,7 @@ static void run(Index rows, Index cols, Index depth, Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction Index nc = (std::min)(cols,blocking.nc()); // cache block size along the N direction - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gebp_kernel gebp; @@ -110,7 +110,7 @@ static void run(Index rows, Index cols, Index depth, // i.e., we test that info[tid].users equals 0. // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it. while(info[tid].users!=0) {} - info[tid].users = threads; + info[tid].users += threads; pack_lhs(blockA+info[tid].lhs_start*actual_kc, lhs.getSubMapper(info[tid].lhs_start,k), actual_kc, info[tid].lhs_length); @@ -148,9 +148,7 @@ static void run(Index rows, Index cols, Index depth, // Release all the sub blocks A'_i of A' for the current thread, // i.e., we simply decrement the number of users by 1 for(Index i=0; i template static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=404 for a discussion and helper program - // to determine the following heuristic. - // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h, - // unless it has been specialized by the user or for a given architecture. - // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs. - // I'm not sure it is still required. - if((rhs.rows()+dst.rows()+dst.cols())0) + if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op()); else { @@ -449,7 +441,7 @@ struct generic_product_impl template static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - if((rhs.rows()+dst.rows()+dst.cols())0) + if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op()); else scaleAndAddTo(dst,lhs, rhs, Scalar(1)); @@ -458,7 +450,7 @@ struct generic_product_impl template static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { - if((rhs.rows()+dst.rows()+dst.cols())0) + if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op()); else scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); @@ -471,20 +463,6 @@ struct generic_product_impl if(a_lhs.cols()==0 || a_lhs.rows()==0 || a_rhs.cols()==0) return; - // Fallback to GEMV if either the lhs or rhs is a runtime vector - if (dst.cols() == 1) - { - typename Dest::ColXpr dst_vec(dst.col(0)); - return internal::generic_product_impl - ::scaleAndAddTo(dst_vec, a_lhs, a_rhs.col(0), alpha); - } - else if (dst.rows() == 1) - { - typename Dest::RowXpr dst_vec(dst.row(0)); - return internal::generic_product_impl - ::scaleAndAddTo(dst_vec, a_lhs.row(0), a_rhs, alpha); - } - typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(a_lhs); typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(a_rhs); diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 6ba0d9bdb..d68d2f965 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -87,7 +87,7 @@ struct general_matrix_matrix_triangular_product pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gebp_kernel gebp; tribb_kernel sybb; @@ -302,13 +302,13 @@ struct general_product_to_triangular_selector template template -EIGEN_DEVICE_FUNC TriangularView& TriangularViewImpl::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta) +TriangularView& TriangularViewImpl::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta) { EIGEN_STATIC_ASSERT((UpLo&UnitDiag)==0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED); eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols()); - + general_product_to_triangular_selector::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta); - + return derived(); } diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h index 9a650ec23..691f95d69 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h @@ -37,7 +37,7 @@ namespace Eigen { namespace internal { -template +template struct general_matrix_matrix_rankupdate : general_matrix_matrix_triangular_product< Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,1,UpLo,BuiltIn> {}; diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixVector.h b/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixVector.h index dfb6aebce..a597c1f4e 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/products/GeneralMatrixVector.h @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 2008-2016 Gael Guennebaud +// Copyright (C) 2008-2009 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -14,57 +14,11 @@ namespace Eigen { namespace internal { -enum GEMVPacketSizeType { - GEMVPacketFull = 0, - GEMVPacketHalf, - GEMVPacketQuarter -}; - -template -struct gemv_packet_cond { typedef T3 type; }; - -template -struct gemv_packet_cond { typedef T1 type; }; - -template -struct gemv_packet_cond { typedef T2 type; }; - -template -class gemv_traits -{ - typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; - -#define PACKET_DECL_COND_PREFIX(prefix, name, packet_size) \ - typedef typename gemv_packet_cond::type, \ - typename packet_traits::half, \ - typename unpacket_traits::half>::half>::type \ - prefix ## name ## Packet - - PACKET_DECL_COND_PREFIX(_, Lhs, _PacketSize); - PACKET_DECL_COND_PREFIX(_, Rhs, _PacketSize); - PACKET_DECL_COND_PREFIX(_, Res, _PacketSize); -#undef PACKET_DECL_COND_PREFIX - -public: - enum { - Vectorizable = unpacket_traits<_LhsPacket>::vectorizable && - unpacket_traits<_RhsPacket>::vectorizable && - int(unpacket_traits<_LhsPacket>::size)==int(unpacket_traits<_RhsPacket>::size), - LhsPacketSize = Vectorizable ? unpacket_traits<_LhsPacket>::size : 1, - RhsPacketSize = Vectorizable ? unpacket_traits<_RhsPacket>::size : 1, - ResPacketSize = Vectorizable ? unpacket_traits<_ResPacket>::size : 1 - }; - - typedef typename conditional::type LhsPacket; - typedef typename conditional::type RhsPacket; - typedef typename conditional::type ResPacket; -}; - - /* Optimized col-major matrix * vector product: - * This algorithm processes the matrix per vertical panels, - * which are then processed horizontaly per chunck of 8*PacketSize x 1 vertical segments. + * This algorithm processes 4 columns at onces that allows to both reduce + * the number of load/stores of the result by a factor 4 and to reduce + * the instruction dependency. Moreover, we know that all bands have the + * same alignment pattern. * * Mixing type logic: C += alpha * A * B * | A | B |alpha| comments @@ -73,30 +27,56 @@ public: * |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp * |cplx |real |real | optimal case, vectorization possible via real-cplx mul * + * Accesses to the matrix coefficients follow the following logic: + * + * - if all columns have the same alignment then + * - if the columns have the same alignment as the result vector, then easy! (-> AllAligned case) + * - otherwise perform unaligned loads only (-> NoneAligned case) + * - otherwise + * - if even columns have the same alignment then + * // odd columns are guaranteed to have the same alignment too + * - if even or odd columns have the same alignment as the result, then + * // for a register size of 2 scalars, this is guarantee to be the case (e.g., SSE with double) + * - perform half aligned and half unaligned loads (-> EvenAligned case) + * - otherwise perform unaligned loads only (-> NoneAligned case) + * - otherwise, if the register size is 4 scalars (e.g., SSE with float) then + * - one over 4 consecutive columns is guaranteed to be aligned with the result vector, + * perform simple aligned loads for this column and aligned loads plus re-alignment for the other. (-> FirstAligned case) + * // this re-alignment is done by the palign function implemented for SSE in Eigen/src/Core/arch/SSE/PacketMath.h + * - otherwise, + * // if we get here, this means the register size is greater than 4 (e.g., AVX with floats), + * // we currently fall back to the NoneAligned case + * * The same reasoning apply for the transposed case. + * + * The last case (PacketSize>4) could probably be improved by generalizing the FirstAligned case, but since we do not support AVX yet... + * One might also wonder why in the EvenAligned case we perform unaligned loads instead of using the aligned-loads plus re-alignment + * strategy as in the FirstAligned case. The reason is that we observed that unaligned loads on a 8 byte boundary are not too slow + * compared to unaligned loads on a 4 byte boundary. + * */ template struct general_matrix_vector_product { - typedef gemv_traits Traits; - typedef gemv_traits HalfTraits; - typedef gemv_traits QuarterTraits; - typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; - typedef typename Traits::LhsPacket LhsPacket; - typedef typename Traits::RhsPacket RhsPacket; - typedef typename Traits::ResPacket ResPacket; +enum { + Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable + && int(packet_traits::size)==int(packet_traits::size), + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, + ResPacketSize = Vectorizable ? packet_traits::size : 1 +}; - typedef typename HalfTraits::LhsPacket LhsPacketHalf; - typedef typename HalfTraits::RhsPacket RhsPacketHalf; - typedef typename HalfTraits::ResPacket ResPacketHalf; +typedef typename packet_traits::type _LhsPacket; +typedef typename packet_traits::type _RhsPacket; +typedef typename packet_traits::type _ResPacket; - typedef typename QuarterTraits::LhsPacket LhsPacketQuarter; - typedef typename QuarterTraits::RhsPacket RhsPacketQuarter; - typedef typename QuarterTraits::ResPacket ResPacketQuarter; +typedef typename conditional::type LhsPacket; +typedef typename conditional::type RhsPacket; +typedef typename conditional::type ResPacket; -EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( +EIGEN_DONT_INLINE static void run( Index rows, Index cols, const LhsMapper& lhs, const RhsMapper& rhs, @@ -105,187 +85,244 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( }; template -EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product::run( +EIGEN_DONT_INLINE void general_matrix_vector_product::run( Index rows, Index cols, - const LhsMapper& alhs, + const LhsMapper& lhs, const RhsMapper& rhs, ResScalar* res, Index resIncr, RhsScalar alpha) { EIGEN_UNUSED_VARIABLE(resIncr); eigen_internal_assert(resIncr==1); + #ifdef _EIGEN_ACCUMULATE_PACKETS + #error _EIGEN_ACCUMULATE_PACKETS has already been defined + #endif + #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) \ + pstore(&res[j], \ + padd(pload(&res[j]), \ + padd( \ + padd(pcj.pmul(lhs0.template load(j), ptmp0), \ + pcj.pmul(lhs1.template load(j), ptmp1)), \ + padd(pcj.pmul(lhs2.template load(j), ptmp2), \ + pcj.pmul(lhs3.template load(j), ptmp3)) ))) - // The following copy tells the compiler that lhs's attributes are not modified outside this function - // This helps GCC to generate propoer code. - LhsMapper lhs(alhs); + typedef typename LhsMapper::VectorMapper LhsScalars; conj_helper cj; conj_helper pcj; - conj_helper pcj_half; - conj_helper pcj_quarter; + if(ConjugateRhs) + alpha = numext::conj(alpha); + + enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned }; + const Index columnsAtOnce = 4; + const Index peels = 2; + const Index LhsPacketAlignedMask = LhsPacketSize-1; + const Index ResPacketAlignedMask = ResPacketSize-1; +// const Index PeelAlignedMask = ResPacketSize*peels-1; + const Index size = rows; const Index lhsStride = lhs.stride(); - // TODO: for padded aligned inputs, we could enable aligned reads - enum { LhsAlignment = Unaligned, - ResPacketSize = Traits::ResPacketSize, - ResPacketSizeHalf = HalfTraits::ResPacketSize, - ResPacketSizeQuarter = QuarterTraits::ResPacketSize, - LhsPacketSize = Traits::LhsPacketSize, - HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize, - HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf - }; - const Index n8 = rows-8*ResPacketSize+1; - const Index n4 = rows-4*ResPacketSize+1; - const Index n3 = rows-3*ResPacketSize+1; - const Index n2 = rows-2*ResPacketSize+1; - const Index n1 = rows-1*ResPacketSize+1; - const Index n_half = rows-1*ResPacketSizeHalf+1; - const Index n_quarter = rows-1*ResPacketSizeQuarter+1; + // How many coeffs of the result do we have to skip to be aligned. + // Here we assume data are at least aligned on the base scalar type. + Index alignedStart = internal::first_default_aligned(res,size); + Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0; + const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; - // TODO: improve the following heuristic: - const Index block_cols = cols<128 ? cols : (lhsStride*sizeof(LhsScalar)<32000?16:4); - ResPacket palpha = pset1(alpha); - ResPacketHalf palpha_half = pset1(alpha); - ResPacketQuarter palpha_quarter = pset1(alpha); + const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; + Index alignmentPattern = alignmentStep==0 ? AllAligned + : alignmentStep==(LhsPacketSize/2) ? EvenAligned + : FirstAligned; - for(Index j2=0; j2(ResScalar(0)), - c1 = pset1(ResScalar(0)), - c2 = pset1(ResScalar(0)), - c3 = pset1(ResScalar(0)), - c4 = pset1(ResScalar(0)), - c5 = pset1(ResScalar(0)), - c6 = pset1(ResScalar(0)), - c7 = pset1(ResScalar(0)); + alignedSize = 0; + alignedStart = 0; + alignmentPattern = NoneAligned; + } + else if(LhsPacketSize > 4) + { + // TODO: extend the code to support aligned loads whenever possible when LhsPacketSize > 4. + // Currently, it seems to be better to perform unaligned loads anyway + alignmentPattern = NoneAligned; + } + else if (LhsPacketSize>1) + { + // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size(rhs(j,0)); - c0 = pcj.pmadd(lhs.template load(i+LhsPacketSize*0,j),b0,c0); - c1 = pcj.pmadd(lhs.template load(i+LhsPacketSize*1,j),b0,c1); - c2 = pcj.pmadd(lhs.template load(i+LhsPacketSize*2,j),b0,c2); - c3 = pcj.pmadd(lhs.template load(i+LhsPacketSize*3,j),b0,c3); - c4 = pcj.pmadd(lhs.template load(i+LhsPacketSize*4,j),b0,c4); - c5 = pcj.pmadd(lhs.template load(i+LhsPacketSize*5,j),b0,c5); - c6 = pcj.pmadd(lhs.template load(i+LhsPacketSize*6,j),b0,c6); - c7 = pcj.pmadd(lhs.template load(i+LhsPacketSize*7,j),b0,c7); - } - pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu(res+i+ResPacketSize*0))); - pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu(res+i+ResPacketSize*1))); - pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu(res+i+ResPacketSize*2))); - pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu(res+i+ResPacketSize*3))); - pstoreu(res+i+ResPacketSize*4, pmadd(c4,palpha,ploadu(res+i+ResPacketSize*4))); - pstoreu(res+i+ResPacketSize*5, pmadd(c5,palpha,ploadu(res+i+ResPacketSize*5))); - pstoreu(res+i+ResPacketSize*6, pmadd(c6,palpha,ploadu(res+i+ResPacketSize*6))); - pstoreu(res+i+ResPacketSize*7, pmadd(c7,palpha,ploadu(res+i+ResPacketSize*7))); - } - if(i(ResScalar(0)), - c1 = pset1(ResScalar(0)), - c2 = pset1(ResScalar(0)), - c3 = pset1(ResScalar(0)); + // nothing can be aligned, no need to skip any column + alignmentPattern = NoneAligned; + skipColumns = 0; + } + else + { + skipColumns = (std::min)(skipColumns,cols); + // note that the skiped columns are processed later. + } - for(Index j=j2; j(rhs(j,0)); - c0 = pcj.pmadd(lhs.template load(i+LhsPacketSize*0,j),b0,c0); - c1 = pcj.pmadd(lhs.template load(i+LhsPacketSize*1,j),b0,c1); - c2 = pcj.pmadd(lhs.template load(i+LhsPacketSize*2,j),b0,c2); - c3 = pcj.pmadd(lhs.template load(i+LhsPacketSize*3,j),b0,c3); - } - pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu(res+i+ResPacketSize*0))); - pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu(res+i+ResPacketSize*1))); - pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu(res+i+ResPacketSize*2))); - pstoreu(res+i+ResPacketSize*3, pmadd(c3,palpha,ploadu(res+i+ResPacketSize*3))); + /* eigen_internal_assert( (alignmentPattern==NoneAligned) + || (skipColumns + columnsAtOnce >= cols) + || LhsPacketSize > size + || (size_t(firstLhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);*/ + } + else if(Vectorizable) + { + alignedStart = 0; + alignedSize = size; + alignmentPattern = AllAligned; + } - i+=ResPacketSize*4; - } - if(i(ResScalar(0)), - c1 = pset1(ResScalar(0)), - c2 = pset1(ResScalar(0)); + const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1; + const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3; - for(Index j=j2; j(rhs(j,0)); - c0 = pcj.pmadd(lhs.template load(i+LhsPacketSize*0,j),b0,c0); - c1 = pcj.pmadd(lhs.template load(i+LhsPacketSize*1,j),b0,c1); - c2 = pcj.pmadd(lhs.template load(i+LhsPacketSize*2,j),b0,c2); - } - pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu(res+i+ResPacketSize*0))); - pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu(res+i+ResPacketSize*1))); - pstoreu(res+i+ResPacketSize*2, pmadd(c2,palpha,ploadu(res+i+ResPacketSize*2))); + Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns; + for (Index i=skipColumns; i(alpha*rhs(i, 0)), + ptmp1 = pset1(alpha*rhs(i+offset1, 0)), + ptmp2 = pset1(alpha*rhs(i+2, 0)), + ptmp3 = pset1(alpha*rhs(i+offset3, 0)); - i+=ResPacketSize*3; - } - if(i(ResScalar(0)), - c1 = pset1(ResScalar(0)); + // this helps a lot generating better binary code + const LhsScalars lhs0 = lhs.getVectorMapper(0, i+0), lhs1 = lhs.getVectorMapper(0, i+offset1), + lhs2 = lhs.getVectorMapper(0, i+2), lhs3 = lhs.getVectorMapper(0, i+offset3); - for(Index j=j2; j(rhs(j,0)); - c0 = pcj.pmadd(lhs.template load(i+LhsPacketSize*0,j),b0,c0); - c1 = pcj.pmadd(lhs.template load(i+LhsPacketSize*1,j),b0,c1); - } - pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu(res+i+ResPacketSize*0))); - pstoreu(res+i+ResPacketSize*1, pmadd(c1,palpha,ploadu(res+i+ResPacketSize*1))); - i+=ResPacketSize*2; - } - if(i(ResScalar(0)); - for(Index j=j2; j(rhs(j,0)); - c0 = pcj.pmadd(lhs.template load(i+0,j),b0,c0); + res[j] = cj.pmadd(lhs0(j), pfirst(ptmp0), res[j]); + res[j] = cj.pmadd(lhs1(j), pfirst(ptmp1), res[j]); + res[j] = cj.pmadd(lhs2(j), pfirst(ptmp2), res[j]); + res[j] = cj.pmadd(lhs3(j), pfirst(ptmp3), res[j]); } - pstoreu(res+i+ResPacketSize*0, pmadd(c0,palpha,ploadu(res+i+ResPacketSize*0))); - i+=ResPacketSize; - } - if(HasHalf && i(ResScalar(0)); - for(Index j=j2; jalignedStart) { - RhsPacketHalf b0 = pset1(rhs(j,0)); - c0 = pcj_half.pmadd(lhs.template load(i+0,j),b0,c0); + switch(alignmentPattern) + { + case AllAligned: + for (Index j = alignedStart; j1) + { + LhsPacket A00, A01, A02, A03, A10, A11, A12, A13; + ResPacket T0, T1; + + A01 = lhs1.template load(alignedStart-1); + A02 = lhs2.template load(alignedStart-2); + A03 = lhs3.template load(alignedStart-3); + + for (; j(j-1+LhsPacketSize); palign<1>(A01,A11); + A12 = lhs2.template load(j-2+LhsPacketSize); palign<2>(A02,A12); + A13 = lhs3.template load(j-3+LhsPacketSize); palign<3>(A03,A13); + + A00 = lhs0.template load(j); + A10 = lhs0.template load(j+LhsPacketSize); + T0 = pcj.pmadd(A00, ptmp0, pload(&res[j])); + T1 = pcj.pmadd(A10, ptmp0, pload(&res[j+ResPacketSize])); + + T0 = pcj.pmadd(A01, ptmp1, T0); + A01 = lhs1.template load(j-1+2*LhsPacketSize); palign<1>(A11,A01); + T0 = pcj.pmadd(A02, ptmp2, T0); + A02 = lhs2.template load(j-2+2*LhsPacketSize); palign<2>(A12,A02); + T0 = pcj.pmadd(A03, ptmp3, T0); + pstore(&res[j],T0); + A03 = lhs3.template load(j-3+2*LhsPacketSize); palign<3>(A13,A03); + T1 = pcj.pmadd(A11, ptmp1, T1); + T1 = pcj.pmadd(A12, ptmp2, T1); + T1 = pcj.pmadd(A13, ptmp3, T1); + pstore(&res[j+ResPacketSize],T1); + } + } + for (; j(res+i+ResPacketSizeHalf*0))); - i+=ResPacketSizeHalf; - } - if(HasQuarter && i(ResScalar(0)); - for(Index j=j2; j(rhs(j,0)); - c0 = pcj_quarter.pmadd(lhs.template load(i+0,j),b0,c0); - } - pstoreu(res+i+ResPacketSizeQuarter*0, pmadd(c0,palpha_quarter,ploadu(res+i+ResPacketSizeQuarter*0))); - i+=ResPacketSizeQuarter; - } - for(;i(alpha*rhs(k, 0)); + const LhsScalars lhs0 = lhs.getVectorMapper(0, k); + + if (Vectorizable) + { + /* explicit vectorization */ + // process first unaligned result's coeffs + for (Index j=0; j(alignedStart)) + for (Index i = alignedStart;i(i), ptmp0, pload(&res[i]))); + else + for (Index i = alignedStart;i(i), ptmp0, pload(&res[i]))); + } + + // process remaining scalars (or all if no explicit vectorization) + for (Index i=alignedSize; i struct general_matrix_vector_product { - typedef gemv_traits Traits; - typedef gemv_traits HalfTraits; - typedef gemv_traits QuarterTraits; +typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; - typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; +enum { + Vectorizable = packet_traits::Vectorizable && packet_traits::Vectorizable + && int(packet_traits::size)==int(packet_traits::size), + LhsPacketSize = Vectorizable ? packet_traits::size : 1, + RhsPacketSize = Vectorizable ? packet_traits::size : 1, + ResPacketSize = Vectorizable ? packet_traits::size : 1 +}; - typedef typename Traits::LhsPacket LhsPacket; - typedef typename Traits::RhsPacket RhsPacket; - typedef typename Traits::ResPacket ResPacket; +typedef typename packet_traits::type _LhsPacket; +typedef typename packet_traits::type _RhsPacket; +typedef typename packet_traits::type _ResPacket; - typedef typename HalfTraits::LhsPacket LhsPacketHalf; - typedef typename HalfTraits::RhsPacket RhsPacketHalf; - typedef typename HalfTraits::ResPacket ResPacketHalf; +typedef typename conditional::type LhsPacket; +typedef typename conditional::type RhsPacket; +typedef typename conditional::type ResPacket; - typedef typename QuarterTraits::LhsPacket LhsPacketQuarter; - typedef typename QuarterTraits::RhsPacket RhsPacketQuarter; - typedef typename QuarterTraits::ResPacket ResPacketQuarter; - -EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( +EIGEN_DONT_INLINE static void run( Index rows, Index cols, const LhsMapper& lhs, const RhsMapper& rhs, @@ -324,191 +361,255 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run( }; template -EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product::run( +EIGEN_DONT_INLINE void general_matrix_vector_product::run( Index rows, Index cols, - const LhsMapper& alhs, + const LhsMapper& lhs, const RhsMapper& rhs, ResScalar* res, Index resIncr, ResScalar alpha) { - // The following copy tells the compiler that lhs's attributes are not modified outside this function - // This helps GCC to generate propoer code. - LhsMapper lhs(alhs); - eigen_internal_assert(rhs.stride()==1); + + #ifdef _EIGEN_ACCUMULATE_PACKETS + #error _EIGEN_ACCUMULATE_PACKETS has already been defined + #endif + + #define _EIGEN_ACCUMULATE_PACKETS(Alignment0,Alignment13,Alignment2) {\ + RhsPacket b = rhs.getVectorMapper(j, 0).template load(0); \ + ptmp0 = pcj.pmadd(lhs0.template load(j), b, ptmp0); \ + ptmp1 = pcj.pmadd(lhs1.template load(j), b, ptmp1); \ + ptmp2 = pcj.pmadd(lhs2.template load(j), b, ptmp2); \ + ptmp3 = pcj.pmadd(lhs3.template load(j), b, ptmp3); } + conj_helper cj; conj_helper pcj; - conj_helper pcj_half; - conj_helper pcj_quarter; - // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large, - // processing 8 rows at once might be counter productive wrt cache. - const Index n8 = lhs.stride()*sizeof(LhsScalar)>32000 ? 0 : rows-7; - const Index n4 = rows-3; - const Index n2 = rows-1; + typedef typename LhsMapper::VectorMapper LhsScalars; - // TODO: for padded aligned inputs, we could enable aligned reads - enum { LhsAlignment = Unaligned, - ResPacketSize = Traits::ResPacketSize, - ResPacketSizeHalf = HalfTraits::ResPacketSize, - ResPacketSizeQuarter = QuarterTraits::ResPacketSize, - LhsPacketSize = Traits::LhsPacketSize, - LhsPacketSizeHalf = HalfTraits::LhsPacketSize, - LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize, - HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize, - HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf - }; + enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 }; + const Index rowsAtOnce = 4; + const Index peels = 2; + const Index RhsPacketAlignedMask = RhsPacketSize-1; + const Index LhsPacketAlignedMask = LhsPacketSize-1; + const Index depth = cols; + const Index lhsStride = lhs.stride(); - Index i=0; - for(; i1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0; + const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1; + + const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0; + Index alignmentPattern = alignmentStep==0 ? AllAligned + : alignmentStep==(LhsPacketSize/2) ? EvenAligned + : FirstAligned; + + // we cannot assume the first element is aligned because of sub-matrices + const Index lhsAlignmentOffset = lhs.firstAligned(depth); + const Index rhsAlignmentOffset = rhs.firstAligned(rows); + + // find how many rows do we have to skip to be aligned with rhs (if possible) + Index skipRows = 0; + // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats) + if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || + (lhsAlignmentOffset < 0) || (lhsAlignmentOffset == depth) || + (rhsAlignmentOffset < 0) || (rhsAlignmentOffset == rows) ) { - ResPacket c0 = pset1(ResScalar(0)), - c1 = pset1(ResScalar(0)), - c2 = pset1(ResScalar(0)), - c3 = pset1(ResScalar(0)), - c4 = pset1(ResScalar(0)), - c5 = pset1(ResScalar(0)), - c6 = pset1(ResScalar(0)), - c7 = pset1(ResScalar(0)); - - Index j=0; - for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) - { - RhsPacket b0 = rhs.template load(j,0); - - c0 = pcj.pmadd(lhs.template load(i+0,j),b0,c0); - c1 = pcj.pmadd(lhs.template load(i+1,j),b0,c1); - c2 = pcj.pmadd(lhs.template load(i+2,j),b0,c2); - c3 = pcj.pmadd(lhs.template load(i+3,j),b0,c3); - c4 = pcj.pmadd(lhs.template load(i+4,j),b0,c4); - c5 = pcj.pmadd(lhs.template load(i+5,j),b0,c5); - c6 = pcj.pmadd(lhs.template load(i+6,j),b0,c6); - c7 = pcj.pmadd(lhs.template load(i+7,j),b0,c7); - } - ResScalar cc0 = predux(c0); - ResScalar cc1 = predux(c1); - ResScalar cc2 = predux(c2); - ResScalar cc3 = predux(c3); - ResScalar cc4 = predux(c4); - ResScalar cc5 = predux(c5); - ResScalar cc6 = predux(c6); - ResScalar cc7 = predux(c7); - for(; j 4) { - ResPacket c0 = pset1(ResScalar(0)), - c1 = pset1(ResScalar(0)), - c2 = pset1(ResScalar(0)), - c3 = pset1(ResScalar(0)); - - Index j=0; - for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) - { - RhsPacket b0 = rhs.template load(j,0); - - c0 = pcj.pmadd(lhs.template load(i+0,j),b0,c0); - c1 = pcj.pmadd(lhs.template load(i+1,j),b0,c1); - c2 = pcj.pmadd(lhs.template load(i+2,j),b0,c2); - c3 = pcj.pmadd(lhs.template load(i+3,j),b0,c3); - } - ResScalar cc0 = predux(c0); - ResScalar cc1 = predux(c1); - ResScalar cc2 = predux(c2); - ResScalar cc3 = predux(c3); - for(; j 4. + alignmentPattern = NoneAligned; } - for(; i1) { - ResPacket c0 = pset1(ResScalar(0)), - c1 = pset1(ResScalar(0)); + // eigen_internal_assert(size_t(firstLhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || depth(j,0); - - c0 = pcj.pmadd(lhs.template load(i+0,j),b0,c0); - c1 = pcj.pmadd(lhs.template load(i+1,j),b0,c1); + // nothing can be aligned, no need to skip any column + alignmentPattern = NoneAligned; + skipRows = 0; } - ResScalar cc0 = predux(c0); - ResScalar cc1 = predux(c1); - for(; j= rows) + || LhsPacketSize > depth + || (size_t(firstLhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);*/ } - for(; i(ResScalar(0)); - ResPacketHalf c0_h = pset1(ResScalar(0)); - ResPacketQuarter c0_q = pset1(ResScalar(0)); - Index j=0; - for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) + alignedStart = 0; + alignedSize = depth; + alignmentPattern = AllAligned; + } + + const Index offset1 = (alignmentPattern==FirstAligned && alignmentStep==1)?3:1; + const Index offset3 = (alignmentPattern==FirstAligned && alignmentStep==1)?1:3; + + Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows; + for (Index i=skipRows; i(j,0); - c0 = pcj.pmadd(lhs.template load(i,j),b0,c0); - } - ResScalar cc0 = predux(c0); - if (HasHalf) { - for(; j+LhsPacketSizeHalf<=cols; j+=LhsPacketSizeHalf) + /* explicit vectorization */ + ResPacket ptmp0 = pset1(ResScalar(0)), ptmp1 = pset1(ResScalar(0)), + ptmp2 = pset1(ResScalar(0)), ptmp3 = pset1(ResScalar(0)); + + // process initial unaligned coeffs + // FIXME this loop get vectorized by the compiler ! + for (Index j=0; jalignedStart) + { + switch(alignmentPattern) { - RhsPacketHalf b0 = rhs.template load(j,0); - c0_h = pcj_half.pmadd(lhs.template load(i,j),b0,c0_h); + case AllAligned: + for (Index j = alignedStart; j1) + { + /* Here we proccess 4 rows with with two peeled iterations to hide + * the overhead of unaligned loads. Moreover unaligned loads are handled + * using special shift/move operations between the two aligned packets + * overlaping the desired unaligned packet. This is *much* more efficient + * than basic unaligned loads. + */ + LhsPacket A01, A02, A03, A11, A12, A13; + A01 = lhs1.template load(alignedStart-1); + A02 = lhs2.template load(alignedStart-2); + A03 = lhs3.template load(alignedStart-3); + + for (; j(0); + A11 = lhs1.template load(j-1+LhsPacketSize); palign<1>(A01,A11); + A12 = lhs2.template load(j-2+LhsPacketSize); palign<2>(A02,A12); + A13 = lhs3.template load(j-3+LhsPacketSize); palign<3>(A03,A13); + + ptmp0 = pcj.pmadd(lhs0.template load(j), b, ptmp0); + ptmp1 = pcj.pmadd(A01, b, ptmp1); + A01 = lhs1.template load(j-1+2*LhsPacketSize); palign<1>(A11,A01); + ptmp2 = pcj.pmadd(A02, b, ptmp2); + A02 = lhs2.template load(j-2+2*LhsPacketSize); palign<2>(A12,A02); + ptmp3 = pcj.pmadd(A03, b, ptmp3); + A03 = lhs3.template load(j-3+2*LhsPacketSize); palign<3>(A13,A03); + + b = rhs.getVectorMapper(j+RhsPacketSize, 0).template load(0); + ptmp0 = pcj.pmadd(lhs0.template load(j+LhsPacketSize), b, ptmp0); + ptmp1 = pcj.pmadd(A11, b, ptmp1); + ptmp2 = pcj.pmadd(A12, b, ptmp2); + ptmp3 = pcj.pmadd(A13, b, ptmp3); + } + } + for (; j(j,0); - c0_q = pcj_quarter.pmadd(lhs.template load(i,j),b0,c0_q); - } - cc0 += predux(c0_q); - } - for(; j(tmp0); + const LhsScalars lhs0 = lhs.getVectorMapper(i, 0); + // process first unaligned result's coeffs + // FIXME this loop get vectorized by the compiler ! + for (Index j=0; jalignedStart) + { + // process aligned rhs coeffs + if (lhs0.template aligned(alignedStart)) + for (Index j = alignedStart;j(j), rhs.getVectorMapper(j, 0).template load(0), ptmp0); + else + for (Index j = alignedStart;j(j), rhs.getVectorMapper(j, 0).template load(0), ptmp0); + tmp0 += predux(ptmp0); + } + + // process remaining scalars + // FIXME this loop get vectorized by the compiler ! + for (Index j=alignedSize; j -#endif - namespace Eigen { namespace internal { @@ -80,17 +76,8 @@ template struct GemmParallelInfo { GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {} - // volatile is not enough on all architectures (see bug 1572) - // to guarantee that when thread A says to thread B that it is - // done with packing a block, then all writes have been really - // carried out... C++11 memory model+atomic guarantees this. -#if EIGEN_HAS_CXX11_ATOMIC - std::atomic sync; - std::atomic users; -#else Index volatile sync; int volatile users; -#endif Index lhs_start; Index lhs_length; @@ -101,14 +88,11 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, { // TODO when EIGEN_USE_BLAS is defined, // we should still enable OMP for other scalar types - // Without C++11, we have to disable GEMM's parallelization on - // non x86 architectures because there volatile is not enough for our purpose. - // See bug 1572. -#if (! defined(EIGEN_HAS_OPENMP)) || defined(EIGEN_USE_BLAS) || ((!EIGEN_HAS_CXX11_ATOMIC) && !(EIGEN_ARCH_i386_OR_x86_64)) +#if !(defined (EIGEN_HAS_OPENMP)) || defined (EIGEN_USE_BLAS) // FIXME the transpose variable is only needed to properly split // the matrix product when multithreading is enabled. This is a temporary // fix to support row-major destination matrices. This whole - // parallelizer mechanism has to be redesigned anyway. + // parallelizer mechanism has to be redisigned anyway. EIGEN_UNUSED_VARIABLE(depth); EIGEN_UNUSED_VARIABLE(transpose); func(0,rows, 0,cols); @@ -129,12 +113,12 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, double work = static_cast(rows) * static_cast(cols) * static_cast(depth); double kMinTaskSize = 50000; // FIXME improve this heuristic. - pb_max_threads = std::max(1, std::min(pb_max_threads, static_cast( work / kMinTaskSize ) )); + pb_max_threads = std::max(1, std::min(pb_max_threads, work / kMinTaskSize)); // compute the number of threads we are going to use Index threads = std::min(nbThreads(), pb_max_threads); - // if multi-threading is explicitly disabled, not useful, or if we already are in a parallel session, + // if multi-threading is explicitely disabled, not useful, or if we already are in a parallel session, // then abort multi-threading // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp? if((!Condition) || (threads==1) || (omp_get_num_threads()>1)) @@ -148,7 +132,8 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, ei_declare_aligned_stack_constructed_variable(GemmParallelInfo,info,threads,0); - #pragma omp parallel num_threads(threads) + int errorCount = 0; + #pragma omp parallel num_threads(threads) reduction(+: errorCount) { Index i = omp_get_thread_num(); // Note that the actual number of threads might be lower than the number of request ones. @@ -167,9 +152,14 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, info[i].lhs_start = r0; info[i].lhs_length = actualBlockRows; - if(transpose) func(c0, actualBlockCols, 0, rows, info); - else func(0, rows, c0, actualBlockCols, info); + EIGEN_TRY { + if(transpose) func(c0, actualBlockCols, 0, rows, info); + else func(0, rows, c0, actualBlockCols, info); + } EIGEN_CATCH(...) { + ++errorCount; + } } + if (errorCount) EIGEN_THROW_X(Eigen::eigen_assert_exception()); #endif } diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index 33ecf10f6..04c933480 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -45,23 +45,14 @@ struct symm_pack_lhs } void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows) { - typedef typename unpacket_traits::type>::half HalfPacket; - typedef typename unpacket_traits::type>::half>::half QuarterPacket; - enum { PacketSize = packet_traits::size, - HalfPacketSize = unpacket_traits::size, - QuarterPacketSize = unpacket_traits::size, - HasHalf = (int)HalfPacketSize < (int)PacketSize, - HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; - + enum { PacketSize = packet_traits::size }; const_blas_data_mapper lhs(_lhs,lhsStride); Index count = 0; //Index peeled_mc3 = (rows/Pack1)*Pack1; const Index peeled_mc3 = Pack1>=3*PacketSize ? (rows/(3*PacketSize))*(3*PacketSize) : 0; const Index peeled_mc2 = Pack1>=2*PacketSize ? peeled_mc3+((rows-peeled_mc3)/(2*PacketSize))*(2*PacketSize) : 0; - const Index peeled_mc1 = Pack1>=1*PacketSize ? peeled_mc2+((rows-peeled_mc2)/(1*PacketSize))*(1*PacketSize) : 0; - const Index peeled_mc_half = Pack1>=HalfPacketSize ? peeled_mc1+((rows-peeled_mc1)/(HalfPacketSize))*(HalfPacketSize) : 0; - const Index peeled_mc_quarter = Pack1>=QuarterPacketSize ? peeled_mc_half+((rows-peeled_mc_half)/(QuarterPacketSize))*(QuarterPacketSize) : 0; + const Index peeled_mc1 = Pack1>=1*PacketSize ? (rows/(1*PacketSize))*(1*PacketSize) : 0; if(Pack1>=3*PacketSize) for(Index i=0; i(blockA, lhs, cols, i, count); - if(HasHalf && Pack1>=HalfPacketSize) - for(Index i=peeled_mc1; i(blockA, lhs, cols, i, count); - - if(HasQuarter && Pack1>=QuarterPacketSize) - for(Index i=peeled_mc_half; i(blockA, lhs, cols, i, count); - // do the same with mr==1 - for(Index i=peeled_mc_quarter; i gebp_kernel; symm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; - gemm_pack_lhs pack_lhs_transposed; + gemm_pack_lhs pack_lhs_transposed; for(Index k2=0; k2() + gemm_pack_lhs() (blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc); gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha); @@ -459,7 +442,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; symm_pack_rhs pack_rhs; for(Index k2=0; k2 -EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC -void selfadjoint_matrix_vector_product::run( +EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product::run( Index size, const Scalar* lhs, Index lhsStride, const Scalar* rhs, @@ -64,7 +62,8 @@ void selfadjoint_matrix_vector_product enum { LhsUpLo = LhsMode&(Upper|Lower) }; template - static EIGEN_DEVICE_FUNC - void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha) + static void run(Dest& dest, const Lhs &a_lhs, const Rhs &a_rhs, const Scalar& alpha) { typedef typename Dest::Scalar ResScalar; typedef typename Rhs::Scalar RhsScalar; diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointProduct.h b/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointProduct.h index 61e8894e7..ef12c98f6 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointProduct.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointProduct.h @@ -120,7 +120,7 @@ struct selfadjoint_product_selector template template -EIGEN_DEVICE_FUNC SelfAdjointView& SelfAdjointView +SelfAdjointView& SelfAdjointView ::rankUpdate(const MatrixBase& u, const Scalar& alpha) { selfadjoint_product_selector::run(_expression().const_cast_derived(), u.derived(), alpha); diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointRank2Update.h b/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointRank2Update.h index 09209f733..2ae364111 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointRank2Update.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/products/SelfadjointRank2Update.h @@ -24,8 +24,7 @@ struct selfadjoint_rank2_update_selector; template struct selfadjoint_rank2_update_selector { - static EIGEN_DEVICE_FUNC - void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) + static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) { const Index size = u.size(); for (Index i=0; i struct conj_expr_if template template -EIGEN_DEVICE_FUNC SelfAdjointView& SelfAdjointView +SelfAdjointView& SelfAdjointView ::rankUpdate(const MatrixBase& u, const MatrixBase& v, const Scalar& alpha) { typedef internal::blas_traits UBlasTraits; diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h b/uppsrc/plugin/Eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h index f0c60507a..2fb408d1d 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -155,7 +155,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; for(Index k2=IsLower ? depth : 0; @@ -226,7 +226,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix() + gemm_pack_lhs() (blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc); gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, @@ -305,7 +305,7 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; gemm_pack_rhs pack_rhs_panel; diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/products/TriangularSolverMatrix.h b/uppsrc/plugin/Eigen/Eigen/src/Core/products/TriangularSolverMatrix.h index 0dcf3bb52..e3ed2cd19 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -76,7 +76,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix conj; gebp_kernel gebp_kernel; - gemm_pack_lhs pack_lhs; + gemm_pack_lhs pack_lhs; gemm_pack_rhs pack_rhs; // the goal here is to subdivise the Rhs panels such that we keep some cache @@ -229,7 +229,7 @@ EIGEN_DONT_INLINE void triangular_solve_matrix gebp_kernel; gemm_pack_rhs pack_rhs; gemm_pack_rhs pack_rhs_panel; - gemm_pack_lhs pack_lhs_panel; + gemm_pack_lhs pack_lhs_panel; for(Index k2=IsLower ? size : 0; IsLower ? k2>0 : k20) rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map >(rhs+s,k))).sum(); - if((!(Mode & UnitDiag)) && numext::not_equal_strict(rhs[i],RhsScalar(0))) + if(!(Mode & UnitDiag)) rhs[i] /= cjLhs(i,i); } } @@ -114,23 +114,20 @@ struct triangular_solve_vector0) - Map >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r); - } + Index r = actualPanelWidth - k - 1; // remaining size + Index s = IsLower ? i+1 : i-r; + if (r>0) + Map >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r); } Index r = IsLower ? size - endBlock : startBlock; // remaining size if (r > 0) { // let's directly call the low level product function because: // 1 - it is faster to compile - // 2 - it is slightly faster at runtime + // 2 - it is slighlty faster at runtime general_matrix_vector_product::run( r, actualPanelWidth, LhsMapper(&lhs.coeffRef(endBlock,startBlock), lhsStride), diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/BlasUtil.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/BlasUtil.h index 643558cba..3dff9bc9b 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/BlasUtil.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/util/BlasUtil.h @@ -24,7 +24,7 @@ struct gebp_kernel; template struct gemm_pack_rhs; -template +template struct gemm_pack_lhs; template< @@ -159,9 +159,11 @@ template class BlasLinearMapper; template -class BlasLinearMapper -{ -public: +class BlasLinearMapper { + public: + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data, Index incr=1) : m_data(data) { @@ -177,17 +179,19 @@ public: return m_data[i]; } - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const { - return ploadt(m_data + i); + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + return ploadt(m_data + i); } - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const { - pstoret(m_data + i, p); + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { + return ploadt(m_data + i); } -protected: + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const Packet &p) const { + pstoret(m_data + i, p); + } + + protected: Scalar *m_data; }; @@ -199,6 +203,9 @@ template class blas_data_mapper { public: + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + typedef BlasLinearMapper LinearMapper; typedef BlasVectorMapper VectorMapper; @@ -228,14 +235,12 @@ public: return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; } - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i, Index j) const { - return ploadt(&operator()(i, j)); + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + return ploadt(&operator()(i, j)); } - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const { - return ploadt(&operator()(i, j)); + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { + return ploadt(&operator()(i, j)); } template @@ -258,7 +263,7 @@ public: return internal::first_default_aligned(m_data, size); } -protected: + protected: Scalar* EIGEN_RESTRICT m_data; const Index m_stride; }; @@ -270,6 +275,9 @@ template class BlasLinearMapper { public: + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data,Index incr) : m_data(data), m_incr(incr) {} EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const { @@ -280,9 +288,8 @@ public: return m_data[i*m_incr.value()]; } - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const { - return pgather(m_data + i*m_incr.value(), m_incr.value()); + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + return pgather(m_data + i*m_incr.value(), m_incr.value()); } template @@ -299,6 +306,9 @@ template::type Packet; + typedef typename packet_traits::half HalfPacket; + typedef BlasLinearMapper LinearMapper; EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr) : m_data(data), m_stride(stride), m_incr(incr) {} @@ -317,9 +327,8 @@ public: return m_data[StorageOrder==RowMajor ? j*m_incr.value() + i*m_stride : i*m_incr.value() + j*m_stride]; } - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i, Index j) const { - return pgather(&operator()(i, j),m_incr.value()); + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + return pgather(&operator()(i, j),m_incr.value()); } template @@ -370,15 +379,14 @@ template struct blas_traits HasUsableDirectAccess = ( (int(XprType::Flags)&DirectAccessBit) && ( bool(XprType::IsVectorAtCompileTime) || int(inner_stride_at_compile_time::ret) == 1) - ) ? 1 : 0, - HasScalarFactor = false + ) ? 1 : 0 }; typedef typename conditional::type DirectLinearAccessType; - static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return x; } - static inline EIGEN_DEVICE_FUNC const Scalar extractScalarFactor(const XprType&) { return Scalar(1); } + static inline ExtractType extract(const XprType& x) { return x; } + static inline const Scalar extractScalarFactor(const XprType&) { return Scalar(1); } }; // pop conjugate @@ -403,23 +411,17 @@ template struct blas_traits, const CwiseNullaryOp,Plain>, NestedXpr> > : blas_traits { - enum { - HasScalarFactor = true - }; typedef blas_traits Base; typedef CwiseBinaryOp, const CwiseNullaryOp,Plain>, NestedXpr> XprType; typedef typename Base::ExtractType ExtractType; - static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); } - static inline EIGEN_DEVICE_FUNC Scalar extractScalarFactor(const XprType& x) + static inline ExtractType extract(const XprType& x) { return Base::extract(x.rhs()); } + static inline Scalar extractScalarFactor(const XprType& x) { return x.lhs().functor().m_other * Base::extractScalarFactor(x.rhs()); } }; template struct blas_traits, NestedXpr, const CwiseNullaryOp,Plain> > > : blas_traits { - enum { - HasScalarFactor = true - }; typedef blas_traits Base; typedef CwiseBinaryOp, NestedXpr, const CwiseNullaryOp,Plain> > XprType; typedef typename Base::ExtractType ExtractType; @@ -438,9 +440,6 @@ template struct blas_traits, NestedXpr> > : blas_traits { - enum { - HasScalarFactor = true - }; typedef blas_traits Base; typedef CwiseUnaryOp, NestedXpr> XprType; typedef typename Base::ExtractType ExtractType; diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/ConfigureVectorization.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/ConfigureVectorization.h deleted file mode 100644 index 952abc306..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/ConfigureVectorization.h +++ /dev/null @@ -1,483 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2008-2018 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CONFIGURE_VECTORIZATION_H -#define EIGEN_CONFIGURE_VECTORIZATION_H - -//------------------------------------------------------------------------------------------ -// Static and dynamic alignment control -// -// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES -// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively. -// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not, -// a default value is automatically computed based on architecture, compiler, and OS. -// -// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX} -// to be used to declare statically aligned buffers. -//------------------------------------------------------------------------------------------ - - -/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements. - * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled, - * so that vectorization doesn't affect binary compatibility. - * - * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link - * vectorized and non-vectorized code. - * - * FIXME: this code can be cleaned up once we switch to proper C++11 only. - */ -#if (defined EIGEN_CUDACC) - #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n) - #define EIGEN_ALIGNOF(x) __alignof(x) -#elif EIGEN_HAS_ALIGNAS - #define EIGEN_ALIGN_TO_BOUNDARY(n) alignas(n) - #define EIGEN_ALIGNOF(x) alignof(x) -#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM - #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) - #define EIGEN_ALIGNOF(x) __alignof(x) -#elif EIGEN_COMP_MSVC - #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n)) - #define EIGEN_ALIGNOF(x) __alignof(x) -#elif EIGEN_COMP_SUNCC - // FIXME not sure about this one: - #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) - #define EIGEN_ALIGNOF(x) __alignof(x) -#else - #error Please tell me what is the equivalent of alignas(n) and alignof(x) for your compiler -#endif - -// If the user explicitly disable vectorization, then we also disable alignment -#if defined(EIGEN_DONT_VECTORIZE) - #if defined(EIGEN_GPUCC) - // GPU code is always vectorized and requires memory alignment for - // statically allocated buffers. - #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16 - #else - #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0 - #endif -#elif defined(__AVX512F__) - // 64 bytes static alignment is preferred only if really required - #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64 -#elif defined(__AVX__) - // 32 bytes static alignment is preferred only if really required - #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32 -#else - #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16 -#endif - - -// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense -#define EIGEN_MIN_ALIGN_BYTES 16 - -// Defined the boundary (in bytes) on which the data needs to be aligned. Note -// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be -// aligned at all regardless of the value of this #define. - -#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)) && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0 -#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY. -#endif - -// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprecated -// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0 -#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN) - #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES - #undef EIGEN_MAX_STATIC_ALIGN_BYTES - #endif - #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 -#endif - -#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES - - // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES - - // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable - // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always - // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in - // certain common platform (compiler+architecture combinations) to avoid these problems. - // Only static alignment is really problematic (relies on nonstandard compiler extensions), - // try to keep heap alignment even when we have to disable static alignment. - #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || EIGEN_ARCH_MIPS) - #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 - #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6) - // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support. - // Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use. - // 4.8 and newer seem definitely unaffected. - #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 - #else - #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0 - #endif - - // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX - #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \ - && !EIGEN_GCC3_OR_OLDER \ - && !EIGEN_COMP_SUNCC \ - && !EIGEN_OS_QNX - #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1 - #else - #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0 - #endif - - #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT - #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES - #else - #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 - #endif - -#endif - -// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_STATIC_ALIGN_BYTES -#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES0 is the true test whether we want to align arrays on the stack or not. -// It takes into account both the user choice to explicitly enable/disable alignment (by setting EIGEN_MAX_STATIC_ALIGN_BYTES) -// and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). -// Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used. - - -// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY -#define EIGEN_ALIGN8 EIGEN_ALIGN_TO_BOUNDARY(8) -#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16) -#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32) -#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64) -#if EIGEN_MAX_STATIC_ALIGN_BYTES>0 -#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES) -#else -#define EIGEN_ALIGN_MAX -#endif - - -// Dynamic alignment control - -#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0 -#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN. -#endif - -#ifdef EIGEN_DONT_ALIGN - #ifdef EIGEN_MAX_ALIGN_BYTES - #undef EIGEN_MAX_ALIGN_BYTES - #endif - #define EIGEN_MAX_ALIGN_BYTES 0 -#elif !defined(EIGEN_MAX_ALIGN_BYTES) - #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES -#endif - -#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES -#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES -#else -#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES -#endif - - -#ifndef EIGEN_UNALIGNED_VECTORIZE -#define EIGEN_UNALIGNED_VECTORIZE 1 -#endif - -//---------------------------------------------------------------------- - -// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into -// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks -#if EIGEN_MAX_ALIGN_BYTES==0 - #ifndef EIGEN_DONT_VECTORIZE - #define EIGEN_DONT_VECTORIZE - #endif -#endif - - -// The following (except #include and _M_IX86_FP ??) can likely be -// removed as gcc 4.1 and msvc 2008 are not supported anyways. -#if EIGEN_COMP_MSVC - #include // for _aligned_malloc -- need it regardless of whether vectorization is enabled - #if (EIGEN_COMP_MSVC >= 1500) // 2008 or later - // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP. - #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64 - #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER - #endif - #endif -#else - #if (defined __SSE2__) && ( (!EIGEN_COMP_GNUC) || EIGEN_COMP_ICC || EIGEN_GNUC_AT_LEAST(4,2) ) - #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC - #endif -#endif - -#if !(defined(EIGEN_DONT_VECTORIZE) || defined(EIGEN_GPUCC)) - - #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER) - - // Defines symbols for compile-time detection of which instructions are - // used. - // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_SSE - #define EIGEN_VECTORIZE_SSE2 - - // Detect sse3/ssse3/sse4: - // gcc and icc defines __SSE3__, ... - // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you - // want to force the use of those instructions with msvc. - #ifdef __SSE3__ - #define EIGEN_VECTORIZE_SSE3 - #endif - #ifdef __SSSE3__ - #define EIGEN_VECTORIZE_SSSE3 - #endif - #ifdef __SSE4_1__ - #define EIGEN_VECTORIZE_SSE4_1 - #endif - #ifdef __SSE4_2__ - #define EIGEN_VECTORIZE_SSE4_2 - #endif - #ifdef __AVX__ - #ifndef EIGEN_USE_SYCL - #define EIGEN_VECTORIZE_AVX - #endif - #define EIGEN_VECTORIZE_SSE3 - #define EIGEN_VECTORIZE_SSSE3 - #define EIGEN_VECTORIZE_SSE4_1 - #define EIGEN_VECTORIZE_SSE4_2 - #endif - #ifdef __AVX2__ - #ifndef EIGEN_USE_SYCL - #define EIGEN_VECTORIZE_AVX2 - #define EIGEN_VECTORIZE_AVX - #endif - #define EIGEN_VECTORIZE_SSE3 - #define EIGEN_VECTORIZE_SSSE3 - #define EIGEN_VECTORIZE_SSE4_1 - #define EIGEN_VECTORIZE_SSE4_2 - #endif - #if defined(__FMA__) || (EIGEN_COMP_MSVC && defined(__AVX2__)) - // MSVC does not expose a switch dedicated for FMA - // For MSVC, AVX2 => FMA - #define EIGEN_VECTORIZE_FMA - #endif - #if defined(__AVX512F__) - #ifndef EIGEN_VECTORIZE_FMA - #if EIGEN_COMP_GNUC - #error Please add -mfma to your compiler flags: compiling with -mavx512f alone without SSE/AVX FMA is not supported (bug 1638). - #else - #error Please enable FMA in your compiler flags (e.g. -mfma): compiling with AVX512 alone without SSE/AVX FMA is not supported (bug 1638). - #endif - #endif - #ifndef EIGEN_USE_SYCL - #define EIGEN_VECTORIZE_AVX512 - #define EIGEN_VECTORIZE_AVX2 - #define EIGEN_VECTORIZE_AVX - #endif - #define EIGEN_VECTORIZE_FMA - #define EIGEN_VECTORIZE_SSE3 - #define EIGEN_VECTORIZE_SSSE3 - #define EIGEN_VECTORIZE_SSE4_1 - #define EIGEN_VECTORIZE_SSE4_2 - #ifndef EIGEN_USE_SYCL - #ifdef __AVX512DQ__ - #define EIGEN_VECTORIZE_AVX512DQ - #endif - #ifdef __AVX512ER__ - #define EIGEN_VECTORIZE_AVX512ER - #endif - #endif - #endif - - // Disable AVX support on broken xcode versions - #if defined(__apple_build_version__) && (__apple_build_version__ == 11000033 ) && ( __MAC_OS_X_VERSION_MIN_REQUIRED == 101500 ) - // A nasty bug in the clang compiler shipped with xcode in a common compilation situation - // when XCode 11.0 and Mac deployment target macOS 10.15 is https://trac.macports.org/ticket/58776#no1 - #ifdef EIGEN_VECTORIZE_AVX - #undef EIGEN_VECTORIZE_AVX - #warning "Disabling AVX support: clang compiler shipped with XCode 11.[012] generates broken assembly with -macosx-version-min=10.15 and AVX enabled. " - #ifdef EIGEN_VECTORIZE_AVX2 - #undef EIGEN_VECTORIZE_AVX2 - #endif - #ifdef EIGEN_VECTORIZE_FMA - #undef EIGEN_VECTORIZE_FMA - #endif - #ifdef EIGEN_VECTORIZE_AVX512 - #undef EIGEN_VECTORIZE_AVX512 - #endif - #ifdef EIGEN_VECTORIZE_AVX512DQ - #undef EIGEN_VECTORIZE_AVX512DQ - #endif - #ifdef EIGEN_VECTORIZE_AVX512ER - #undef EIGEN_VECTORIZE_AVX512ER - #endif - #endif - // NOTE: Confirmed test failures in XCode 11.0, and XCode 11.2 with -macosx-version-min=10.15 and AVX - // NOTE using -macosx-version-min=10.15 with Xcode 11.0 results in runtime segmentation faults in many tests, 11.2 produce core dumps in 3 tests - // NOTE using -macosx-version-min=10.14 produces functioning and passing tests in all cases - // NOTE __clang_version__ "11.0.0 (clang-1100.0.33.8)" XCode 11.0 <- Produces many segfault and core dumping tests - // with -macosx-version-min=10.15 and AVX - // NOTE __clang_version__ "11.0.0 (clang-1100.0.33.12)" XCode 11.2 <- Produces 3 core dumping tests with - // -macosx-version-min=10.15 and AVX - #endif - - // include files - - // This extern "C" works around a MINGW-w64 compilation issue - // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354 - // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do). - // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations - // with conflicting linkage. The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know; - // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too. - // notice that since these are C headers, the extern "C" is theoretically needed anyways. - extern "C" { - // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly. - // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus: - #if EIGEN_COMP_ICC >= 1110 - #include - #else - #include - #include - #include - #ifdef EIGEN_VECTORIZE_SSE3 - #include - #endif - #ifdef EIGEN_VECTORIZE_SSSE3 - #include - #endif - #ifdef EIGEN_VECTORIZE_SSE4_1 - #include - #endif - #ifdef EIGEN_VECTORIZE_SSE4_2 - #include - #endif - #if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512) - #include - #endif - #endif - } // end extern "C" - - #elif defined __VSX__ - - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_VSX - #include - // We need to #undef all these ugly tokens defined in - // => use __vector instead of vector - #undef bool - #undef vector - #undef pixel - - #elif defined __ALTIVEC__ - - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_ALTIVEC - #include - // We need to #undef all these ugly tokens defined in - // => use __vector instead of vector - #undef bool - #undef vector - #undef pixel - - #elif (defined __ARM_NEON) || (defined __ARM_NEON__) - - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_NEON - #include - - #elif (defined __s390x__ && defined __VEC__) - - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_ZVECTOR - #include - - #elif defined __mips_msa - - // Limit MSA optimizations to little-endian CPUs for now. - // TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs? - #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) - #if defined(__LP64__) - #define EIGEN_MIPS_64 - #else - #define EIGEN_MIPS_32 - #endif - #define EIGEN_VECTORIZE - #define EIGEN_VECTORIZE_MSA - #include - #endif - - #endif -#endif - -#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!defined(EIGEN_COMP_CLANG) || EIGEN_COMP_CLANG>=380)) - // We can use the optimized fp16 to float and float to fp16 conversion routines - #define EIGEN_HAS_FP16_C - - #if defined(EIGEN_COMP_CLANG) - // Workaround for clang: The FP16C intrinsics for clang are included by - // immintrin.h, as opposed to emmintrin.h as suggested by Intel: - // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#othertechs=FP16C&expand=1711 - #include - #endif -#endif - -#if defined EIGEN_CUDACC - #define EIGEN_VECTORIZE_GPU - #include - #if EIGEN_CUDA_SDK_VER >= 70500 - #define EIGEN_HAS_CUDA_FP16 - #endif -#endif - -#if defined(EIGEN_HAS_CUDA_FP16) - #include - #include -#endif - -#if defined(EIGEN_HIPCC) - #define EIGEN_VECTORIZE_GPU - #include - #define EIGEN_HAS_HIP_FP16 - #include -#endif - - -/** \brief Namespace containing all symbols from the %Eigen library. */ -namespace Eigen { - -inline static const char *SimdInstructionSetsInUse(void) { -#if defined(EIGEN_VECTORIZE_AVX512) - return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; -#elif defined(EIGEN_VECTORIZE_AVX) - return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; -#elif defined(EIGEN_VECTORIZE_SSE4_2) - return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2"; -#elif defined(EIGEN_VECTORIZE_SSE4_1) - return "SSE, SSE2, SSE3, SSSE3, SSE4.1"; -#elif defined(EIGEN_VECTORIZE_SSSE3) - return "SSE, SSE2, SSE3, SSSE3"; -#elif defined(EIGEN_VECTORIZE_SSE3) - return "SSE, SSE2, SSE3"; -#elif defined(EIGEN_VECTORIZE_SSE2) - return "SSE, SSE2"; -#elif defined(EIGEN_VECTORIZE_ALTIVEC) - return "AltiVec"; -#elif defined(EIGEN_VECTORIZE_VSX) - return "VSX"; -#elif defined(EIGEN_VECTORIZE_NEON) - return "ARM NEON"; -#elif defined(EIGEN_VECTORIZE_ZVECTOR) - return "S390X ZVECTOR"; -#elif defined(EIGEN_VECTORIZE_MSA) - return "MIPS MSA"; -#else - return "None"; -#endif -} - -} // end namespace Eigen - - -#endif // EIGEN_CONFIGURE_VECTORIZATION_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/Constants.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/Constants.h index 7ada82195..7587d6842 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/Constants.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/util/Constants.h @@ -25,10 +25,6 @@ const int Dynamic = -1; */ const int DynamicIndex = 0xffffff; -/** This value means that the increment to go from one value to another in a sequence is not constant for each step. - */ -const int UndefinedIncr = 0xfffffe; - /** This value means +Infinity; it is currently used only as the p parameter to MatrixBase::lpNorm(). * The value Infinity there means the L-infinity norm. */ @@ -254,6 +250,12 @@ enum AlignmentType { #endif }; +/** \ingroup enums + * Enum used by DenseBase::corner() in Eigen2 compatibility mode. */ +// FIXME after the corner() API change, this was not needed anymore, except by AlignedBox +// TODO: find out what to do with that. Adapt the AlignedBox API ? +enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight }; + /** \ingroup enums * Enum containing possible values for the \p Direction parameter of * Reverse, PartialReduxExpr and VectorwiseOp. */ @@ -333,8 +335,6 @@ enum SideType { OnTheRight = 2 }; - - /* the following used to be written as: * * struct NoChange_t {}; @@ -464,7 +464,6 @@ namespace Architecture AltiVec = 0x2, VSX = 0x3, NEON = 0x4, - MSA = 0x5, #if defined EIGEN_VECTORIZE_SSE Target = SSE #elif defined EIGEN_VECTORIZE_ALTIVEC @@ -473,8 +472,6 @@ namespace Architecture Target = VSX #elif defined EIGEN_VECTORIZE_NEON Target = NEON -#elif defined EIGEN_VECTORIZE_MSA - Target = MSA #else Target = Generic #endif diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/DisableStupidWarnings.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/DisableStupidWarnings.h index 4501d3248..74f74cc42 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/util/DisableStupidWarnings.h @@ -4,6 +4,7 @@ #ifdef _MSC_VER // 4100 - unreferenced formal parameter (occurred e.g. in aligned_allocator::destroy(pointer p)) // 4101 - unreferenced local variable + // 4127 - conditional expression is constant // 4181 - qualifier applied to reference type ignored // 4211 - nonstandard extension used : redefined extern to static // 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data @@ -19,7 +20,7 @@ #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS #pragma warning( push ) #endif - #pragma warning( disable : 4100 4101 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800) + #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800) #elif defined __INTEL_COMPILER // 2196 - routine is both "inline" and "noinline" ("noinline" assumed) @@ -41,14 +42,6 @@ #pragma clang diagnostic push #endif #pragma clang diagnostic ignored "-Wconstant-logical-operand" - #if __clang_major__ >= 3 && __clang_minor__ >= 5 - #pragma clang diagnostic ignored "-Wabsolute-value" - #endif - #if ( defined(__ALTIVEC__) || defined(__VSX__) ) && __cplusplus < 201103L - // warning: generic selections are a C11-specific feature - // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h - #pragma clang diagnostic ignored "-Wc11-extensions" - #endif #elif defined __GNUC__ @@ -71,7 +64,6 @@ #endif #if defined __NVCC__ - #pragma diag_suppress boolean_controlling_expr_is_constant // Disable the "statement is unreachable" message #pragma diag_suppress code_is_unreachable // Disable the "dynamic initialization in unreachable code" message @@ -89,7 +81,6 @@ #pragma diag_suppress 2671 #pragma diag_suppress 2735 #pragma diag_suppress 2737 - #pragma diag_suppress 2739 #endif #else diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/ForwardDeclarations.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/ForwardDeclarations.h index cd0bdb5a7..134544f96 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/ForwardDeclarations.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/util/ForwardDeclarations.h @@ -79,8 +79,6 @@ template class ForceAlignedAccess; template class SwapWrapper; template class Block; -template class IndexedView; -template class Reshaped; template class VectorBlock; template class Transpose; @@ -110,7 +108,7 @@ template class TranspositionsWrapper; template::has_write_access ? WriteAccessors : ReadOnlyAccessors > class MapBase; -template class Stride; +template class Stride; template class InnerStride; template class OuterStride; template > class Map; @@ -131,9 +129,6 @@ template class SolverBase; template class InnerIterator; namespace internal { -template class generic_randaccess_stl_iterator; -template class pointer_based_stl_iterator; -template class subvector_stl_iterator; template struct kernel_retval_base; template struct kernel_retval; template struct image_retval_base; @@ -187,7 +182,6 @@ template struct scalar_real_op; template struct scalar_imag_op; template struct scalar_abs_op; template struct scalar_abs2_op; -template struct scalar_absolute_difference_op; template struct scalar_sqrt_op; template struct scalar_rsqrt_op; template struct scalar_exp_op; @@ -215,27 +209,11 @@ template struct scalar_lgamma_op; template struct scalar_digamma_op; template struct scalar_erf_op; template struct scalar_erfc_op; -template struct scalar_ndtri_op; template struct scalar_igamma_op; template struct scalar_igammac_op; template struct scalar_zeta_op; template struct scalar_betainc_op; -// Bessel functions in SpecialFunctions module -template struct scalar_bessel_i0_op; -template struct scalar_bessel_i0e_op; -template struct scalar_bessel_i1_op; -template struct scalar_bessel_i1e_op; -template struct scalar_bessel_j0_op; -template struct scalar_bessel_y0_op; -template struct scalar_bessel_j1_op; -template struct scalar_bessel_y1_op; -template struct scalar_bessel_k0_op; -template struct scalar_bessel_k0e_op; -template struct scalar_bessel_k1_op; -template struct scalar_bessel_k1e_op; - - } // end namespace internal struct IOFormat; @@ -273,7 +251,6 @@ template class HouseholderQR; template class ColPivHouseholderQR; template class FullPivHouseholderQR; template class CompleteOrthogonalDecomposition; -template class SVDBase; template class JacobiSVD; template class BDCSVD; template class LLT; diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/IndexedViewHelper.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/IndexedViewHelper.h deleted file mode 100644 index 1cda85060..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/IndexedViewHelper.h +++ /dev/null @@ -1,186 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2017 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - - -#ifndef EIGEN_INDEXED_VIEW_HELPER_H -#define EIGEN_INDEXED_VIEW_HELPER_H - -namespace Eigen { - -namespace internal { -struct symbolic_last_tag {}; -} - -/** \var last - * \ingroup Core_Module - * - * Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically reference the last element/row/columns - * of the underlying vector or matrix once passed to DenseBase::operator()(const RowIndices&, const ColIndices&). - * - * This symbolic placeholder supports standard arithmetic operations. - * - * A typical usage example would be: - * \code - * using namespace Eigen; - * using Eigen::last; - * VectorXd v(n); - * v(seq(2,last-2)).setOnes(); - * \endcode - * - * \sa end - */ -static const symbolic::SymbolExpr last; // PLEASE use Eigen::last instead of Eigen::placeholders::last - -/** \var lastp1 - * \ingroup Core_Module - * - * Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically - * reference the last+1 element/row/columns of the underlying vector or matrix once - * passed to DenseBase::operator()(const RowIndices&, const ColIndices&). - * - * This symbolic placeholder supports standard arithmetic operations. - * It is essentially an alias to last+fix<1>. - * - * \sa last - */ -#ifdef EIGEN_PARSED_BY_DOXYGEN -static const auto lastp1 = last+fix<1>; -#else -// Using a FixedExpr<1> expression is important here to make sure the compiler -// can fully optimize the computation starting indices with zero overhead. -static const symbolic::AddExpr,symbolic::ValueExpr > > lastp1(last+fix<1>()); -#endif - -namespace internal { - - // Replace symbolic last/end "keywords" by their true runtime value -inline Index eval_expr_given_size(Index x, Index /* size */) { return x; } - -template -FixedInt eval_expr_given_size(FixedInt x, Index /*size*/) { return x; } - -template -Index eval_expr_given_size(const symbolic::BaseExpr &x, Index size) -{ - return x.derived().eval(last=size-1); -} - -// Extract increment/step at compile time -template struct get_compile_time_incr { - enum { value = UndefinedIncr }; -}; - -// Analogue of std::get<0>(x), but tailored for our needs. -template -Index first(const T& x) { return x.first(); } - -// IndexedViewCompatibleType/makeIndexedViewCompatible turn an arbitrary object of type T into something usable by MatrixSlice -// The generic implementation is a no-op -template -struct IndexedViewCompatibleType { - typedef T type; -}; - -template -const T& makeIndexedViewCompatible(const T& x, Index /*size*/, Q) { return x; } - -//-------------------------------------------------------------------------------- -// Handling of a single Index -//-------------------------------------------------------------------------------- - -struct SingleRange { - enum { - SizeAtCompileTime = 1 - }; - SingleRange(Index val) : m_value(val) {} - Index operator[](Index) const { return m_value; } - Index size() const { return 1; } - Index first() const { return m_value; } - Index m_value; -}; - -template<> struct get_compile_time_incr { - enum { value = 1 }; // 1 or 0 ?? -}; - -// Turn a single index into something that looks like an array (i.e., that exposes a .size(), and operator[](int) methods) -template -struct IndexedViewCompatibleType::value>::type> { - // Here we could simply use Array, but maybe it's less work for the compiler to use - // a simpler wrapper as SingleRange - //typedef Eigen::Array type; - typedef SingleRange type; -}; - -template -struct IndexedViewCompatibleType::value>::type> { - typedef SingleRange type; -}; - - -template -typename enable_if::value,SingleRange>::type -makeIndexedViewCompatible(const T& id, Index size, SpecializedType) { - return eval_expr_given_size(id,size); -} - -//-------------------------------------------------------------------------------- -// Handling of all -//-------------------------------------------------------------------------------- - -struct all_t { all_t() {} }; - -// Convert a symbolic 'all' into a usable range type -template -struct AllRange { - enum { SizeAtCompileTime = XprSize }; - AllRange(Index size = XprSize) : m_size(size) {} - Index operator[](Index i) const { return i; } - Index size() const { return m_size.value(); } - Index first() const { return 0; } - variable_if_dynamic m_size; -}; - -template -struct IndexedViewCompatibleType { - typedef AllRange type; -}; - -template -inline AllRange::value> makeIndexedViewCompatible(all_t , XprSizeType size, SpecializedType) { - return AllRange::value>(size); -} - -template struct get_compile_time_incr > { - enum { value = 1 }; -}; - -} // end namespace internal - - -/** \var all - * \ingroup Core_Module - * Can be used as a parameter to DenseBase::operator()(const RowIndices&, const ColIndices&) to index all rows or columns - */ -static const Eigen::internal::all_t all; // PLEASE use Eigen::all instead of Eigen::placeholders::all - - -namespace placeholders { - typedef symbolic::SymbolExpr last_t; - typedef symbolic::AddExpr,symbolic::ValueExpr > > end_t; - typedef Eigen::internal::all_t all_t; - - EIGEN_DEPRECATED static const all_t all = Eigen::all; // PLEASE use Eigen::all instead of Eigen::placeholders::all - EIGEN_DEPRECATED static const last_t last = Eigen::last; // PLEASE use Eigen::last instead of Eigen::placeholders::last - EIGEN_DEPRECATED static const end_t end = Eigen::lastp1; // PLEASE use Eigen::lastp1 instead of Eigen::placeholders::end -} - -} // end namespace Eigen - -#endif // EIGEN_INDEXED_VIEW_HELPER_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/IntegralConstant.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/IntegralConstant.h deleted file mode 100644 index caeea232d..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/IntegralConstant.h +++ /dev/null @@ -1,272 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2017 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - - -#ifndef EIGEN_INTEGRAL_CONSTANT_H -#define EIGEN_INTEGRAL_CONSTANT_H - -namespace Eigen { - -namespace internal { - -template class FixedInt; -template class VariableAndFixedInt; - -/** \internal - * \class FixedInt - * - * This class embeds a compile-time integer \c N. - * - * It is similar to c++11 std::integral_constant but with some additional features - * such as: - * - implicit conversion to int - * - arithmetic and some bitwise operators: -, +, *, /, %, &, | - * - c++98/14 compatibility with fix and fix() syntax to define integral constants. - * - * It is strongly discouraged to directly deal with this class FixedInt. Instances are expcected to - * be created by the user using Eigen::fix or Eigen::fix(). In C++98-11, the former syntax does - * not create a FixedInt instance but rather a point to function that needs to be \em cleaned-up - * using the generic helper: - * \code - * internal::cleanup_index_type::type - * internal::cleanup_index_type::type - * \endcode - * where T can a FixedInt, a pointer to function FixedInt (*)(), or numerous other integer-like representations. - * \c DynamicKey is either Dynamic (default) or DynamicIndex and used to identify true compile-time values. - * - * For convenience, you can extract the compile-time value \c N in a generic way using the following helper: - * \code - * internal::get_fixed_value::value - * \endcode - * that will give you \c N if T equals FixedInt or FixedInt (*)(), and \c DefaultVal if T does not embed any compile-time value (e.g., T==int). - * - * \sa fix, class VariableAndFixedInt - */ -template class FixedInt -{ -public: - static const int value = N; - operator int() const { return value; } - FixedInt() {} - FixedInt( VariableAndFixedInt other) { - #ifndef EIGEN_INTERNAL_DEBUGGING - EIGEN_UNUSED_VARIABLE(other); - #endif - eigen_internal_assert(int(other)==N); - } - - FixedInt<-N> operator-() const { return FixedInt<-N>(); } - template - FixedInt operator+( FixedInt) const { return FixedInt(); } - template - FixedInt operator-( FixedInt) const { return FixedInt(); } - template - FixedInt operator*( FixedInt) const { return FixedInt(); } - template - FixedInt operator/( FixedInt) const { return FixedInt(); } - template - FixedInt operator%( FixedInt) const { return FixedInt(); } - template - FixedInt operator|( FixedInt) const { return FixedInt(); } - template - FixedInt operator&( FixedInt) const { return FixedInt(); } - -#if EIGEN_HAS_CXX14 - // Needed in C++14 to allow fix(): - FixedInt operator() () const { return *this; } - - VariableAndFixedInt operator() (int val) const { return VariableAndFixedInt(val); } -#else - FixedInt ( FixedInt (*)() ) {} -#endif - -#if EIGEN_HAS_CXX11 - FixedInt(std::integral_constant) {} -#endif -}; - -/** \internal - * \class VariableAndFixedInt - * - * This class embeds both a compile-time integer \c N and a runtime integer. - * Both values are supposed to be equal unless the compile-time value \c N has a special - * value meaning that the runtime-value should be used. Depending on the context, this special - * value can be either Eigen::Dynamic (for positive quantities) or Eigen::DynamicIndex (for - * quantities that can be negative). - * - * It is the return-type of the function Eigen::fix(int), and most of the time this is the only - * way it is used. It is strongly discouraged to directly deal with instances of VariableAndFixedInt. - * Indeed, in order to write generic code, it is the responsibility of the callee to properly convert - * it to either a true compile-time quantity (i.e. a FixedInt), or to a runtime quantity (e.g., an Index) - * using the following generic helper: - * \code - * internal::cleanup_index_type::type - * internal::cleanup_index_type::type - * \endcode - * where T can be a template instantiation of VariableAndFixedInt or numerous other integer-like representations. - * \c DynamicKey is either Dynamic (default) or DynamicIndex and used to identify true compile-time values. - * - * For convenience, you can also extract the compile-time value \c N using the following helper: - * \code - * internal::get_fixed_value::value - * \endcode - * that will give you \c N if T equals VariableAndFixedInt, and \c DefaultVal if T does not embed any compile-time value (e.g., T==int). - * - * \sa fix(int), class FixedInt - */ -template class VariableAndFixedInt -{ -public: - static const int value = N; - operator int() const { return m_value; } - VariableAndFixedInt(int val) { m_value = val; } -protected: - int m_value; -}; - -template struct get_fixed_value { - static const int value = Default; -}; - -template struct get_fixed_value,Default> { - static const int value = N; -}; - -#if !EIGEN_HAS_CXX14 -template struct get_fixed_value (*)(),Default> { - static const int value = N; -}; -#endif - -template struct get_fixed_value,Default> { - static const int value = N ; -}; - -template -struct get_fixed_value,Default> { - static const int value = N; -}; - -template EIGEN_DEVICE_FUNC Index get_runtime_value(const T &x) { return x; } -#if !EIGEN_HAS_CXX14 -template EIGEN_DEVICE_FUNC Index get_runtime_value(FixedInt (*)()) { return N; } -#endif - -// Cleanup integer/FixedInt/VariableAndFixedInt/etc types: - -// By default, no cleanup: -template struct cleanup_index_type { typedef T type; }; - -// Convert any integral type (e.g., short, int, unsigned int, etc.) to Eigen::Index -template struct cleanup_index_type::value>::type> { typedef Index type; }; - -#if !EIGEN_HAS_CXX14 -// In c++98/c++11, fix is a pointer to function that we better cleanup to a true FixedInt: -template struct cleanup_index_type (*)(), DynamicKey> { typedef FixedInt type; }; -#endif - -// If VariableAndFixedInt does not match DynamicKey, then we turn it to a pure compile-time value: -template struct cleanup_index_type, DynamicKey> { typedef FixedInt type; }; -// If VariableAndFixedInt matches DynamicKey, then we turn it to a pure runtime-value (aka Index): -template struct cleanup_index_type, DynamicKey> { typedef Index type; }; - -#if EIGEN_HAS_CXX11 -template struct cleanup_index_type, DynamicKey> { typedef FixedInt type; }; -#endif - -} // end namespace internal - -#ifndef EIGEN_PARSED_BY_DOXYGEN - -#if EIGEN_HAS_CXX14 -template -static const internal::FixedInt fix{}; -#else -template -inline internal::FixedInt fix() { return internal::FixedInt(); } - -// The generic typename T is mandatory. Otherwise, a code like fix could refer to either the function above or this next overload. -// This way a code like fix can only refer to the previous function. -template -inline internal::VariableAndFixedInt fix(T val) { return internal::VariableAndFixedInt(internal::convert_index(val)); } -#endif - -#else // EIGEN_PARSED_BY_DOXYGEN - -/** \var fix() - * \ingroup Core_Module - * - * This \em identifier permits to construct an object embedding a compile-time integer \c N. - * - * \tparam N the compile-time integer value - * - * It is typically used in conjunction with the Eigen::seq and Eigen::seqN functions to pass compile-time values to them: - * \code - * seqN(10,fix<4>,fix<-3>) // <=> [10 7 4 1] - * \endcode - * - * See also the function fix(int) to pass both a compile-time and runtime value. - * - * In c++14, it is implemented as: - * \code - * template static const internal::FixedInt fix{}; - * \endcode - * where internal::FixedInt is an internal template class similar to - * \c std::integral_constant - * Here, \c fix is thus an object of type \c internal::FixedInt. - * - * In c++98/11, it is implemented as a function: - * \code - * template inline internal::FixedInt fix(); - * \endcode - * Here internal::FixedInt is thus a pointer to function. - * - * If for some reason you want a true object in c++98 then you can write: \code fix() \endcode which is also valid in c++14. - * - * \sa fix(int), seq, seqN - */ -template -static const auto fix(); - -/** \fn fix(int) - * \ingroup Core_Module - * - * This function returns an object embedding both a compile-time integer \c N, and a fallback runtime value \a val. - * - * \tparam N the compile-time integer value - * \param val the fallback runtime integer value - * - * This function is a more general version of the \ref fix identifier/function that can be used in template code - * where the compile-time value could turn out to actually mean "undefined at compile-time". For positive integers - * such as a size or a dimension, this case is identified by Eigen::Dynamic, whereas runtime signed integers - * (e.g., an increment/stride) are identified as Eigen::DynamicIndex. In such a case, the runtime value \a val - * will be used as a fallback. - * - * A typical use case would be: - * \code - * template void foo(const MatrixBase &mat) { - * const int N = Derived::RowsAtCompileTime==Dynamic ? Dynamic : Derived::RowsAtCompileTime/2; - * const int n = mat.rows()/2; - * ... mat( seqN(0,fix(n) ) ...; - * } - * \endcode - * In this example, the function Eigen::seqN knows that the second argument is expected to be a size. - * If the passed compile-time value N equals Eigen::Dynamic, then the proxy object returned by fix will be dissmissed, and converted to an Eigen::Index of value \c n. - * Otherwise, the runtime-value \c n will be dissmissed, and the returned ArithmeticSequence will be of the exact same type as seqN(0,fix) . - * - * \sa fix, seqN, class ArithmeticSequence - */ -template -static const auto fix(int val); - -#endif // EIGEN_PARSED_BY_DOXYGEN - -} // end namespace Eigen - -#endif // EIGEN_INTEGRAL_CONSTANT_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/MKL_support.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/MKL_support.h index 17963fad4..b7d6ecc76 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/MKL_support.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/util/MKL_support.h @@ -55,11 +55,7 @@ #if defined EIGEN_USE_MKL -# if (!defined MKL_DIRECT_CALL) && (!defined EIGEN_MKL_NO_DIRECT_CALL) -# define MKL_DIRECT_CALL -# define MKL_DIRECT_CALL_JUST_SET -# endif -# include +# include /*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/ # ifndef INTEL_MKL_VERSION # undef EIGEN_USE_MKL /* INTEL_MKL_VERSION is not even defined on older versions */ @@ -73,9 +69,6 @@ # undef EIGEN_USE_MKL_VML # undef EIGEN_USE_LAPACKE_STRICT # undef EIGEN_USE_LAPACKE -# ifdef MKL_DIRECT_CALL_JUST_SET -# undef MKL_DIRECT_CALL -# endif # endif #endif diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/Macros.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/Macros.h index d0499a1c9..87233eadf 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/Macros.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/util/Macros.h @@ -11,56 +11,19 @@ #ifndef EIGEN_MACROS_H #define EIGEN_MACROS_H -//------------------------------------------------------------------------------------------ -// Eigen version and basic defaults -//------------------------------------------------------------------------------------------ - #define EIGEN_WORLD_VERSION 3 #define EIGEN_MAJOR_VERSION 3 -#define EIGEN_MINOR_VERSION 90 +#define EIGEN_MINOR_VERSION 8 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \ (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \ EIGEN_MINOR_VERSION>=z)))) -#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR -#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor -#else -#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::ColMajor -#endif - -#ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE -#define EIGEN_DEFAULT_DENSE_INDEX_TYPE std::ptrdiff_t -#endif - -// Upperbound on the C++ version to use. -// Expected values are 03, 11, 14, 17, etc. -// By default, let's use an arbitrarily large C++ version. -#ifndef EIGEN_MAX_CPP_VER -#define EIGEN_MAX_CPP_VER 99 -#endif - -/** Allows to disable some optimizations which might affect the accuracy of the result. - * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them. - * They currently include: - * - single precision ArrayBase::sin() and ArrayBase::cos() for SSE and AVX vectorization. - */ -#ifndef EIGEN_FAST_MATH -#define EIGEN_FAST_MATH 1 -#endif - -#ifndef EIGEN_STACK_ALLOCATION_LIMIT -// 131072 == 128 KB -#define EIGEN_STACK_ALLOCATION_LIMIT 131072 -#endif - -//------------------------------------------------------------------------------------------ // Compiler identification, EIGEN_COMP_* -//------------------------------------------------------------------------------------------ /// \internal EIGEN_COMP_GNUC set to 1 for all compilers compatible with GCC #ifdef __GNUC__ - #define EIGEN_COMP_GNUC (__GNUC__*10+__GNUC_MINOR__) + #define EIGEN_COMP_GNUC 1 #else #define EIGEN_COMP_GNUC 0 #endif @@ -108,44 +71,14 @@ #define EIGEN_COMP_MSVC 0 #endif -#if defined(__NVCC__) -#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) - #define EIGEN_COMP_NVCC ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) -#elif defined(__CUDACC_VER__) - #define EIGEN_COMP_NVCC __CUDACC_VER__ -#else - #error "NVCC did not define compiler version." -#endif -#else - #define EIGEN_COMP_NVCC 0 -#endif - // For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC: -// name ver MSC_VER -// 2008 9 1500 -// 2010 10 1600 -// 2012 11 1700 -// 2013 12 1800 -// 2015 14 1900 -// "15" 15 1900 -// 2017-14.1 15.0 1910 -// 2017-14.11 15.3 1911 -// 2017-14.12 15.5 1912 -// 2017-14.13 15.6 1913 -// 2017-14.14 15.7 1914 - -/// \internal EIGEN_COMP_MSVC_LANG set to _MSVC_LANG if the compiler is Microsoft Visual C++, 0 otherwise. -#if defined(_MSVC_LANG) - #define EIGEN_COMP_MSVC_LANG _MSVC_LANG -#else - #define EIGEN_COMP_MSVC_LANG 0 -#endif - -// For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC_LANG: -// MSVC option Standard MSVC_LANG -// /std:c++14 (default as of VS 2019) C++14 201402L -// /std:c++17 C++17 201703L -// /std:c++latest >C++17 >201703L +// name ver MSC_VER +// 2008 9 1500 +// 2010 10 1600 +// 2012 11 1700 +// 2013 12 1800 +// 2015 14 1900 +// "15" 15 1900 /// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC or clang-cl #if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC || EIGEN_COMP_LLVM || EIGEN_COMP_CLANG) @@ -154,21 +87,16 @@ #define EIGEN_COMP_MSVC_STRICT 0 #endif -/// \internal EIGEN_COMP_IBM set to xlc version if the compiler is IBM XL C++ -// XLC version -// 3.1 0x0301 -// 4.5 0x0405 -// 5.0 0x0500 -// 12.1 0x0C01 -#if defined(__IBMCPP__) || defined(__xlc__) || defined(__ibmxl__) - #define EIGEN_COMP_IBM __xlC__ +/// \internal EIGEN_COMP_IBM set to 1 if the compiler is IBM XL C++ +#if defined(__IBMCPP__) || defined(__xlc__) + #define EIGEN_COMP_IBM 1 #else #define EIGEN_COMP_IBM 0 #endif -/// \internal EIGEN_COMP_PGI set to PGI version if the compiler is Portland Group Compiler +/// \internal EIGEN_COMP_PGI set to 1 if the compiler is Portland Group Compiler #if defined(__PGI) - #define EIGEN_COMP_PGI (__PGIC__*100+__PGIC_MINOR__) + #define EIGEN_COMP_PGI 1 #else #define EIGEN_COMP_PGI 0 #endif @@ -180,7 +108,7 @@ #define EIGEN_COMP_ARM 0 #endif -/// \internal EIGEN_COMP_EMSCRIPTEN set to 1 if the compiler is Emscripten Compiler +/// \internal EIGEN_COMP_ARM set to 1 if the compiler is ARM Compiler #if defined(__EMSCRIPTEN__) #define EIGEN_COMP_EMSCRIPTEN 1 #else @@ -214,11 +142,7 @@ #endif - -//------------------------------------------------------------------------------------------ // Architecture identification, EIGEN_ARCH_* -//------------------------------------------------------------------------------------------ - #if defined(__x86_64__) || defined(_M_X64) || defined(__amd64) #define EIGEN_ARCH_x86_64 1 @@ -288,9 +212,7 @@ -//------------------------------------------------------------------------------------------ // Operating system identification, EIGEN_OS_* -//------------------------------------------------------------------------------------------ /// \internal EIGEN_OS_UNIX set to 1 if the OS is a unix variant #if defined(__unix__) || defined(__unix) @@ -377,17 +299,9 @@ #define EIGEN_OS_WIN_STRICT 0 #endif -/// \internal EIGEN_OS_SUN set to __SUNPRO_C if the OS is SUN -// compiler solaris __SUNPRO_C -// version studio -// 5.7 10 0x570 -// 5.8 11 0x580 -// 5.9 12 0x590 -// 5.10 12.1 0x5100 -// 5.11 12.2 0x5110 -// 5.12 12.3 0x5120 +/// \internal EIGEN_OS_SUN set to 1 if the OS is SUN #if (defined(sun) || defined(__sun)) && !(defined(__SVR4) || defined(__svr4__)) - #define EIGEN_OS_SUN __SUNPRO_C + #define EIGEN_OS_SUN 1 #else #define EIGEN_OS_SUN 0 #endif @@ -400,112 +314,6 @@ #endif -//------------------------------------------------------------------------------------------ -// Detect GPU compilers and architectures -//------------------------------------------------------------------------------------------ - -// NVCC is not supported as the target platform for HIPCC -// Note that this also makes EIGEN_CUDACC and EIGEN_HIPCC mutually exclusive -#if defined(__NVCC__) && defined(__HIPCC__) - #error "NVCC as the target platform for HIPCC is currently not supported." -#endif - -#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA) - // Means the compiler is either nvcc or clang with CUDA enabled - #define EIGEN_CUDACC __CUDACC__ -#endif - -#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA) - // Means we are generating code for the device - #define EIGEN_CUDA_ARCH __CUDA_ARCH__ -#endif - -#if defined(EIGEN_CUDACC) -#include - #define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10) -#else - #define EIGEN_CUDA_SDK_VER 0 -#endif - -#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) - // Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP) - #define EIGEN_HIPCC __HIPCC__ - - // We need to include hip_runtime.h here because it pulls in - // ++ hip_common.h which contains the define for __HIP_DEVICE_COMPILE__ - // ++ host_defines.h which contains the defines for the __host__ and __device__ macros - #include - - #if defined(__HIP_DEVICE_COMPILE__) - // analogous to EIGEN_CUDA_ARCH, but for HIP - #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__ - #endif -#endif - -// Unify CUDA/HIPCC - -#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) -// -// If either EIGEN_CUDACC or EIGEN_HIPCC is defined, then define EIGEN_GPUCC -// -#define EIGEN_GPUCC -// -// EIGEN_HIPCC implies the HIP compiler and is used to tweak Eigen code for use in HIP kernels -// EIGEN_CUDACC implies the CUDA compiler and is used to tweak Eigen code for use in CUDA kernels -// -// In most cases the same tweaks are required to the Eigen code to enable in both the HIP and CUDA kernels. -// For those cases, the corresponding code should be guarded with -// #if defined(EIGEN_GPUCC) -// instead of -// #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) -// -// For cases where the tweak is specific to HIP, the code should be guarded with -// #if defined(EIGEN_HIPCC) -// -// For cases where the tweak is specific to CUDA, the code should be guarded with -// #if defined(EIGEN_CUDACC) -// -#endif - -#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) -// -// If either EIGEN_CUDA_ARCH or EIGEN_HIP_DEVICE_COMPILE is defined, then define EIGEN_GPU_COMPILE_PHASE -// -#define EIGEN_GPU_COMPILE_PHASE -// -// GPU compilers (HIPCC, NVCC) typically do two passes over the source code, -// + one to compile the source for the "host" (ie CPU) -// + another to compile the source for the "device" (ie. GPU) -// -// Code that needs to enabled only during the either the "host" or "device" compilation phase -// needs to be guarded with a macro that indicates the current compilation phase -// -// EIGEN_HIP_DEVICE_COMPILE implies the device compilation phase in HIP -// EIGEN_CUDA_ARCH implies the device compilation phase in CUDA -// -// In most cases, the "host" / "device" specific code is the same for both HIP and CUDA -// For those cases, the code should be guarded with -// #if defined(EIGEN_GPU_COMPILE_PHASE) -// instead of -// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE) -// -// For cases where the tweak is specific to HIP, the code should be guarded with -// #if defined(EIGEN_HIP_DEVICE_COMPILE) -// -// For cases where the tweak is specific to CUDA, the code should be guarded with -// #if defined(EIGEN_CUDA_ARCH) -// -#endif - -#if defined(EIGEN_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__) -// EIGEN_USE_SYCL is a user-defined macro while __SYCL_DEVICE_ONLY__ is a compiler-defined macro. -// In most cases we want to check if both macros are defined which can be done using the define below. -#define SYCL_DEVICE_ONLY -#endif - -//------------------------------------------------------------------------------------------ -// Detect Compiler/Architecture/OS specific features -//------------------------------------------------------------------------------------------ #if EIGEN_GNUC_AT_MOST(4,3) && !EIGEN_COMP_CLANG // see bug 89 @@ -514,6 +322,20 @@ #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 1 #endif +// This macro can be used to prevent from macro expansion, e.g.: +// std::max EIGEN_NOT_A_MACRO(a,b) +#define EIGEN_NOT_A_MACRO + +#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR +#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor +#else +#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::ColMajor +#endif + +#ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE std::ptrdiff_t +#endif + // Cross compiler wrapper around LLVM's __has_builtin #ifdef __has_builtin # define EIGEN_HAS_BUILTIN(x) __has_builtin(x) @@ -527,47 +349,19 @@ # define __has_feature(x) 0 #endif -// Some old compilers do not support template specializations like: -// template void foo(const T x[N]); -#if !( EIGEN_COMP_CLANG && ( (EIGEN_COMP_CLANG<309) \ - || (defined(__apple_build_version__) && (__apple_build_version__ < 9000000))) \ - || EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<49) -#define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 1 -#else -#define EIGEN_HAS_STATIC_ARRAY_TEMPLATE 0 +// Upperbound on the C++ version to use. +// Expected values are 03, 11, 14, 17, etc. +// By default, let's use an arbitrarily large C++ version. +#ifndef EIGEN_MAX_CPP_VER +#define EIGEN_MAX_CPP_VER 99 #endif - -// The macro EIGEN_COMP_CXXVER defines the c++ verson expected by the compiler. -// For instance, if compiling with gcc and -std=c++17, then EIGEN_COMP_CXXVER -// is defined to 17. -#if (defined(__cplusplus) && (__cplusplus > 201402L) || EIGEN_COMP_MSVC_LANG > 201402L) -#define EIGEN_COMP_CXXVER 17 -#elif (defined(__cplusplus) && (__cplusplus > 201103L) || EIGEN_COMP_MSVC >= 1910) -#define EIGEN_COMP_CXXVER 14 -#elif (defined(__cplusplus) && (__cplusplus >= 201103L) || EIGEN_COMP_MSVC >= 1900) -#define EIGEN_COMP_CXXVER 11 -#else -#define EIGEN_COMP_CXXVER 03 -#endif - - -// The macros EIGEN_HAS_CXX?? defines a rough estimate of available c++ features -// but in practice we should not rely on them but rather on the availabilty of -// individual features as defined later. -// This is why there is no EIGEN_HAS_CXX17. -// FIXME: get rid of EIGEN_HAS_CXX14 and maybe even EIGEN_HAS_CXX11. -#if EIGEN_MAX_CPP_VER>=11 && EIGEN_COMP_CXXVER>=11 +#if EIGEN_MAX_CPP_VER>=11 && (defined(__cplusplus) && (__cplusplus >= 201103L) || EIGEN_COMP_MSVC >= 1900) #define EIGEN_HAS_CXX11 1 #else #define EIGEN_HAS_CXX11 0 #endif -#if EIGEN_MAX_CPP_VER>=14 && EIGEN_COMP_CXXVER>=14 -#define EIGEN_HAS_CXX14 1 -#else -#define EIGEN_HAS_CXX14 0 -#endif // Do we support r-value references? #ifndef EIGEN_HAS_RVALUE_REFERENCES @@ -582,14 +376,12 @@ #endif // Does the compiler support C99? -// Need to include to make sure _GLIBCXX_USE_C99 gets defined -#include #ifndef EIGEN_HAS_C99_MATH #if EIGEN_MAX_CPP_VER>=11 && \ ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901)) \ || (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) \ || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) \ - || (EIGEN_COMP_MSVC >= 1900) || defined(SYCL_DEVICE_ONLY)) + || (EIGEN_COMP_MSVC >= 1900) ) #define EIGEN_HAS_C99_MATH 1 #else #define EIGEN_HAS_C99_MATH 0 @@ -597,33 +389,14 @@ #endif // Does the compiler support result_of? -// It's likely that MSVC 2013 supports result_of but I couldn't not find a good source for that, -// so let's be conservative. #ifndef EIGEN_HAS_STD_RESULT_OF -#if EIGEN_MAX_CPP_VER>=11 && \ - (__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L) || EIGEN_COMP_MSVC >= 1900) +#if EIGEN_MAX_CPP_VER>=11 && ((__has_feature(cxx_lambdas) || (defined(__cplusplus) && __cplusplus >= 201103L))) #define EIGEN_HAS_STD_RESULT_OF 1 #else #define EIGEN_HAS_STD_RESULT_OF 0 #endif #endif -#ifndef EIGEN_HAS_ALIGNAS -#if EIGEN_MAX_CPP_VER>=11 && EIGEN_HAS_CXX11 && \ - ( __has_feature(cxx_alignas) \ - || EIGEN_HAS_CXX14 \ - || (EIGEN_COMP_MSVC >= 1800) \ - || (EIGEN_GNUC_AT_LEAST(4,8)) \ - || (EIGEN_COMP_CLANG>=305) \ - || (EIGEN_COMP_ICC>=1500) \ - || (EIGEN_COMP_PGI>=1500) \ - || (EIGEN_COMP_SUNCC>=0x5130)) -#define EIGEN_HAS_ALIGNAS 1 -#else -#define EIGEN_HAS_ALIGNAS 0 -#endif -#endif - // Does the compiler support type_traits? // - full support of type traits was added only to GCC 5.1.0. // - 20150626 corresponds to the last release of 4.x libstdc++ @@ -641,12 +414,10 @@ // Does the compiler support variadic templates? #ifndef EIGEN_HAS_VARIADIC_TEMPLATES #if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \ - && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_COMP_NVCC >= 80000) ) + && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_CUDACC_VER >= 80000) ) // ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices: // this prevents nvcc from crashing when compiling Eigen on Tegra X1 #define EIGEN_HAS_VARIADIC_TEMPLATES 1 -#elif EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) && defined(SYCL_DEVICE_ONLY) -#define EIGEN_HAS_VARIADIC_TEMPLATES 1 #else #define EIGEN_HAS_VARIADIC_TEMPLATES 0 #endif @@ -654,22 +425,22 @@ // Does the compiler fully support const expressions? (as in c++14) #ifndef EIGEN_HAS_CONSTEXPR - #if defined(EIGEN_CUDACC) - // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above - #if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_COMP_NVCC >= 70500)) - #define EIGEN_HAS_CONSTEXPR 1 - #endif - #elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \ - (EIGEN_GNUC_AT_LEAST(4,8) && (__cplusplus > 199711L)) || \ - (EIGEN_COMP_CLANG >= 306 && (__cplusplus > 199711L))) - #define EIGEN_HAS_CONSTEXPR 1 - #endif - #ifndef EIGEN_HAS_CONSTEXPR - #define EIGEN_HAS_CONSTEXPR 0 - #endif +#ifdef __CUDACC__ +// Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above +#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_CUDACC_VER >= 70500)) + #define EIGEN_HAS_CONSTEXPR 1 +#endif +#elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \ + (EIGEN_GNUC_AT_LEAST(4,8) && (__cplusplus > 199711L))) +#define EIGEN_HAS_CONSTEXPR 1 +#endif -#endif // EIGEN_HAS_CONSTEXPR +#ifndef EIGEN_HAS_CONSTEXPR +#define EIGEN_HAS_CONSTEXPR 0 +#endif + +#endif // Does the compiler support C++11 math? // Let's be conservative and enable the default C++11 implementation only if we are sure it exists @@ -707,80 +478,15 @@ #endif #endif -#ifndef EIGEN_HAS_CXX11_ATOMIC - #if EIGEN_MAX_CPP_VER>=11 && \ - (__has_feature(cxx_atomic) \ - || (__cplusplus > 201103L) \ - || ((__cplusplus >= 201103L) && (EIGEN_COMP_MSVC==0 || EIGEN_COMP_MSVC >= 1700))) - #define EIGEN_HAS_CXX11_ATOMIC 1 - #else - #define EIGEN_HAS_CXX11_ATOMIC 0 - #endif +/** Allows to disable some optimizations which might affect the accuracy of the result. + * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them. + * They currently include: + * - single precision ArrayBase::sin() and ArrayBase::cos() for SSE and AVX vectorization. + */ +#ifndef EIGEN_FAST_MATH +#define EIGEN_FAST_MATH 1 #endif -#ifndef EIGEN_HAS_CXX11_OVERRIDE_FINAL - #if EIGEN_MAX_CPP_VER>=11 && \ - (__cplusplus >= 201103L || EIGEN_COMP_MSVC >= 1700) - #define EIGEN_HAS_CXX11_OVERRIDE_FINAL 1 - #else - #define EIGEN_HAS_CXX11_OVERRIDE_FINAL 0 - #endif -#endif - -// NOTE: the required Apple's clang version is very conservative -// and it could be that XCode 9 works just fine. -// NOTE: the MSVC version is based on https://en.cppreference.com/w/cpp/compiler_support -// and not tested. -#ifndef EIGEN_HAS_CXX17_OVERALIGN -#if EIGEN_MAX_CPP_VER>=17 && EIGEN_COMP_CXXVER>=17 && ( \ - (EIGEN_COMP_MSVC >= 1912) \ - || (EIGEN_GNUC_AT_LEAST(7,0)) \ - || ((!defined(__apple_build_version__)) && (EIGEN_COMP_CLANG>=500)) \ - || (( defined(__apple_build_version__)) && (__apple_build_version__>=10000000)) \ - ) -#define EIGEN_HAS_CXX17_OVERALIGN 1 -#else -#define EIGEN_HAS_CXX17_OVERALIGN 0 -#endif -#endif - -#if defined(EIGEN_CUDACC) && EIGEN_HAS_CONSTEXPR - // While available already with c++11, this is useful mostly starting with c++14 and relaxed constexpr rules - #if defined(__NVCC__) - // nvcc considers constexpr functions as __host__ __device__ with the option --expt-relaxed-constexpr - #ifdef __CUDACC_RELAXED_CONSTEXPR__ - #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC - #endif - #elif defined(__clang__) && defined(__CUDA__) && __has_feature(cxx_relaxed_constexpr) - // clang++ always considers constexpr functions as implicitly __host__ __device__ - #define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC - #endif -#endif - -// Does the compiler support the __int128 and __uint128_t extensions for 128-bit -// integer arithmetic? -// -// Clang and GCC define __SIZEOF_INT128__ when these extensions are supported, -// but we avoid using them in certain cases: -// -// * Building using Clang for Windows, where the Clang runtime library has -// 128-bit support only on LP64 architectures, but Windows is LLP64. -#ifndef EIGEN_HAS_BUILTIN_INT128 -#if defined(__SIZEOF_INT128__) && !(EIGEN_OS_WIN && EIGEN_COMP_CLANG) -#define EIGEN_HAS_BUILTIN_INT128 1 -#else -#define EIGEN_HAS_BUILTIN_INT128 0 -#endif -#endif - -//------------------------------------------------------------------------------------------ -// Preprocessor programming helpers -//------------------------------------------------------------------------------------------ - -// This macro can be used to prevent from macro expansion, e.g.: -// std::max EIGEN_NOT_A_MACRO(a,b) -#define EIGEN_NOT_A_MACRO - #define EIGEN_DEBUG_VAR(x) std::cerr << #x << " = " << x << std::endl; // concatenate two tokens @@ -812,7 +518,7 @@ // Eval.h:91: sorry, unimplemented: inlining failed in call to 'const Eigen::Eval Eigen::MatrixBase::eval() const' // : function body not available // See also bug 1367 -#if EIGEN_GNUC_AT_LEAST(4,2) && !defined(SYCL_DEVICE_ONLY) +#if EIGEN_GNUC_AT_LEAST(4,2) #define EIGEN_ALWAYS_INLINE __attribute__((always_inline)) inline #else #define EIGEN_ALWAYS_INLINE EIGEN_STRONG_INLINE @@ -832,43 +538,12 @@ #define EIGEN_PERMISSIVE_EXPR #endif -// GPU stuff - -// Disable some features when compiling with GPU compilers (NVCC/clang-cuda/SYCL/HIPCC) -#if defined(EIGEN_CUDACC) || defined(SYCL_DEVICE_ONLY) || defined(EIGEN_HIPCC) - // Do not try asserts on device code - #ifndef EIGEN_NO_DEBUG - #define EIGEN_NO_DEBUG - #endif - - #ifdef EIGEN_INTERNAL_DEBUGGING - #undef EIGEN_INTERNAL_DEBUGGING - #endif - - #ifdef EIGEN_EXCEPTIONS - #undef EIGEN_EXCEPTIONS - #endif -#endif - -#if defined(SYCL_DEVICE_ONLY) - #ifndef EIGEN_DONT_VECTORIZE - #define EIGEN_DONT_VECTORIZE - #endif - #define EIGEN_DEVICE_FUNC __attribute__((flatten)) __attribute__((always_inline)) -// All functions callable from CUDA/HIP code must be qualified with __device__ -#elif defined(EIGEN_GPUCC) - #define EIGEN_DEVICE_FUNC __host__ __device__ -#else - #define EIGEN_DEVICE_FUNC -#endif - - // this macro allows to get rid of linking errors about multiply defined functions. // - static is not very good because it prevents definitions from different object files to be merged. // So static causes the resulting linked executable to be bloated with multiple copies of the same function. // - inline is not perfect either as it unwantedly hints the compiler toward inlining the function. -#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC -#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC inline +#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS inline #ifdef NDEBUG # ifndef EIGEN_NO_DEBUG @@ -878,12 +553,8 @@ // eigen_plain_assert is where we implement the workaround for the assert() bug in GCC <= 4.3, see bug 89 #ifdef EIGEN_NO_DEBUG - #ifdef SYCL_DEVICE_ONLY // used to silence the warning on SYCL device - #define eigen_plain_assert(x) EIGEN_UNUSED_VARIABLE(x) - #else - #define eigen_plain_assert(x) - #endif -#else + #define eigen_plain_assert(x) +#else #if EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO namespace Eigen { namespace internal { @@ -956,7 +627,7 @@ // Suppresses 'unused variable' warnings. namespace Eigen { namespace internal { - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ignore_unused_variable(const T&) {} + template EIGEN_DEVICE_FUNC void ignore_unused_variable(const T&) {} } } #define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var); @@ -970,14 +641,169 @@ namespace Eigen { #endif -#if EIGEN_COMP_MSVC - // NOTE MSVC often gives C4127 warnings with compiletime if statements. See bug 1362. - // This workaround is ugly, but it does the job. -# define EIGEN_CONST_CONDITIONAL(cond) (void)0, cond +//------------------------------------------------------------------------------------------ +// Static and dynamic alignment control +// +// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES +// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively. +// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not, +// a default value is automatically computed based on architecture, compiler, and OS. +// +// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX} +// to be used to declare statically aligned buffers. +//------------------------------------------------------------------------------------------ + + +/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements. + * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled, + * so that vectorization doesn't affect binary compatibility. + * + * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link + * vectorized and non-vectorized code. + */ +#if (defined __CUDACC__) + #define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n) +#elif EIGEN_COMP_GNUC || EIGEN_COMP_PGI || EIGEN_COMP_IBM || EIGEN_COMP_ARM + #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) +#elif EIGEN_COMP_MSVC + #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n)) +#elif EIGEN_COMP_SUNCC + // FIXME not sure about this one: + #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n))) #else -# define EIGEN_CONST_CONDITIONAL(cond) cond + #error Please tell me what is the equivalent of __attribute__((aligned(n))) for your compiler #endif +// If the user explicitly disable vectorization, then we also disable alignment +#if defined(EIGEN_DONT_VECTORIZE) + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 0 +#elif defined(EIGEN_VECTORIZE_AVX512) + // 64 bytes static alignmeent is preferred only if really required + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 64 +#elif defined(__AVX__) + // 32 bytes static alignmeent is preferred only if really required + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32 +#else + #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16 +#endif + + +// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense +#define EIGEN_MIN_ALIGN_BYTES 16 + +// Defined the boundary (in bytes) on which the data needs to be aligned. Note +// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be +// aligned at all regardless of the value of this #define. + +#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)) && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && EIGEN_MAX_STATIC_ALIGN_BYTES>0 +#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY. +#endif + +// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprectated +// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0 +#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN) + #ifdef EIGEN_MAX_STATIC_ALIGN_BYTES + #undef EIGEN_MAX_STATIC_ALIGN_BYTES + #endif + #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 +#endif + +#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES + + // Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES + + // 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable + // 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always + // enable alignment, but it can be a cause of problems on some platforms, so we just disable it in + // certain common platform (compiler+architecture combinations) to avoid these problems. + // Only static alignment is really problematic (relies on nonstandard compiler extensions), + // try to keep heap alignment even when we have to disable static alignment. + #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64) + #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 + #elif EIGEN_ARCH_ARM_OR_ARM64 && EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(4, 6) + // Old versions of GCC on ARM, at least 4.4, were once seen to have buggy static alignment support. + // Not sure which version fixed it, hopefully it doesn't affect 4.7, which is still somewhat in use. + // 4.8 and newer seem definitely unaffected. + #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 + #else + #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0 + #endif + + // static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX + #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \ + && !EIGEN_GCC3_OR_OLDER \ + && !EIGEN_COMP_SUNCC \ + && !EIGEN_OS_QNX + #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1 + #else + #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0 + #endif + + #if EIGEN_ARCH_WANTS_STACK_ALIGNMENT + #define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES + #else + #define EIGEN_MAX_STATIC_ALIGN_BYTES 0 + #endif + +#endif + +// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_ALIGN_BYTES +#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES0 is the true test whether we want to align arrays on the stack or not. +// It takes into account both the user choice to explicitly enable/disable alignment (by settting EIGEN_MAX_STATIC_ALIGN_BYTES) +// and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). +// Henceforth, only EIGEN_MAX_STATIC_ALIGN_BYTES should be used. + + +// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY +#define EIGEN_ALIGN8 EIGEN_ALIGN_TO_BOUNDARY(8) +#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16) +#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32) +#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64) +#if EIGEN_MAX_STATIC_ALIGN_BYTES>0 +#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES) +#else +#define EIGEN_ALIGN_MAX +#endif + + +// Dynamic alignment control + +#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES>0 +#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN. +#endif + +#ifdef EIGEN_DONT_ALIGN + #ifdef EIGEN_MAX_ALIGN_BYTES + #undef EIGEN_MAX_ALIGN_BYTES + #endif + #define EIGEN_MAX_ALIGN_BYTES 0 +#elif !defined(EIGEN_MAX_ALIGN_BYTES) + #define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES +#endif + +#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES +#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES +#else +#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES +#endif + + +#ifndef EIGEN_UNALIGNED_VECTORIZE +#define EIGEN_UNALIGNED_VECTORIZE 1 +#endif + +//---------------------------------------------------------------------- + + #ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD #define EIGEN_RESTRICT #endif @@ -985,6 +811,10 @@ namespace Eigen { #define EIGEN_RESTRICT __restrict #endif +#ifndef EIGEN_STACK_ALLOCATION_LIMIT +// 131072 == 128 KB +#define EIGEN_STACK_ALLOCATION_LIMIT 131072 +#endif #ifndef EIGEN_DEFAULT_IO_FORMAT #ifdef EIGEN_MAKING_DOCS @@ -999,32 +829,7 @@ namespace Eigen { // just an empty macro ! #define EIGEN_EMPTY - -// When compiling CUDA/HIP device code with NVCC or HIPCC -// pull in math functions from the global namespace. -// In host mode, and when device code is compiled with clang, -// use the std versions. -#if (defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)) || defined(EIGEN_HIP_DEVICE_COMPILE) - #define EIGEN_USING_STD_MATH(FUNC) using ::FUNC; -#else - #define EIGEN_USING_STD_MATH(FUNC) using std::FUNC; -#endif - - -// When compiling HIP device code with HIPCC, certain functions -// from the stdlib need to be pulled in from the global namespace -// (as opposed to from the std:: namespace). This is because HIPCC -// does not natively support all the std:: routines in device code. -// Instead it contains header files that declare the corresponding -// routines in the global namespace such they can be used in device code. -#if defined(EIGEN_HIP_DEVICE_COMPILE) - #define EIGEN_USING_STD(FUNC) using ::FUNC; -#else - #define EIGEN_USING_STD(FUNC) using std::FUNC; -#endif - - -#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_COMP_NVCC) +#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_CUDACC_VER>0) // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324) #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ using Base::operator =; @@ -1103,8 +908,7 @@ namespace Eigen { typedef typename Eigen::internal::ref_selector::type Nested; \ typedef typename Eigen::internal::traits::StorageKind StorageKind; \ typedef typename Eigen::internal::traits::StorageIndex StorageIndex; \ - enum CompileTimeTraits \ - { RowsAtCompileTime = Eigen::internal::traits::RowsAtCompileTime, \ + enum { RowsAtCompileTime = Eigen::internal::traits::RowsAtCompileTime, \ ColsAtCompileTime = Eigen::internal::traits::ColsAtCompileTime, \ Flags = Eigen::internal::traits::Flags, \ SizeAtCompileTime = Base::SizeAtCompileTime, \ @@ -1149,14 +953,6 @@ namespace Eigen { #define EIGEN_IMPLIES(a,b) (!(a) || (b)) -#if EIGEN_HAS_BUILTIN(__builtin_expect) || EIGEN_COMP_GNUC -#define EIGEN_PREDICT_FALSE(x) (__builtin_expect(x, false)) -#define EIGEN_PREDICT_TRUE(x) (__builtin_expect(false || (x), true)) -#else -#define EIGEN_PREDICT_FALSE(x) (x) -#define EIGEN_PREDICT_TRUE(x) (x) -#endif - // the expression type of a standard coefficient wise binary operation #define EIGEN_CWISE_BINARY_RETURN_TYPE(LHS,RHS,OPNAME) \ CwiseBinaryOp< \ @@ -1188,14 +984,14 @@ namespace Eigen { const typename internal::plain_constant_type::type, const EXPR> // Workaround for MSVC 2010 (see ML thread "patch with compile for for MSVC 2010") -#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC_STRICT<=1600) +#if EIGEN_COMP_MSVC_STRICT<=1600 #define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) typename internal::enable_if::type #else #define EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(X) X #endif #define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME) \ - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE \ + template EIGEN_DEVICE_FUNC inline \ EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,typename internal::promote_scalar_arg::type,OPNAME))\ (METHOD)(const T& scalar) const { \ typedef typename internal::promote_scalar_arg::type PromotedT; \ @@ -1204,7 +1000,7 @@ namespace Eigen { } #define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD,OPNAME) \ - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend \ + template EIGEN_DEVICE_FUNC inline friend \ EIGEN_MSVC10_WORKAROUND_BINARYOP_RETURN_TYPE(const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(typename internal::promote_scalar_arg::type,Derived,OPNAME)) \ (METHOD)(const T& scalar, const StorageBaseType& matrix) { \ typedef typename internal::promote_scalar_arg::type PromotedT; \ @@ -1217,23 +1013,15 @@ namespace Eigen { EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD,OPNAME) -#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && !defined(EIGEN_USE_SYCL) && !defined(EIGEN_HIP_DEVICE_COMPILE) - #define EIGEN_EXCEPTIONS -#endif - - #ifdef EIGEN_EXCEPTIONS # define EIGEN_THROW_X(X) throw X # define EIGEN_THROW throw # define EIGEN_TRY try # define EIGEN_CATCH(X) catch (X) #else -# if defined(EIGEN_CUDA_ARCH) +# ifdef __CUDA_ARCH__ # define EIGEN_THROW_X(X) asm("trap;") # define EIGEN_THROW asm("trap;") -# elif defined(EIGEN_HIP_DEVICE_COMPILE) -# define EIGEN_THROW_X(X) asm("s_trap 0") -# define EIGEN_THROW asm("s_trap 0") # else # define EIGEN_THROW_X(X) std::abort() # define EIGEN_THROW std::abort() @@ -1253,47 +1041,13 @@ namespace Eigen { # define EIGEN_NOEXCEPT # define EIGEN_NOEXCEPT_IF(x) # define EIGEN_NO_THROW throw() -# if EIGEN_COMP_MSVC || EIGEN_COMP_CXXVER>=17 +# if EIGEN_COMP_MSVC // MSVC does not support exception specifications (warning C4290), - // and they are deprecated in c++11 anyway. This is even an error in c++17. + // and they are deprecated in c++11 anyway. # define EIGEN_EXCEPTION_SPEC(X) throw() # else # define EIGEN_EXCEPTION_SPEC(X) throw(X) # endif #endif -#if EIGEN_HAS_VARIADIC_TEMPLATES -// The all function is used to enable a variadic version of eigen_assert which can take a parameter pack as its input. -namespace Eigen { -namespace internal { - -inline bool all(){ return true; } - -template -bool all(T t, Ts ... ts){ return t && all(ts...); } - -} -} -#endif - -#if EIGEN_HAS_CXX11_OVERRIDE_FINAL -// provide override and final specifiers if they are available: -# define EIGEN_OVERRIDE override -# define EIGEN_FINAL final -#else -# define EIGEN_OVERRIDE -# define EIGEN_FINAL -#endif - -// Wrapping #pragma unroll in a macro since it is required for SYCL -#if defined(SYCL_DEVICE_ONLY) - #if defined(_MSC_VER) - #define EIGEN_UNROLL_LOOP __pragma(unroll) - #else - #define EIGEN_UNROLL_LOOP _Pragma("unroll") - #endif -#else - #define EIGEN_UNROLL_LOOP -#endif - #endif // EIGEN_MACROS_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/Memory.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/Memory.h index 1b12544d2..291383c58 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/Memory.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/util/Memory.h @@ -63,27 +63,14 @@ namespace Eigen { namespace internal { -EIGEN_DEVICE_FUNC +EIGEN_DEVICE_FUNC inline void throw_std_bad_alloc() { #ifdef EIGEN_EXCEPTIONS throw std::bad_alloc(); #else std::size_t huge = static_cast(-1); - #if defined(EIGEN_HIPCC) - // - // calls to "::operator new" are to be treated as opaque function calls (i.e no inlining), - // and as a consequence the code in the #else block triggers the hipcc warning : - // "no overloaded function has restriction specifiers that are compatible with the ambient context" - // - // "throw_std_bad_alloc" has the EIGEN_DEVICE_FUNC attribute, so it seems that hipcc expects - // the same on "operator new" - // Reverting code back to the old version in this #if block for the hipcc compiler - // - new int[huge]; - #else ::operator new(huge); - #endif #endif } @@ -96,26 +83,19 @@ inline void throw_std_bad_alloc() /** \internal Like malloc, but the returned pointer is guaranteed to be 16-byte aligned. * Fast, but wastes 16 additional bytes of memory. Does not throw any exception. */ -EIGEN_DEVICE_FUNC inline void* handmade_aligned_malloc(std::size_t size, std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES) +inline void* handmade_aligned_malloc(std::size_t size) { - eigen_assert(alignment >= sizeof(void*) && (alignment & (alignment-1)) == 0 && "Alignment must be at least sizeof(void*) and a power of 2"); - - EIGEN_USING_STD(malloc) - void *original = malloc(size+alignment); - + void *original = std::malloc(size+EIGEN_DEFAULT_ALIGN_BYTES); if (original == 0) return 0; - void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(alignment-1))) + alignment); + void *aligned = reinterpret_cast((reinterpret_cast(original) & ~(std::size_t(EIGEN_DEFAULT_ALIGN_BYTES-1))) + EIGEN_DEFAULT_ALIGN_BYTES); *(reinterpret_cast(aligned) - 1) = original; return aligned; } /** \internal Frees memory allocated with handmade_aligned_malloc */ -EIGEN_DEVICE_FUNC inline void handmade_aligned_free(void *ptr) +inline void handmade_aligned_free(void *ptr) { - if (ptr) { - EIGEN_USING_STD(free) - free(*(reinterpret_cast(ptr) - 1)); - } + if (ptr) std::free(*(reinterpret_cast(ptr) - 1)); } /** \internal @@ -134,7 +114,7 @@ inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = void *previous_aligned = static_cast(original)+previous_offset; if(aligned!=previous_aligned) std::memmove(aligned, previous_aligned, size); - + *(reinterpret_cast(aligned) - 1) = original; return aligned; } @@ -162,7 +142,7 @@ EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() { eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)"); } -#else +#else EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() {} #endif @@ -176,12 +156,9 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size) void *result; #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED - - EIGEN_USING_STD(malloc) - result = malloc(size); - + result = std::malloc(size); #if EIGEN_DEFAULT_ALIGN_BYTES==16 - eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade aligned memory allocator."); + eigen_assert((size<16 || (std::size_t(result)%16)==0) && "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback to handmade alignd memory allocator."); #endif #else result = handmade_aligned_malloc(size); @@ -197,10 +174,7 @@ EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size) EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) { #if (EIGEN_DEFAULT_ALIGN_BYTES==0) || EIGEN_MALLOC_ALREADY_ALIGNED - - EIGEN_USING_STD(free) - free(ptr); - + std::free(ptr); #else handmade_aligned_free(ptr); #endif @@ -244,9 +218,7 @@ template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(std: { check_that_malloc_is_allowed(); - EIGEN_USING_STD(malloc) - void *result = malloc(size); - + void *result = std::malloc(size); if(!result && size) throw_std_bad_alloc(); return result; @@ -260,8 +232,7 @@ template EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *ptr) { - EIGEN_USING_STD(free) - free(ptr); + std::free(ptr); } template inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size) @@ -360,7 +331,7 @@ template EIGEN_DEVICE_FUNC inline T* conditional_aligned template EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, std::size_t size) { destruct_elements_of_array(ptr, size); - Eigen::internal::aligned_free(ptr); + aligned_free(ptr); } /** \internal Deletes objects constructed with conditional_aligned_new @@ -500,8 +471,8 @@ EIGEN_DEVICE_FUNC inline Index first_default_aligned(const Scalar* array, Index } /** \internal Returns the smallest integer multiple of \a base and greater or equal to \a size - */ -template + */ +template inline Index first_multiple(Index size, Index base) { return ((size+base-1)/base)*base; @@ -522,8 +493,7 @@ template struct smart_copy_helper { IntPtr size = IntPtr(end)-IntPtr(start); if(size==0) return; eigen_internal_assert(start!=0 && end!=0 && target!=0); - EIGEN_USING_STD(memcpy) - memcpy(target, start, size); + std::memcpy(target, start, size); } }; @@ -532,7 +502,7 @@ template struct smart_copy_helper { { std::copy(start, end, target); } }; -// intelligent memmove. falls back to std::memmove for POD types, uses std::copy otherwise. +// intelligent memmove. falls back to std::memmove for POD types, uses std::copy otherwise. template struct smart_memmove_helper; template void smart_memmove(const T* start, const T* end, T* target) @@ -552,15 +522,15 @@ template struct smart_memmove_helper { template struct smart_memmove_helper { static inline void run(const T* start, const T* end, T* target) - { + { if (UIntPtr(target) < UIntPtr(start)) { std::copy(start, end, target); } - else + else { std::ptrdiff_t count = (std::ptrdiff_t(end)-std::ptrdiff_t(start)) / sizeof(T); - std::copy_backward(start, end, target + count); + std::copy_backward(start, end, target + count); } } }; @@ -572,7 +542,7 @@ template struct smart_memmove_helper { // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA // to the appropriate stack allocation function -#if ! defined EIGEN_ALLOCA && ! defined EIGEN_GPU_COMPILE_PHASE +#ifndef EIGEN_ALLOCA #if EIGEN_OS_LINUX || EIGEN_OS_MAC || (defined alloca) #define EIGEN_ALLOCA alloca #elif EIGEN_COMP_MSVC @@ -580,15 +550,6 @@ template struct smart_memmove_helper { #endif #endif -// With clang -Oz -mthumb, alloca changes the stack pointer in a way that is -// not allowed in Thumb2. -DEIGEN_STACK_ALLOCATION_LIMIT=0 doesn't work because -// the compiler still emits bad code because stack allocation checks use "<=". -// TODO: Eliminate after https://bugs.llvm.org/show_bug.cgi?id=23772 -// is fixed. -#if defined(__clang__) && defined(__thumb__) - #undef EIGEN_ALLOCA -#endif - // This helper class construct the allocated memory, and takes care of destructing and freeing the handled data // at destruction time. In practice this helper class is mainly useful to avoid memory leak in case of exceptions. template class aligned_stack_memory_handler : noncopyable @@ -600,14 +561,12 @@ template class aligned_stack_memory_handler : noncopyable * In this case, the buffer elements will also be destructed when this handler will be destructed. * Finally, if \a dealloc is true, then the pointer \a ptr is freed. **/ - EIGEN_DEVICE_FUNC aligned_stack_memory_handler(T* ptr, std::size_t size, bool dealloc) : m_ptr(ptr), m_size(size), m_deallocate(dealloc) { if(NumTraits::RequireInitialization && m_ptr) Eigen::internal::construct_elements_of_array(m_ptr, size); } - EIGEN_DEVICE_FUNC ~aligned_stack_memory_handler() { if(NumTraits::RequireInitialization && m_ptr) @@ -621,60 +580,6 @@ template class aligned_stack_memory_handler : noncopyable bool m_deallocate; }; -#ifdef EIGEN_ALLOCA - -template::Evaluate && Xpr::MaxSizeAtCompileTime==Dynamic - > -struct local_nested_eval_wrapper -{ - static const bool NeedExternalBuffer = false; - typedef typename Xpr::Scalar Scalar; - typedef typename nested_eval::type ObjectType; - ObjectType object; - - EIGEN_DEVICE_FUNC - local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) : object(xpr) - { - EIGEN_UNUSED_VARIABLE(ptr); - eigen_internal_assert(ptr==0); - } -}; - -template -struct local_nested_eval_wrapper -{ - static const bool NeedExternalBuffer = true; - typedef typename Xpr::Scalar Scalar; - typedef typename plain_object_eval::type PlainObject; - typedef Map ObjectType; - ObjectType object; - - EIGEN_DEVICE_FUNC - local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) - : object(ptr==0 ? reinterpret_cast(Eigen::internal::aligned_malloc(sizeof(Scalar)*xpr.size())) : ptr, xpr.rows(), xpr.cols()), - m_deallocate(ptr==0) - { - if(NumTraits::RequireInitialization && object.data()) - Eigen::internal::construct_elements_of_array(object.data(), object.size()); - object = xpr; - } - - EIGEN_DEVICE_FUNC - ~local_nested_eval_wrapper() - { - if(NumTraits::RequireInitialization && object.data()) - Eigen::internal::destruct_elements_of_array(object.data(), object.size()); - if(m_deallocate) - Eigen::internal::aligned_free(object.data()); - } - -private: - bool m_deallocate; -}; - -#endif // EIGEN_ALLOCA - template class scoped_array : noncopyable { T* m_ptr; @@ -698,15 +603,13 @@ template void swap(scoped_array &a,scoped_array &b) { std::swap(a.ptr(),b.ptr()); } - + } // end namespace internal /** \internal - * - * The macro ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) declares, allocates, - * and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack - * if the size in bytes is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform - * (currently, this is Linux, OSX and Visual Studio only). Otherwise the memory is allocated on the heap. + * Declares, allocates and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack + * if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform + * (currently, this is Linux and Visual Studio only). Otherwise the memory is allocated on the heap. * The allocated buffer is automatically deleted when exiting the scope of this declaration. * If BUFFER is non null, then the declared variable is simply an alias for BUFFER, and no allocation/deletion occurs. * Here is an example: @@ -717,17 +620,9 @@ template void swap(scoped_array &a,scoped_array &b) * } * \endcode * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token. - * - * The macro ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) is analogue to - * \code - * typename internal::nested_eval::type NAME(XPR); - * \endcode - * with the advantage of using aligned stack allocation even if the maximal size of XPR at compile time is unknown. - * This is accomplished through alloca if this later is supported and if the required number of bytes - * is below EIGEN_STACK_ALLOCATION_LIMIT. */ #ifdef EIGEN_ALLOCA - + #if EIGEN_DEFAULT_ALIGN_BYTES>0 // We always manually re-align the result of EIGEN_ALLOCA. // If alloca is already aligned, the compiler should be smart enough to optimize away the re-alignment. @@ -744,23 +639,13 @@ template void swap(scoped_array &a,scoped_array &b) : Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE) ); \ Eigen::internal::aligned_stack_memory_handler EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,sizeof(TYPE)*SIZE>EIGEN_STACK_ALLOCATION_LIMIT) - - #define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) \ - Eigen::internal::local_nested_eval_wrapper EIGEN_CAT(NAME,_wrapper)(XPR, reinterpret_cast( \ - ( (Eigen::internal::local_nested_eval_wrapper::NeedExternalBuffer) && ((sizeof(typename XPR_T::Scalar)*XPR.size())<=EIGEN_STACK_ALLOCATION_LIMIT) ) \ - ? EIGEN_ALIGNED_ALLOCA( sizeof(typename XPR_T::Scalar)*XPR.size() ) : 0 ) ) ; \ - typename Eigen::internal::local_nested_eval_wrapper::ObjectType NAME(EIGEN_CAT(NAME,_wrapper).object) - #else #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \ Eigen::internal::check_size_for_overflow(SIZE); \ TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE)); \ Eigen::internal::aligned_stack_memory_handler EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true) - - -#define ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) typename Eigen::internal::nested_eval::type NAME(XPR) - + #endif @@ -768,17 +653,6 @@ template void swap(scoped_array &a,scoped_array &b) *** Implementation of EIGEN_MAKE_ALIGNED_OPERATOR_NEW [_IF] *** *****************************************************************************/ -#if EIGEN_HAS_CXX17_OVERALIGN - -// C++17 -> no need to bother about alignment anymore :) - -#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) -#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) -#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW -#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) - -#else - #if EIGEN_MAX_ALIGN_BYTES!=0 #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \ void* operator new(std::size_t size, const std::nothrow_t&) EIGEN_NO_THROW { \ @@ -814,14 +688,8 @@ template void swap(scoped_array &a,scoped_array &b) #endif #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true) -#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \ - EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool( \ - ((Size)!=Eigen::Dynamic) && \ - (((EIGEN_MAX_ALIGN_BYTES>=16) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES )==0)) || \ - ((EIGEN_MAX_ALIGN_BYTES>=32) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES/2)==0)) || \ - ((EIGEN_MAX_ALIGN_BYTES>=64) && ((sizeof(Scalar)*(Size))%(EIGEN_MAX_ALIGN_BYTES/4)==0)) ))) - -#endif +#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \ + EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%EIGEN_MAX_ALIGN_BYTES==0))) /****************************************************************************/ @@ -835,13 +703,13 @@ template void swap(scoped_array &a,scoped_array &b) * - 32 bytes alignment if AVX is enabled. * - 64 bytes alignment if AVX512 is enabled. * -* This can be controlled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented +* This can be controled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented * \link TopicPreprocessorDirectivesPerformance there \endlink. * * Example: * \code * // Matrix4f requires 16 bytes alignment: -* std::map< int, Matrix4f, std::less, +* std::map< int, Matrix4f, std::less, * aligned_allocator > > my_map_mat4; * // Vector3f does not require 16 bytes alignment, no need to use Eigen's allocator: * std::map< int, Vector3f > my_map_vec3; @@ -876,19 +744,18 @@ public: ~aligned_allocator() {} - #if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_LEAST(7,0) - // In gcc std::allocator::max_size() is bugged making gcc triggers a warning: - // eigen/Eigen/src/Core/util/Memory.h:189:12: warning: argument 1 value '18446744073709551612' exceeds maximum object size 9223372036854775807 - // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87544 - size_type max_size() const { - return (std::numeric_limits::max)()/sizeof(T); - } - #endif - pointer allocate(size_type num, const void* /*hint*/ = 0) { internal::check_size_for_overflow(num); - return static_cast( internal::aligned_malloc(num * sizeof(T)) ); + size_type size = num * sizeof(T); +#if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_LEAST(7,0) + // workaround gcc bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87544 + // It triggered eigen/Eigen/src/Core/util/Memory.h:189:12: warning: argument 1 value '18446744073709551612' exceeds maximum object size 9223372036854775807 + if(size>=std::size_t((std::numeric_limits::max)())) + return 0; + else +#endif + return static_cast( internal::aligned_malloc(size) ); } void deallocate(pointer p, size_type /*num*/) diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/Meta.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/Meta.h index e9e2f1873..9b61ff037 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/Meta.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/util/Meta.h @@ -11,18 +11,9 @@ #ifndef EIGEN_META_H #define EIGEN_META_H -#if defined(EIGEN_GPU_COMPILE_PHASE) - - #include - - #if defined(EIGEN_CUDA_ARCH) - #include - #endif - - #if defined(EIGEN_HIP_DEVICE_COMPILE) - #include "Eigen/src/Core/arch/HIP/hcc/math_constants.h" - #endif - +#if defined(__CUDA_ARCH__) +#include +#include #endif #if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L @@ -63,21 +54,15 @@ typedef std::size_t UIntPtr; struct true_type { enum { value = 1 }; }; struct false_type { enum { value = 0 }; }; -template -struct bool_constant; - -template<> -struct bool_constant : true_type {}; - -template<> -struct bool_constant : false_type {}; - template struct conditional { typedef Then type; }; template struct conditional { typedef Else type; }; +template struct is_same { enum { value = 0 }; }; +template struct is_same { enum { value = 1 }; }; + template struct remove_reference { typedef T type; }; template struct remove_reference { typedef T type; }; @@ -112,31 +97,23 @@ template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; -template struct is_same { enum { value = 0 }; }; -template struct is_same { enum { value = 1 }; }; - -template< class T > -struct is_void : is_same::type> {}; - #if EIGEN_HAS_CXX11 -template<> struct is_arithmetic { enum { value = true }; }; -template<> struct is_arithmetic { enum { value = true }; }; using std::is_integral; #else -template struct is_integral { enum { value = false }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; +template struct is_integral { enum { value = false }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; #if EIGEN_COMP_MSVC -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral{ enum { value = true }; }; #endif #endif @@ -174,11 +151,6 @@ template struct add_const_on_value_type { typedef T const template struct add_const_on_value_type { typedef T const* const type; }; template struct add_const_on_value_type { typedef T const* const type; }; -#if EIGEN_HAS_CXX11 - -using std::is_convertible; - -#else template struct is_convertible_impl @@ -192,19 +164,16 @@ private: struct yes {int a[1];}; struct no {int a[2];}; - template - static yes test(T, int); - - template + static yes test(const To&, int); static no test(any_conversion, ...); public: - static typename internal::remove_reference::type* ms_from; + static From ms_from; #ifdef __INTEL_COMPILER #pragma warning push #pragma warning ( disable : 2259 ) #endif - enum { value = sizeof(test(*ms_from, 0))==sizeof(yes) }; + enum { value = sizeof(test(ms_from, 0))==sizeof(yes) }; #ifdef __INTEL_COMPILER #pragma warning pop #endif @@ -213,17 +182,10 @@ public: template struct is_convertible { - enum { value = is_convertible_impl::value }; + enum { value = is_convertible_impl::type, + typename remove_all::type>::value }; }; -template -struct is_convertible { enum { value = false }; }; - -template -struct is_convertible { enum { value = true }; }; - -#endif - /** \internal Allows to enable/disable an overload * according to a compile time condition. */ @@ -232,7 +194,7 @@ template struct enable_if; template struct enable_if { typedef T type; }; -#if defined(EIGEN_GPU_COMPILE_PHASE) +#if defined(__CUDA_ARCH__) #if !defined(__FLT_EPSILON__) #define __FLT_EPSILON__ FLT_EPSILON #define __DBL_EPSILON__ DBL_EPSILON @@ -254,31 +216,13 @@ template<> struct numeric_limits EIGEN_DEVICE_FUNC static float epsilon() { return __FLT_EPSILON__; } EIGEN_DEVICE_FUNC - static float (max)() { - #if defined(EIGEN_CUDA_ARCH) - return CUDART_MAX_NORMAL_F; - #else - return HIPRT_MAX_NORMAL_F; - #endif - } + static float (max)() { return CUDART_MAX_NORMAL_F; } EIGEN_DEVICE_FUNC static float (min)() { return FLT_MIN; } EIGEN_DEVICE_FUNC - static float infinity() { - #if defined(EIGEN_CUDA_ARCH) - return CUDART_INF_F; - #else - return HIPRT_INF_F; - #endif - } + static float infinity() { return CUDART_INF_F; } EIGEN_DEVICE_FUNC - static float quiet_NaN() { - #if defined(EIGEN_CUDA_ARCH) - return CUDART_NAN_F; - #else - return HIPRT_NAN_F; - #endif - } + static float quiet_NaN() { return CUDART_NAN_F; } }; template<> struct numeric_limits { @@ -289,21 +233,9 @@ template<> struct numeric_limits EIGEN_DEVICE_FUNC static double (min)() { return DBL_MIN; } EIGEN_DEVICE_FUNC - static double infinity() { - #if defined(EIGEN_CUDA_ARCH) - return CUDART_INF; - #else - return HIPRT_INF; - #endif - } + static double infinity() { return CUDART_INF; } EIGEN_DEVICE_FUNC - static double quiet_NaN() { - #if defined(EIGEN_CUDA_ARCH) - return CUDART_NAN; - #else - return HIPRT_NAN; - #endif - } + static double quiet_NaN() { return CUDART_NAN; } }; template<> struct numeric_limits { @@ -359,22 +291,13 @@ template<> struct numeric_limits EIGEN_DEVICE_FUNC static unsigned long long (min)() { return 0; } }; -template<> struct numeric_limits -{ - EIGEN_DEVICE_FUNC - static bool epsilon() { return false; } - EIGEN_DEVICE_FUNC - static bool (max)() { return true; } - EIGEN_DEVICE_FUNC - static bool (min)() { return false; } -}; } #endif /** \internal - * A base class do disable default copy ctor and copy assignment operator. + * A base class do disable default copy ctor and copy assignement operator. */ class noncopyable { @@ -385,59 +308,6 @@ protected: EIGEN_DEVICE_FUNC ~noncopyable() {} }; -/** \internal - * Provides access to the number of elements in the object of as a compile-time constant expression. - * It "returns" Eigen::Dynamic if the size cannot be resolved at compile-time (default). - * - * Similar to std::tuple_size, but more general. - * - * It currently supports: - * - any types T defining T::SizeAtCompileTime - * - plain C arrays as T[N] - * - std::array (c++11) - * - some internal types such as SingleRange and AllRange - * - * The second template parameter eases SFINAE-based specializations. - */ -template struct array_size { - enum { value = Dynamic }; -}; - -template struct array_size::type> { - enum { value = T::SizeAtCompileTime }; -}; - -template struct array_size { - enum { value = N }; -}; -template struct array_size { - enum { value = N }; -}; - -#if EIGEN_HAS_CXX11 -template struct array_size > { - enum { value = N }; -}; -template struct array_size > { - enum { value = N }; -}; -#endif - -/** \internal - * Analogue of the std::size free function. - * It returns the size of the container or view \a x of type \c T - * - * It currently supports: - * - any types T defining a member T::size() const - * - plain C arrays as T[N] - * - */ -template -Index size(const T& x) { return x.size(); } - -template -Index size(const T (&) [N]) { return N; } - /** \internal * Convenient struct to get the result type of a unary or binary functor. * @@ -535,10 +405,10 @@ struct meta_no { char a[2]; }; template struct has_ReturnType { - template static meta_yes testFunctor(C const *, typename C::ReturnType const * = 0); - template static meta_no testFunctor(...); + template static meta_yes testFunctor(typename C::ReturnType const *); + template static meta_no testFunctor(...); - enum { value = sizeof(testFunctor(static_cast(0))) == sizeof(meta_yes) }; + enum { value = sizeof(testFunctor(0)) == sizeof(meta_yes) }; }; template const T* return_ptr(); @@ -621,27 +491,17 @@ template struct scalar_product_traits // typedef typename scalar_product_traits::type, typename remove_all::type>::ReturnType type; // }; -/** \internal Obtains a POD type suitable to use as storage for an object of a size - * of at most Len bytes, aligned as specified by \c Align. - */ -template -struct aligned_storage { - struct type { - EIGEN_ALIGN_TO_BOUNDARY(Align) unsigned char data[Len]; - }; -}; - } // end namespace internal namespace numext { - -#if defined(EIGEN_GPU_COMPILE_PHASE) + +#if defined(__CUDA_ARCH__) template EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; } #else template EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); } #endif -#if defined(EIGEN_GPU_COMPILE_PHASE) +#if defined(__CUDA_ARCH__) using internal::device::numeric_limits; #else using std::numeric_limits; @@ -650,7 +510,6 @@ using std::numeric_limits; // Integer division with rounding up. // T is assumed to be an integer type with a>=0, and b>0 template -EIGEN_DEVICE_FUNC T div_ceil(const T &a, const T &b) { return (a+b-1) / b; @@ -658,35 +517,23 @@ T div_ceil(const T &a, const T &b) // The aim of the following functions is to bypass -Wfloat-equal warnings // when we really want a strict equality comparison on floating points. -template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +template EIGEN_STRONG_INLINE bool equal_strict(const X& x,const Y& y) { return x == y; } -#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) -template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +template<> EIGEN_STRONG_INLINE bool equal_strict(const float& x,const float& y) { return std::equal_to()(x,y); } -template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +template<> EIGEN_STRONG_INLINE bool equal_strict(const double& x,const double& y) { return std::equal_to()(x,y); } -#endif -template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +template EIGEN_STRONG_INLINE bool not_equal_strict(const X& x,const Y& y) { return x != y; } -#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) -template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +template<> EIGEN_STRONG_INLINE bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to()(x,y); } -template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +template<> EIGEN_STRONG_INLINE bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to()(x,y); } -#endif - -/** \internal extract the bits of the float \a x */ -inline unsigned int as_uint(float x) -{ - unsigned int ret; - std::memcpy(&ret, &x, sizeof(float)); - return ret; -} } // end namespace numext @@ -697,10 +544,6 @@ inline unsigned int as_uint(float x) #include namespace Eigen { namespace numext { -typedef std::uint8_t uint8_t; -typedef std::int8_t int8_t; -typedef std::uint16_t uint16_t; -typedef std::int16_t int16_t; typedef std::uint32_t uint32_t; typedef std::int32_t int32_t; typedef std::uint64_t uint64_t; @@ -713,10 +556,6 @@ typedef std::int64_t int64_t; #include namespace Eigen { namespace numext { -typedef ::uint8_t uint8_t; -typedef ::int8_t int8_t; -typedef ::uint16_t uint16_t; -typedef ::int16_t int16_t; typedef ::uint32_t uint32_t; typedef ::int32_t int32_t; typedef ::uint64_t uint64_t; @@ -725,4 +564,5 @@ typedef ::int64_t int64_t; } #endif + #endif // EIGEN_META_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/ReshapedHelper.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/ReshapedHelper.h deleted file mode 100644 index 412432132..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/ReshapedHelper.h +++ /dev/null @@ -1,51 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2017 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - - -#ifndef EIGEN_RESHAPED_HELPER_H -#define EIGEN_RESHAPED_HELPER_H - -namespace Eigen { - -enum AutoSize_t { AutoSize }; -const int AutoOrder = 2; - -namespace internal { - -template -struct get_compiletime_reshape_size { - enum { value = get_fixed_value::value }; -}; - -template -Index get_runtime_reshape_size(SizeType size, Index /*other*/, Index /*total*/) { - return internal::get_runtime_value(size); -} - -template -struct get_compiletime_reshape_size { - enum { - other_size = get_fixed_value::value, - value = (TotalSize==Dynamic || other_size==Dynamic) ? Dynamic : TotalSize / other_size }; -}; - -inline Index get_runtime_reshape_size(AutoSize_t /*size*/, Index other, Index total) { - return total/other; -} - -template -struct get_compiletime_reshape_order { - enum { value = Order == AutoOrder ? Flags & RowMajorBit : Order }; -}; - -} - -} // end namespace Eigen - -#endif // EIGEN_RESHAPED_HELPER_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/StaticAssert.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/StaticAssert.h index 95107ff36..500e47792 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/StaticAssert.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/util/StaticAssert.h @@ -103,9 +103,7 @@ STORAGE_KIND_MUST_MATCH=1, STORAGE_INDEX_MUST_MATCH=1, CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1, - SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1, - INVALID_TEMPLATE_PARAMETER=1, - GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS=1 + SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1 }; }; @@ -184,7 +182,7 @@ ) #define EIGEN_STATIC_ASSERT_NON_INTEGER(TYPE) \ - EIGEN_STATIC_ASSERT(!Eigen::NumTraits::IsInteger, THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES) + EIGEN_STATIC_ASSERT(!NumTraits::IsInteger, THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES) // static assertion failing if it is guaranteed at compile-time that the two matrix expression types have different sizes @@ -194,8 +192,8 @@ YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES) #define EIGEN_STATIC_ASSERT_SIZE_1x1(TYPE) \ - EIGEN_STATIC_ASSERT((TYPE::RowsAtCompileTime == 1 || TYPE::RowsAtCompileTime == Eigen::Dynamic) && \ - (TYPE::ColsAtCompileTime == 1 || TYPE::ColsAtCompileTime == Eigen::Dynamic), \ + EIGEN_STATIC_ASSERT((TYPE::RowsAtCompileTime == 1 || TYPE::RowsAtCompileTime == Dynamic) && \ + (TYPE::ColsAtCompileTime == 1 || TYPE::ColsAtCompileTime == Dynamic), \ THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS) #define EIGEN_STATIC_ASSERT_LVALUE(Derived) \ diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/SymbolicIndex.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/SymbolicIndex.h deleted file mode 100644 index 17cf46f05..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/SymbolicIndex.h +++ /dev/null @@ -1,293 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2017 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_SYMBOLIC_INDEX_H -#define EIGEN_SYMBOLIC_INDEX_H - -namespace Eigen { - -/** \namespace Eigen::symbolic - * \ingroup Core_Module - * - * This namespace defines a set of classes and functions to build and evaluate symbolic expressions of scalar type Index. - * Here is a simple example: - * - * \code - * // First step, defines symbols: - * struct x_tag {}; static const symbolic::SymbolExpr x; - * struct y_tag {}; static const symbolic::SymbolExpr y; - * struct z_tag {}; static const symbolic::SymbolExpr z; - * - * // Defines an expression: - * auto expr = (x+3)/y+z; - * - * // And evaluate it: (c++14) - * std::cout << expr.eval(x=6,y=3,z=-13) << "\n"; - * - * // In c++98/11, only one symbol per expression is supported for now: - * auto expr98 = (3-x)/2; - * std::cout << expr98.eval(x=6) << "\n"; - * \endcode - * - * It is currently only used internally to define and manipulate the Eigen::last and Eigen::lastp1 symbols in Eigen::seq and Eigen::seqN. - * - */ -namespace symbolic { - -template class Symbol; -template class NegateExpr; -template class AddExpr; -template class ProductExpr; -template class QuotientExpr; - -// A simple wrapper around an integral value to provide the eval method. -// We could also use a free-function symbolic_eval... -template -class ValueExpr { -public: - ValueExpr(IndexType val) : m_value(val) {} - template - IndexType eval_impl(const T&) const { return m_value; } -protected: - IndexType m_value; -}; - -// Specialization for compile-time value, -// It is similar to ValueExpr(N) but this version helps the compiler to generate better code. -template -class ValueExpr > { -public: - ValueExpr() {} - template - Index eval_impl(const T&) const { return N; } -}; - - -/** \class BaseExpr - * \ingroup Core_Module - * Common base class of any symbolic expressions - */ -template -class BaseExpr -{ -public: - const Derived& derived() const { return *static_cast(this); } - - /** Evaluate the expression given the \a values of the symbols. - * - * \param values defines the values of the symbols, it can either be a SymbolValue or a std::tuple of SymbolValue - * as constructed by SymbolExpr::operator= operator. - * - */ - template - Index eval(const T& values) const { return derived().eval_impl(values); } - -#if EIGEN_HAS_CXX14 - template - Index eval(Types&&... values) const { return derived().eval_impl(std::make_tuple(values...)); } -#endif - - NegateExpr operator-() const { return NegateExpr(derived()); } - - AddExpr > operator+(Index b) const - { return AddExpr >(derived(), b); } - AddExpr > operator-(Index a) const - { return AddExpr >(derived(), -a); } - ProductExpr > operator*(Index a) const - { return ProductExpr >(derived(),a); } - QuotientExpr > operator/(Index a) const - { return QuotientExpr >(derived(),a); } - - friend AddExpr > operator+(Index a, const BaseExpr& b) - { return AddExpr >(b.derived(), a); } - friend AddExpr,ValueExpr<> > operator-(Index a, const BaseExpr& b) - { return AddExpr,ValueExpr<> >(-b.derived(), a); } - friend ProductExpr,Derived> operator*(Index a, const BaseExpr& b) - { return ProductExpr,Derived>(a,b.derived()); } - friend QuotientExpr,Derived> operator/(Index a, const BaseExpr& b) - { return QuotientExpr,Derived>(a,b.derived()); } - - template - AddExpr > > operator+(internal::FixedInt) const - { return AddExpr > >(derived(), ValueExpr >()); } - template - AddExpr > > operator-(internal::FixedInt) const - { return AddExpr > >(derived(), ValueExpr >()); } - template - ProductExpr > > operator*(internal::FixedInt) const - { return ProductExpr > >(derived(),ValueExpr >()); } - template - QuotientExpr > > operator/(internal::FixedInt) const - { return QuotientExpr > >(derived(),ValueExpr >()); } - - template - friend AddExpr > > operator+(internal::FixedInt, const BaseExpr& b) - { return AddExpr > >(b.derived(), ValueExpr >()); } - template - friend AddExpr,ValueExpr > > operator-(internal::FixedInt, const BaseExpr& b) - { return AddExpr,ValueExpr > >(-b.derived(), ValueExpr >()); } - template - friend ProductExpr >,Derived> operator*(internal::FixedInt, const BaseExpr& b) - { return ProductExpr >,Derived>(ValueExpr >(),b.derived()); } - template - friend QuotientExpr >,Derived> operator/(internal::FixedInt, const BaseExpr& b) - { return QuotientExpr > ,Derived>(ValueExpr >(),b.derived()); } - -#if (!EIGEN_HAS_CXX14) - template - AddExpr > > operator+(internal::FixedInt (*)()) const - { return AddExpr > >(derived(), ValueExpr >()); } - template - AddExpr > > operator-(internal::FixedInt (*)()) const - { return AddExpr > >(derived(), ValueExpr >()); } - template - ProductExpr > > operator*(internal::FixedInt (*)()) const - { return ProductExpr > >(derived(),ValueExpr >()); } - template - QuotientExpr > > operator/(internal::FixedInt (*)()) const - { return QuotientExpr > >(derived(),ValueExpr >()); } - - template - friend AddExpr > > operator+(internal::FixedInt (*)(), const BaseExpr& b) - { return AddExpr > >(b.derived(), ValueExpr >()); } - template - friend AddExpr,ValueExpr > > operator-(internal::FixedInt (*)(), const BaseExpr& b) - { return AddExpr,ValueExpr > >(-b.derived(), ValueExpr >()); } - template - friend ProductExpr >,Derived> operator*(internal::FixedInt (*)(), const BaseExpr& b) - { return ProductExpr >,Derived>(ValueExpr >(),b.derived()); } - template - friend QuotientExpr >,Derived> operator/(internal::FixedInt (*)(), const BaseExpr& b) - { return QuotientExpr > ,Derived>(ValueExpr >(),b.derived()); } -#endif - - - template - AddExpr operator+(const BaseExpr &b) const - { return AddExpr(derived(), b.derived()); } - - template - AddExpr > operator-(const BaseExpr &b) const - { return AddExpr >(derived(), -b.derived()); } - - template - ProductExpr operator*(const BaseExpr &b) const - { return ProductExpr(derived(), b.derived()); } - - template - QuotientExpr operator/(const BaseExpr &b) const - { return QuotientExpr(derived(), b.derived()); } -}; - -template -struct is_symbolic { - // BaseExpr has no conversion ctor, so we only have to check whether T can be statically cast to its base class BaseExpr. - enum { value = internal::is_convertible >::value }; -}; - -/** Represents the actual value of a symbol identified by its tag - * - * It is the return type of SymbolValue::operator=, and most of the time this is only way it is used. - */ -template -class SymbolValue -{ -public: - /** Default constructor from the value \a val */ - SymbolValue(Index val) : m_value(val) {} - - /** \returns the stored value of the symbol */ - Index value() const { return m_value; } -protected: - Index m_value; -}; - -/** Expression of a symbol uniquely identified by the template parameter type \c tag */ -template -class SymbolExpr : public BaseExpr > -{ -public: - /** Alias to the template parameter \c tag */ - typedef tag Tag; - - SymbolExpr() {} - - /** Associate the value \a val to the given symbol \c *this, uniquely identified by its \c Tag. - * - * The returned object should be passed to ExprBase::eval() to evaluate a given expression with this specified runtime-time value. - */ - SymbolValue operator=(Index val) const { - return SymbolValue(val); - } - - Index eval_impl(const SymbolValue &values) const { return values.value(); } - -#if EIGEN_HAS_CXX14 - // C++14 versions suitable for multiple symbols - template - Index eval_impl(const std::tuple& values) const { return std::get >(values).value(); } -#endif -}; - -template -class NegateExpr : public BaseExpr > -{ -public: - NegateExpr(const Arg0& arg0) : m_arg0(arg0) {} - - template - Index eval_impl(const T& values) const { return -m_arg0.eval_impl(values); } -protected: - Arg0 m_arg0; -}; - -template -class AddExpr : public BaseExpr > -{ -public: - AddExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {} - - template - Index eval_impl(const T& values) const { return m_arg0.eval_impl(values) + m_arg1.eval_impl(values); } -protected: - Arg0 m_arg0; - Arg1 m_arg1; -}; - -template -class ProductExpr : public BaseExpr > -{ -public: - ProductExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {} - - template - Index eval_impl(const T& values) const { return m_arg0.eval_impl(values) * m_arg1.eval_impl(values); } -protected: - Arg0 m_arg0; - Arg1 m_arg1; -}; - -template -class QuotientExpr : public BaseExpr > -{ -public: - QuotientExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {} - - template - Index eval_impl(const T& values) const { return m_arg0.eval_impl(values) / m_arg1.eval_impl(values); } -protected: - Arg0 m_arg0; - Arg1 m_arg1; -}; - -} // end namespace symbolic - -} // end namespace Eigen - -#endif // EIGEN_SYMBOLIC_INDEX_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/Core/util/XprHelper.h b/uppsrc/plugin/Eigen/Eigen/src/Core/util/XprHelper.h index fd2db56a4..6bb497082 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Core/util/XprHelper.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Core/util/XprHelper.h @@ -49,12 +49,6 @@ template struct is_valid_index_type }; }; -// true if both types are not valid index types -template -struct valid_indexed_view_overload { - enum { value = !(internal::is_valid_index_type::value && internal::is_valid_index_type::value) }; -}; - // promote_scalar_arg is an helper used in operation between an expression and a scalar, like: // expression * scalar // Its role is to determine how the type T of the scalar operand should be promoted given the scalar type ExprScalar of the given expression. @@ -132,7 +126,6 @@ template class variable_if_dynamic EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamic) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); eigen_assert(v == T(Value)); } EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T value() { return T(Value); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator T() const { return T(Value); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T) {} }; @@ -143,7 +136,6 @@ template class variable_if_dynamic public: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value) : m_value(value) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T value() const { return m_value; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator T() const { return m_value; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; } }; @@ -187,10 +179,7 @@ template struct unpacket_traits enum { size = 1, - alignment = 1, - vectorizable = false, - masked_load_available=false, - masked_store_available=false + alignment = 1 }; }; @@ -411,7 +400,7 @@ template struct plain_matrix_type_row_major typedef Matrix::Scalar, Rows, Cols, - (MaxCols==1&&MaxRows!=1) ? ColMajor : RowMajor, + (MaxCols==1&&MaxRows!=1) ? RowMajor : ColMajor, MaxRows, MaxCols > type; @@ -466,7 +455,7 @@ template { enum { ScalarReadCost = NumTraits::Scalar>::ReadCost, - CoeffReadCost = evaluator::CoeffReadCost, // NOTE What if an evaluator evaluate itself into a temporary? + CoeffReadCost = evaluator::CoeffReadCost, // NOTE What if an evaluator evaluate itself into a tempory? // Then CoeffReadCost will be small (e.g., 1) but we still have to evaluate, especially if n>1. // This situation is already taken care by the EvalBeforeNestingBit flag, which is turned ON // for all evaluator creating a temporary. This flag is then propagated by the parent evaluators. @@ -682,39 +671,24 @@ template struct is_diagonal > template struct is_diagonal > { enum { ret = true }; }; - -template struct is_identity -{ enum { value = false }; }; - -template struct is_identity, T> > -{ enum { value = true }; }; - - template struct glue_shapes; template<> struct glue_shapes { typedef TriangularShape type; }; template -struct possibly_same_dense { - enum { value = has_direct_access::ret && has_direct_access::ret && is_same::value }; -}; - -template -EIGEN_DEVICE_FUNC -bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if::value>::type * = 0) +bool is_same_dense(const T1 &mat1, const T2 &mat2, typename enable_if::ret&&has_direct_access::ret, T1>::type * = 0) { return (mat1.data()==mat2.data()) && (mat1.innerStride()==mat2.innerStride()) && (mat1.outerStride()==mat2.outerStride()); } template -EIGEN_DEVICE_FUNC -bool is_same_dense(const T1 &, const T2 &, typename enable_if::value>::type * = 0) +bool is_same_dense(const T1 &, const T2 &, typename enable_if::ret&&has_direct_access::ret), T1>::type * = 0) { return false; } // Internal helper defining the cost of a scalar division for the type T. // The default heuristic can be specialized for each scalar type and architecture. -template +template struct scalar_div_cost { enum { value = 8*NumTraits::MulCost }; }; diff --git a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h index 081e918f1..dc5fae06a 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/ComplexEigenSolver.h @@ -214,7 +214,7 @@ template class ComplexEigenSolver /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was successful, \c NoConvergence otherwise. + * \returns \c Success if computation was succesful, \c NoConvergence otherwise. */ ComputationInfo info() const { diff --git a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/ComplexSchur.h b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/ComplexSchur.h index fc71468f8..4354e4018 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/ComplexSchur.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/ComplexSchur.h @@ -212,7 +212,7 @@ template class ComplexSchur /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was successful, \c NoConvergence otherwise. + * \returns \c Success if computation was succesful, \c NoConvergence otherwise. */ ComputationInfo info() const { diff --git a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/EigenSolver.h b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/EigenSolver.h index 572b29e4e..f205b185d 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/EigenSolver.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/EigenSolver.h @@ -110,7 +110,7 @@ template class EigenSolver * * \sa compute() for an example. */ - EigenSolver() : m_eivec(), m_eivalues(), m_isInitialized(false), m_eigenvectorsOk(false), m_realSchur(), m_matT(), m_tmp() {} + EigenSolver() : m_eivec(), m_eivalues(), m_isInitialized(false), m_realSchur(), m_matT(), m_tmp() {} /** \brief Default constructor with memory preallocation * @@ -277,7 +277,7 @@ template class EigenSolver template EigenSolver& compute(const EigenBase& matrix, bool computeEigenvectors = true); - /** \returns NumericalIssue if the input contains INF or NaN values or overflow occurred. Returns Success otherwise. */ + /** \returns NumericalIssue if the input contains INF or NaN values or overflow occured. Returns Success otherwise. */ ComputationInfo info() const { eigen_assert(m_isInitialized && "EigenSolver is not initialized."); diff --git a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h index d0f9091be..5f6bb8289 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h @@ -121,7 +121,7 @@ class GeneralizedSelfAdjointEigenSolver : public SelfAdjointEigenSolver<_MatrixT * * \returns Reference to \c *this * - * According to \p options, this function computes eigenvalues and (if requested) + * Accoring to \p options, this function computes eigenvalues and (if requested) * the eigenvectors of one of the following three generalized eigenproblems: * - \c Ax_lBx: \f$ Ax = \lambda B x \f$ * - \c ABx_lx: \f$ ABx = \lambda x \f$ diff --git a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h index d947dac4e..f647f69b0 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/HessenbergDecomposition.h @@ -315,7 +315,7 @@ void HessenbergDecomposition::_compute(MatrixType& matA, CoeffVector // A = A H' matA.rightCols(remainingSize) - .applyHouseholderOnTheRight(matA.col(i).tail(remainingSize-1), numext::conj(h), &temp.coeffRef(0)); + .applyHouseholderOnTheRight(matA.col(i).tail(remainingSize-1).conjugate(), numext::conj(h), &temp.coeffRef(0)); } } diff --git a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h index 66e5a3dbb..e4e426071 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h @@ -84,7 +84,7 @@ MatrixBase::eigenvalues() const * \sa SelfAdjointEigenSolver::eigenvalues(), MatrixBase::eigenvalues() */ template -EIGEN_DEVICE_FUNC inline typename SelfAdjointView::EigenvaluesReturnType +inline typename SelfAdjointView::EigenvaluesReturnType SelfAdjointView::eigenvalues() const { PlainObject thisAsMatrix(*this); @@ -147,7 +147,7 @@ MatrixBase::operatorNorm() const * \sa eigenvalues(), MatrixBase::operatorNorm() */ template -EIGEN_DEVICE_FUNC inline typename SelfAdjointView::RealScalar +inline typename SelfAdjointView::RealScalar SelfAdjointView::operatorNorm() const { return eigenvalues().cwiseAbs().maxCoeff(); diff --git a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/RealQZ.h b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/RealQZ.h index 509130184..b3a910dd9 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/RealQZ.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/RealQZ.h @@ -90,9 +90,8 @@ namespace Eigen { m_Z(size, size), m_workspace(size*2), m_maxIters(400), - m_isInitialized(false), - m_computeQZ(true) - {} + m_isInitialized(false) + { } /** \brief Constructor; computes real QZ decomposition of given matrices * @@ -109,11 +108,9 @@ namespace Eigen { m_Z(A.rows(),A.cols()), m_workspace(A.rows()*2), m_maxIters(400), - m_isInitialized(false), - m_computeQZ(true) - { - compute(A, B, computeQZ); - } + m_isInitialized(false) { + compute(A, B, computeQZ); + } /** \brief Returns matrix Q in the QZ decomposition. * @@ -164,7 +161,7 @@ namespace Eigen { /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was successful, \c NoConvergence otherwise. + * \returns \c Success if computation was succesful, \c NoConvergence otherwise. */ ComputationInfo info() const { diff --git a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/RealSchur.h b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/RealSchur.h index 7304ef344..9191519ab 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/RealSchur.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/RealSchur.h @@ -190,7 +190,7 @@ template class RealSchur RealSchur& computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ, bool computeU); /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was successful, \c NoConvergence otherwise. + * \returns \c Success if computation was succesful, \c NoConvergence otherwise. */ ComputationInfo info() const { @@ -270,13 +270,8 @@ RealSchur& RealSchur::compute(const EigenBase // Step 1. Reduce to Hessenberg form m_hess.compute(matrix.derived()/scale); - // Step 2. Reduce to real Schur form - // Note: we copy m_hess.matrixQ() into m_matU here and not in computeFromHessenberg - // to be able to pass our working-space buffer for the Householder to Dense evaluation. - m_workspaceVector.resize(matrix.cols()); - if(computeU) - m_hess.matrixQ().evalTo(m_matU, m_workspaceVector); - computeFromHessenberg(m_hess.matrixH(), m_matU, computeU); + // Step 2. Reduce to real Schur form + computeFromHessenberg(m_hess.matrixH(), m_hess.matrixQ(), computeU); m_matT *= scale; @@ -289,13 +284,13 @@ RealSchur& RealSchur::computeFromHessenberg(const HessMa using std::abs; m_matT = matrixH; - m_workspaceVector.resize(m_matT.cols()); - if(computeU && !internal::is_same_dense(m_matU,matrixQ)) + if(computeU) m_matU = matrixQ; Index maxIters = m_maxIters; if (maxIters == -1) maxIters = m_maxIterationsPerRow * matrixH.rows(); + m_workspaceVector.resize(m_matT.cols()); Scalar* workspace = &m_workspaceVector.coeffRef(0); // The matrix m_matT is divided in three parts. diff --git a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index 9bbce652f..d37656fa2 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -20,9 +20,7 @@ class GeneralizedSelfAdjointEigenSolver; namespace internal { template struct direct_selfadjoint_eigenvalues; - template -EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec); } @@ -121,9 +119,7 @@ template class SelfAdjointEigenSolver : m_eivec(), m_eivalues(), m_subdiag(), - m_info(InvalidInput), - m_isInitialized(false), - m_eigenvectorsOk(false) + m_isInitialized(false) { } /** \brief Constructor, pre-allocates memory for dynamic-size matrices. @@ -143,8 +139,7 @@ template class SelfAdjointEigenSolver : m_eivec(size, size), m_eivalues(size), m_subdiag(size > 1 ? size - 1 : 1), - m_isInitialized(false), - m_eigenvectorsOk(false) + m_isInitialized(false) {} /** \brief Constructor; computes eigendecomposition of given matrix. @@ -168,8 +163,7 @@ template class SelfAdjointEigenSolver : m_eivec(matrix.rows(), matrix.cols()), m_eivalues(matrix.cols()), m_subdiag(matrix.rows() > 1 ? matrix.rows() - 1 : 1), - m_isInitialized(false), - m_eigenvectorsOk(false) + m_isInitialized(false) { compute(matrix.derived(), options); } @@ -343,7 +337,7 @@ template class SelfAdjointEigenSolver /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was successful, \c NoConvergence otherwise. + * \returns \c Success if computation was succesful, \c NoConvergence otherwise. */ EIGEN_DEVICE_FUNC ComputationInfo info() const @@ -360,8 +354,7 @@ template class SelfAdjointEigenSolver static const int m_maxIterations = 30; protected: - static EIGEN_DEVICE_FUNC - void check_template_parameters() + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } @@ -410,7 +403,7 @@ SelfAdjointEigenSolver& SelfAdjointEigenSolver const InputType &matrix(a_matrix.derived()); - EIGEN_USING_STD_MATH(abs); + using std::abs; eigen_assert(matrix.cols() == matrix.rows()); eigen_assert((options&~(EigVecMask|GenEigMask))==0 && (options&EigVecMask)!=EigVecMask @@ -486,10 +479,9 @@ namespace internal { * \returns \c Success or \c NoConvergence */ template -EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag, const Index maxIterations, bool computeEigenvectors, MatrixType& eivec) { - EIGEN_USING_STD_MATH(abs); + using std::abs; ComputationInfo info; typedef typename MatrixType::Scalar Scalar; @@ -543,7 +535,7 @@ ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag diag.segment(i,n-i).minCoeff(&k); if (k > 0) { - numext::swap(diag[i], diag[k+i]); + std::swap(diag[i], diag[k+i]); if(computeEigenvectors) eivec.col(i).swap(eivec.col(k+i)); } @@ -613,8 +605,8 @@ template struct direct_selfadjoint_eigenvalues res, Ref representative) { - EIGEN_USING_STD_MATH(abs); - EIGEN_USING_STD_MATH(sqrt); + EIGEN_USING_STD_MATH(sqrt) + EIGEN_USING_STD_MATH(abs) Index i0; // Find non-zero column i0 (by construction, there must exist a non zero coefficient on the diagonal): mat.diagonal().cwiseAbs().maxCoeff(&i0); @@ -728,7 +720,7 @@ struct direct_selfadjoint_eigenvalues EIGEN_DEVICE_FUNC static inline void computeRoots(const MatrixType& m, VectorType& roots) { - EIGEN_USING_STD_MATH(sqrt); + using std::sqrt; const Scalar t0 = Scalar(0.5) * sqrt( numext::abs2(m(0,0)-m(1,1)) + Scalar(4)*numext::abs2(m(1,0))); const Scalar t1 = Scalar(0.5) * (m(0,0) + m(1,1)); roots(0) = t1 - t0; @@ -816,7 +808,7 @@ template EIGEN_DEVICE_FUNC static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n) { - EIGEN_USING_STD_MATH(abs); + using std::abs; RealScalar td = (diag[end-1] - diag[end])*RealScalar(0.5); RealScalar e = subdiag[end-1]; // Note that thanks to scaling, e^2 or td^2 cannot overflow, however they can still diff --git a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/Tridiagonalization.h b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/Tridiagonalization.h index c5c1acf46..1d102c17b 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/Tridiagonalization.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Eigenvalues/Tridiagonalization.h @@ -25,7 +25,6 @@ struct traits > }; template -EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs); } @@ -345,7 +344,6 @@ namespace internal { * \sa Tridiagonalization::packedMatrix() */ template -EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs) { using numext::conj; @@ -426,7 +424,6 @@ struct tridiagonalization_inplace_selector; * \sa class Tridiagonalization */ template -EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) { eigen_assert(mat.cols()==mat.rows() && diag.size()==mat.rows() && subdiag.size()==mat.rows()-1); @@ -442,8 +439,7 @@ struct tridiagonalization_inplace_selector typedef typename Tridiagonalization::CoeffVectorType CoeffVectorType; typedef typename Tridiagonalization::HouseholderSequenceType HouseholderSequenceType; template - static EIGEN_DEVICE_FUNC - void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) + static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) { CoeffVectorType hCoeffs(mat.cols()-1); tridiagonalization_inplace(mat,hCoeffs); @@ -512,8 +508,7 @@ struct tridiagonalization_inplace_selector typedef typename MatrixType::Scalar Scalar; template - static EIGEN_DEVICE_FUNC - void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ) + static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ) { diag(0,0) = numext::real(mat(0,0)); if(extractQ) diff --git a/uppsrc/plugin/Eigen/Eigen/src/Geometry/AlignedBox.h b/uppsrc/plugin/Eigen/Eigen/src/Geometry/AlignedBox.h index c902d8f0a..066eae4f9 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Geometry/AlignedBox.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Geometry/AlignedBox.h @@ -63,7 +63,7 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim) /** Default constructor initializing a null box. */ EIGEN_DEVICE_FUNC inline AlignedBox() - { if (EIGEN_CONST_CONDITIONAL(AmbientDimAtCompileTime!=Dynamic)) setEmpty(); } + { if (AmbientDimAtCompileTime!=Dynamic) setEmpty(); } /** Constructs a null box with \a _dim the dimension of the ambient space. */ EIGEN_DEVICE_FUNC inline explicit AlignedBox(Index _dim) : m_min(_dim), m_max(_dim) diff --git a/uppsrc/plugin/Eigen/Eigen/src/Geometry/Hyperplane.h b/uppsrc/plugin/Eigen/Eigen/src/Geometry/Hyperplane.h index cebe03557..05929b299 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Geometry/Hyperplane.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Geometry/Hyperplane.h @@ -119,7 +119,7 @@ public: * If the dimension of the ambient space is greater than 2, then there isn't uniqueness, * so an arbitrary choice is made. */ - // FIXME to be consistent with the rest this could be implemented as a static Through function ?? + // FIXME to be consitent with the rest this could be implemented as a static Through function ?? EIGEN_DEVICE_FUNC explicit Hyperplane(const ParametrizedLine& parametrized) { normal() = parametrized.direction().unitOrthogonal(); diff --git a/uppsrc/plugin/Eigen/Eigen/src/Geometry/OrthoMethods.h b/uppsrc/plugin/Eigen/Eigen/src/Geometry/OrthoMethods.h index 524aebe1b..a035e6310 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Geometry/OrthoMethods.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Geometry/OrthoMethods.h @@ -27,10 +27,9 @@ namespace Eigen { template template #ifndef EIGEN_PARSED_BY_DOXYGEN -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename MatrixBase::template cross_product_return_type::type +EIGEN_DEVICE_FUNC inline typename MatrixBase::template cross_product_return_type::type #else -typename MatrixBase::PlainObject +inline typename MatrixBase::PlainObject #endif MatrixBase::cross(const MatrixBase& other) const { diff --git a/uppsrc/plugin/Eigen/Eigen/src/Geometry/ParametrizedLine.h b/uppsrc/plugin/Eigen/Eigen/src/Geometry/ParametrizedLine.h index 3929ca87f..1e985d8cd 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Geometry/ParametrizedLine.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Geometry/ParametrizedLine.h @@ -104,44 +104,7 @@ public: template EIGEN_DEVICE_FUNC VectorType intersectionPoint(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const; - /** Applies the transformation matrix \a mat to \c *this and returns a reference to \c *this. - * - * \param mat the Dim x Dim transformation matrix - * \param traits specifies whether the matrix \a mat represents an #Isometry - * or a more generic #Affine transformation. The default is #Affine. - */ - template - EIGEN_DEVICE_FUNC inline ParametrizedLine& transform(const MatrixBase& mat, TransformTraits traits = Affine) - { - if (traits==Affine) - direction() = (mat * direction()).normalized(); - else if (traits==Isometry) - direction() = mat * direction(); - else - { - eigen_assert(0 && "invalid traits value in ParametrizedLine::transform()"); - } - origin() = mat * origin(); - return *this; - } - - /** Applies the transformation \a t to \c *this and returns a reference to \c *this. - * - * \param t the transformation of dimension Dim - * \param traits specifies whether the transformation \a t represents an #Isometry - * or a more generic #Affine transformation. The default is #Affine. - * Other kind of transformations are not supported. - */ - template - EIGEN_DEVICE_FUNC inline ParametrizedLine& transform(const Transform& t, - TransformTraits traits = Affine) - { - transform(t.linear(), traits); - origin() += t.translation(); - return *this; - } - -/** \returns \c *this with scalar type casted to \a NewScalarType + /** \returns \c *this with scalar type casted to \a NewScalarType * * Note that if \a NewScalarType is equal to the current scalar type of \c *this * then this function smartly returns a const reference to \c *this. diff --git a/uppsrc/plugin/Eigen/Eigen/src/Geometry/Quaternion.h b/uppsrc/plugin/Eigen/Eigen/src/Geometry/Quaternion.h index 7b2c4d89d..b81820656 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Geometry/Quaternion.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Geometry/Quaternion.h @@ -294,21 +294,6 @@ public: EIGEN_DEVICE_FUNC explicit inline Quaternion(const Quaternion& other) { m_coeffs = other.coeffs().template cast(); } -#if EIGEN_HAS_RVALUE_REFERENCES - // We define a copy constructor, which means we don't get an implicit move constructor or assignment operator. - /** Default move constructor */ - EIGEN_DEVICE_FUNC inline Quaternion(Quaternion&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_constructible::value) - : m_coeffs(std::move(other.coeffs())) - {} - - /** Default move assignment operator */ - EIGEN_DEVICE_FUNC Quaternion& operator=(Quaternion&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable::value) - { - m_coeffs = std::move(other.coeffs()); - return *this; - } -#endif - EIGEN_DEVICE_FUNC static Quaternion UnitRandom(); template @@ -661,7 +646,7 @@ EIGEN_DEVICE_FUNC Quaternion Quaternion::UnitRan const Scalar u1 = internal::random(0, 1), u2 = internal::random(0, 2*EIGEN_PI), u3 = internal::random(0, 2*EIGEN_PI); - const Scalar a = sqrt(Scalar(1) - u1), + const Scalar a = sqrt(1 - u1), b = sqrt(u1); return Quaternion (a * sin(u2), a * cos(u2), b * sin(u3), b * cos(u3)); } diff --git a/uppsrc/plugin/Eigen/Eigen/src/Geometry/Scaling.h b/uppsrc/plugin/Eigen/Eigen/src/Geometry/Scaling.h index df650fda6..f58ca03d9 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Geometry/Scaling.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Geometry/Scaling.h @@ -29,22 +29,6 @@ namespace Eigen { * * \sa Scaling(), class DiagonalMatrix, MatrixBase::asDiagonal(), class Translation, class Transform */ - -namespace internal -{ - // This helper helps nvcc+MSVC to properly parse this file. - // See bug 1412. - template - struct uniformscaling_times_affine_returntype - { - enum - { - NewMode = int(Mode) == int(Isometry) ? Affine : Mode - }; - typedef Transform type; - }; -} - template class UniformScaling { @@ -76,11 +60,9 @@ public: /** Concatenates a uniform scaling and an affine transformation */ template - inline typename - internal::uniformscaling_times_affine_returntype::type - operator* (const Transform& t) const + inline Transform operator* (const Transform& t) const { - typename internal::uniformscaling_times_affine_returntype::type res = t; + Transform res = t; res.prescale(factor()); return res; } @@ -88,7 +70,7 @@ public: /** Concatenates a uniform scaling and a linear transformation matrix */ // TODO returns an expression template - inline typename Eigen::internal::plain_matrix_type::type operator* (const MatrixBase& other) const + inline typename internal::plain_matrix_type::type operator* (const MatrixBase& other) const { return other * m_factor; } template @@ -128,7 +110,7 @@ public: /** Concatenates a linear transformation matrix and a uniform scaling * \relates UniformScaling */ -// NOTE this operator is defined in MatrixBase and not as a friend function +// NOTE this operator is defiend in MatrixBase and not as a friend function // of UniformScaling to fix an internal crash of Intel's ICC template EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived,Scalar,product) diff --git a/uppsrc/plugin/Eigen/Eigen/src/Geometry/Transform.h b/uppsrc/plugin/Eigen/Eigen/src/Geometry/Transform.h index c87b5fedf..c21d9e550 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Geometry/Transform.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Geometry/Transform.h @@ -97,9 +97,6 @@ template struct transform_make_affine; * - #AffineCompact: the transformation is stored as a (Dim)x(Dim+1) matrix. * - #Projective: the transformation is stored as a (Dim+1)^2 matrix * without any assumption. - * - #Isometry: same as #Affine with the additional assumption that - * the linear part represents a rotation. This assumption is exploited - * to speed up some functions such as inverse() and rotation(). * \tparam _Options has the same meaning as in class Matrix. It allows to specify DontAlign and/or RowMajor. * These Options are passed directly to the underlying matrix type. * @@ -118,7 +115,7 @@ template struct transform_make_affine; * \end{array} \right) \f$ * * Note that for a projective transformation the last row can be anything, - * and then the interpretation of different parts might be slightly different. + * and then the interpretation of different parts might be sightly different. * * However, unlike a plain matrix, the Transform class provides many features * simplifying both its assembly and usage. In particular, it can be composed @@ -338,7 +335,7 @@ public: OtherModeIsAffineCompact = OtherMode == int(AffineCompact) }; - if(EIGEN_CONST_CONDITIONAL(ModeIsAffineCompact == OtherModeIsAffineCompact)) + if(ModeIsAffineCompact == OtherModeIsAffineCompact) { // We need the block expression because the code is compiled for all // combinations of transformations and will trigger a compile time error @@ -346,7 +343,7 @@ public: m_matrix.template block(0,0) = other.matrix().template block(0,0); makeAffine(); } - else if(EIGEN_CONST_CONDITIONAL(OtherModeIsAffineCompact)) + else if(OtherModeIsAffineCompact) { typedef typename Transform::MatrixType OtherMatrixType; internal::transform_construct_from_matrix::run(this, other.matrix()); @@ -484,7 +481,7 @@ public: TransformTimeDiagonalReturnType res; res.linear().noalias() = a*b.linear(); res.translation().noalias() = a*b.translation(); - if (EIGEN_CONST_CONDITIONAL(Mode!=int(AffineCompact))) + if (Mode!=int(AffineCompact)) res.matrix().row(Dim) = b.matrix().row(Dim); return res; } @@ -605,9 +602,7 @@ public: template EIGEN_DEVICE_FUNC inline Transform operator*(const RotationBase& r) const; - typedef typename internal::conditional::type RotationReturnType; - EIGEN_DEVICE_FUNC RotationReturnType rotation() const; - + EIGEN_DEVICE_FUNC const LinearMatrixType rotation() const; template EIGEN_DEVICE_FUNC void computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const; @@ -760,7 +755,7 @@ template Transform& Transform::operator=(const QMatrix& other) { EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE) - if (EIGEN_CONST_CONDITIONAL(Mode == int(AffineCompact))) + if (Mode == int(AffineCompact)) m_matrix << other.m11(), other.m21(), other.dx(), other.m12(), other.m22(), other.dy(); else @@ -806,7 +801,7 @@ Transform& Transform::operator { check_template_params(); EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE) - if (EIGEN_CONST_CONDITIONAL(Mode == int(AffineCompact))) + if (Mode == int(AffineCompact)) m_matrix << other.m11(), other.m21(), other.dx(), other.m12(), other.m22(), other.dy(); else @@ -824,7 +819,7 @@ template QTransform Transform::toQTransform(void) const { EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE) - if (EIGEN_CONST_CONDITIONAL(Mode == int(AffineCompact))) + if (Mode == int(AffineCompact)) return QTransform(m_matrix.coeff(0,0), m_matrix.coeff(1,0), m_matrix.coeff(0,1), m_matrix.coeff(1,1), m_matrix.coeff(0,2), m_matrix.coeff(1,2)); @@ -917,7 +912,7 @@ EIGEN_DEVICE_FUNC Transform& Transform::pretranslate(const MatrixBase &other) { EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim)) - if(EIGEN_CONST_CONDITIONAL(int(Mode)==int(Projective))) + if(int(Mode)==int(Projective)) affine() += other * m_matrix.row(Dim); else translation() += other; @@ -1051,43 +1046,20 @@ EIGEN_DEVICE_FUNC inline Transform Transform struct transform_rotation_impl { - template - EIGEN_DEVICE_FUNC static inline - const typename TransformType::LinearMatrixType run(const TransformType& t) - { - typedef typename TransformType::LinearMatrixType LinearMatrixType; - LinearMatrixType result; - t.computeRotationScaling(&result, (LinearMatrixType*)0); - return result; - } -}; -template<> struct transform_rotation_impl { - template - EIGEN_DEVICE_FUNC static inline - typename TransformType::ConstLinearPart run(const TransformType& t) - { - return t.linear(); - } -}; -} /** \returns the rotation part of the transformation * - * If Mode==Isometry, then this method is an alias for linear(), - * otherwise it calls computeRotationScaling() to extract the rotation - * through a SVD decomposition. * * \svd_module * * \sa computeRotationScaling(), computeScalingRotation(), class SVD */ template -EIGEN_DEVICE_FUNC -typename Transform::RotationReturnType +EIGEN_DEVICE_FUNC const typename Transform::LinearMatrixType Transform::rotation() const { - return internal::transform_rotation_impl::run(*this); + LinearMatrixType result; + computeRotationScaling(&result, (LinearMatrixType*)0); + return result; } @@ -1111,12 +1083,12 @@ EIGEN_DEVICE_FUNC void Transform::computeRotationScalin Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1 VectorType sv(svd.singularValues()); sv.coeffRef(0) *= x; - if(scaling) *scaling = svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint(); + if(scaling) scaling->lazyAssign(svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint()); if(rotation) { LinearMatrixType m(svd.matrixU()); m.col(0) /= x; - *rotation = m * svd.matrixV().adjoint(); + rotation->lazyAssign(m * svd.matrixV().adjoint()); } } @@ -1140,12 +1112,12 @@ EIGEN_DEVICE_FUNC void Transform::computeScalingRotatio Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1 VectorType sv(svd.singularValues()); sv.coeffRef(0) *= x; - if(scaling) *scaling = svd.matrixU() * sv.asDiagonal() * svd.matrixU().adjoint(); + if(scaling) scaling->lazyAssign(svd.matrixU() * sv.asDiagonal() * svd.matrixU().adjoint()); if(rotation) { LinearMatrixType m(svd.matrixU()); m.col(0) /= x; - *rotation = m * svd.matrixV().adjoint(); + rotation->lazyAssign(m * svd.matrixV().adjoint()); } } diff --git a/uppsrc/plugin/Eigen/Eigen/src/Geometry/Translation.h b/uppsrc/plugin/Eigen/Eigen/src/Geometry/Translation.h index 8c2290121..0e99ce68e 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Geometry/Translation.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Geometry/Translation.h @@ -70,18 +70,18 @@ public: /** Constructs and initialize the translation transformation from a vector of translation coefficients */ EIGEN_DEVICE_FUNC explicit inline Translation(const VectorType& vector) : m_coeffs(vector) {} - /** \brief Returns the x-translation by value. **/ + /** \brief Retruns the x-translation by value. **/ EIGEN_DEVICE_FUNC inline Scalar x() const { return m_coeffs.x(); } - /** \brief Returns the y-translation by value. **/ + /** \brief Retruns the y-translation by value. **/ EIGEN_DEVICE_FUNC inline Scalar y() const { return m_coeffs.y(); } - /** \brief Returns the z-translation by value. **/ + /** \brief Retruns the z-translation by value. **/ EIGEN_DEVICE_FUNC inline Scalar z() const { return m_coeffs.z(); } - /** \brief Returns the x-translation as a reference. **/ + /** \brief Retruns the x-translation as a reference. **/ EIGEN_DEVICE_FUNC inline Scalar& x() { return m_coeffs.x(); } - /** \brief Returns the y-translation as a reference. **/ + /** \brief Retruns the y-translation as a reference. **/ EIGEN_DEVICE_FUNC inline Scalar& y() { return m_coeffs.y(); } - /** \brief Returns the z-translation as a reference. **/ + /** \brief Retruns the z-translation as a reference. **/ EIGEN_DEVICE_FUNC inline Scalar& z() { return m_coeffs.z(); } EIGEN_DEVICE_FUNC const VectorType& vector() const { return m_coeffs; } diff --git a/uppsrc/plugin/Eigen/Eigen/src/Geometry/arch/Geometry_SSE.h b/uppsrc/plugin/Eigen/Eigen/src/Geometry/arch/Geometry_SSE.h index 108cc9f8e..f68cab583 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Geometry/arch/Geometry_SSE.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Geometry/arch/Geometry_SSE.h @@ -25,20 +25,18 @@ struct quat_product }; static inline Quaternion run(const QuaternionBase& _a, const QuaternionBase& _b) { - evaluator ae(_a.coeffs()); - evaluator be(_b.coeffs()); Quaternion res; - const Packet4f mask = _mm_setr_ps(0.f,0.f,0.f,-0.f); - Packet4f a = ae.template packet(0); - Packet4f b = be.template packet(0); - Packet4f s1 = pmul(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2)); - Packet4f s2 = pmul(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1)); + const __m128 mask = _mm_setr_ps(0.f,0.f,0.f,-0.f); + __m128 a = _a.coeffs().template packet(0); + __m128 b = _b.coeffs().template packet(0); + __m128 s1 = _mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2)); + __m128 s2 = _mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1)); pstoret( &res.x(), - padd(psub(pmul(a,vec4f_swizzle1(b,3,3,3,3)), - pmul(vec4f_swizzle1(a,2,0,1,0), + _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,vec4f_swizzle1(b,3,3,3,3)), + _mm_mul_ps(vec4f_swizzle1(a,2,0,1,0), vec4f_swizzle1(b,1,2,0,0))), - pxor(mask,padd(s1,s2)))); + _mm_xor_ps(mask,_mm_add_ps(s1,s2)))); return res; } @@ -52,10 +50,9 @@ struct quat_conj }; static inline Quaternion run(const QuaternionBase& q) { - evaluator qe(q.coeffs()); Quaternion res; - const Packet4f mask = _mm_setr_ps(-0.f,-0.f,-0.f,0.f); - pstoret(&res.x(), pxor(mask, qe.template packet::Alignment,Packet4f>(0))); + const __m128 mask = _mm_setr_ps(-0.f,-0.f,-0.f,0.f); + pstoret(&res.x(), _mm_xor_ps(mask, q.coeffs().template packet::Alignment>(0))); return res; } }; @@ -70,14 +67,12 @@ struct cross3_impl static inline typename plain_matrix_type::type run(const VectorLhs& lhs, const VectorRhs& rhs) { - evaluator lhs_eval(lhs); - evaluator rhs_eval(rhs); - Packet4f a = lhs_eval.template packet::Alignment,Packet4f>(0); - Packet4f b = rhs_eval.template packet::Alignment,Packet4f>(0); - Packet4f mul1 = pmul(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3)); - Packet4f mul2 = pmul(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3)); + __m128 a = lhs.template packet::Alignment>(0); + __m128 b = rhs.template packet::Alignment>(0); + __m128 mul1=_mm_mul_ps(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3)); + __m128 mul2=_mm_mul_ps(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3)); typename plain_matrix_type::type res; - pstoret(&res.x(),psub(mul1,mul2)); + pstoret(&res.x(),_mm_sub_ps(mul1,mul2)); return res; } }; @@ -99,12 +94,9 @@ struct quat_product Quaternion res; - evaluator ae(_a.coeffs()); - evaluator be(_b.coeffs()); - const double* a = _a.coeffs().data(); - Packet2d b_xy = be.template packet(0); - Packet2d b_zw = be.template packet(2); + Packet2d b_xy = _b.coeffs().template packet(0); + Packet2d b_zw = _b.coeffs().template packet(2); Packet2d a_xx = pset1(a[0]); Packet2d a_yy = pset1(a[1]); Packet2d a_zz = pset1(a[2]); @@ -153,12 +145,11 @@ struct quat_conj }; static inline Quaternion run(const QuaternionBase& q) { - evaluator qe(q.coeffs()); Quaternion res; - const Packet2d mask0 = _mm_setr_pd(-0.,-0.); - const Packet2d mask2 = _mm_setr_pd(-0.,0.); - pstoret(&res.x(), pxor(mask0, qe.template packet::Alignment,Packet2d>(0))); - pstoret(&res.z(), pxor(mask2, qe.template packet::Alignment,Packet2d>(2))); + const __m128d mask0 = _mm_setr_pd(-0.,-0.); + const __m128d mask2 = _mm_setr_pd(-0.,0.); + pstoret(&res.x(), _mm_xor_pd(mask0, q.coeffs().template packet::Alignment>(0))); + pstoret(&res.z(), _mm_xor_pd(mask2, q.coeffs().template packet::Alignment>(2))); return res; } }; diff --git a/uppsrc/plugin/Eigen/Eigen/src/Householder/BlockHouseholder.h b/uppsrc/plugin/Eigen/Eigen/src/Householder/BlockHouseholder.h index 39ce1c2a0..01a7ed188 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Householder/BlockHouseholder.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Householder/BlockHouseholder.h @@ -63,15 +63,8 @@ void make_block_householder_triangular_factor(TriangularFactorType& triFactor, c triFactor.row(i).tail(rt).noalias() = -hCoeffs(i) * vectors.col(i).tail(rs).adjoint() * vectors.bottomRightCorner(rs, rt).template triangularView(); - // FIXME use the following line with .noalias() once the triangular product can work inplace - // triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView(); - for(Index j=nbVecs-1; j>i; --j) - { - typename TriangularFactorType::Scalar z = triFactor(i,j); - triFactor(i,j) = z * triFactor(j,j); - if(nbVecs-j-1>0) - triFactor.row(i).tail(nbVecs-j-1) += z * triFactor.row(j).tail(nbVecs-j-1); - } + // FIXME add .noalias() once the triangular product can work inplace + triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template triangularView(); } triFactor(i,i) = hCoeffs(i); diff --git a/uppsrc/plugin/Eigen/Eigen/src/Householder/Householder.h b/uppsrc/plugin/Eigen/Eigen/src/Householder/Householder.h index 5bc037f00..80de2c305 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Householder/Householder.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Householder/Householder.h @@ -39,7 +39,6 @@ template struct decrement_size * MatrixBase::applyHouseholderOnTheRight() */ template -EIGEN_DEVICE_FUNC void MatrixBase::makeHouseholderInPlace(Scalar& tau, RealScalar& beta) { VectorBlock::ret> essentialPart(derived(), 1, size()-1); @@ -63,7 +62,6 @@ void MatrixBase::makeHouseholderInPlace(Scalar& tau, RealScalar& beta) */ template template -EIGEN_DEVICE_FUNC void MatrixBase::makeHouseholder( EssentialPart& essential, Scalar& tau, @@ -105,14 +103,13 @@ void MatrixBase::makeHouseholder( * \param essential the essential part of the vector \c v * \param tau the scaling factor of the Householder transformation * \param workspace a pointer to working space with at least - * this->cols() entries + * this->cols() * essential.size() entries * * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), * MatrixBase::applyHouseholderOnTheRight() */ template template -EIGEN_DEVICE_FUNC void MatrixBase::applyHouseholderOnTheLeft( const EssentialPart& essential, const Scalar& tau, @@ -143,14 +140,13 @@ void MatrixBase::applyHouseholderOnTheLeft( * \param essential the essential part of the vector \c v * \param tau the scaling factor of the Householder transformation * \param workspace a pointer to working space with at least - * this->rows() entries + * this->cols() * essential.size() entries * * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), * MatrixBase::applyHouseholderOnTheLeft() */ template template -EIGEN_DEVICE_FUNC void MatrixBase::applyHouseholderOnTheRight( const EssentialPart& essential, const Scalar& tau, @@ -164,10 +160,10 @@ void MatrixBase::applyHouseholderOnTheRight( { Map::type> tmp(workspace,rows()); Block right(derived(), 0, 1, rows(), cols()-1); - tmp.noalias() = right * essential; + tmp.noalias() = right * essential.conjugate(); tmp += this->col(0); this->col(0) -= tau * tmp; - right.noalias() -= tau * tmp * essential.adjoint(); + right.noalias() -= tau * tmp * essential.transpose(); } } diff --git a/uppsrc/plugin/Eigen/Eigen/src/Householder/HouseholderSequence.h b/uppsrc/plugin/Eigen/Eigen/src/Householder/HouseholderSequence.h index 9318c281f..3ce0a693d 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Householder/HouseholderSequence.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Householder/HouseholderSequence.h @@ -87,7 +87,7 @@ struct hseq_side_dependent_impl { typedef Block EssentialVectorType; typedef HouseholderSequence HouseholderSequenceType; - static EIGEN_DEVICE_FUNC inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k) + static inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k) { Index start = k+1+h.m_shift; return Block(h.m_vectors, start, k, h.rows()-start, 1); @@ -140,28 +140,6 @@ template class HouseholderS Side > ConjugateReturnType; - typedef HouseholderSequence< - VectorsType, - typename internal::conditional::IsComplex, - typename internal::remove_all::type, - CoeffsType>::type, - Side - > AdjointReturnType; - - typedef HouseholderSequence< - typename internal::conditional::IsComplex, - typename internal::remove_all::type, - VectorsType>::type, - CoeffsType, - Side - > TransposeReturnType; - - typedef HouseholderSequence< - typename internal::add_const::type, - typename internal::add_const::type, - Side - > ConstHouseholderSequence; - /** \brief Constructor. * \param[in] v %Matrix containing the essential parts of the Householder vectors * \param[in] h Vector containing the Householder coefficients @@ -179,19 +157,17 @@ template class HouseholderS * * \sa setLength(), setShift() */ - EIGEN_DEVICE_FUNC HouseholderSequence(const VectorsType& v, const CoeffsType& h) - : m_vectors(v), m_coeffs(h), m_reverse(false), m_length(v.diagonalSize()), + : m_vectors(v), m_coeffs(h), m_trans(false), m_length(v.diagonalSize()), m_shift(0) { } /** \brief Copy constructor. */ - EIGEN_DEVICE_FUNC HouseholderSequence(const HouseholderSequence& other) : m_vectors(other.m_vectors), m_coeffs(other.m_coeffs), - m_reverse(other.m_reverse), + m_trans(other.m_trans), m_length(other.m_length), m_shift(other.m_shift) { @@ -201,14 +177,12 @@ template class HouseholderS * \returns Number of rows * \details This equals the dimension of the space that the transformation acts on. */ - EIGEN_DEVICE_FUNC Index rows() const { return Side==OnTheLeft ? m_vectors.rows() : m_vectors.cols(); } /** \brief Number of columns of transformation viewed as a matrix. * \returns Number of columns * \details This equals the dimension of the space that the transformation acts on. */ - EIGEN_DEVICE_FUNC Index cols() const { return rows(); } /** \brief Essential part of a Householder vector. @@ -225,7 +199,6 @@ template class HouseholderS * * \sa setShift(), shift() */ - EIGEN_DEVICE_FUNC const EssentialVectorType essentialVector(Index k) const { eigen_assert(k >= 0 && k < m_length); @@ -233,51 +206,31 @@ template class HouseholderS } /** \brief %Transpose of the Householder sequence. */ - TransposeReturnType transpose() const + HouseholderSequence transpose() const { - return TransposeReturnType(m_vectors.conjugate(), m_coeffs) - .setReverseFlag(!m_reverse) - .setLength(m_length) - .setShift(m_shift); + return HouseholderSequence(*this).setTrans(!m_trans); } /** \brief Complex conjugate of the Householder sequence. */ ConjugateReturnType conjugate() const { return ConjugateReturnType(m_vectors.conjugate(), m_coeffs.conjugate()) - .setReverseFlag(m_reverse) + .setTrans(m_trans) .setLength(m_length) .setShift(m_shift); } - /** \returns an expression of the complex conjugate of \c *this if Cond==true, - * returns \c *this otherwise. - */ - template - EIGEN_DEVICE_FUNC - inline typename internal::conditional::type - conjugateIf() const - { - typedef typename internal::conditional::type ReturnType; - return ReturnType(m_vectors.template conjugateIf(), m_coeffs.template conjugateIf()); - } - /** \brief Adjoint (conjugate transpose) of the Householder sequence. */ - AdjointReturnType adjoint() const + ConjugateReturnType adjoint() const { - return AdjointReturnType(m_vectors, m_coeffs.conjugate()) - .setReverseFlag(!m_reverse) - .setLength(m_length) - .setShift(m_shift); + return conjugate().setTrans(!m_trans); } /** \brief Inverse of the Householder sequence (equals the adjoint). */ - AdjointReturnType inverse() const { return adjoint(); } + ConjugateReturnType inverse() const { return adjoint(); } /** \internal */ - template - inline EIGEN_DEVICE_FUNC - void evalTo(DestType& dst) const + template inline void evalTo(DestType& dst) const { Matrix workspace(rows()); @@ -286,7 +239,6 @@ template class HouseholderS /** \internal */ template - EIGEN_DEVICE_FUNC void evalTo(Dest& dst, Workspace& workspace) const { workspace.resize(rows()); @@ -299,7 +251,7 @@ template class HouseholderS for(Index k = vecs-1; k >= 0; --k) { Index cornerSize = rows() - k - m_shift; - if(m_reverse) + if(m_trans) dst.bottomRightCorner(cornerSize, cornerSize) .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data()); else @@ -313,26 +265,18 @@ template class HouseholderS for(Index k = 0; kBlockSize) - { - dst.setIdentity(rows(), rows()); - if(m_reverse) - applyThisOnTheLeft(dst,workspace,true); - else - applyThisOnTheLeft(dst,workspace,true); - } else { dst.setIdentity(rows(), rows()); for(Index k = vecs-1; k >= 0; --k) { Index cornerSize = rows() - k - m_shift; - if(m_reverse) + if(m_trans) dst.bottomRightCorner(cornerSize, cornerSize) - .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data()); + .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0)); else dst.bottomRightCorner(cornerSize, cornerSize) - .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), workspace.data()); + .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0)); } } } @@ -351,34 +295,31 @@ template class HouseholderS workspace.resize(dst.rows()); for(Index k = 0; k < m_length; ++k) { - Index actual_k = m_reverse ? m_length-k-1 : k; + Index actual_k = m_trans ? m_length-k-1 : k; dst.rightCols(rows()-m_shift-actual_k) .applyHouseholderOnTheRight(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data()); } } /** \internal */ - template inline void applyThisOnTheLeft(Dest& dst, bool inputIsIdentity = false) const + template inline void applyThisOnTheLeft(Dest& dst) const { Matrix workspace; - applyThisOnTheLeft(dst, workspace, inputIsIdentity); + applyThisOnTheLeft(dst, workspace); } /** \internal */ template - inline void applyThisOnTheLeft(Dest& dst, Workspace& workspace, bool inputIsIdentity = false) const + inline void applyThisOnTheLeft(Dest& dst, Workspace& workspace) const { - if(inputIsIdentity && m_reverse) - inputIsIdentity = false; + const Index BlockSize = 48; // if the entries are large enough, then apply the reflectors by block if(m_length>=BlockSize && dst.cols()>1) { - // Make sure we have at least 2 useful blocks, otherwise it is point-less: - Index blockSize = m_length class HouseholderS Side==OnTheRight ? bs : m_vectors.rows()-start, Side==OnTheRight ? m_vectors.cols()-start : bs); typename internal::conditional, SubVectorsType&>::type sub_vecs(sub_vecs1); - - Index dstStart = dst.rows()-rows()+m_shift+k; - Index dstRows = rows()-m_shift-k; - Block sub_dst(dst, - dstStart, - inputIsIdentity ? dstStart : 0, - dstRows, - inputIsIdentity ? dstRows : dst.cols()); - apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_reverse); + Block sub_dst(dst,dst.rows()-rows()+m_shift+k,0, rows()-m_shift-k,dst.cols()); + apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_trans); } } else @@ -404,9 +338,8 @@ template class HouseholderS workspace.resize(dst.cols()); for(Index k = 0; k < m_length; ++k) { - Index actual_k = m_reverse ? k : m_length-k-1; - Index dstStart = rows()-m_shift-actual_k; - dst.bottomRightCorner(dstStart, inputIsIdentity ? dstStart : dst.cols()) + Index actual_k = m_trans ? k : m_length-k-1; + dst.bottomRows(rows()-m_shift-actual_k) .applyHouseholderOnTheLeft(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data()); } } @@ -424,7 +357,7 @@ template class HouseholderS { typename internal::matrix_type_times_scalar_type::Type res(other.template cast::ResultScalar>()); - applyThisOnTheLeft(res, internal::is_identity::value && res.rows()==res.cols()); + applyThisOnTheLeft(res); return res; } @@ -439,7 +372,6 @@ template class HouseholderS * * \sa length() */ - EIGEN_DEVICE_FUNC HouseholderSequence& setLength(Index length) { m_length = length; @@ -457,17 +389,13 @@ template class HouseholderS * * \sa shift() */ - EIGEN_DEVICE_FUNC HouseholderSequence& setShift(Index shift) { m_shift = shift; return *this; } - EIGEN_DEVICE_FUNC Index length() const { return m_length; } /**< \brief Returns the length of the Householder sequence. */ - - EIGEN_DEVICE_FUNC Index shift() const { return m_shift; } /**< \brief Returns the shift of the Householder sequence. */ /* Necessary for .adjoint() and .conjugate() */ @@ -475,30 +403,27 @@ template class HouseholderS protected: - /** \internal - * \brief Sets the reverse flag. - * \param [in] reverse New value of the reverse flag. + /** \brief Sets the transpose flag. + * \param [in] trans New value of the transpose flag. * - * By default, the reverse flag is not set. If the reverse flag is set, then this object represents - * \f$ H^r = H_{n-1} \ldots H_1 H_0 \f$ instead of \f$ H = H_0 H_1 \ldots H_{n-1} \f$. - * \note For real valued HouseholderSequence this is equivalent to transposing \f$ H \f$. + * By default, the transpose flag is not set. If the transpose flag is set, then this object represents + * \f$ H^T = H_{n-1}^T \ldots H_1^T H_0^T \f$ instead of \f$ H = H_0 H_1 \ldots H_{n-1} \f$. * - * \sa reverseFlag(), transpose(), adjoint() + * \sa trans() */ - HouseholderSequence& setReverseFlag(bool reverse) + HouseholderSequence& setTrans(bool trans) { - m_reverse = reverse; + m_trans = trans; return *this; } - bool reverseFlag() const { return m_reverse; } /**< \internal \brief Returns the reverse flag. */ + bool trans() const { return m_trans; } /**< \brief Returns the transpose flag. */ typename VectorsType::Nested m_vectors; typename CoeffsType::Nested m_coeffs; - bool m_reverse; + bool m_trans; Index m_length; Index m_shift; - enum { BlockSize = 48 }; }; /** \brief Computes the product of a matrix with a Householder sequence. diff --git a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h index 153acef65..454f46814 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +++ b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h @@ -191,16 +191,32 @@ public: /** \internal */ template - void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const + void _solve_with_guess_impl(const Rhs& b, Dest& x) const { - m_iterations = Base::maxIterations(); - m_error = Base::m_tolerance; - - bool ret = internal::bicgstab(matrix(), b, x, Base::m_preconditioner, m_iterations, m_error); - - m_info = (!ret) ? NumericalIssue + bool failed = false; + for(Index j=0; j + void _solve_impl(const MatrixBase& b, Dest& x) const + { + x.resize(this->rows(),b.cols()); + x.setZero(); + _solve_with_guess_impl(b,x); } protected: diff --git a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h index 5d8c6b433..f7ce47134 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +++ b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h @@ -51,7 +51,7 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x, return; } const RealScalar considerAsZero = (std::numeric_limits::min)(); - RealScalar threshold = numext::maxi(RealScalar(tol*tol*rhsNorm2),considerAsZero); + RealScalar threshold = numext::maxi(tol*tol*rhsNorm2,considerAsZero); RealScalar residualNorm2 = residual.squaredNorm(); if (residualNorm2 < threshold) { @@ -195,7 +195,7 @@ public: /** \internal */ template - void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const + void _solve_with_guess_impl(const Rhs& b, Dest& x) const { typedef typename Base::MatrixWrapper MatrixWrapper; typedef typename Base::ActualMatrixType ActualMatrixType; @@ -211,14 +211,31 @@ public: RowMajorWrapper, typename MatrixWrapper::template ConstSelfAdjointViewReturnType::Type >::type SelfAdjointWrapper; - m_iterations = Base::maxIterations(); m_error = Base::m_tolerance; - RowMajorWrapper row_mat(matrix()); - internal::conjugate_gradient(SelfAdjointWrapper(row_mat), b, x, Base::m_preconditioner, m_iterations, m_error); + for(Index j=0; j + void _solve_impl(const MatrixBase& b, Dest& x) const + { + x.setZero(); + _solve_with_guess_impl(b.derived(),x); + } protected: diff --git a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h index e5d0308ec..e45c272b4 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +++ b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h @@ -41,7 +41,13 @@ namespace Eigen { * the info() method, then you can either increase the initial shift, or better use another preconditioning technique. * */ -template > +template +#else +NaturalOrdering +#endif +> class IncompleteCholesky : public SparseSolverBase > { protected: @@ -70,12 +76,12 @@ class IncompleteCholesky : public SparseSolverBase - IncompleteCholesky(const MatrixType& matrix) : m_initialShift(1e-3),m_analysisIsOk(false),m_factorizationIsOk(false) + IncompleteCholesky(const MatrixType& matrix) : m_initialShift(1e-3),m_factorizationIsOk(false) { compute(matrix); } diff --git a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h index 09436cb67..338e6f10a 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h +++ b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h @@ -136,7 +136,7 @@ class IncompleteLUT : public SparseSolverBase::analyzePattern(const _MatrixType& amat) // Compute the Fill-reducing permutation // Since ILUT does not perform any numerical pivoting, // it is highly preferable to keep the diagonal through symmetric permutations. +#ifndef EIGEN_MPL2_ONLY // To this end, let's symmetrize the pattern and perform AMD on it. SparseMatrix mat1 = amat; SparseMatrix mat2 = amat.transpose(); // FIXME for a matrix with nearly symmetric pattern, mat2+mat1 is the appropriate choice. - // on the other hand for a really non-symmetric pattern, mat2*mat1 should be preferred... + // on the other hand for a really non-symmetric pattern, mat2*mat1 should be prefered... SparseMatrix AtA = mat2 + mat1; AMDOrdering ordering; ordering(AtA,m_P); m_Pinv = m_P.inverse(); // cache the inverse permutation +#else + // If AMD is not available, (MPL2-only), then let's use the slower COLAMD routine. + SparseMatrix mat1 = amat; + COLAMDOrdering ordering; + ordering(mat1,m_Pinv); + m_P = m_Pinv.inverse(); +#endif + m_analysisIsOk = true; m_factorizationIsOk = false; m_isInitialized = true; diff --git a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h index 13ba9a55b..7c2326eb7 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h +++ b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h @@ -275,7 +275,7 @@ public: const Preconditioner& preconditioner() const { return m_preconditioner; } /** \returns the max number of iterations. - * It is either the value set by setMaxIterations or, by default, + * It is either the value setted by setMaxIterations or, by default, * twice the number of columns of the matrix. */ Index maxIterations() const @@ -331,7 +331,7 @@ public: /** \internal */ template - void _solve_with_guess_impl(const Rhs& b, SparseMatrixBase &aDest) const + void _solve_impl(const Rhs& b, SparseMatrixBase &aDest) const { eigen_assert(rows()==b.rows()); @@ -344,65 +344,15 @@ public: // We do not directly fill dest because sparse expressions have to be free of aliasing issue. // For non square least-square problems, b and dest might not have the same size whereas they might alias each-other. typename DestDerived::PlainObject tmp(cols(),rhsCols); - ComputationInfo global_info = Success; for(Index k=0; k - typename internal::enable_if::type - _solve_with_guess_impl(const Rhs& b, MatrixBase &aDest) const - { - eigen_assert(rows()==b.rows()); - - Index rhsCols = b.cols(); - DestDerived& dest(aDest.derived()); - ComputationInfo global_info = Success; - for(Index k=0; k - typename internal::enable_if::type - _solve_with_guess_impl(const Rhs& b, MatrixBase &dest) const - { - derived()._solve_vector_with_guess_impl(b,dest.derived()); - } - - /** \internal default initial guess = 0 */ - template - void _solve_impl(const Rhs& b, Dest& x) const - { - x.setZero(); - derived()._solve_with_guess_impl(b,x); - } - protected: void init() { diff --git a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h index 203fd0ec6..0aea0e099 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h +++ b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h @@ -182,14 +182,32 @@ public: /** \internal */ template - void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const + void _solve_with_guess_impl(const Rhs& b, Dest& x) const { m_iterations = Base::maxIterations(); m_error = Base::m_tolerance; - internal::least_square_conjugate_gradient(matrix(), b, x, Base::m_preconditioner, m_iterations, m_error); + for(Index j=0; j + void _solve_impl(const MatrixBase& b, Dest& x) const + { + x.setZero(); + _solve_with_guess_impl(b.derived(),x); + } }; diff --git a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h index 79e1e4819..0ace45177 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h +++ b/uppsrc/plugin/Eigen/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h @@ -108,7 +108,7 @@ struct Assignment, interna } }; -} // end namespace internal +} // end namepsace internal } // end namespace Eigen diff --git a/uppsrc/plugin/Eigen/Eigen/src/Jacobi/Jacobi.h b/uppsrc/plugin/Eigen/Eigen/src/Jacobi/Jacobi.h index bfb9dcb08..1998c6322 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/Jacobi/Jacobi.h +++ b/uppsrc/plugin/Eigen/Eigen/src/Jacobi/Jacobi.h @@ -11,7 +11,7 @@ #ifndef EIGEN_JACOBI_H #define EIGEN_JACOBI_H -namespace Eigen { +namespace Eigen { /** \ingroup Jacobi_Module * \jacobi_module @@ -37,20 +37,17 @@ template class JacobiRotation typedef typename NumTraits::Real RealScalar; /** Default constructor without any initialization. */ - EIGEN_DEVICE_FUNC JacobiRotation() {} /** Construct a planar rotation from a cosine-sine pair (\a c, \c s). */ - EIGEN_DEVICE_FUNC JacobiRotation(const Scalar& c, const Scalar& s) : m_c(c), m_s(s) {} - EIGEN_DEVICE_FUNC Scalar& c() { return m_c; } - EIGEN_DEVICE_FUNC Scalar c() const { return m_c; } - EIGEN_DEVICE_FUNC Scalar& s() { return m_s; } - EIGEN_DEVICE_FUNC Scalar s() const { return m_s; } + Scalar& c() { return m_c; } + Scalar c() const { return m_c; } + Scalar& s() { return m_s; } + Scalar s() const { return m_s; } /** Concatenates two planar rotation */ - EIGEN_DEVICE_FUNC JacobiRotation operator*(const JacobiRotation& other) { using numext::conj; @@ -59,26 +56,19 @@ template class JacobiRotation } /** Returns the transposed transformation */ - EIGEN_DEVICE_FUNC JacobiRotation transpose() const { using numext::conj; return JacobiRotation(m_c, -conj(m_s)); } /** Returns the adjoint transformation */ - EIGEN_DEVICE_FUNC JacobiRotation adjoint() const { using numext::conj; return JacobiRotation(conj(m_c), -m_s); } template - EIGEN_DEVICE_FUNC bool makeJacobi(const MatrixBase&, Index p, Index q); - EIGEN_DEVICE_FUNC bool makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z); - EIGEN_DEVICE_FUNC void makeGivens(const Scalar& p, const Scalar& q, Scalar* r=0); protected: - EIGEN_DEVICE_FUNC void makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type); - EIGEN_DEVICE_FUNC void makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type); Scalar m_c, m_s; @@ -90,12 +80,10 @@ template class JacobiRotation * \sa MatrixBase::makeJacobi(const MatrixBase&, Index, Index), MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight() */ template -EIGEN_DEVICE_FUNC bool JacobiRotation::makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z) { using std::sqrt; using std::abs; - RealScalar deno = RealScalar(2)*abs(y); if(deno < (std::numeric_limits::min)()) { @@ -135,7 +123,6 @@ bool JacobiRotation::makeJacobi(const RealScalar& x, const Scalar& y, co */ template template -EIGEN_DEVICE_FUNC inline bool JacobiRotation::makeJacobi(const MatrixBase& m, Index p, Index q) { return makeJacobi(numext::real(m.coeff(p,p)), m.coeff(p,q), numext::real(m.coeff(q,q))); @@ -158,7 +145,6 @@ inline bool JacobiRotation::makeJacobi(const MatrixBase& m, Ind * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight() */ template -EIGEN_DEVICE_FUNC void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* r) { makeGivens(p, q, r, typename internal::conditional::IsComplex, internal::true_type, internal::false_type>::type()); @@ -167,13 +153,12 @@ void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar // specialization for complexes template -EIGEN_DEVICE_FUNC void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type) { using std::sqrt; using std::abs; using numext::conj; - + if(q==Scalar(0)) { m_c = numext::real(p)<0 ? Scalar(-1) : Scalar(1); @@ -227,7 +212,6 @@ void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar // specialization for reals template -EIGEN_DEVICE_FUNC void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type) { using std::sqrt; @@ -273,13 +257,12 @@ void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar namespace internal { /** \jacobi_module - * Applies the clock wise 2D rotation \a j to the set of 2D vectors of coordinates \a x and \a y: + * Applies the clock wise 2D rotation \a j to the set of 2D vectors of cordinates \a x and \a y: * \f$ \left ( \begin{array}{cc} x \\ y \end{array} \right ) = J \left ( \begin{array}{cc} x \\ y \end{array} \right ) \f$ * * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight() */ template -EIGEN_DEVICE_FUNC void apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& xpr_y, const JacobiRotation& j); } @@ -291,7 +274,6 @@ void apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& */ template template -EIGEN_DEVICE_FUNC inline void MatrixBase::applyOnTheLeft(Index p, Index q, const JacobiRotation& j) { RowXpr x(this->row(p)); @@ -307,7 +289,6 @@ inline void MatrixBase::applyOnTheLeft(Index p, Index q, const JacobiRo */ template template -EIGEN_DEVICE_FUNC inline void MatrixBase::applyOnTheRight(Index p, Index q, const JacobiRotation& j) { ColXpr x(this->col(p)); @@ -321,8 +302,7 @@ template struct apply_rotation_in_the_plane_selector { - static EIGEN_DEVICE_FUNC - inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s) + static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s) { for(Index i=0; i -EIGEN_DEVICE_FUNC void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& xpr_y, const JacobiRotation& j) { typedef typename VectorX::Scalar Scalar; @@ -463,7 +442,7 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x Scalar* EIGEN_RESTRICT x = &xpr_x.derived().coeffRef(0); Scalar* EIGEN_RESTRICT y = &xpr_y.derived().coeffRef(0); - + OtherScalar c = j.c(); OtherScalar s = j.s(); if (c==OtherScalar(1) && s==OtherScalar(0)) diff --git a/uppsrc/plugin/Eigen/Eigen/src/KLUSupport/KLUSupport.h b/uppsrc/plugin/Eigen/Eigen/src/KLUSupport/KLUSupport.h deleted file mode 100644 index d2633a935..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/KLUSupport/KLUSupport.h +++ /dev/null @@ -1,358 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2017 Kyle Macfarlan -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_KLUSUPPORT_H -#define EIGEN_KLUSUPPORT_H - -namespace Eigen { - -/* TODO extract L, extract U, compute det, etc... */ - -/** \ingroup KLUSupport_Module - * \brief A sparse LU factorization and solver based on KLU - * - * This class allows to solve for A.X = B sparse linear problems via a LU factorization - * using the KLU library. The sparse matrix A must be squared and full rank. - * The vectors or matrices X and B can be either dense or sparse. - * - * \warning The input matrix A should be in a \b compressed and \b column-major form. - * Otherwise an expensive copy will be made. You can call the inexpensive makeCompressed() to get a compressed matrix. - * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<> - * - * \implsparsesolverconcept - * - * \sa \ref TutorialSparseSolverConcept, class UmfPackLU, class SparseLU - */ - - -inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, double B [ ], klu_common *Common, double) { - return klu_solve(Symbolic, Numeric, internal::convert_index(ldim), internal::convert_index(nrhs), B, Common); -} - -inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, std::complexB[], klu_common *Common, std::complex) { - return klu_z_solve(Symbolic, Numeric, internal::convert_index(ldim), internal::convert_index(nrhs), &numext::real_ref(B[0]), Common); -} - -inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, double B[], klu_common *Common, double) { - return klu_tsolve(Symbolic, Numeric, internal::convert_index(ldim), internal::convert_index(nrhs), B, Common); -} - -inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, std::complexB[], klu_common *Common, std::complex) { - return klu_z_tsolve(Symbolic, Numeric, internal::convert_index(ldim), internal::convert_index(nrhs), &numext::real_ref(B[0]), 0, Common); -} - -inline klu_numeric* klu_factor(int Ap [ ], int Ai [ ], double Ax [ ], klu_symbolic *Symbolic, klu_common *Common, double) { - return klu_factor(Ap, Ai, Ax, Symbolic, Common); -} - -inline klu_numeric* klu_factor(int Ap[], int Ai[], std::complex Ax[], klu_symbolic *Symbolic, klu_common *Common, std::complex) { - return klu_z_factor(Ap, Ai, &numext::real_ref(Ax[0]), Symbolic, Common); -} - - -template -class KLU : public SparseSolverBase > -{ - protected: - typedef SparseSolverBase > Base; - using Base::m_isInitialized; - public: - using Base::_solve_impl; - typedef _MatrixType MatrixType; - typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::RealScalar RealScalar; - typedef typename MatrixType::StorageIndex StorageIndex; - typedef Matrix Vector; - typedef Matrix IntRowVectorType; - typedef Matrix IntColVectorType; - typedef SparseMatrix LUMatrixType; - typedef SparseMatrix KLUMatrixType; - typedef Ref KLUMatrixRef; - enum { - ColsAtCompileTime = MatrixType::ColsAtCompileTime, - MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime - }; - - public: - - KLU() - : m_dummy(0,0), mp_matrix(m_dummy) - { - init(); - } - - template - explicit KLU(const InputMatrixType& matrix) - : mp_matrix(matrix) - { - init(); - compute(matrix); - } - - ~KLU() - { - if(m_symbolic) klu_free_symbolic(&m_symbolic,&m_common); - if(m_numeric) klu_free_numeric(&m_numeric,&m_common); - } - - inline Index rows() const { return mp_matrix.rows(); } - inline Index cols() const { return mp_matrix.cols(); } - - /** \brief Reports whether previous computation was successful. - * - * \returns \c Success if computation was successful, - * \c NumericalIssue if the matrix.appears to be negative. - */ - ComputationInfo info() const - { - eigen_assert(m_isInitialized && "Decomposition is not initialized."); - return m_info; - } -#if 0 // not implemented yet - inline const LUMatrixType& matrixL() const - { - if (m_extractedDataAreDirty) extractData(); - return m_l; - } - - inline const LUMatrixType& matrixU() const - { - if (m_extractedDataAreDirty) extractData(); - return m_u; - } - - inline const IntColVectorType& permutationP() const - { - if (m_extractedDataAreDirty) extractData(); - return m_p; - } - - inline const IntRowVectorType& permutationQ() const - { - if (m_extractedDataAreDirty) extractData(); - return m_q; - } -#endif - /** Computes the sparse Cholesky decomposition of \a matrix - * Note that the matrix should be column-major, and in compressed format for best performance. - * \sa SparseMatrix::makeCompressed(). - */ - template - void compute(const InputMatrixType& matrix) - { - if(m_symbolic) klu_free_symbolic(&m_symbolic, &m_common); - if(m_numeric) klu_free_numeric(&m_numeric, &m_common); - grab(matrix.derived()); - analyzePattern_impl(); - factorize_impl(); - } - - /** Performs a symbolic decomposition on the sparcity of \a matrix. - * - * This function is particularly useful when solving for several problems having the same structure. - * - * \sa factorize(), compute() - */ - template - void analyzePattern(const InputMatrixType& matrix) - { - if(m_symbolic) klu_free_symbolic(&m_symbolic, &m_common); - if(m_numeric) klu_free_numeric(&m_numeric, &m_common); - - grab(matrix.derived()); - - analyzePattern_impl(); - } - - - /** Provides access to the control settings array used by KLU. - * - * See KLU documentation for details. - */ - inline const klu_common& kluCommon() const - { - return m_common; - } - - /** Provides access to the control settings array used by UmfPack. - * - * If this array contains NaN's, the default values are used. - * - * See KLU documentation for details. - */ - inline klu_common& kluCommon() - { - return m_common; - } - - /** Performs a numeric decomposition of \a matrix - * - * The given matrix must has the same sparcity than the matrix on which the pattern anylysis has been performed. - * - * \sa analyzePattern(), compute() - */ - template - void factorize(const InputMatrixType& matrix) - { - eigen_assert(m_analysisIsOk && "KLU: you must first call analyzePattern()"); - if(m_numeric) - klu_free_numeric(&m_numeric,&m_common); - - grab(matrix.derived()); - - factorize_impl(); - } - - /** \internal */ - template - bool _solve_impl(const MatrixBase &b, MatrixBase &x) const; - -#if 0 // not implemented yet - Scalar determinant() const; - - void extractData() const; -#endif - - protected: - - void init() - { - m_info = InvalidInput; - m_isInitialized = false; - m_numeric = 0; - m_symbolic = 0; - m_extractedDataAreDirty = true; - - klu_defaults(&m_common); - } - - void analyzePattern_impl() - { - m_info = InvalidInput; - m_analysisIsOk = false; - m_factorizationIsOk = false; - m_symbolic = klu_analyze(internal::convert_index(mp_matrix.rows()), - const_cast(mp_matrix.outerIndexPtr()), const_cast(mp_matrix.innerIndexPtr()), - &m_common); - if (m_symbolic) { - m_isInitialized = true; - m_info = Success; - m_analysisIsOk = true; - m_extractedDataAreDirty = true; - } - } - - void factorize_impl() - { - - m_numeric = klu_factor(const_cast(mp_matrix.outerIndexPtr()), const_cast(mp_matrix.innerIndexPtr()), const_cast(mp_matrix.valuePtr()), - m_symbolic, &m_common, Scalar()); - - - m_info = m_numeric ? Success : NumericalIssue; - m_factorizationIsOk = m_numeric ? 1 : 0; - m_extractedDataAreDirty = true; - } - - template - void grab(const EigenBase &A) - { - mp_matrix.~KLUMatrixRef(); - ::new (&mp_matrix) KLUMatrixRef(A.derived()); - } - - void grab(const KLUMatrixRef &A) - { - if(&(A.derived()) != &mp_matrix) - { - mp_matrix.~KLUMatrixRef(); - ::new (&mp_matrix) KLUMatrixRef(A); - } - } - - // cached data to reduce reallocation, etc. -#if 0 // not implemented yet - mutable LUMatrixType m_l; - mutable LUMatrixType m_u; - mutable IntColVectorType m_p; - mutable IntRowVectorType m_q; -#endif - - KLUMatrixType m_dummy; - KLUMatrixRef mp_matrix; - - klu_numeric* m_numeric; - klu_symbolic* m_symbolic; - klu_common m_common; - mutable ComputationInfo m_info; - int m_factorizationIsOk; - int m_analysisIsOk; - mutable bool m_extractedDataAreDirty; - - private: - KLU(const KLU& ) { } -}; - -#if 0 // not implemented yet -template -void KLU::extractData() const -{ - if (m_extractedDataAreDirty) - { - eigen_assert(false && "KLU: extractData Not Yet Implemented"); - - // get size of the data - int lnz, unz, rows, cols, nz_udiag; - umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar()); - - // allocate data - m_l.resize(rows,(std::min)(rows,cols)); - m_l.resizeNonZeros(lnz); - - m_u.resize((std::min)(rows,cols),cols); - m_u.resizeNonZeros(unz); - - m_p.resize(rows); - m_q.resize(cols); - - // extract - umfpack_get_numeric(m_l.outerIndexPtr(), m_l.innerIndexPtr(), m_l.valuePtr(), - m_u.outerIndexPtr(), m_u.innerIndexPtr(), m_u.valuePtr(), - m_p.data(), m_q.data(), 0, 0, 0, m_numeric); - - m_extractedDataAreDirty = false; - } -} - -template -typename KLU::Scalar KLU::determinant() const -{ - eigen_assert(false && "KLU: extractData Not Yet Implemented"); - return Scalar(); -} -#endif - -template -template -bool KLU::_solve_impl(const MatrixBase &b, MatrixBase &x) const -{ - Index rhsCols = b.cols(); - EIGEN_STATIC_ASSERT((XDerived::Flags&RowMajorBit)==0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES); - eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()"); - - x = b; - int info = klu_solve(m_symbolic, m_numeric, b.rows(), rhsCols, x.const_cast_derived().data(), const_cast(&m_common), Scalar()); - - m_info = info!=0 ? Success : NumericalIssue; - return true; -} - -} // end namespace Eigen - -#endif // EIGEN_KLUSUPPORT_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/LU/Determinant.h b/uppsrc/plugin/Eigen/Eigen/src/LU/Determinant.h index 3a41e6fcb..d6a3c1e5a 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/LU/Determinant.h +++ b/uppsrc/plugin/Eigen/Eigen/src/LU/Determinant.h @@ -15,7 +15,6 @@ namespace Eigen { namespace internal { template -EIGEN_DEVICE_FUNC inline const typename Derived::Scalar bruteforce_det3_helper (const MatrixBase& matrix, int a, int b, int c) { @@ -23,6 +22,14 @@ inline const typename Derived::Scalar bruteforce_det3_helper * (matrix.coeff(1,b) * matrix.coeff(2,c) - matrix.coeff(1,c) * matrix.coeff(2,b)); } +template +const typename Derived::Scalar bruteforce_det4_helper +(const MatrixBase& matrix, int j, int k, int m, int n) +{ + return (matrix.coeff(j,0) * matrix.coeff(k,1) - matrix.coeff(k,0) * matrix.coeff(j,1)) + * (matrix.coeff(m,2) * matrix.coeff(n,3) - matrix.coeff(n,2) * matrix.coeff(m,3)); +} + template struct determinant_impl @@ -37,8 +44,7 @@ template struct determinant_impl { - static inline EIGEN_DEVICE_FUNC - typename traits::Scalar run(const Derived& m) + static inline typename traits::Scalar run(const Derived& m) { return m.coeff(0,0); } @@ -46,8 +52,7 @@ template struct determinant_impl template struct determinant_impl { - static inline EIGEN_DEVICE_FUNC - typename traits::Scalar run(const Derived& m) + static inline typename traits::Scalar run(const Derived& m) { return m.coeff(0,0) * m.coeff(1,1) - m.coeff(1,0) * m.coeff(0,1); } @@ -55,8 +60,7 @@ template struct determinant_impl template struct determinant_impl { - static inline EIGEN_DEVICE_FUNC - typename traits::Scalar run(const Derived& m) + static inline typename traits::Scalar run(const Derived& m) { return bruteforce_det3_helper(m,0,1,2) - bruteforce_det3_helper(m,1,0,2) @@ -66,34 +70,15 @@ template struct determinant_impl template struct determinant_impl { - typedef typename traits::Scalar Scalar; - static EIGEN_DEVICE_FUNC - Scalar run(const Derived& m) + static typename traits::Scalar run(const Derived& m) { - Scalar d2_01 = det2(m, 0, 1); - Scalar d2_02 = det2(m, 0, 2); - Scalar d2_03 = det2(m, 0, 3); - Scalar d2_12 = det2(m, 1, 2); - Scalar d2_13 = det2(m, 1, 3); - Scalar d2_23 = det2(m, 2, 3); - Scalar d3_0 = det3(m, 1,d2_23, 2,d2_13, 3,d2_12); - Scalar d3_1 = det3(m, 0,d2_23, 2,d2_03, 3,d2_02); - Scalar d3_2 = det3(m, 0,d2_13, 1,d2_03, 3,d2_01); - Scalar d3_3 = det3(m, 0,d2_12, 1,d2_02, 2,d2_01); - return internal::pmadd(-m(0,3),d3_0, m(1,3)*d3_1) + - internal::pmadd(-m(2,3),d3_2, m(3,3)*d3_3); - } -protected: - static EIGEN_DEVICE_FUNC - Scalar det2(const Derived& m, Index i0, Index i1) - { - return m(i0,0) * m(i1,1) - m(i1,0) * m(i0,1); - } - - static EIGEN_DEVICE_FUNC - Scalar det3(const Derived& m, Index i0, const Scalar& d0, Index i1, const Scalar& d1, Index i2, const Scalar& d2) - { - return internal::pmadd(m(i0,2), d0, internal::pmadd(-m(i1,2), d1, m(i2,2)*d2)); + // trick by Martin Costabel to compute 4x4 det with only 30 muls + return bruteforce_det4_helper(m,0,1,2,3) + - bruteforce_det4_helper(m,0,2,1,3) + + bruteforce_det4_helper(m,0,3,1,2) + + bruteforce_det4_helper(m,1,2,0,3) + - bruteforce_det4_helper(m,1,3,0,2) + + bruteforce_det4_helper(m,2,3,0,1); } }; @@ -104,7 +89,6 @@ protected: * \returns the determinant of this matrix */ template -EIGEN_DEVICE_FUNC inline typename internal::traits::Scalar MatrixBase::determinant() const { eigen_assert(rows() == cols()); diff --git a/uppsrc/plugin/Eigen/Eigen/src/LU/FullPivLU.h b/uppsrc/plugin/Eigen/Eigen/src/LU/FullPivLU.h index ef93ec5eb..03b6af706 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/LU/FullPivLU.h +++ b/uppsrc/plugin/Eigen/Eigen/src/LU/FullPivLU.h @@ -18,7 +18,6 @@ template struct traits > { typedef MatrixXpr XprKind; typedef SolverStorage StorageKind; - typedef int StorageIndex; enum { Flags = 0 }; }; @@ -49,12 +48,12 @@ template struct traits > * The data of the LU decomposition can be directly accessed through the methods matrixLU(), * permutationP(), permutationQ(). * - * As an example, here is how the original matrix can be retrieved: + * As an exemple, here is how the original matrix can be retrieved: * \include class_FullPivLU.cpp * Output: \verbinclude class_FullPivLU.out * * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. - * + * * \sa MatrixBase::fullPivLu(), MatrixBase::determinant(), MatrixBase::inverse() */ template class FullPivLU @@ -63,9 +62,9 @@ template class FullPivLU public: typedef _MatrixType MatrixType; typedef SolverBase Base; - friend class SolverBase; EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivLU) + // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int enum { MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime @@ -219,7 +218,6 @@ template class FullPivLU return internal::image_retval(*this, originalMatrix); } - #ifdef EIGEN_PARSED_BY_DOXYGEN /** \return a solution x to the equation Ax=b, where A is the matrix of which * *this is the LU decomposition. * @@ -239,10 +237,14 @@ template class FullPivLU * * \sa TriangularView::solve(), kernel(), inverse() */ + // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion. template inline const Solve - solve(const MatrixBase& b) const; - #endif + solve(const MatrixBase& b) const + { + eigen_assert(m_isInitialized && "LU is not initialized."); + return Solve(*this, b.derived()); + } /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is the LU decomposition. @@ -318,7 +320,7 @@ template class FullPivLU return m_usePrescribedThreshold ? m_prescribedThreshold // this formula comes from experimenting (see "LU precision tuning" thread on the list) // and turns out to be identical to Higham's formula used already in LDLt. - : NumTraits::epsilon() * RealScalar(m_lu.diagonalSize()); + : NumTraits::epsilon() * m_lu.diagonalSize(); } /** \returns the rank of the matrix of which *this is the LU decomposition. @@ -409,9 +411,11 @@ template class FullPivLU #ifndef EIGEN_PARSED_BY_DOXYGEN template + EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; template + EIGEN_DEVICE_FUNC void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif @@ -527,8 +531,8 @@ void FullPivLU::computeInPlace() m_nonzero_pivots = k; for(Index i = k; i < size; ++i) { - m_rowsTranspositions.coeffRef(i) = internal::convert_index(i); - m_colsTranspositions.coeffRef(i) = internal::convert_index(i); + m_rowsTranspositions.coeffRef(i) = i; + m_colsTranspositions.coeffRef(i) = i; } break; } @@ -539,8 +543,8 @@ void FullPivLU::computeInPlace() // Now that we've found the pivot, we need to apply the row/col swaps to // bring it to the location (k,k). - m_rowsTranspositions.coeffRef(k) = internal::convert_index(row_of_biggest_in_corner); - m_colsTranspositions.coeffRef(k) = internal::convert_index(col_of_biggest_in_corner); + m_rowsTranspositions.coeffRef(k) = row_of_biggest_in_corner; + m_colsTranspositions.coeffRef(k) = col_of_biggest_in_corner; if(k != row_of_biggest_in_corner) { m_lu.row(k).swap(m_lu.row(row_of_biggest_in_corner)); ++number_of_transpositions; @@ -753,6 +757,7 @@ void FullPivLU<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const const Index rows = this->rows(), cols = this->cols(), nonzero_pivots = this->rank(); + eigen_assert(rhs.rows() == rows); const Index smalldim = (std::min)(rows, cols); if(nonzero_pivots == 0) @@ -802,6 +807,7 @@ void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType const Index rows = this->rows(), cols = this->cols(), nonzero_pivots = this->rank(); + eigen_assert(rhs.rows() == cols); const Index smalldim = (std::min)(rows, cols); if(nonzero_pivots == 0) @@ -815,19 +821,29 @@ void FullPivLU<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType // Step 1 c = permutationQ().inverse() * rhs; - // Step 2 - m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots) - .template triangularView() - .transpose() - .template conjugateIf() - .solveInPlace(c.topRows(nonzero_pivots)); - - // Step 3 - m_lu.topLeftCorner(smalldim, smalldim) - .template triangularView() - .transpose() - .template conjugateIf() - .solveInPlace(c.topRows(smalldim)); + if (Conjugate) { + // Step 2 + m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots) + .template triangularView() + .adjoint() + .solveInPlace(c.topRows(nonzero_pivots)); + // Step 3 + m_lu.topLeftCorner(smalldim, smalldim) + .template triangularView() + .adjoint() + .solveInPlace(c.topRows(smalldim)); + } else { + // Step 2 + m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots) + .template triangularView() + .transpose() + .solveInPlace(c.topRows(nonzero_pivots)); + // Step 3 + m_lu.topLeftCorner(smalldim, smalldim) + .template triangularView() + .transpose() + .solveInPlace(c.topRows(smalldim)); + } // Step 4 PermutationPType invp = permutationP().inverse().eval(); diff --git a/uppsrc/plugin/Eigen/Eigen/src/LU/InverseImpl.h b/uppsrc/plugin/Eigen/Eigen/src/LU/InverseImpl.h index 1bab00c01..f49f23360 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/LU/InverseImpl.h +++ b/uppsrc/plugin/Eigen/Eigen/src/LU/InverseImpl.h @@ -290,7 +290,6 @@ template struct Assignment, internal::assign_op, Dense2Dense> { typedef Inverse SrcXprType; - EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &) { Index dstRows = src.rows(); @@ -333,7 +332,6 @@ struct Assignment, internal::assign_op -EIGEN_DEVICE_FUNC inline const Inverse MatrixBase::inverse() const { EIGEN_STATIC_ASSERT(!NumTraits::IsInteger,THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES) diff --git a/uppsrc/plugin/Eigen/Eigen/src/LU/PartialPivLU.h b/uppsrc/plugin/Eigen/Eigen/src/LU/PartialPivLU.h index b8938013a..6b10f39fa 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/LU/PartialPivLU.h +++ b/uppsrc/plugin/Eigen/Eigen/src/LU/PartialPivLU.h @@ -19,7 +19,6 @@ template struct traits > { typedef MatrixXpr XprKind; typedef SolverStorage StorageKind; - typedef int StorageIndex; typedef traits<_MatrixType> BaseTraits; enum { Flags = BaseTraits::Flags & RowMajorBit, @@ -80,9 +79,8 @@ template class PartialPivLU typedef _MatrixType MatrixType; typedef SolverBase Base; - friend class SolverBase; - EIGEN_GENERIC_PUBLIC_INTERFACE(PartialPivLU) + // FIXME StorageIndex defined in EIGEN_GENERIC_PUBLIC_INTERFACE should be int enum { MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime @@ -154,7 +152,6 @@ template class PartialPivLU return m_p; } - #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method returns the solution x to the equation Ax=b, where A is the matrix of which * *this is the LU decomposition. * @@ -172,10 +169,14 @@ template class PartialPivLU * * \sa TriangularView::solve(), inverse(), computeInverse() */ + // FIXME this is a copy-paste of the base-class member to add the isInitialized assertion. template inline const Solve - solve(const MatrixBase& b) const; - #endif + solve(const MatrixBase& b) const + { + eigen_assert(m_isInitialized && "PartialPivLU is not initialized."); + return Solve(*this, b.derived()); + } /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is the LU decomposition. @@ -230,6 +231,8 @@ template class PartialPivLU * Step 3: replace c by the solution x to Ux = c. */ + eigen_assert(rhs.rows() == m_lu.rows()); + // Step 1 dst = permutationP() * rhs; @@ -243,21 +246,26 @@ template class PartialPivLU template EIGEN_DEVICE_FUNC void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const { - /* The decomposition PA = LU can be rewritten as A^T = U^T L^T P. + /* The decomposition PA = LU can be rewritten as A = P^{-1} L U. * So we proceed as follows: - * Step 1: compute c as the solution to L^T c = b - * Step 2: replace c by the solution x to U^T x = c. - * Step 3: update c = P^-1 c. + * Step 1: compute c = Pb. + * Step 2: replace c by the solution x to Lx = c. + * Step 3: replace c by the solution x to Ux = c. */ eigen_assert(rhs.rows() == m_lu.cols()); - // Step 1 - dst = m_lu.template triangularView().transpose() - .template conjugateIf().solve(rhs); - // Step 2 - m_lu.template triangularView().transpose() - .template conjugateIf().solveInPlace(dst); + if (Conjugate) { + // Step 1 + dst = m_lu.template triangularView().adjoint().solve(rhs); + // Step 2 + m_lu.template triangularView().adjoint().solveInPlace(dst); + } else { + // Step 1 + dst = m_lu.template triangularView().transpose().solve(rhs); + // Step 2 + m_lu.template triangularView().transpose().solveInPlace(dst); + } // Step 3 dst = permutationP().transpose() * dst; } @@ -331,18 +339,17 @@ PartialPivLU::PartialPivLU(EigenBase& matrix) namespace internal { /** \internal This is the blocked version of fullpivlu_unblocked() */ -template +template struct partial_lu_impl { - static const int UnBlockedBound = 16; - static const bool UnBlockedAtCompileTime = SizeAtCompileTime!=Dynamic && SizeAtCompileTime<=UnBlockedBound; - static const int ActualSizeAtCompileTime = UnBlockedAtCompileTime ? SizeAtCompileTime : Dynamic; - // Remaining rows and columns at compile-time: - static const int RRows = SizeAtCompileTime==2 ? 1 : Dynamic; - static const int RCols = SizeAtCompileTime==2 ? 1 : Dynamic; - typedef Matrix MatrixType; - typedef Ref MatrixTypeRef; - typedef Ref > BlockType; + // FIXME add a stride to Map, so that the following mapping becomes easier, + // another option would be to create an expression being able to automatically + // warp any Map, Matrix, and Block expressions as a unique type, but since that's exactly + // a Map + stride, why not adding a stride to Map, and convenient ctors from a Matrix, + // and Block. + typedef Map > MapLU; + typedef Block MatrixType; + typedef Block BlockType; typedef typename MatrixType::RealScalar RealScalar; /** \internal performs the LU decomposition in-place of the matrix \a lu @@ -355,22 +362,19 @@ struct partial_lu_impl * * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise. */ - static Index unblocked_lu(MatrixTypeRef& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions) + static Index unblocked_lu(MatrixType& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions) { typedef scalar_score_coeff_op Scoring; typedef typename Scoring::result_type Score; const Index rows = lu.rows(); const Index cols = lu.cols(); const Index size = (std::min)(rows,cols); - // For small compile-time matrices it is worth processing the last row separately: - // speedup: +100% for 2x2, +10% for others. - const Index endk = UnBlockedAtCompileTime ? size-1 : size; nb_transpositions = 0; Index first_zero_pivot = -1; - for(Index k = 0; k < endk; ++k) + for(Index k = 0; k < size; ++k) { - int rrows = internal::convert_index(rows-k-1); - int rcols = internal::convert_index(cols-k-1); + Index rrows = rows-k-1; + Index rcols = cols-k-1; Index row_of_biggest_in_col; Score biggest_in_corner @@ -387,7 +391,9 @@ struct partial_lu_impl ++nb_transpositions; } - lu.col(k).tail(fix(rrows)) /= lu.coeff(k,k); + // FIXME shall we introduce a safe quotient expression in cas 1/lu.coeff(k,k) + // overflow but not the actual quotient? + lu.col(k).tail(rrows) /= lu.coeff(k,k); } else if(first_zero_pivot==-1) { @@ -397,18 +403,8 @@ struct partial_lu_impl } if(k(rrows),fix(rcols)).noalias() -= lu.col(k).tail(fix(rrows)) * lu.row(k).tail(fix(rcols)); + lu.bottomRightCorner(rrows,rcols).noalias() -= lu.col(k).tail(rrows) * lu.row(k).tail(rcols); } - - // special handling of the last entry - if(UnBlockedAtCompileTime) - { - Index k = endk; - row_transpositions[k] = PivIndex(k); - if (Scoring()(lu(k, k)) == Score(0) && first_zero_pivot == -1) - first_zero_pivot = k; - } - return first_zero_pivot; } @@ -424,17 +420,18 @@ struct partial_lu_impl * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise. * * \note This very low level interface using pointers, etc. is to: - * 1 - reduce the number of instantiations to the strict minimum - * 2 - avoid infinite recursion of the instantiations with Block > > + * 1 - reduce the number of instanciations to the strict minimum + * 2 - avoid infinite recursion of the instanciations with Block > > */ static Index blocked_lu(Index rows, Index cols, Scalar* lu_data, Index luStride, PivIndex* row_transpositions, PivIndex& nb_transpositions, Index maxBlockSize=256) { - MatrixTypeRef lu = MatrixType::Map(lu_data,rows, cols, OuterStride<>(luStride)); + MapLU lu1(lu_data,StorageOrder==RowMajor?rows:luStride,StorageOrder==RowMajor?luStride:cols); + MatrixType lu(lu1,0,0,rows,cols); const Index size = (std::min)(rows,cols); // if the matrix is too small, no blocking: - if(UnBlockedAtCompileTime || size<=UnBlockedBound) + if(size<=16) { return unblocked_lu(lu, row_transpositions, nb_transpositions); } @@ -460,12 +457,12 @@ struct partial_lu_impl // A00 | A01 | A02 // lu = A_0 | A_1 | A_2 = A10 | A11 | A12 // A20 | A21 | A22 - BlockType A_0 = lu.block(0,0,rows,k); - BlockType A_2 = lu.block(0,k+bs,rows,tsize); - BlockType A11 = lu.block(k,k,bs,bs); - BlockType A12 = lu.block(k,k+bs,bs,tsize); - BlockType A21 = lu.block(k+bs,k,trows,bs); - BlockType A22 = lu.block(k+bs,k+bs,trows,tsize); + BlockType A_0(lu,0,0,rows,k); + BlockType A_2(lu,0,k+bs,rows,tsize); + BlockType A11(lu,k,k,bs,bs); + BlockType A12(lu,k,k+bs,bs,tsize); + BlockType A21(lu,k+bs,k,trows,bs); + BlockType A22(lu,k+bs,k+bs,trows,tsize); PivIndex nb_transpositions_in_panel; // recursively call the blocked LU algorithm on [A11^T A21^T]^T @@ -508,9 +505,7 @@ void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, t eigen_assert((&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1); partial_lu_impl - < typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor, - typename TranspositionType::StorageIndex, - EIGEN_SIZE_MIN_PREFER_FIXED(MatrixType::RowsAtCompileTime,MatrixType::ColsAtCompileTime)> + ::blocked_lu(lu.rows(), lu.cols(), &lu.coeffRef(0,0), lu.outerStride(), &row_transpositions.coeffRef(0), nb_transpositions); } diff --git a/uppsrc/plugin/Eigen/Eigen/src/OrderingMethods/Amd.h b/uppsrc/plugin/Eigen/Eigen/src/OrderingMethods/Amd.h index 7ca3f33b1..f91ecb24e 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/OrderingMethods/Amd.h +++ b/uppsrc/plugin/Eigen/Eigen/src/OrderingMethods/Amd.h @@ -2,22 +2,32 @@ // for linear algebra. // // Copyright (C) 2010 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. /* + NOTE: this routine has been adapted from the CSparse library: Copyright (c) 2006, Timothy A. Davis. http://www.suitesparse.com -The author of CSparse, Timothy A. Davis., has executed a license with Google LLC -to permit distribution of this code and derivative works as part of Eigen under -the Mozilla Public License v. 2.0, as stated at the top of this file. +CSparse is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 2.1 of the License, or (at your option) any later version. + +CSparse is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this Module; if not, write to the Free Software +Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ +#include "../Core/util/NonMPL2.h" + #ifndef EIGEN_SPARSE_AMD_H #define EIGEN_SPARSE_AMD_H diff --git a/uppsrc/plugin/Eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h b/uppsrc/plugin/Eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h index 8e339a704..da85b4d6e 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h +++ b/uppsrc/plugin/Eigen/Eigen/src/OrderingMethods/Eigen_Colamd.h @@ -13,119 +13,115 @@ // Davis (davis@cise.ufl.edu), University of Florida. The algorithm was // developed in collaboration with John Gilbert, Xerox PARC, and Esmond // Ng, Oak Ridge National Laboratory. -// +// // Date: -// +// // September 8, 2003. Version 2.3. -// +// // Acknowledgements: -// +// // This work was supported by the National Science Foundation, under // grants DMS-9504974 and DMS-9803599. -// +// // Notice: -// +// // Copyright (c) 1998-2003 by the University of Florida. // All Rights Reserved. -// +// // THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY // EXPRESSED OR IMPLIED. ANY USE IS AT YOUR OWN RISK. -// +// // Permission is hereby granted to use, copy, modify, and/or distribute // this program, provided that the Copyright, this License, and the // Availability of the original version is retained on all copies and made // accessible to the end-user of any code or package that includes COLAMD -// or any modified version of COLAMD. -// +// or any modified version of COLAMD. +// // Availability: -// +// // The colamd/symamd library is available at -// +// // http://www.suitesparse.com - + #ifndef EIGEN_COLAMD_H #define EIGEN_COLAMD_H namespace internal { - -namespace Colamd { - /* Ensure that debugging is turned off: */ #ifndef COLAMD_NDEBUG #define COLAMD_NDEBUG #endif /* NDEBUG */ - - /* ========================================================================== */ /* === Knob and statistics definitions ====================================== */ /* ========================================================================== */ /* size of the knobs [ ] array. Only knobs [0..1] are currently used. */ -const int NKnobs = 20; +#define COLAMD_KNOBS 20 /* number of output statistics. Only stats [0..6] are currently used. */ -const int NStats = 20; +#define COLAMD_STATS 20 -/* Indices into knobs and stats array. */ -enum KnobsStatsIndex { - /* knobs [0] and stats [0]: dense row knob and output statistic. */ - DenseRow = 0, +/* knobs [0] and stats [0]: dense row knob and output statistic. */ +#define COLAMD_DENSE_ROW 0 - /* knobs [1] and stats [1]: dense column knob and output statistic. */ - DenseCol = 1, +/* knobs [1] and stats [1]: dense column knob and output statistic. */ +#define COLAMD_DENSE_COL 1 - /* stats [2]: memory defragmentation count output statistic */ - DefragCount = 2, +/* stats [2]: memory defragmentation count output statistic */ +#define COLAMD_DEFRAG_COUNT 2 - /* stats [3]: colamd status: zero OK, > 0 warning or notice, < 0 error */ - Status = 3, +/* stats [3]: colamd status: zero OK, > 0 warning or notice, < 0 error */ +#define COLAMD_STATUS 3 - /* stats [4..6]: error info, or info on jumbled columns */ - Info1 = 4, - Info2 = 5, - Info3 = 6 -}; +/* stats [4..6]: error info, or info on jumbled columns */ +#define COLAMD_INFO1 4 +#define COLAMD_INFO2 5 +#define COLAMD_INFO3 6 /* error codes returned in stats [3]: */ -enum Status { - Ok = 0, - OkButJumbled = 1, - ErrorANotPresent = -1, - ErrorPNotPresent = -2, - ErrorNrowNegative = -3, - ErrorNcolNegative = -4, - ErrorNnzNegative = -5, - ErrorP0Nonzero = -6, - ErrorATooSmall = -7, - ErrorColLengthNegative = -8, - ErrorRowIndexOutOfBounds = -9, - ErrorOutOfMemory = -10, - ErrorInternalError = -999 -}; +#define COLAMD_OK (0) +#define COLAMD_OK_BUT_JUMBLED (1) +#define COLAMD_ERROR_A_not_present (-1) +#define COLAMD_ERROR_p_not_present (-2) +#define COLAMD_ERROR_nrow_negative (-3) +#define COLAMD_ERROR_ncol_negative (-4) +#define COLAMD_ERROR_nnz_negative (-5) +#define COLAMD_ERROR_p0_nonzero (-6) +#define COLAMD_ERROR_A_too_small (-7) +#define COLAMD_ERROR_col_length_negative (-8) +#define COLAMD_ERROR_row_index_out_of_bounds (-9) +#define COLAMD_ERROR_out_of_memory (-10) +#define COLAMD_ERROR_internal_error (-999) + /* ========================================================================== */ /* === Definitions ========================================================== */ /* ========================================================================== */ -template -IndexType ones_complement(const IndexType r) { - return (-(r)-1); -} +#define ONES_COMPLEMENT(r) (-(r)-1) /* -------------------------------------------------------------------------- */ -const int Empty = -1; + +#define COLAMD_EMPTY (-1) /* Row and column status */ -enum RowColumnStatus { - Alive = 0, - Dead = -1 -}; +#define ALIVE (0) +#define DEAD (-1) /* Column status */ -enum ColumnStatus { - DeadPrincipal = -1, - DeadNonPrincipal = -2 -}; +#define DEAD_PRINCIPAL (-1) +#define DEAD_NON_PRINCIPAL (-2) + +/* Macros for row and column status update and checking. */ +#define ROW_IS_DEAD(r) ROW_IS_MARKED_DEAD (Row[r].shared2.mark) +#define ROW_IS_MARKED_DEAD(row_mark) (row_mark < ALIVE) +#define ROW_IS_ALIVE(r) (Row [r].shared2.mark >= ALIVE) +#define COL_IS_DEAD(c) (Col [c].start < ALIVE) +#define COL_IS_ALIVE(c) (Col [c].start >= ALIVE) +#define COL_IS_DEAD_PRINCIPAL(c) (Col [c].start == DEAD_PRINCIPAL) +#define KILL_ROW(r) { Row [r].shared2.mark = DEAD ; } +#define KILL_PRINCIPAL_COL(c) { Col [c].start = DEAD_PRINCIPAL ; } +#define KILL_NON_PRINCIPAL_COL(c) { Col [c].start = DEAD_NON_PRINCIPAL ; } /* ========================================================================== */ /* === Colamd reporting mechanism =========================================== */ @@ -133,9 +129,9 @@ enum ColumnStatus { // == Row and Column structures == template -struct ColStructure +struct colamd_col { - IndexType start ; /* index for A of first row in this column, or Dead */ + IndexType start ; /* index for A of first row in this column, or DEAD */ /* if column is dead */ IndexType length ; /* number of rows in this column */ union @@ -163,21 +159,11 @@ struct ColStructure IndexType degree_next ; /* next column, if col is in a degree list */ IndexType hash_next ; /* next column, if col is in a hash list */ } shared4 ; - - inline bool is_dead() const { return start < Alive; } - - inline bool is_alive() const { return start >= Alive; } - - inline bool is_dead_principal() const { return start == DeadPrincipal; } - - inline void kill_principal() { start = DeadPrincipal; } - - inline void kill_non_principal() { start = DeadNonPrincipal; } - + }; - + template -struct RowStructure +struct Colamd_Row { IndexType start ; /* index for A of first col in this row */ IndexType length ; /* number of principal columns in this row */ @@ -191,19 +177,13 @@ struct RowStructure IndexType mark ; /* for computing set differences and marking dead rows*/ IndexType first_column ;/* first column in row (used in garbage collection) */ } shared2 ; - - inline bool is_dead() const { return shared2.mark < Alive; } - - inline bool is_alive() const { return shared2.mark >= Alive; } - - inline void kill() { shared2.mark = Dead; } - + }; - + /* ========================================================================== */ /* === Colamd recommended memory size ======================================= */ /* ========================================================================== */ - + /* The recommended length Alen of the array A passed to colamd is given by the COLAMD_RECOMMENDED (nnz, n_row, n_col) macro. It returns -1 if any @@ -212,41 +192,41 @@ struct RowStructure required for the Col and Row arrays, respectively, which are internal to colamd. An additional n_col space is the minimal amount of "elbow room", and nnz/5 more space is recommended for run time efficiency. - + This macro is not needed when using symamd. - + Explicit typecast to IndexType added Sept. 23, 2002, COLAMD version 2.2, to avoid gcc -pedantic warning messages. */ template -inline IndexType colamd_c(IndexType n_col) -{ return IndexType( ((n_col) + 1) * sizeof (ColStructure) / sizeof (IndexType) ) ; } +inline IndexType colamd_c(IndexType n_col) +{ return IndexType( ((n_col) + 1) * sizeof (colamd_col) / sizeof (IndexType) ) ; } template inline IndexType colamd_r(IndexType n_row) -{ return IndexType(((n_row) + 1) * sizeof (RowStructure) / sizeof (IndexType)); } +{ return IndexType(((n_row) + 1) * sizeof (Colamd_Row) / sizeof (IndexType)); } // Prototypes of non-user callable routines template -static IndexType init_rows_cols (IndexType n_row, IndexType n_col, RowStructure Row [], ColStructure col [], IndexType A [], IndexType p [], IndexType stats[NStats] ); +static IndexType init_rows_cols (IndexType n_row, IndexType n_col, Colamd_Row Row [], colamd_col col [], IndexType A [], IndexType p [], IndexType stats[COLAMD_STATS] ); template -static void init_scoring (IndexType n_row, IndexType n_col, RowStructure Row [], ColStructure Col [], IndexType A [], IndexType head [], double knobs[NKnobs], IndexType *p_n_row2, IndexType *p_n_col2, IndexType *p_max_deg); +static void init_scoring (IndexType n_row, IndexType n_col, Colamd_Row Row [], colamd_col Col [], IndexType A [], IndexType head [], double knobs[COLAMD_KNOBS], IndexType *p_n_row2, IndexType *p_n_col2, IndexType *p_max_deg); template -static IndexType find_ordering (IndexType n_row, IndexType n_col, IndexType Alen, RowStructure Row [], ColStructure Col [], IndexType A [], IndexType head [], IndexType n_col2, IndexType max_deg, IndexType pfree); +static IndexType find_ordering (IndexType n_row, IndexType n_col, IndexType Alen, Colamd_Row Row [], colamd_col Col [], IndexType A [], IndexType head [], IndexType n_col2, IndexType max_deg, IndexType pfree); template -static void order_children (IndexType n_col, ColStructure Col [], IndexType p []); +static void order_children (IndexType n_col, colamd_col Col [], IndexType p []); template -static void detect_super_cols (ColStructure Col [], IndexType A [], IndexType head [], IndexType row_start, IndexType row_length ) ; +static void detect_super_cols (colamd_col Col [], IndexType A [], IndexType head [], IndexType row_start, IndexType row_length ) ; template -static IndexType garbage_collection (IndexType n_row, IndexType n_col, RowStructure Row [], ColStructure Col [], IndexType A [], IndexType *pfree) ; +static IndexType garbage_collection (IndexType n_row, IndexType n_col, Colamd_Row Row [], colamd_col Col [], IndexType A [], IndexType *pfree) ; template -static inline IndexType clear_mark (IndexType n_row, RowStructure Row [] ) ; +static inline IndexType clear_mark (IndexType n_row, Colamd_Row Row [] ) ; /* === No debugging ========================================================= */ @@ -260,37 +240,37 @@ static inline IndexType clear_mark (IndexType n_row, RowStructure Ro /** - * \brief Returns the recommended value of Alen - * - * Returns recommended value of Alen for use by colamd. - * Returns -1 if any input argument is negative. - * The use of this routine or macro is optional. - * Note that the macro uses its arguments more than once, - * so be careful for side effects, if you pass expressions as arguments to COLAMD_RECOMMENDED. - * + * \brief Returns the recommended value of Alen + * + * Returns recommended value of Alen for use by colamd. + * Returns -1 if any input argument is negative. + * The use of this routine or macro is optional. + * Note that the macro uses its arguments more than once, + * so be careful for side effects, if you pass expressions as arguments to COLAMD_RECOMMENDED. + * * \param nnz nonzeros in A * \param n_row number of rows in A * \param n_col number of columns in A * \return recommended value of Alen for use by colamd */ template -inline IndexType recommended ( IndexType nnz, IndexType n_row, IndexType n_col) +inline IndexType colamd_recommended ( IndexType nnz, IndexType n_row, IndexType n_col) { if ((nnz) < 0 || (n_row) < 0 || (n_col) < 0) return (-1); else - return (2 * (nnz) + colamd_c (n_col) + colamd_r (n_row) + (n_col) + ((nnz) / 5)); + return (2 * (nnz) + colamd_c (n_col) + colamd_r (n_row) + (n_col) + ((nnz) / 5)); } /** * \brief set default parameters The use of this routine is optional. - * - * Colamd: rows with more than (knobs [DenseRow] * n_col) + * + * Colamd: rows with more than (knobs [COLAMD_DENSE_ROW] * n_col) * entries are removed prior to ordering. Columns with more than - * (knobs [DenseCol] * n_row) entries are removed prior to - * ordering, and placed last in the output column ordering. + * (knobs [COLAMD_DENSE_COL] * n_row) entries are removed prior to + * ordering, and placed last in the output column ordering. * - * DenseRow and DenseCol are defined as 0 and 1, + * COLAMD_DENSE_ROW and COLAMD_DENSE_COL are defined as 0 and 1, * respectively, in colamd.h. Default values of these two knobs * are both 0.5. Currently, only knobs [0] and knobs [1] are * used, but future versions may use more knobs. If so, they will @@ -299,37 +279,37 @@ inline IndexType recommended ( IndexType nnz, IndexType n_row, IndexType n_col) * not need to change, assuming that you either use * colamd_set_defaults, or pass a (double *) NULL pointer as the * knobs array to colamd or symamd. - * + * * \param knobs parameter settings for colamd */ -static inline void set_defaults(double knobs[NKnobs]) +static inline void colamd_set_defaults(double knobs[COLAMD_KNOBS]) { /* === Local variables ================================================== */ - + int i ; if (!knobs) { return ; /* no knobs to initialize */ } - for (i = 0 ; i < NKnobs ; i++) + for (i = 0 ; i < COLAMD_KNOBS ; i++) { knobs [i] = 0 ; } - knobs [Colamd::DenseRow] = 0.5 ; /* ignore rows over 50% dense */ - knobs [Colamd::DenseCol] = 0.5 ; /* ignore columns over 50% dense */ + knobs [COLAMD_DENSE_ROW] = 0.5 ; /* ignore rows over 50% dense */ + knobs [COLAMD_DENSE_COL] = 0.5 ; /* ignore columns over 50% dense */ } -/** +/** * \brief Computes a column ordering using the column approximate minimum degree ordering - * + * * Computes a column ordering (Q) of A such that P(AQ)=LU or * (AQ)'AQ=LL' have less fill-in and require fewer floating point * operations than factorizing the unpermuted matrix A or A'A, * respectively. - * - * + * + * * \param n_row number of rows in A * \param n_col number of columns in A * \param Alen, size of the array A @@ -339,143 +319,143 @@ static inline void set_defaults(double knobs[NKnobs]) * \param stats colamd output statistics and error codes */ template -static bool compute_ordering(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *A, IndexType *p, double knobs[NKnobs], IndexType stats[NStats]) +static bool colamd(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *A, IndexType *p, double knobs[COLAMD_KNOBS], IndexType stats[COLAMD_STATS]) { /* === Local variables ================================================== */ - + IndexType i ; /* loop index */ IndexType nnz ; /* nonzeros in A */ IndexType Row_size ; /* size of Row [], in integers */ IndexType Col_size ; /* size of Col [], in integers */ IndexType need ; /* minimum required length of A */ - Colamd::RowStructure *Row ; /* pointer into A of Row [0..n_row] array */ - Colamd::ColStructure *Col ; /* pointer into A of Col [0..n_col] array */ + Colamd_Row *Row ; /* pointer into A of Row [0..n_row] array */ + colamd_col *Col ; /* pointer into A of Col [0..n_col] array */ IndexType n_col2 ; /* number of non-dense, non-empty columns */ IndexType n_row2 ; /* number of non-dense, non-empty rows */ IndexType ngarbage ; /* number of garbage collections performed */ IndexType max_deg ; /* maximum row degree */ - double default_knobs [NKnobs] ; /* default knobs array */ - - + double default_knobs [COLAMD_KNOBS] ; /* default knobs array */ + + /* === Check the input arguments ======================================== */ - + if (!stats) { COLAMD_DEBUG0 (("colamd: stats not present\n")) ; return (false) ; } - for (i = 0 ; i < NStats ; i++) + for (i = 0 ; i < COLAMD_STATS ; i++) { stats [i] = 0 ; } - stats [Colamd::Status] = Colamd::Ok ; - stats [Colamd::Info1] = -1 ; - stats [Colamd::Info2] = -1 ; - + stats [COLAMD_STATUS] = COLAMD_OK ; + stats [COLAMD_INFO1] = -1 ; + stats [COLAMD_INFO2] = -1 ; + if (!A) /* A is not present */ { - stats [Colamd::Status] = Colamd::ErrorANotPresent ; + stats [COLAMD_STATUS] = COLAMD_ERROR_A_not_present ; COLAMD_DEBUG0 (("colamd: A not present\n")) ; return (false) ; } - + if (!p) /* p is not present */ { - stats [Colamd::Status] = Colamd::ErrorPNotPresent ; + stats [COLAMD_STATUS] = COLAMD_ERROR_p_not_present ; COLAMD_DEBUG0 (("colamd: p not present\n")) ; return (false) ; } - + if (n_row < 0) /* n_row must be >= 0 */ { - stats [Colamd::Status] = Colamd::ErrorNrowNegative ; - stats [Colamd::Info1] = n_row ; + stats [COLAMD_STATUS] = COLAMD_ERROR_nrow_negative ; + stats [COLAMD_INFO1] = n_row ; COLAMD_DEBUG0 (("colamd: nrow negative %d\n", n_row)) ; return (false) ; } - + if (n_col < 0) /* n_col must be >= 0 */ { - stats [Colamd::Status] = Colamd::ErrorNcolNegative ; - stats [Colamd::Info1] = n_col ; + stats [COLAMD_STATUS] = COLAMD_ERROR_ncol_negative ; + stats [COLAMD_INFO1] = n_col ; COLAMD_DEBUG0 (("colamd: ncol negative %d\n", n_col)) ; return (false) ; } - + nnz = p [n_col] ; if (nnz < 0) /* nnz must be >= 0 */ { - stats [Colamd::Status] = Colamd::ErrorNnzNegative ; - stats [Colamd::Info1] = nnz ; + stats [COLAMD_STATUS] = COLAMD_ERROR_nnz_negative ; + stats [COLAMD_INFO1] = nnz ; COLAMD_DEBUG0 (("colamd: number of entries negative %d\n", nnz)) ; return (false) ; } - + if (p [0] != 0) { - stats [Colamd::Status] = Colamd::ErrorP0Nonzero ; - stats [Colamd::Info1] = p [0] ; + stats [COLAMD_STATUS] = COLAMD_ERROR_p0_nonzero ; + stats [COLAMD_INFO1] = p [0] ; COLAMD_DEBUG0 (("colamd: p[0] not zero %d\n", p [0])) ; return (false) ; } - + /* === If no knobs, set default knobs =================================== */ - + if (!knobs) { - set_defaults (default_knobs) ; + colamd_set_defaults (default_knobs) ; knobs = default_knobs ; } - + /* === Allocate the Row and Col arrays from array A ===================== */ - + Col_size = colamd_c (n_col) ; Row_size = colamd_r (n_row) ; need = 2*nnz + n_col + Col_size + Row_size ; - + if (need > Alen) { /* not enough space in array A to perform the ordering */ - stats [Colamd::Status] = Colamd::ErrorATooSmall ; - stats [Colamd::Info1] = need ; - stats [Colamd::Info2] = Alen ; + stats [COLAMD_STATUS] = COLAMD_ERROR_A_too_small ; + stats [COLAMD_INFO1] = need ; + stats [COLAMD_INFO2] = Alen ; COLAMD_DEBUG0 (("colamd: Need Alen >= %d, given only Alen = %d\n", need,Alen)); return (false) ; } - + Alen -= Col_size + Row_size ; - Col = (ColStructure *) &A [Alen] ; - Row = (RowStructure *) &A [Alen + Col_size] ; + Col = (colamd_col *) &A [Alen] ; + Row = (Colamd_Row *) &A [Alen + Col_size] ; /* === Construct the row and column data structures ===================== */ - - if (!Colamd::init_rows_cols (n_row, n_col, Row, Col, A, p, stats)) + + if (!Eigen::internal::init_rows_cols (n_row, n_col, Row, Col, A, p, stats)) { /* input matrix is invalid */ COLAMD_DEBUG0 (("colamd: Matrix invalid\n")) ; return (false) ; } - + /* === Initialize scores, kill dense rows/columns ======================= */ - Colamd::init_scoring (n_row, n_col, Row, Col, A, p, knobs, + Eigen::internal::init_scoring (n_row, n_col, Row, Col, A, p, knobs, &n_row2, &n_col2, &max_deg) ; - + /* === Order the supercolumns =========================================== */ - - ngarbage = Colamd::find_ordering (n_row, n_col, Alen, Row, Col, A, p, + + ngarbage = Eigen::internal::find_ordering (n_row, n_col, Alen, Row, Col, A, p, n_col2, max_deg, 2*nnz) ; - + /* === Order the non-principal columns ================================== */ - - Colamd::order_children (n_col, Col, p) ; - + + Eigen::internal::order_children (n_col, Col, p) ; + /* === Return statistics in stats ======================================= */ - - stats [Colamd::DenseRow] = n_row - n_row2 ; - stats [Colamd::DenseCol] = n_col - n_col2 ; - stats [Colamd::DefragCount] = ngarbage ; - COLAMD_DEBUG0 (("colamd: done.\n")) ; + + stats [COLAMD_DENSE_ROW] = n_row - n_row2 ; + stats [COLAMD_DENSE_COL] = n_col - n_col2 ; + stats [COLAMD_DEFRAG_COUNT] = ngarbage ; + COLAMD_DEBUG0 (("colamd: done.\n")) ; return (true) ; } @@ -485,6 +465,7 @@ static bool compute_ordering(IndexType n_row, IndexType n_col, IndexType Alen, I /* There are no user-callable routines beyond this point in the file */ + /* ========================================================================== */ /* === init_rows_cols ======================================================= */ /* ========================================================================== */ @@ -504,11 +485,11 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ IndexType n_row, /* number of rows of A */ IndexType n_col, /* number of columns of A */ - RowStructure Row [], /* of size n_row+1 */ - ColStructure Col [], /* of size n_col+1 */ + Colamd_Row Row [], /* of size n_row+1 */ + colamd_col Col [], /* of size n_col+1 */ IndexType A [], /* row indices of A, of size Alen */ IndexType p [], /* pointers to columns in A, of size n_col+1 */ - IndexType stats [NStats] /* colamd statistics */ + IndexType stats [COLAMD_STATS] /* colamd statistics */ ) { /* === Local variables ================================================== */ @@ -531,24 +512,24 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ if ((Col [col].length) < 0) // extra parentheses to work-around gcc bug 10200 { /* column pointers must be non-decreasing */ - stats [Colamd::Status] = Colamd::ErrorColLengthNegative ; - stats [Colamd::Info1] = col ; - stats [Colamd::Info2] = Col [col].length ; + stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ; + stats [COLAMD_INFO1] = col ; + stats [COLAMD_INFO2] = Col [col].length ; COLAMD_DEBUG0 (("colamd: col %d length %d < 0\n", col, Col [col].length)) ; return (false) ; } Col [col].shared1.thickness = 1 ; Col [col].shared2.score = 0 ; - Col [col].shared3.prev = Empty ; - Col [col].shared4.degree_next = Empty ; + Col [col].shared3.prev = COLAMD_EMPTY ; + Col [col].shared4.degree_next = COLAMD_EMPTY ; } /* p [0..n_col] no longer needed, used as "head" in subsequent routines */ /* === Scan columns, compute row degrees, and check row indices ========= */ - stats [Info3] = 0 ; /* number of duplicate or unsorted row indices*/ + stats [COLAMD_INFO3] = 0 ; /* number of duplicate or unsorted row indices*/ for (row = 0 ; row < n_row ; row++) { @@ -570,10 +551,10 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ /* make sure row indices within range */ if (row < 0 || row >= n_row) { - stats [Colamd::Status] = Colamd::ErrorRowIndexOutOfBounds ; - stats [Colamd::Info1] = col ; - stats [Colamd::Info2] = row ; - stats [Colamd::Info3] = n_row ; + stats [COLAMD_STATUS] = COLAMD_ERROR_row_index_out_of_bounds ; + stats [COLAMD_INFO1] = col ; + stats [COLAMD_INFO2] = row ; + stats [COLAMD_INFO3] = n_row ; COLAMD_DEBUG0 (("colamd: row %d col %d out of bounds\n", row, col)) ; return (false) ; } @@ -582,10 +563,10 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ { /* row index are unsorted or repeated (or both), thus col */ /* is jumbled. This is a notice, not an error condition. */ - stats [Colamd::Status] = Colamd::OkButJumbled ; - stats [Colamd::Info1] = col ; - stats [Colamd::Info2] = row ; - (stats [Colamd::Info3]) ++ ; + stats [COLAMD_STATUS] = COLAMD_OK_BUT_JUMBLED ; + stats [COLAMD_INFO1] = col ; + stats [COLAMD_INFO2] = row ; + (stats [COLAMD_INFO3]) ++ ; COLAMD_DEBUG1 (("colamd: row %d col %d unsorted/duplicate\n",row,col)); } @@ -623,7 +604,7 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ /* === Create row form ================================================== */ - if (stats [Status] == OkButJumbled) + if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED) { /* if cols jumbled, watch for repeated row indices */ for (col = 0 ; col < n_col ; col++) @@ -665,7 +646,7 @@ static IndexType init_rows_cols /* returns true if OK, or false otherwise */ /* === See if we need to re-create columns ============================== */ - if (stats [Status] == OkButJumbled) + if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED) { COLAMD_DEBUG0 (("colamd: reconstructing column form, matrix jumbled\n")) ; @@ -720,11 +701,11 @@ static void init_scoring IndexType n_row, /* number of rows of A */ IndexType n_col, /* number of columns of A */ - RowStructure Row [], /* of size n_row+1 */ - ColStructure Col [], /* of size n_col+1 */ + Colamd_Row Row [], /* of size n_row+1 */ + colamd_col Col [], /* of size n_col+1 */ IndexType A [], /* column form and row form of A */ IndexType head [], /* of size n_col+1 */ - double knobs [NKnobs],/* parameters */ + double knobs [COLAMD_KNOBS],/* parameters */ IndexType *p_n_row2, /* number of non-dense, non-empty rows */ IndexType *p_n_col2, /* number of non-dense, non-empty columns */ IndexType *p_max_deg /* maximum row degree */ @@ -751,8 +732,8 @@ static void init_scoring /* === Extract knobs ==================================================== */ - dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [Colamd::DenseRow] * n_col), n_col)) ; - dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [Colamd::DenseCol] * n_row), n_row)) ; + dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [COLAMD_DENSE_ROW] * n_col), n_col)) ; + dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs [COLAMD_DENSE_COL] * n_row), n_row)) ; COLAMD_DEBUG1 (("colamd: densecount: %d %d\n", dense_row_count, dense_col_count)) ; max_deg = 0 ; n_col2 = n_col ; @@ -769,7 +750,7 @@ static void init_scoring { /* this is a empty column, kill and order it last */ Col [c].shared2.order = --n_col2 ; - Col[c].kill_principal() ; + KILL_PRINCIPAL_COL (c) ; } } COLAMD_DEBUG1 (("colamd: null columns killed: %d\n", n_col - n_col2)) ; @@ -780,7 +761,7 @@ static void init_scoring for (c = n_col-1 ; c >= 0 ; c--) { /* skip any dead columns */ - if (Col[c].is_dead()) + if (COL_IS_DEAD (c)) { continue ; } @@ -796,7 +777,7 @@ static void init_scoring { Row [*cp++].shared1.degree-- ; } - Col[c].kill_principal() ; + KILL_PRINCIPAL_COL (c) ; } } COLAMD_DEBUG1 (("colamd: Dense and null columns killed: %d\n", n_col - n_col2)) ; @@ -810,7 +791,7 @@ static void init_scoring if (deg > dense_row_count || deg == 0) { /* kill a dense or empty row */ - Row[r].kill() ; + KILL_ROW (r) ; --n_row2 ; } else @@ -832,7 +813,7 @@ static void init_scoring for (c = n_col-1 ; c >= 0 ; c--) { /* skip dead column */ - if (Col[c].is_dead()) + if (COL_IS_DEAD (c)) { continue ; } @@ -845,7 +826,7 @@ static void init_scoring /* get a row */ row = *cp++ ; /* skip if dead */ - if (Row[row].is_dead()) + if (ROW_IS_DEAD (row)) { continue ; } @@ -864,7 +845,7 @@ static void init_scoring /* and have already been killed) */ COLAMD_DEBUG2 (("Newly null killed: %d\n", c)) ; Col [c].shared2.order = --n_col2 ; - Col[c].kill_principal() ; + KILL_PRINCIPAL_COL (c) ; } else { @@ -889,7 +870,7 @@ static void init_scoring /* clear the hash buckets */ for (c = 0 ; c <= n_col ; c++) { - head [c] = Empty ; + head [c] = COLAMD_EMPTY ; } min_score = n_col ; /* place in reverse order, so low column indices are at the front */ @@ -897,7 +878,7 @@ static void init_scoring for (c = n_col-1 ; c >= 0 ; c--) { /* only add principal columns to degree lists */ - if (Col[c].is_alive()) + if (COL_IS_ALIVE (c)) { COLAMD_DEBUG4 (("place %d score %d minscore %d ncol %d\n", c, Col [c].shared2.score, min_score, n_col)) ; @@ -910,16 +891,16 @@ static void init_scoring COLAMD_ASSERT (min_score <= n_col) ; COLAMD_ASSERT (score >= 0) ; COLAMD_ASSERT (score <= n_col) ; - COLAMD_ASSERT (head [score] >= Empty) ; + COLAMD_ASSERT (head [score] >= COLAMD_EMPTY) ; /* now add this column to dList at proper score location */ next_col = head [score] ; - Col [c].shared3.prev = Empty ; + Col [c].shared3.prev = COLAMD_EMPTY ; Col [c].shared4.degree_next = next_col ; /* if there already was a column with the same score, set its */ /* previous pointer to this new column */ - if (next_col != Empty) + if (next_col != COLAMD_EMPTY) { Col [next_col].shared3.prev = c ; } @@ -958,8 +939,8 @@ static IndexType find_ordering /* return the number of garbage collections */ IndexType n_row, /* number of rows of A */ IndexType n_col, /* number of columns of A */ IndexType Alen, /* size of A, 2*nnz + n_col or larger */ - RowStructure Row [], /* of size n_row+1 */ - ColStructure Col [], /* of size n_col+1 */ + Colamd_Row Row [], /* of size n_row+1 */ + colamd_col Col [], /* of size n_col+1 */ IndexType A [], /* column form and row form of A */ IndexType head [], /* of size n_col+1 */ IndexType n_col2, /* Remaining columns to order */ @@ -1005,7 +986,7 @@ static IndexType find_ordering /* return the number of garbage collections */ /* === Initialization and clear mark ==================================== */ max_mark = INT_MAX - n_col ; /* INT_MAX defined in */ - tag_mark = Colamd::clear_mark (n_row, Row) ; + tag_mark = Eigen::internal::clear_mark (n_row, Row) ; min_score = 0 ; ngarbage = 0 ; COLAMD_DEBUG1 (("colamd: Ordering, n_col2=%d\n", n_col2)) ; @@ -1020,10 +1001,10 @@ static IndexType find_ordering /* return the number of garbage collections */ /* make sure degree list isn't empty */ COLAMD_ASSERT (min_score >= 0) ; COLAMD_ASSERT (min_score <= n_col) ; - COLAMD_ASSERT (head [min_score] >= Empty) ; + COLAMD_ASSERT (head [min_score] >= COLAMD_EMPTY) ; /* get pivot column from head of minimum degree list */ - while (min_score < n_col && head [min_score] == Empty) + while (min_score < n_col && head [min_score] == COLAMD_EMPTY) { min_score++ ; } @@ -1031,12 +1012,12 @@ static IndexType find_ordering /* return the number of garbage collections */ COLAMD_ASSERT (pivot_col >= 0 && pivot_col <= n_col) ; next_col = Col [pivot_col].shared4.degree_next ; head [min_score] = next_col ; - if (next_col != Empty) + if (next_col != COLAMD_EMPTY) { - Col [next_col].shared3.prev = Empty ; + Col [next_col].shared3.prev = COLAMD_EMPTY ; } - COLAMD_ASSERT (Col[pivot_col].is_alive()) ; + COLAMD_ASSERT (COL_IS_ALIVE (pivot_col)) ; COLAMD_DEBUG3 (("Pivot col: %d\n", pivot_col)) ; /* remember score for defrag check */ @@ -1055,12 +1036,12 @@ static IndexType find_ordering /* return the number of garbage collections */ needed_memory = numext::mini(pivot_col_score, n_col - k) ; if (pfree + needed_memory >= Alen) { - pfree = Colamd::garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ; + pfree = Eigen::internal::garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ; ngarbage++ ; /* after garbage collection we will have enough */ COLAMD_ASSERT (pfree + needed_memory < Alen) ; /* garbage collection has wiped out the Row[].shared2.mark array */ - tag_mark = Colamd::clear_mark (n_row, Row) ; + tag_mark = Eigen::internal::clear_mark (n_row, Row) ; } @@ -1083,9 +1064,9 @@ static IndexType find_ordering /* return the number of garbage collections */ { /* get a row */ row = *cp++ ; - COLAMD_DEBUG4 (("Pivot col pattern %d %d\n", Row[row].is_alive(), row)) ; + COLAMD_DEBUG4 (("Pivot col pattern %d %d\n", ROW_IS_ALIVE (row), row)) ; /* skip if row is dead */ - if (Row[row].is_dead()) + if (ROW_IS_DEAD (row)) { continue ; } @@ -1097,7 +1078,7 @@ static IndexType find_ordering /* return the number of garbage collections */ col = *rp++ ; /* add the column, if alive and untagged */ col_thickness = Col [col].shared1.thickness ; - if (col_thickness > 0 && Col[col].is_alive()) + if (col_thickness > 0 && COL_IS_ALIVE (col)) { /* tag column in pivot row */ Col [col].shared1.thickness = -col_thickness ; @@ -1124,7 +1105,7 @@ static IndexType find_ordering /* return the number of garbage collections */ /* may be killing an already dead row */ row = *cp++ ; COLAMD_DEBUG3 (("Kill row in pivot col: %d\n", row)) ; - Row[row].kill() ; + KILL_ROW (row) ; } /* === Select a row index to use as the new pivot row =============== */ @@ -1139,7 +1120,7 @@ static IndexType find_ordering /* return the number of garbage collections */ else { /* there is no pivot row, since it is of zero length */ - pivot_row = Empty ; + pivot_row = COLAMD_EMPTY ; COLAMD_ASSERT (pivot_row_length == 0) ; } COLAMD_ASSERT (Col [pivot_col].length > 0 || pivot_row_length == 0) ; @@ -1176,7 +1157,7 @@ static IndexType find_ordering /* return the number of garbage collections */ while (rp < rp_end) { col = *rp++ ; - COLAMD_ASSERT (Col[col].is_alive() && col != pivot_col) ; + COLAMD_ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ; COLAMD_DEBUG3 (("Col: %d\n", col)) ; /* clear tags used to construct pivot row pattern */ @@ -1191,8 +1172,8 @@ static IndexType find_ordering /* return the number of garbage collections */ next_col = Col [col].shared4.degree_next ; COLAMD_ASSERT (cur_score >= 0) ; COLAMD_ASSERT (cur_score <= n_col) ; - COLAMD_ASSERT (cur_score >= Empty) ; - if (prev_col == Empty) + COLAMD_ASSERT (cur_score >= COLAMD_EMPTY) ; + if (prev_col == COLAMD_EMPTY) { head [cur_score] = next_col ; } @@ -1200,7 +1181,7 @@ static IndexType find_ordering /* return the number of garbage collections */ { Col [prev_col].shared4.degree_next = next_col ; } - if (next_col != Empty) + if (next_col != COLAMD_EMPTY) { Col [next_col].shared3.prev = prev_col ; } @@ -1213,12 +1194,12 @@ static IndexType find_ordering /* return the number of garbage collections */ { /* get a row */ row = *cp++ ; + row_mark = Row [row].shared2.mark ; /* skip if dead */ - if (Row[row].is_dead()) + if (ROW_IS_MARKED_DEAD (row_mark)) { continue ; } - row_mark = Row [row].shared2.mark ; COLAMD_ASSERT (row != pivot_row) ; set_difference = row_mark - tag_mark ; /* check if the row has been seen yet */ @@ -1234,7 +1215,7 @@ static IndexType find_ordering /* return the number of garbage collections */ if (set_difference == 0) { COLAMD_DEBUG3 (("aggressive absorption. Row: %d\n", row)) ; - Row[row].kill() ; + KILL_ROW (row) ; } else { @@ -1256,7 +1237,7 @@ static IndexType find_ordering /* return the number of garbage collections */ { /* get a column */ col = *rp++ ; - COLAMD_ASSERT (Col[col].is_alive() && col != pivot_col) ; + COLAMD_ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ; hash = 0 ; cur_score = 0 ; cp = &A [Col [col].start] ; @@ -1271,12 +1252,12 @@ static IndexType find_ordering /* return the number of garbage collections */ /* get a row */ row = *cp++ ; COLAMD_ASSERT(row >= 0 && row < n_row) ; + row_mark = Row [row].shared2.mark ; /* skip if dead */ - if (Row [row].is_dead()) + if (ROW_IS_MARKED_DEAD (row_mark)) { continue ; } - row_mark = Row [row].shared2.mark ; COLAMD_ASSERT (row_mark > tag_mark) ; /* compact the column */ *new_cp++ = row ; @@ -1297,7 +1278,7 @@ static IndexType find_ordering /* return the number of garbage collections */ { COLAMD_DEBUG4 (("further mass elimination. Col: %d\n", col)) ; /* nothing left but the pivot row in this column */ - Col[col].kill_principal() ; + KILL_PRINCIPAL_COL (col) ; pivot_row_degree -= Col [col].shared1.thickness ; COLAMD_ASSERT (pivot_row_degree >= 0) ; /* order it */ @@ -1321,7 +1302,7 @@ static IndexType find_ordering /* return the number of garbage collections */ COLAMD_ASSERT (hash <= n_col) ; head_column = head [hash] ; - if (head_column > Empty) + if (head_column > COLAMD_EMPTY) { /* degree list "hash" is non-empty, use prev (shared3) of */ /* first column in degree list as head of hash bucket */ @@ -1338,7 +1319,7 @@ static IndexType find_ordering /* return the number of garbage collections */ /* save hash function in Col [col].shared3.hash */ Col [col].shared3.hash = (IndexType) hash ; - COLAMD_ASSERT (Col[col].is_alive()) ; + COLAMD_ASSERT (COL_IS_ALIVE (col)) ; } } @@ -1348,11 +1329,11 @@ static IndexType find_ordering /* return the number of garbage collections */ COLAMD_DEBUG3 (("** Supercolumn detection phase. **\n")) ; - Colamd::detect_super_cols (Col, A, head, pivot_row_start, pivot_row_length) ; + Eigen::internal::detect_super_cols (Col, A, head, pivot_row_start, pivot_row_length) ; /* === Kill the pivotal column ====================================== */ - Col[pivot_col].kill_principal() ; + KILL_PRINCIPAL_COL (pivot_col) ; /* === Clear mark =================================================== */ @@ -1360,7 +1341,7 @@ static IndexType find_ordering /* return the number of garbage collections */ if (tag_mark >= max_mark) { COLAMD_DEBUG2 (("clearing tag_mark\n")) ; - tag_mark = Colamd::clear_mark (n_row, Row) ; + tag_mark = Eigen::internal::clear_mark (n_row, Row) ; } /* === Finalize the new pivot row, and column scores ================ */ @@ -1376,7 +1357,7 @@ static IndexType find_ordering /* return the number of garbage collections */ { col = *rp++ ; /* skip dead columns */ - if (Col[col].is_dead()) + if (COL_IS_DEAD (col)) { continue ; } @@ -1410,11 +1391,11 @@ static IndexType find_ordering /* return the number of garbage collections */ COLAMD_ASSERT (min_score <= n_col) ; COLAMD_ASSERT (cur_score >= 0) ; COLAMD_ASSERT (cur_score <= n_col) ; - COLAMD_ASSERT (head [cur_score] >= Empty) ; + COLAMD_ASSERT (head [cur_score] >= COLAMD_EMPTY) ; next_col = head [cur_score] ; Col [col].shared4.degree_next = next_col ; - Col [col].shared3.prev = Empty ; - if (next_col != Empty) + Col [col].shared3.prev = COLAMD_EMPTY ; + if (next_col != COLAMD_EMPTY) { Col [next_col].shared3.prev = col ; } @@ -1467,7 +1448,7 @@ static inline void order_children /* === Parameters ======================================================= */ IndexType n_col, /* number of columns of A */ - ColStructure Col [], /* of size n_col+1 */ + colamd_col Col [], /* of size n_col+1 */ IndexType p [] /* p [0 ... n_col-1] is the column permutation*/ ) { @@ -1483,15 +1464,15 @@ static inline void order_children for (i = 0 ; i < n_col ; i++) { /* find an un-ordered non-principal column */ - COLAMD_ASSERT (col_is_dead(Col, i)) ; - if (!Col[i].is_dead_principal() && Col [i].shared2.order == Empty) + COLAMD_ASSERT (COL_IS_DEAD (i)) ; + if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == COLAMD_EMPTY) { parent = i ; /* once found, find its principal parent */ do { parent = Col [parent].shared1.parent ; - } while (!Col[parent].is_dead_principal()) ; + } while (!COL_IS_DEAD_PRINCIPAL (parent)) ; /* now, order all un-ordered non-principal columns along path */ /* to this parent. collapse tree at the same time */ @@ -1501,7 +1482,7 @@ static inline void order_children do { - COLAMD_ASSERT (Col [c].shared2.order == Empty) ; + COLAMD_ASSERT (Col [c].shared2.order == COLAMD_EMPTY) ; /* order this column */ Col [c].shared2.order = order++ ; @@ -1512,9 +1493,9 @@ static inline void order_children c = Col [c].shared1.parent ; /* continue until we hit an ordered column. There are */ - /* guaranteed not to be anymore unordered columns */ + /* guarranteed not to be anymore unordered columns */ /* above an ordered column */ - } while (Col [c].shared2.order == Empty) ; + } while (Col [c].shared2.order == COLAMD_EMPTY) ; /* re-order the super_col parent to largest order for this group */ Col [parent].shared2.order = order ; @@ -1566,8 +1547,8 @@ template static void detect_super_cols ( /* === Parameters ======================================================= */ - - ColStructure Col [], /* of size n_col+1 */ + + colamd_col Col [], /* of size n_col+1 */ IndexType A [], /* row indices of A */ IndexType head [], /* head of degree lists and hash buckets */ IndexType row_start, /* pointer to set of columns to check */ @@ -1597,7 +1578,7 @@ static void detect_super_cols while (rp < rp_end) { col = *rp++ ; - if (Col[col].is_dead()) + if (COL_IS_DEAD (col)) { continue ; } @@ -1609,7 +1590,7 @@ static void detect_super_cols /* === Get the first column in this hash bucket ===================== */ head_column = head [hash] ; - if (head_column > Empty) + if (head_column > COLAMD_EMPTY) { first_col = Col [head_column].shared3.headhash ; } @@ -1620,10 +1601,10 @@ static void detect_super_cols /* === Consider each column in the hash bucket ====================== */ - for (super_c = first_col ; super_c != Empty ; + for (super_c = first_col ; super_c != COLAMD_EMPTY ; super_c = Col [super_c].shared4.hash_next) { - COLAMD_ASSERT (Col [super_c].is_alive()) ; + COLAMD_ASSERT (COL_IS_ALIVE (super_c)) ; COLAMD_ASSERT (Col [super_c].shared3.hash == hash) ; length = Col [super_c].length ; @@ -1633,10 +1614,10 @@ static void detect_super_cols /* === Compare super_c with all columns after it ================ */ for (c = Col [super_c].shared4.hash_next ; - c != Empty ; c = Col [c].shared4.hash_next) + c != COLAMD_EMPTY ; c = Col [c].shared4.hash_next) { COLAMD_ASSERT (c != super_c) ; - COLAMD_ASSERT (Col[c].is_alive()) ; + COLAMD_ASSERT (COL_IS_ALIVE (c)) ; COLAMD_ASSERT (Col [c].shared3.hash == hash) ; /* not identical if lengths or scores are different */ @@ -1654,10 +1635,10 @@ static void detect_super_cols for (i = 0 ; i < length ; i++) { /* the columns are "clean" (no dead rows) */ - COLAMD_ASSERT ( cp1->is_alive() ); - COLAMD_ASSERT ( cp2->is_alive() ); + COLAMD_ASSERT (ROW_IS_ALIVE (*cp1)) ; + COLAMD_ASSERT (ROW_IS_ALIVE (*cp2)) ; /* row indices will same order for both supercols, */ - /* no gather scatter necessary */ + /* no gather scatter nessasary */ if (*cp1++ != *cp2++) { break ; @@ -1677,9 +1658,9 @@ static void detect_super_cols Col [super_c].shared1.thickness += Col [c].shared1.thickness ; Col [c].shared1.parent = super_c ; - Col[c].kill_non_principal() ; + KILL_NON_PRINCIPAL_COL (c) ; /* order c later, in order_children() */ - Col [c].shared2.order = Empty ; + Col [c].shared2.order = COLAMD_EMPTY ; /* remove c from hash bucket */ Col [prev_c].shared4.hash_next = Col [c].shared4.hash_next ; } @@ -1687,15 +1668,15 @@ static void detect_super_cols /* === Empty this hash bucket ======================================= */ - if (head_column > Empty) + if (head_column > COLAMD_EMPTY) { /* corresponding degree list "hash" is not empty */ - Col [head_column].shared3.headhash = Empty ; + Col [head_column].shared3.headhash = COLAMD_EMPTY ; } else { /* corresponding degree list "hash" is empty */ - head [hash] = Empty ; + head [hash] = COLAMD_EMPTY ; } } } @@ -1707,7 +1688,7 @@ static void detect_super_cols /* Defragments and compacts columns and rows in the workspace A. Used when - all available memory has been used while performing row merging. Returns + all avaliable memory has been used while performing row merging. Returns the index of the first free position in A, after garbage collection. The time taken by this routine is linear is the size of the array A, which is itself linear in the number of nonzeros in the input matrix. @@ -1717,11 +1698,11 @@ template static IndexType garbage_collection /* returns the new value of pfree */ ( /* === Parameters ======================================================= */ - + IndexType n_row, /* number of rows */ IndexType n_col, /* number of columns */ - RowStructure Row [], /* row info */ - ColStructure Col [], /* column info */ + Colamd_Row Row [], /* row info */ + colamd_col Col [], /* column info */ IndexType A [], /* A [0 ... Alen-1] holds the matrix */ IndexType *pfree /* &A [0] ... pfree is in use */ ) @@ -1740,7 +1721,7 @@ static IndexType garbage_collection /* returns the new value of pfree */ pdest = &A[0] ; for (c = 0 ; c < n_col ; c++) { - if (Col[c].is_alive()) + if (COL_IS_ALIVE (c)) { psrc = &A [Col [c].start] ; @@ -1751,7 +1732,7 @@ static IndexType garbage_collection /* returns the new value of pfree */ for (j = 0 ; j < length ; j++) { r = *psrc++ ; - if (Row[r].is_alive()) + if (ROW_IS_ALIVE (r)) { *pdest++ = r ; } @@ -1764,22 +1745,22 @@ static IndexType garbage_collection /* returns the new value of pfree */ for (r = 0 ; r < n_row ; r++) { - if (Row[r].is_alive()) + if (ROW_IS_ALIVE (r)) { if (Row [r].length == 0) { - /* this row is of zero length. cannot compact it, so kill it */ - COLAMD_DEBUG3 (("Defrag row kill\n")) ; - Row[r].kill() ; + /* this row is of zero length. cannot compact it, so kill it */ + COLAMD_DEBUG3 (("Defrag row kill\n")) ; + KILL_ROW (r) ; } else { - /* save first column index in Row [r].shared2.first_column */ - psrc = &A [Row [r].start] ; - Row [r].shared2.first_column = *psrc ; - COLAMD_ASSERT (Row[r].is_alive()) ; - /* flag the start of the row with the one's complement of row */ - *psrc = ones_complement(r) ; + /* save first column index in Row [r].shared2.first_column */ + psrc = &A [Row [r].start] ; + Row [r].shared2.first_column = *psrc ; + COLAMD_ASSERT (ROW_IS_ALIVE (r)) ; + /* flag the start of the row with the one's complement of row */ + *psrc = ONES_COMPLEMENT (r) ; } } @@ -1795,11 +1776,11 @@ static IndexType garbage_collection /* returns the new value of pfree */ { psrc-- ; /* get the row index */ - r = ones_complement(*psrc) ; + r = ONES_COMPLEMENT (*psrc) ; COLAMD_ASSERT (r >= 0 && r < n_row) ; /* restore first column index */ *psrc = Row [r].shared2.first_column ; - COLAMD_ASSERT (Row[r].is_alive()) ; + COLAMD_ASSERT (ROW_IS_ALIVE (r)) ; /* move and compact the row */ COLAMD_ASSERT (pdest <= psrc) ; @@ -1808,7 +1789,7 @@ static IndexType garbage_collection /* returns the new value of pfree */ for (j = 0 ; j < length ; j++) { c = *psrc++ ; - if (Col[c].is_alive()) + if (COL_IS_ALIVE (c)) { *pdest++ = c ; } @@ -1840,7 +1821,7 @@ static inline IndexType clear_mark /* return the new value for tag_mark */ /* === Parameters ======================================================= */ IndexType n_row, /* number of rows in A */ - RowStructure Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */ + Colamd_Row Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */ ) { /* === Local variables ================================================== */ @@ -1849,7 +1830,7 @@ static inline IndexType clear_mark /* return the new value for tag_mark */ for (r = 0 ; r < n_row ; r++) { - if (Row[r].is_alive()) + if (ROW_IS_ALIVE (r)) { Row [r].shared2.mark = 0 ; } @@ -1857,7 +1838,6 @@ static inline IndexType clear_mark /* return the new value for tag_mark */ return (1) ; } -} // namespace Colamd -} // namespace internal +} // namespace internal #endif diff --git a/uppsrc/plugin/Eigen/Eigen/src/OrderingMethods/Ordering.h b/uppsrc/plugin/Eigen/Eigen/src/OrderingMethods/Ordering.h index c57897014..7ea9b14d7 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/OrderingMethods/Ordering.h +++ b/uppsrc/plugin/Eigen/Eigen/src/OrderingMethods/Ordering.h @@ -31,13 +31,15 @@ void ordering_helper_at_plus_a(const MatrixType& A, MatrixType& symmat) for (int i = 0; i < C.rows(); i++) { for (typename MatrixType::InnerIterator it(C, i); it; ++it) - it.valueRef() = typename MatrixType::Scalar(0); + it.valueRef() = 0.0; } symmat = C + A; } } +#ifndef EIGEN_MPL2_ONLY + /** \ingroup OrderingMethods_Module * \class AMDOrdering * @@ -79,6 +81,8 @@ class AMDOrdering } }; +#endif // EIGEN_MPL2_ONLY + /** \ingroup OrderingMethods_Module * \class NaturalOrdering * @@ -129,17 +133,17 @@ class COLAMDOrdering StorageIndex n = StorageIndex(mat.cols()); StorageIndex nnz = StorageIndex(mat.nonZeros()); // Get the recommended value of Alen to be used by colamd - StorageIndex Alen = internal::Colamd::recommended(nnz, m, n); + StorageIndex Alen = internal::colamd_recommended(nnz, m, n); // Set the default parameters - double knobs [internal::Colamd::NKnobs]; - StorageIndex stats [internal::Colamd::NStats]; - internal::Colamd::set_defaults(knobs); + double knobs [COLAMD_KNOBS]; + StorageIndex stats [COLAMD_STATS]; + internal::colamd_set_defaults(knobs); IndexVector p(n+1), A(Alen); for(StorageIndex i=0; i <= n; i++) p(i) = mat.outerIndexPtr()[i]; for(StorageIndex i=0; i < nnz; i++) A(i) = mat.innerIndexPtr()[i]; // Call Colamd routine to compute the ordering - StorageIndex info = internal::Colamd::compute_ordering(m, n, Alen, A.data(), p.data(), knobs, stats); + StorageIndex info = internal::colamd(m, n, Alen, A.data(), p.data(), knobs, stats); EIGEN_UNUSED_VARIABLE(info); eigen_assert( info && "COLAMD failed " ); diff --git a/uppsrc/plugin/Eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h b/uppsrc/plugin/Eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h index 37426877a..160d8a523 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h +++ b/uppsrc/plugin/Eigen/Eigen/src/PaStiXSupport/PaStiXSupport.h @@ -203,7 +203,7 @@ class PastixBase : public SparseSolverBase /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was successful, + * \returns \c Success if computation was succesful, * \c NumericalIssue if the PaStiX reports a problem * \c InvalidInput if the input matrix is invalid * diff --git a/uppsrc/plugin/Eigen/Eigen/src/PardisoSupport/PardisoSupport.h b/uppsrc/plugin/Eigen/Eigen/src/PardisoSupport/PardisoSupport.h index f89b79bd5..f8c7d0780 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/PardisoSupport/PardisoSupport.h +++ b/uppsrc/plugin/Eigen/Eigen/src/PardisoSupport/PardisoSupport.h @@ -123,7 +123,6 @@ class PardisoImpl : public SparseSolverBase }; PardisoImpl() - : m_analysisIsOk(false), m_factorizationIsOk(false) { eigen_assert((sizeof(StorageIndex) >= sizeof(_INTEGER_t) && sizeof(StorageIndex) <= 8) && "Non-supported index type"); m_iparm.setZero(); @@ -141,7 +140,7 @@ class PardisoImpl : public SparseSolverBase /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was successful, + * \returns \c Success if computation was succesful, * \c NumericalIssue if the matrix appears to be negative. */ ComputationInfo info() const @@ -193,7 +192,8 @@ class PardisoImpl : public SparseSolverBase void pardisoInit(int type) { m_type = type; - bool symmetric = std::abs(m_type) < 10; + EIGEN_USING_STD(abs); + bool symmetric = abs(m_type) < 10; m_iparm[0] = 1; // No solver default m_iparm[1] = 2; // use Metis for the ordering m_iparm[2] = 0; // Reserved. Set to zero. (??Numbers of processors, value of OMP_NUM_THREADS??) @@ -386,15 +386,14 @@ class PardisoLU : public PardisoImpl< PardisoLU > { protected: typedef PardisoImpl Base; + typedef typename Base::Scalar Scalar; + typedef typename Base::RealScalar RealScalar; using Base::pardisoInit; using Base::m_matrix; friend class PardisoImpl< PardisoLU >; public: - typedef typename Base::Scalar Scalar; - typedef typename Base::RealScalar RealScalar; - using Base::compute; using Base::solve; @@ -442,14 +441,14 @@ class PardisoLLT : public PardisoImpl< PardisoLLT > { protected: typedef PardisoImpl< PardisoLLT > Base; + typedef typename Base::Scalar Scalar; + typedef typename Base::RealScalar RealScalar; using Base::pardisoInit; using Base::m_matrix; friend class PardisoImpl< PardisoLLT >; public: - typedef typename Base::Scalar Scalar; - typedef typename Base::RealScalar RealScalar; typedef typename Base::StorageIndex StorageIndex; enum { UpLo = _UpLo }; using Base::compute; @@ -505,14 +504,14 @@ class PardisoLDLT : public PardisoImpl< PardisoLDLT > { protected: typedef PardisoImpl< PardisoLDLT > Base; + typedef typename Base::Scalar Scalar; + typedef typename Base::RealScalar RealScalar; using Base::pardisoInit; using Base::m_matrix; friend class PardisoImpl< PardisoLDLT >; public: - typedef typename Base::Scalar Scalar; - typedef typename Base::RealScalar RealScalar; typedef typename Base::StorageIndex StorageIndex; using Base::compute; enum { UpLo = Options&(Upper|Lower) }; diff --git a/uppsrc/plugin/Eigen/Eigen/src/QR/ColPivHouseholderQR.h b/uppsrc/plugin/Eigen/Eigen/src/QR/ColPivHouseholderQR.h index 9b677e9bf..a7b47d55d 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/QR/ColPivHouseholderQR.h +++ b/uppsrc/plugin/Eigen/Eigen/src/QR/ColPivHouseholderQR.h @@ -17,9 +17,6 @@ namespace internal { template struct traits > : traits<_MatrixType> { - typedef MatrixXpr XprKind; - typedef SolverStorage StorageKind; - typedef int StorageIndex; enum { Flags = 0 }; }; @@ -49,19 +46,20 @@ template struct traits > * \sa MatrixBase::colPivHouseholderQr() */ template class ColPivHouseholderQR - : public SolverBase > { public: typedef _MatrixType MatrixType; - typedef SolverBase Base; - friend class SolverBase; - - EIGEN_GENERIC_PUBLIC_INTERFACE(ColPivHouseholderQR) enum { + RowsAtCompileTime = MatrixType::RowsAtCompileTime, + ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + // FIXME should be int + typedef typename MatrixType::StorageIndex StorageIndex; typedef typename internal::plain_diag_type::type HCoeffsType; typedef PermutationMatrix PermutationType; typedef typename internal::plain_row_type::type IntRowVectorType; @@ -158,7 +156,6 @@ template class ColPivHouseholderQR computeInPlace(); } - #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method finds a solution x to the equation Ax=b, where A is the matrix of which * *this is the QR decomposition, if any exists. * @@ -175,8 +172,11 @@ template class ColPivHouseholderQR */ template inline const Solve - solve(const MatrixBase& b) const; - #endif + solve(const MatrixBase& b) const + { + eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized."); + return Solve(*this, b.derived()); + } HouseholderSequenceType householderQ() const; HouseholderSequenceType matrixQ() const @@ -402,7 +402,7 @@ template class ColPivHouseholderQR */ RealScalar maxPivot() const { return m_maxpivot; } - /** \brief Reports whether the QR factorization was successful. + /** \brief Reports whether the QR factorization was succesful. * * \note This function always returns \c Success. It is provided for compatibility * with other factorization routines. @@ -416,10 +416,8 @@ template class ColPivHouseholderQR #ifndef EIGEN_PARSED_BY_DOXYGEN template + EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; - - template - void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -586,6 +584,8 @@ template template void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const { + eigen_assert(rhs.rows() == rows()); + const Index nonzero_pivots = nonzeroPivots(); if(nonzero_pivots == 0) @@ -596,7 +596,11 @@ void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType & typename RhsType::PlainObject c(rhs); - c.applyOnTheLeft(householderQ().setLength(nonzero_pivots).adjoint() ); + // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T + c.applyOnTheLeft(householderSequence(m_qr, m_hCoeffs) + .setLength(nonzero_pivots) + .transpose() + ); m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots) .template triangularView() @@ -605,31 +609,6 @@ void ColPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType & for(Index i = 0; i < nonzero_pivots; ++i) dst.row(m_colsPermutation.indices().coeff(i)) = c.row(i); for(Index i = nonzero_pivots; i < cols(); ++i) dst.row(m_colsPermutation.indices().coeff(i)).setZero(); } - -template -template -void ColPivHouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const -{ - const Index nonzero_pivots = nonzeroPivots(); - - if(nonzero_pivots == 0) - { - dst.setZero(); - return; - } - - typename RhsType::PlainObject c(m_colsPermutation.transpose()*rhs); - - m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots) - .template triangularView() - .transpose().template conjugateIf() - .solveInPlace(c.topRows(nonzero_pivots)); - - dst.topRows(nonzero_pivots) = c.topRows(nonzero_pivots); - dst.bottomRows(rows()-nonzero_pivots).setZero(); - - dst.applyOnTheLeft(householderQ().setLength(nonzero_pivots).template conjugateIf() ); -} #endif namespace internal { diff --git a/uppsrc/plugin/Eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/uppsrc/plugin/Eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h index 486d3373a..34c637b70 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h +++ b/uppsrc/plugin/Eigen/Eigen/src/QR/CompleteOrthogonalDecomposition.h @@ -16,9 +16,6 @@ namespace internal { template struct traits > : traits<_MatrixType> { - typedef MatrixXpr XprKind; - typedef SolverStorage StorageKind; - typedef int StorageIndex; enum { Flags = 0 }; }; @@ -47,21 +44,19 @@ struct traits > * * \sa MatrixBase::completeOrthogonalDecomposition() */ -template class CompleteOrthogonalDecomposition - : public SolverBase > -{ +template +class CompleteOrthogonalDecomposition { public: typedef _MatrixType MatrixType; - typedef SolverBase Base; - - template - friend struct internal::solve_assertion; - - EIGEN_GENERIC_PUBLIC_INTERFACE(CompleteOrthogonalDecomposition) enum { + RowsAtCompileTime = MatrixType::RowsAtCompileTime, + ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + typedef typename MatrixType::StorageIndex StorageIndex; typedef typename internal::plain_diag_type::type HCoeffsType; typedef PermutationMatrix PermutationType; @@ -136,9 +131,9 @@ template class CompleteOrthogonalDecomposition m_temp(matrix.cols()) { computeInPlace(); - } + } + - #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method computes the minimum-norm solution X to a least squares * problem \f[\mathrm{minimize} \|A X - B\|, \f] where \b A is the matrix of * which \c *this is the complete orthogonal decomposition. @@ -150,8 +145,11 @@ template class CompleteOrthogonalDecomposition */ template inline const Solve solve( - const MatrixBase& b) const; - #endif + const MatrixBase& b) const { + eigen_assert(m_cpqr.m_isInitialized && + "CompleteOrthogonalDecomposition is not initialized."); + return Solve(*this, b.derived()); + } HouseholderSequenceType householderQ(void) const; HouseholderSequenceType matrixQ(void) const { return m_cpqr.householderQ(); } @@ -160,8 +158,8 @@ template class CompleteOrthogonalDecomposition */ MatrixType matrixZ() const { MatrixType Z = MatrixType::Identity(m_cpqr.cols(), m_cpqr.cols()); - applyZOnTheLeftInPlace(Z); - return Z; + applyZAdjointOnTheLeftInPlace(Z); + return Z.adjoint(); } /** \returns a reference to the matrix where the complete orthogonal @@ -277,7 +275,6 @@ template class CompleteOrthogonalDecomposition */ inline const Inverse pseudoInverse() const { - eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized."); return Inverse(*this); } @@ -356,7 +353,7 @@ template class CompleteOrthogonalDecomposition inline RealScalar maxPivot() const { return m_cpqr.maxPivot(); } /** \brief Reports whether the complete orthogonal decomposition was - * successful. + * succesful. * * \note This function always returns \c Success. It is provided for * compatibility @@ -370,10 +367,7 @@ template class CompleteOrthogonalDecomposition #ifndef EIGEN_PARSED_BY_DOXYGEN template - void _solve_impl(const RhsType& rhs, DstType& dst) const; - - template - void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; + EIGEN_DEVICE_FUNC void _solve_impl(const RhsType& rhs, DstType& dst) const; #endif protected: @@ -381,22 +375,8 @@ template class CompleteOrthogonalDecomposition EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } - template - void _check_solve_assertion(const Rhs& b) const { - EIGEN_ONLY_USED_FOR_DEBUG(b); - eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized."); - eigen_assert((Transpose_?derived().cols():derived().rows())==b.rows() && "CompleteOrthogonalDecomposition::solve(): invalid number of rows of the right hand side matrix b"); - } - void computeInPlace(); - /** Overwrites \b rhs with \f$ \mathbf{Z} * \mathbf{rhs} \f$ or - * \f$ \mathbf{\overline Z} * \mathbf{rhs} \f$ if \c Conjugate - * is set to \c true. - */ - template - void applyZOnTheLeftInPlace(Rhs& rhs) const; - /** Overwrites \b rhs with \f$ \mathbf{Z}^* * \mathbf{rhs} \f$. */ template @@ -472,7 +452,7 @@ void CompleteOrthogonalDecomposition::computeInPlace() // Apply Z(k) to the first k rows of X_k m_cpqr.m_qr.topRightCorner(k, cols - rank + 1) .applyHouseholderOnTheRight( - m_cpqr.m_qr.row(k).tail(cols - rank).adjoint(), m_zCoeffs(k), + m_cpqr.m_qr.row(k).tail(cols - rank).transpose(), m_zCoeffs(k), &m_temp(0)); } if (k != rank - 1) { @@ -484,28 +464,6 @@ void CompleteOrthogonalDecomposition::computeInPlace() } } -template -template -void CompleteOrthogonalDecomposition::applyZOnTheLeftInPlace( - Rhs& rhs) const { - const Index cols = this->cols(); - const Index nrhs = rhs.cols(); - const Index rank = this->rank(); - Matrix temp((std::max)(cols, nrhs)); - for (Index k = rank-1; k >= 0; --k) { - if (k != rank - 1) { - rhs.row(k).swap(rhs.row(rank - 1)); - } - rhs.middleRows(rank - 1, cols - rank + 1) - .applyHouseholderOnTheLeft( - matrixQTZ().row(k).tail(cols - rank).transpose().template conjugateIf(), zCoeffs().template conjugateIf()(k), - &temp(0)); - if (k != rank - 1) { - rhs.row(k).swap(rhs.row(rank - 1)); - } - } -} - template template void CompleteOrthogonalDecomposition::applyZAdjointOnTheLeftInPlace( @@ -513,7 +471,7 @@ void CompleteOrthogonalDecomposition::applyZAdjointOnTheLeftInPlace( const Index cols = this->cols(); const Index nrhs = rhs.cols(); const Index rank = this->rank(); - Matrix temp((std::max)(cols, nrhs)); + Matrix temp((std::max)(cols, nrhs)); for (Index k = 0; k < rank; ++k) { if (k != rank - 1) { rhs.row(k).swap(rhs.row(rank - 1)); @@ -533,6 +491,8 @@ template template void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl( const RhsType& rhs, DstType& dst) const { + eigen_assert(rhs.rows() == this->rows()); + const Index rank = this->rank(); if (rank == 0) { dst.setZero(); @@ -540,8 +500,11 @@ void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl( } // Compute c = Q^* * rhs + // Note that the matrix Q = H_0^* H_1^*... so its inverse is + // Q^* = (H_0 H_1 ...)^T typename RhsType::PlainObject c(rhs); - c.applyOnTheLeft(matrixQ().setLength(rank).adjoint()); + c.applyOnTheLeft( + householderSequence(matrixQTZ(), hCoeffs()).setLength(rank).transpose()); // Solve T z = c(1:rank, :) dst.topRows(rank) = matrixT() @@ -560,45 +523,10 @@ void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl( // Undo permutation to get x = P^{-1} * y. dst = colsPermutation() * dst; } - -template -template -void CompleteOrthogonalDecomposition<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const -{ - const Index rank = this->rank(); - - if (rank == 0) { - dst.setZero(); - return; - } - - typename RhsType::PlainObject c(colsPermutation().transpose()*rhs); - - if (rank < cols()) { - applyZOnTheLeftInPlace(c); - } - - matrixT().topLeftCorner(rank, rank) - .template triangularView() - .transpose().template conjugateIf() - .solveInPlace(c.topRows(rank)); - - dst.topRows(rank) = c.topRows(rank); - dst.bottomRows(rows()-rank).setZero(); - - dst.applyOnTheLeft(householderQ().setLength(rank).template conjugateIf() ); -} #endif namespace internal { -template -struct traits > > - : traits::PlainObject> -{ - enum { Flags = 0 }; -}; - template struct Assignment >, internal::assign_op::Scalar>, Dense2Dense> { @@ -606,8 +534,7 @@ struct Assignment SrcXprType; static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &) { - typedef Matrix IdentityMatrixType; - dst = src.nestedExpression().solve(IdentityMatrixType::Identity(src.cols(), src.cols())); + dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.rows())); } }; diff --git a/uppsrc/plugin/Eigen/Eigen/src/QR/FullPivHouseholderQR.h b/uppsrc/plugin/Eigen/Eigen/src/QR/FullPivHouseholderQR.h index d0664a1d8..e489bddc2 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/QR/FullPivHouseholderQR.h +++ b/uppsrc/plugin/Eigen/Eigen/src/QR/FullPivHouseholderQR.h @@ -18,9 +18,6 @@ namespace internal { template struct traits > : traits<_MatrixType> { - typedef MatrixXpr XprKind; - typedef SolverStorage StorageKind; - typedef int StorageIndex; enum { Flags = 0 }; }; @@ -58,19 +55,20 @@ struct traits > * \sa MatrixBase::fullPivHouseholderQr() */ template class FullPivHouseholderQR - : public SolverBase > { public: typedef _MatrixType MatrixType; - typedef SolverBase Base; - friend class SolverBase; - - EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivHouseholderQR) enum { + RowsAtCompileTime = MatrixType::RowsAtCompileTime, + ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + // FIXME should be int + typedef typename MatrixType::StorageIndex StorageIndex; typedef internal::FullPivHouseholderQRMatrixQReturnType MatrixQReturnType; typedef typename internal::plain_diag_type::type HCoeffsType; typedef Matrix class FullPivHouseholderQR computeInPlace(); } - #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method finds a solution x to the equation Ax=b, where A is the matrix of which * \c *this is the QR decomposition. * @@ -176,8 +173,11 @@ template class FullPivHouseholderQR */ template inline const Solve - solve(const MatrixBase& b) const; - #endif + solve(const MatrixBase& b) const + { + eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized."); + return Solve(*this, b.derived()); + } /** \returns Expression object representing the matrix Q */ @@ -392,24 +392,22 @@ template class FullPivHouseholderQR * diagonal coefficient of U. */ RealScalar maxPivot() const { return m_maxpivot; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN template + EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; - - template - void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: - + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } - + void computeInPlace(); - + MatrixType m_qr; HCoeffsType m_hCoeffs; IntDiagSizeVectorType m_rows_transpositions; @@ -501,15 +499,15 @@ void FullPivHouseholderQR::computeInPlace() m_nonzero_pivots = k; for(Index i = k; i < size; i++) { - m_rows_transpositions.coeffRef(i) = internal::convert_index(i); - m_cols_transpositions.coeffRef(i) = internal::convert_index(i); + m_rows_transpositions.coeffRef(i) = i; + m_cols_transpositions.coeffRef(i) = i; m_hCoeffs.coeffRef(i) = Scalar(0); } break; } - m_rows_transpositions.coeffRef(k) = internal::convert_index(row_of_biggest_in_corner); - m_cols_transpositions.coeffRef(k) = internal::convert_index(col_of_biggest_in_corner); + m_rows_transpositions.coeffRef(k) = row_of_biggest_in_corner; + m_cols_transpositions.coeffRef(k) = col_of_biggest_in_corner; if(k != row_of_biggest_in_corner) { m_qr.row(k).tail(cols-k).swap(m_qr.row(row_of_biggest_in_corner).tail(cols-k)); ++number_of_transpositions; @@ -543,6 +541,7 @@ template template void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const { + eigen_assert(rhs.rows() == rows()); const Index l_rank = rank(); // FIXME introduce nonzeroPivots() and use it here. and more generally, @@ -555,7 +554,7 @@ void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType typename RhsType::PlainObject c(rhs); - Matrix temp(rhs.cols()); + Matrix temp(rhs.cols()); for (Index k = 0; k < l_rank; ++k) { Index remainingSize = rows()-k; @@ -572,42 +571,6 @@ void FullPivHouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType for(Index i = 0; i < l_rank; ++i) dst.row(m_cols_permutation.indices().coeff(i)) = c.row(i); for(Index i = l_rank; i < cols(); ++i) dst.row(m_cols_permutation.indices().coeff(i)).setZero(); } - -template -template -void FullPivHouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const -{ - const Index l_rank = rank(); - - if(l_rank == 0) - { - dst.setZero(); - return; - } - - typename RhsType::PlainObject c(m_cols_permutation.transpose()*rhs); - - m_qr.topLeftCorner(l_rank, l_rank) - .template triangularView() - .transpose().template conjugateIf() - .solveInPlace(c.topRows(l_rank)); - - dst.topRows(l_rank) = c.topRows(l_rank); - dst.bottomRows(rows()-l_rank).setZero(); - - Matrix temp(dst.cols()); - const Index size = (std::min)(rows(), cols()); - for (Index k = size-1; k >= 0; --k) - { - Index remainingSize = rows()-k; - - dst.bottomRightCorner(remainingSize, dst.cols()) - .applyHouseholderOnTheLeft(m_qr.col(k).tail(remainingSize-1).template conjugateIf(), - m_hCoeffs.template conjugateIf().coeff(k), &temp.coeffRef(0)); - - dst.row(k).swap(dst.row(m_rows_transpositions.coeff(k))); - } -} #endif namespace internal { diff --git a/uppsrc/plugin/Eigen/Eigen/src/QR/HouseholderQR.h b/uppsrc/plugin/Eigen/Eigen/src/QR/HouseholderQR.h index 801739fbd..3513d995c 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/QR/HouseholderQR.h +++ b/uppsrc/plugin/Eigen/Eigen/src/QR/HouseholderQR.h @@ -14,18 +14,6 @@ namespace Eigen { -namespace internal { -template struct traits > - : traits<_MatrixType> -{ - typedef MatrixXpr XprKind; - typedef SolverStorage StorageKind; - typedef int StorageIndex; - enum { Flags = 0 }; -}; - -} // end namespace internal - /** \ingroup QR_Module * * @@ -54,19 +42,20 @@ template struct traits > * \sa MatrixBase::householderQr() */ template class HouseholderQR - : public SolverBase > { public: typedef _MatrixType MatrixType; - typedef SolverBase Base; - friend class SolverBase; - - EIGEN_GENERIC_PUBLIC_INTERFACE(HouseholderQR) enum { + RowsAtCompileTime = MatrixType::RowsAtCompileTime, + ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; + typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + // FIXME should be int + typedef typename MatrixType::StorageIndex StorageIndex; typedef Matrix MatrixQType; typedef typename internal::plain_diag_type::type HCoeffsType; typedef typename internal::plain_row_type::type RowVectorType; @@ -132,7 +121,6 @@ template class HouseholderQR computeInPlace(); } - #ifdef EIGEN_PARSED_BY_DOXYGEN /** This method finds a solution x to the equation Ax=b, where A is the matrix of which * *this is the QR decomposition, if any exists. * @@ -149,8 +137,11 @@ template class HouseholderQR */ template inline const Solve - solve(const MatrixBase& b) const; - #endif + solve(const MatrixBase& b) const + { + eigen_assert(m_isInitialized && "HouseholderQR is not initialized."); + return Solve(*this, b.derived()); + } /** This method returns an expression of the unitary matrix Q as a sequence of Householder transformations. * @@ -213,30 +204,28 @@ template class HouseholderQR inline Index rows() const { return m_qr.rows(); } inline Index cols() const { return m_qr.cols(); } - + /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q. * * For advanced uses only. */ const HCoeffsType& hCoeffs() const { return m_hCoeffs; } - + #ifndef EIGEN_PARSED_BY_DOXYGEN template + EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; - - template - void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: - + static void check_template_parameters() { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } void computeInPlace(); - + MatrixType m_qr; HCoeffsType m_hCoeffs; RowVectorType m_temp; @@ -303,7 +292,7 @@ template struct householder_qr_inplace_blocked { - // This is specialized for LAPACK-supported Scalar types in HouseholderQR_LAPACKE.h + // This is specialized for MKL-supported Scalar types in HouseholderQR_MKL.h static void run(MatrixQR& mat, HCoeffs& hCoeffs, Index maxBlockSize=32, typename MatrixQR::Scalar* tempData = 0) { @@ -361,10 +350,15 @@ template void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) const { const Index rank = (std::min)(rows(), cols()); + eigen_assert(rhs.rows() == rows()); typename RhsType::PlainObject c(rhs); - c.applyOnTheLeft(householderQ().setLength(rank).adjoint() ); + // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T + c.applyOnTheLeft(householderSequence( + m_qr.leftCols(rank), + m_hCoeffs.head(rank)).transpose() + ); m_qr.topLeftCorner(rank, rank) .template triangularView() @@ -373,25 +367,6 @@ void HouseholderQR<_MatrixType>::_solve_impl(const RhsType &rhs, DstType &dst) c dst.topRows(rank) = c.topRows(rank); dst.bottomRows(cols()-rank).setZero(); } - -template -template -void HouseholderQR<_MatrixType>::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const -{ - const Index rank = (std::min)(rows(), cols()); - - typename RhsType::PlainObject c(rhs); - - m_qr.topLeftCorner(rank, rank) - .template triangularView() - .transpose().template conjugateIf() - .solveInPlace(c.topRows(rank)); - - dst.topRows(rank) = c.topRows(rank); - dst.bottomRows(rows()-rank).setZero(); - - dst.applyOnTheLeft(householderQ().setLength(rank).template conjugateIf() ); -} #endif /** Performs the QR factorization of the given matrix \a matrix. The result of diff --git a/uppsrc/plugin/Eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/uppsrc/plugin/Eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h index 013c7ae7a..953d57c9d 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +++ b/uppsrc/plugin/Eigen/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h @@ -74,35 +74,13 @@ class SPQR : public SparseSolverBase > }; public: SPQR() - : m_analysisIsOk(false), - m_factorizationIsOk(false), - m_isRUpToDate(false), - m_ordering(SPQR_ORDERING_DEFAULT), - m_allow_tol(SPQR_DEFAULT_TOL), - m_tolerance (NumTraits::epsilon()), - m_cR(0), - m_E(0), - m_H(0), - m_HPinv(0), - m_HTau(0), - m_useDefaultThreshold(true) + : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits::epsilon()), m_useDefaultThreshold(true) { cholmod_l_start(&m_cc); } explicit SPQR(const _MatrixType& matrix) - : m_analysisIsOk(false), - m_factorizationIsOk(false), - m_isRUpToDate(false), - m_ordering(SPQR_ORDERING_DEFAULT), - m_allow_tol(SPQR_DEFAULT_TOL), - m_tolerance (NumTraits::epsilon()), - m_cR(0), - m_E(0), - m_H(0), - m_HPinv(0), - m_HTau(0), - m_useDefaultThreshold(true) + : m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits::epsilon()), m_useDefaultThreshold(true) { cholmod_l_start(&m_cc); compute(matrix); @@ -242,7 +220,7 @@ class SPQR : public SparseSolverBase > /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was successful, + * \returns \c Success if computation was succesful, * \c NumericalIssue if the sparse QR can not be computed */ ComputationInfo info() const diff --git a/uppsrc/plugin/Eigen/Eigen/src/SVD/BDCSVD.h b/uppsrc/plugin/Eigen/Eigen/src/SVD/BDCSVD.h index bcec45f58..a5b73f8f2 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/SVD/BDCSVD.h +++ b/uppsrc/plugin/Eigen/Eigen/src/SVD/BDCSVD.h @@ -22,11 +22,6 @@ // #define EIGEN_BDCSVD_DEBUG_VERBOSE // #define EIGEN_BDCSVD_SANITY_CHECKS -#ifdef EIGEN_BDCSVD_SANITY_CHECKS -#undef eigen_internal_assert -#define eigen_internal_assert(X) assert(X); -#endif - namespace Eigen { #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE @@ -39,7 +34,6 @@ namespace internal { template struct traits > - : traits<_MatrixType> { typedef _MatrixType MatrixType; }; @@ -63,7 +57,7 @@ struct traits > * recommended and can several order of magnitude faster. * * \warning this algorithm is unlikely to provide accurate result when compiled with unsafe math optimizations. - * For instance, this concerns Intel's compiler (ICC), which performs such optimization by default unless + * For instance, this concerns Intel's compiler (ICC), which perfroms such optimization by default unless * you compile with the \c -fp-model \c precise option. Likewise, the \c -ffast-math option of GCC or clang will * significantly degrade the accuracy. * @@ -111,7 +105,7 @@ public: * The default constructor is useful in cases in which the user intends to * perform decompositions via BDCSVD::compute(const MatrixType&). */ - BDCSVD() : m_algoswap(16), m_isTranspose(false), m_compU(false), m_compV(false), m_numIters(0) + BDCSVD() : m_algoswap(16), m_numIters(0) {} @@ -218,7 +212,7 @@ public: // Method to allocate and initialize matrix and attributes template -void BDCSVD::allocate(Eigen::Index rows, Eigen::Index cols, unsigned int computationOptions) +void BDCSVD::allocate(Index rows, Index cols, unsigned int computationOptions) { m_isTranspose = (cols > rows); @@ -394,7 +388,7 @@ void BDCSVD::structured_update(Block A, co //@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix // to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the reference paper. template -void BDCSVD::divide (Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift) +void BDCSVD::divide (Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift) { // requires rows = cols + 1; using std::pow; @@ -574,7 +568,7 @@ void BDCSVD::divide (Eigen::Index firstCol, Eigen::Index lastCol, Ei // handling of round-off errors, be consistent in ordering // For instance, to solve the secular equation using FMM, see http://www.stat.uchicago.edu/~lekheng/courses/302/classics/greengard-rokhlin.pdf template -void BDCSVD::computeSVDofM(Eigen::Index firstCol, Eigen::Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V) +void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V) { const RealScalar considerZero = (std::numeric_limits::min)(); using std::abs; @@ -597,7 +591,7 @@ void BDCSVD::computeSVDofM(Eigen::Index firstCol, Eigen::Index n, Ma // but others are interleaved and we must ignore them at this stage. // To this end, let's compute a permutation skipping them: Index actual_n = n; - while(actual_n>1 && diag(actual_n-1)==Literal(0)) {--actual_n; eigen_internal_assert(col0(actual_n)==Literal(0)); } + while(actual_n>1 && diag(actual_n-1)==Literal(0)) --actual_n; Index m = 0; // size of the deflated problem for(Index k=0;kconsiderZero) @@ -624,11 +618,13 @@ void BDCSVD::computeSVDofM(Eigen::Index firstCol, Eigen::Index n, Ma std::cout << " shift: " << shifts.transpose() << "\n"; { + Index actual_n = n; + while(actual_n>1 && abs(col0(actual_n-1))= 0).all()); std::cout << " check2 (>0) : " << ((singVals.array()-diag) / singVals.array()).head(actual_n).transpose() << "\n\n"; - assert((((singVals.array()-diag) / singVals.array()).head(actual_n) >= 0).all()); + std::cout << " check3 (>0) : " << ((diag.segment(1,actual_n-1)-singVals.head(actual_n-1).array()) / singVals.head(actual_n-1).array()).transpose() << "\n\n\n"; + std::cout << " check4 (>0) : " << ((singVals.segment(1,actual_n-1)-singVals.head(actual_n-1))).transpose() << "\n\n\n"; } #endif @@ -656,13 +652,13 @@ void BDCSVD::computeSVDofM(Eigen::Index firstCol, Eigen::Index n, Ma #endif #ifdef EIGEN_BDCSVD_SANITY_CHECKS + assert(U.allFinite()); + assert(V.allFinite()); + assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 1e-14 * n); + assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 1e-14 * n); assert(m_naiveU.allFinite()); assert(m_naiveV.allFinite()); assert(m_computed.allFinite()); - assert(U.allFinite()); - assert(V.allFinite()); -// assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() < 100*NumTraits::epsilon() * n); -// assert((V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 100*NumTraits::epsilon() * n); #endif // Because of deflation, the singular values might not be completely sorted. @@ -677,15 +673,6 @@ void BDCSVD::computeSVDofM(Eigen::Index firstCol, Eigen::Index n, Ma if(m_compV) V.col(i).swap(V.col(i+1)); } } - -#ifdef EIGEN_BDCSVD_SANITY_CHECKS - { - bool singular_values_sorted = (((singVals.segment(1,actual_n-1)-singVals.head(actual_n-1))).array() >= 0).all(); - if(!singular_values_sorted) - std::cout << "Singular values are not sorted: " << singVals.segment(1,actual_n).transpose() << "\n"; - assert(singular_values_sorted); - } -#endif // Reverse order so that singular values in increased order // Because of deflation, the zeros singular-values are already at the end @@ -762,22 +749,19 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d RealScalar mid = left + (right-left) / Literal(2); RealScalar fMid = secularEq(mid, col0, diag, perm, diag, Literal(0)); #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE - std::cout << "right-left = " << right-left << "\n"; -// std::cout << "fMid = " << fMid << " " << secularEq(mid-left, col0, diag, perm, ArrayXr(diag-left), left) -// << " " << secularEq(mid-right, col0, diag, perm, ArrayXr(diag-right), right) << "\n"; - std::cout << " = " << secularEq(left+RealScalar(0.000001)*(right-left), col0, diag, perm, diag, 0) - << " " << secularEq(left+RealScalar(0.1) *(right-left), col0, diag, perm, diag, 0) - << " " << secularEq(left+RealScalar(0.2) *(right-left), col0, diag, perm, diag, 0) - << " " << secularEq(left+RealScalar(0.3) *(right-left), col0, diag, perm, diag, 0) - << " " << secularEq(left+RealScalar(0.4) *(right-left), col0, diag, perm, diag, 0) - << " " << secularEq(left+RealScalar(0.49) *(right-left), col0, diag, perm, diag, 0) - << " " << secularEq(left+RealScalar(0.5) *(right-left), col0, diag, perm, diag, 0) - << " " << secularEq(left+RealScalar(0.51) *(right-left), col0, diag, perm, diag, 0) - << " " << secularEq(left+RealScalar(0.6) *(right-left), col0, diag, perm, diag, 0) - << " " << secularEq(left+RealScalar(0.7) *(right-left), col0, diag, perm, diag, 0) - << " " << secularEq(left+RealScalar(0.8) *(right-left), col0, diag, perm, diag, 0) - << " " << secularEq(left+RealScalar(0.9) *(right-left), col0, diag, perm, diag, 0) - << " " << secularEq(left+RealScalar(0.999999)*(right-left), col0, diag, perm, diag, 0) << "\n"; + std::cout << right-left << "\n"; + std::cout << "fMid = " << fMid << " " << secularEq(mid-left, col0, diag, perm, diag-left, left) << " " << secularEq(mid-right, col0, diag, perm, diag-right, right) << "\n"; + std::cout << " = " << secularEq(0.1*(left+right), col0, diag, perm, diag, 0) + << " " << secularEq(0.2*(left+right), col0, diag, perm, diag, 0) + << " " << secularEq(0.3*(left+right), col0, diag, perm, diag, 0) + << " " << secularEq(0.4*(left+right), col0, diag, perm, diag, 0) + << " " << secularEq(0.49*(left+right), col0, diag, perm, diag, 0) + << " " << secularEq(0.5*(left+right), col0, diag, perm, diag, 0) + << " " << secularEq(0.51*(left+right), col0, diag, perm, diag, 0) + << " " << secularEq(0.6*(left+right), col0, diag, perm, diag, 0) + << " " << secularEq(0.7*(left+right), col0, diag, perm, diag, 0) + << " " << secularEq(0.8*(left+right), col0, diag, perm, diag, 0) + << " " << secularEq(0.9*(left+right), col0, diag, perm, diag, 0) << "\n"; #endif RealScalar shift = (k == actual_n-1 || fMid > Literal(0)) ? left : right; @@ -835,16 +819,13 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d // And find mu such that f(mu)==0: RealScalar muZero = -a/b; RealScalar fZero = secularEq(muZero, col0, diag, perm, diagShifted, shift); - -#ifdef EIGEN_BDCSVD_SANITY_CHECKS - assert((numext::isfinite)(fZero)); -#endif muPrev = muCur; fPrev = fCur; muCur = muZero; fCur = fZero; + if (shift == left && (muCur < Literal(0) || muCur > right - left)) useBisection = true; if (shift == right && (muCur < -(right - left) || muCur > Literal(0))) useBisection = true; if (abs(fCur)>abs(fPrev)) useBisection = true; @@ -877,33 +858,20 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d else rightShifted = -(std::numeric_limits::min)(); } - + RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift); eigen_internal_assert(fLeft [" << leftShifted << " " << rightShifted << "], shift=" << shift - << " , f(right)=" << secularEq(0, col0, diag, perm, diagShifted, shift) - << " == " << secularEq(right, col0, diag, perm, diag, 0) << " == " << fRight << "\n"; + std::cout << "fLeft: " << leftShifted << " - " << diagShifted.head(10).transpose() << "\n ; " << bool(left==shift) << " " << (left-shift) << "\n"; + std::cout << k << " : " << fLeft << " * " << fRight << " == " << fLeft * fRight << " ; " << left << " - " << right << " -> " << leftShifted << " " << rightShifted << " shift=" << shift << "\n"; } #endif eigen_internal_assert(fLeft * fRight < Literal(0)); @@ -944,15 +912,6 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d shifts[k] = shift; mus[k] = muCur; -#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE - if(k+1=singVals[k-1]); - assert(singVals[k]>=diag(k)); -#endif - // perturb singular value slightly if it equals diagonal entry to avoid division by zero later // (deflation is supposed to avoid this from happening) // - this does no seem to be necessary anymore - @@ -976,7 +935,7 @@ void BDCSVD::perturbCol0 zhat.setZero(); return; } - Index lastIdx = perm(m-1); + Index last = perm(m-1); // The offset permits to skip deflated entries while computing zhat for (Index k = 0; k < n; ++k) { @@ -986,43 +945,15 @@ void BDCSVD::perturbCol0 { // see equation (3.6) RealScalar dk = diag(k); - RealScalar prod = (singVals(lastIdx) + dk) * (mus(lastIdx) + (shifts(lastIdx) - dk)); -#ifdef EIGEN_BDCSVD_SANITY_CHECKS - if(prod<0) { - std::cout << "k = " << k << " ; z(k)=" << col0(k) << ", diag(k)=" << dk << "\n"; - std::cout << "prod = " << "(" << singVals(lastIdx) << " + " << dk << ") * (" << mus(lastIdx) << " + (" << shifts(lastIdx) << " - " << dk << "))" << "\n"; - std::cout << " = " << singVals(lastIdx) + dk << " * " << mus(lastIdx) + (shifts(lastIdx) - dk) << "\n"; - } - assert(prod>=0); -#endif + RealScalar prod = (singVals(last) + dk) * (mus(last) + (shifts(last) - dk)); for(Index l = 0; l=k && (l==0 || l-1>=m)) - { - std::cout << "Error in perturbCol0\n"; - std::cout << " " << k << "/" << n << " " << l << "/" << m << " " << i << "/" << n << " ; " << col0(k) << " " << diag(k) << " " << "\n"; - std::cout << " " <=0); -#endif #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE if(i!=k && numext::abs(((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) - 1) > 0.9 ) std::cout << " " << ((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) << " == (" << (singVals(j)+dk) << " * " << (mus(j)+(shifts(j)-dk)) @@ -1031,12 +962,9 @@ void BDCSVD::perturbCol0 } } #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE - std::cout << "zhat(" << k << ") = sqrt( " << prod << ") ; " << (singVals(lastIdx) + dk) << " * " << mus(lastIdx) + shifts(lastIdx) << " - " << dk << "\n"; + std::cout << "zhat(" << k << ") = sqrt( " << prod << ") ; " << (singVals(last) + dk) << " * " << mus(last) + shifts(last) << " - " << dk << "\n"; #endif RealScalar tmp = sqrt(prod); -#ifdef EIGEN_BDCSVD_SANITY_CHECKS - assert((numext::isfinite)(tmp)); -#endif zhat(k) = col0(k) > Literal(0) ? RealScalar(tmp) : RealScalar(-tmp); } } @@ -1090,7 +1018,7 @@ void BDCSVD::computeSingVecs // i >= 1, di almost null and zi non null. // We use a rotation to zero out zi applied to the left of M template -void BDCSVD::deflation43(Eigen::Index firstCol, Eigen::Index shift, Eigen::Index i, Eigen::Index size) +void BDCSVD::deflation43(Index firstCol, Index shift, Index i, Index size) { using std::abs; using std::sqrt; @@ -1119,7 +1047,7 @@ void BDCSVD::deflation43(Eigen::Index firstCol, Eigen::Index shift, // We apply two rotations to have zj = 0; // TODO deflation44 is still broken and not properly tested template -void BDCSVD::deflation44(Eigen::Index firstColu , Eigen::Index firstColm, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index i, Eigen::Index j, Eigen::Index size) +void BDCSVD::deflation44(Index firstColu , Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size) { using std::abs; using std::sqrt; @@ -1146,7 +1074,7 @@ void BDCSVD::deflation44(Eigen::Index firstColu , Eigen::Index first } c/=r; s/=r; - m_computed(firstColm + i, firstColm) = r; + m_computed(firstColm + i, firstColm) = r; m_computed(firstColm + j, firstColm + j) = m_computed(firstColm + i, firstColm + i); m_computed(firstColm + j, firstColm) = Literal(0); @@ -1159,7 +1087,7 @@ void BDCSVD::deflation44(Eigen::Index firstColu , Eigen::Index first // acts on block from (firstCol+shift, firstCol+shift) to (lastCol+shift, lastCol+shift) [inclusive] template -void BDCSVD::deflation(Eigen::Index firstCol, Eigen::Index lastCol, Eigen::Index k, Eigen::Index firstRowW, Eigen::Index firstColW, Eigen::Index shift) +void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift) { using std::sqrt; using std::abs; @@ -1220,7 +1148,6 @@ void BDCSVD::deflation(Eigen::Index firstCol, Eigen::Index lastCol, #endif #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE std::cout << "to be sorted: " << diag.transpose() << "\n\n"; - std::cout << " : " << col0.transpose() << "\n\n"; #endif { // Check for total deflation @@ -1311,7 +1238,7 @@ void BDCSVD::deflation(Eigen::Index firstCol, Eigen::Index lastCol, if( (diag(i) - diag(i-1)) < NumTraits::epsilon()*maxDiag ) { #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE - std::cout << "deflation 4.4 with i = " << i << " because " << diag(i) << " - " << diag(i-1) << " == " << (diag(i) - diag(i-1)) << " < " << NumTraits::epsilon()*/*diag(i)*/maxDiag << "\n"; + std::cout << "deflation 4.4 with i = " << i << " because " << (diag(i) - diag(i-1)) << " < " << NumTraits::epsilon()*diag(i) << "\n"; #endif eigen_internal_assert(abs(diag(i) - diag(i-1))::deflation(Eigen::Index firstCol, Eigen::Index lastCol, #endif }//end deflation -#if !defined(EIGEN_GPUCC) +#ifndef __CUDACC__ /** \svd_module * * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm diff --git a/uppsrc/plugin/Eigen/Eigen/src/SVD/JacobiSVD.h b/uppsrc/plugin/Eigen/Eigen/src/SVD/JacobiSVD.h index 2b6891105..43488b1e0 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/SVD/JacobiSVD.h +++ b/uppsrc/plugin/Eigen/Eigen/src/SVD/JacobiSVD.h @@ -425,7 +425,6 @@ struct svd_precondition_2x2_block_to_be_real template struct traits > - : traits<_MatrixType> { typedef _MatrixType MatrixType; }; @@ -611,7 +610,7 @@ template class JacobiSVD }; template -void JacobiSVD::allocate(Eigen::Index rows, Eigen::Index cols, unsigned int computationOptions) +void JacobiSVD::allocate(Index rows, Index cols, unsigned int computationOptions) { eigen_assert(rows >= 0 && cols >= 0); diff --git a/uppsrc/plugin/Eigen/Eigen/src/SVD/SVDBase.h b/uppsrc/plugin/Eigen/Eigen/src/SVD/SVDBase.h index 34d5c9dd3..53da28488 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/SVD/SVDBase.h +++ b/uppsrc/plugin/Eigen/Eigen/src/SVD/SVDBase.h @@ -17,18 +17,6 @@ #define EIGEN_SVDBASE_H namespace Eigen { - -namespace internal { -template struct traits > - : traits -{ - typedef MatrixXpr XprKind; - typedef SolverStorage StorageKind; - typedef int StorageIndex; - enum { Flags = 0 }; -}; -} - /** \ingroup SVD_Module * * @@ -56,18 +44,15 @@ template struct traits > * terminate in finite (and reasonable) time. * \sa class BDCSVD, class JacobiSVD */ -template class SVDBase - : public SolverBase > +template +class SVDBase { -public: - - template - friend struct internal::solve_assertion; +public: typedef typename internal::traits::MatrixType MatrixType; typedef typename MatrixType::Scalar Scalar; typedef typename NumTraits::Real RealScalar; - typedef typename Eigen::internal::traits::StorageIndex StorageIndex; + typedef typename MatrixType::StorageIndex StorageIndex; typedef Eigen::Index Index; ///< \deprecated since Eigen 3.3 enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime, @@ -209,7 +194,6 @@ public: inline Index rows() const { return m_rows; } inline Index cols() const { return m_cols; } - #ifdef EIGEN_PARSED_BY_DOXYGEN /** \returns a (least squares) solution of \f$ A x = b \f$ using the current SVD decomposition of A. * * \param b the right-hand-side of the equation to solve. @@ -221,15 +205,17 @@ public: */ template inline const Solve - solve(const MatrixBase& b) const; - #endif - + solve(const MatrixBase& b) const + { + eigen_assert(m_isInitialized && "SVD is not initialized."); + eigen_assert(computeU() && computeV() && "SVD::solve() requires both unitaries U and V to be computed (thin unitaries suffice)."); + return Solve(derived(), b.derived()); + } + #ifndef EIGEN_PARSED_BY_DOXYGEN template + EIGEN_DEVICE_FUNC void _solve_impl(const RhsType &rhs, DstType &dst) const; - - template - void _solve_impl_transposed(const RhsType &rhs, DstType &dst) const; #endif protected: @@ -238,14 +224,6 @@ protected: { EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar); } - - template - void _check_solve_assertion(const Rhs& b) const { - EIGEN_ONLY_USED_FOR_DEBUG(b); - eigen_assert(m_isInitialized && "SVD is not initialized."); - eigen_assert(computeU() && computeV() && "SVDBase::solve(): Both unitaries U and V are required to be computed (thin unitaries suffice)."); - eigen_assert((Transpose_?cols():rows())==b.rows() && "SVDBase::solve(): invalid number of rows of the right hand side matrix b"); - } // return true if already allocated bool allocate(Index rows, Index cols, unsigned int computationOptions) ; @@ -268,10 +246,6 @@ protected: : m_isInitialized(false), m_isAllocated(false), m_usePrescribedThreshold(false), - m_computeFullU(false), - m_computeThinU(false), - m_computeFullV(false), - m_computeThinV(false), m_computationOptions(0), m_rows(-1), m_cols(-1), m_diagSize(0) { @@ -286,30 +260,17 @@ template template void SVDBase::_solve_impl(const RhsType &rhs, DstType &dst) const { + eigen_assert(rhs.rows() == rows()); + // A = U S V^* // So A^{-1} = V S^{-1} U^* - Matrix tmp; + Matrix tmp; Index l_rank = rank(); tmp.noalias() = m_matrixU.leftCols(l_rank).adjoint() * rhs; tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp; dst = m_matrixV.leftCols(l_rank) * tmp; } - -template -template -void SVDBase::_solve_impl_transposed(const RhsType &rhs, DstType &dst) const -{ - // A = U S V^* - // So A^{-*} = U S^{-1} V^* - // And A^{-T} = U_conj S^{-1} V^T - Matrix tmp; - Index l_rank = rank(); - - tmp.noalias() = m_matrixV.leftCols(l_rank).transpose().template conjugateIf() * rhs; - tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp; - dst = m_matrixU.template conjugateIf().leftCols(l_rank) * tmp; -} #endif template diff --git a/uppsrc/plugin/Eigen/Eigen/src/SVD/UpperBidiagonalization.h b/uppsrc/plugin/Eigen/Eigen/src/SVD/UpperBidiagonalization.h index 997defc47..11ac847e1 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/SVD/UpperBidiagonalization.h +++ b/uppsrc/plugin/Eigen/Eigen/src/SVD/UpperBidiagonalization.h @@ -127,7 +127,7 @@ void upperbidiagonalization_inplace_unblocked(MatrixType& mat, .makeHouseholderInPlace(mat.coeffRef(k,k+1), upper_diagonal[k]); // apply householder transform to remaining part of mat on the left mat.bottomRightCorner(remainingRows-1, remainingCols) - .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols-1).adjoint(), mat.coeff(k,k+1), tempData); + .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols-1).transpose(), mat.coeff(k,k+1), tempData); } } @@ -202,7 +202,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, { SubColumnType y_k( Y.col(k).tail(remainingCols) ); - // let's use the beginning of column k of Y as a temporary vector + // let's use the begining of column k of Y as a temporary vector SubColumnType tmp( Y.col(k).head(k) ); y_k.noalias() = A.block(k,k+1, remainingRows,remainingCols).adjoint() * v_k; // bottleneck tmp.noalias() = V_k1.adjoint() * v_k; @@ -231,7 +231,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, { SubColumnType x_k ( X.col(k).tail(remainingRows-1) ); - // let's use the beginning of column k of X as a temporary vectors + // let's use the begining of column k of X as a temporary vectors // note that tmp0 and tmp1 overlaps SubColumnType tmp0 ( X.col(k).head(k) ), tmp1 ( X.col(k).head(k+1) ); diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h index 06edb8688..369e6804a 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h +++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCholesky/SimplicialCholesky.h @@ -80,19 +80,11 @@ class SimplicialCholeskyBase : public SparseSolverBase /** Default constructor */ SimplicialCholeskyBase() - : m_info(Success), - m_factorizationIsOk(false), - m_analysisIsOk(false), - m_shiftOffset(0), - m_shiftScale(1) + : m_info(Success), m_shiftOffset(0), m_shiftScale(1) {} explicit SimplicialCholeskyBase(const MatrixType& matrix) - : m_info(Success), - m_factorizationIsOk(false), - m_analysisIsOk(false), - m_shiftOffset(0), - m_shiftScale(1) + : m_info(Success), m_shiftOffset(0), m_shiftScale(1) { derived().compute(matrix); } @@ -109,7 +101,7 @@ class SimplicialCholeskyBase : public SparseSolverBase /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was successful, + * \returns \c Success if computation was succesful, * \c NumericalIssue if the matrix.appears to be negative. */ ComputationInfo info() const diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h index 72e1740c1..7b6183d08 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h @@ -2,21 +2,46 @@ // for linear algebra. // // Copyright (C) 2008-2012 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. /* -NOTE: these functions have been adapted from the LDL library: + +NOTE: thes functions vave been adapted from the LDL library: LDL Copyright (c) 2005 by Timothy A. Davis. All Rights Reserved. -The author of LDL, Timothy A. Davis., has executed a license with Google LLC -to permit distribution of this code and derivative works as part of Eigen under -the Mozilla Public License v. 2.0, as stated at the top of this file. +LDL License: + + Your use or distribution of LDL or any modified version of + LDL implies that you agree to this License. + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + USA + + Permission is hereby granted to use or copy this program under the + terms of the GNU LGPL, provided that the Copyright, this License, + and the Availability of the original version is retained on all copies. + User documentation of any code that uses this code or any modified + version of this code must cite the Copyright, this License, the + Availability note, and "Used by permission." Permission to modify + the code and to distribute modified code is granted, provided the + Copyright, this License, and the Availability note are retained, + and a notice that the code was modified is included. */ +#include "../Core/util/NonMPL2.h" + #ifndef EIGEN_SIMPLICIAL_CHOLESKY_IMPL_H #define EIGEN_SIMPLICIAL_CHOLESKY_IMPL_H @@ -97,7 +122,7 @@ void SimplicialCholeskyBase::factorize_preordered(const CholMatrixType& for(StorageIndex k = 0; k < size; ++k) { // compute nonzero pattern of kth row of L, in topological order - y[k] = Scalar(0); // Y(0:k) is now all zero + y[k] = 0.0; // Y(0:k) is now all zero StorageIndex top = size; // stack for pattern is empty tags[k] = k; // mark node k as visited m_nonZerosPerCol[k] = 0; // count of nonzeros in column k of L @@ -121,12 +146,12 @@ void SimplicialCholeskyBase::factorize_preordered(const CholMatrixType& /* compute numerical values kth row of L (a sparse triangular solve) */ RealScalar d = numext::real(y[k]) * m_shiftScale + m_shiftOffset; // get D(k,k), apply the shift function, and clear Y(k) - y[k] = Scalar(0); + y[k] = 0.0; for(; top < size; ++top) { Index i = pattern[top]; /* pattern[top:n-1] is pattern of L(:,k) */ Scalar yi = y[i]; /* get and clear Y(i) */ - y[i] = Scalar(0); + y[i] = 0.0; /* the nonzero entry L(k,i) */ Scalar l_ki; diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/CompressedStorage.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/CompressedStorage.h index acd986fab..d89fa0dae 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/CompressedStorage.h +++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/CompressedStorage.h @@ -207,22 +207,6 @@ class CompressedStorage return m_values[id]; } - void moveChunk(Index from, Index to, Index chunkSize) - { - eigen_internal_assert(to+chunkSize <= m_size); - if(to>from && from+chunkSize>to) - { - // move backward - internal::smart_memmove(m_values+from, m_values+from+chunkSize, m_values+to); - internal::smart_memmove(m_indices+from, m_indices+from+chunkSize, m_indices+to); - } - else - { - internal::smart_copy(m_values+from, m_values+from+chunkSize, m_values+to); - internal::smart_copy(m_indices+from, m_indices+from+chunkSize, m_indices+to); - } - } - void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits::dummy_precision()) { Index k = 0; diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseAssign.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseAssign.h index 905485c88..18352a847 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseAssign.h +++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseAssign.h @@ -83,7 +83,7 @@ void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src) // eval without temporary dst.resize(src.rows(), src.cols()); dst.setZero(); - dst.reserve((std::min)(src.rows()*src.cols(), (std::max)(src.rows(),src.cols())*2)); + dst.reserve((std::max)(src.rows(),src.cols())*2); for (Index j=0; j }; // Generic Sparse to Dense assignment -template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak> -struct Assignment +template< typename DstXprType, typename SrcXprType, typename Functor> +struct Assignment { static void run(DstXprType &dst, const SrcXprType &src, const Functor &func) { @@ -153,73 +153,6 @@ struct Assignment } }; -// Specialization for dense ?= dense +/- sparse and dense ?= sparse +/- dense -template -struct assignment_from_dense_op_sparse -{ - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - void run(DstXprType &dst, const SrcXprType &src, const InitialFunc& /*func*/) - { - #ifdef EIGEN_SPARSE_ASSIGNMENT_FROM_DENSE_OP_SPARSE_PLUGIN - EIGEN_SPARSE_ASSIGNMENT_FROM_DENSE_OP_SPARSE_PLUGIN - #endif - - call_assignment_no_alias(dst, src.lhs(), Func1()); - call_assignment_no_alias(dst, src.rhs(), Func2()); - } - - // Specialization for dense1 = sparse + dense2; -> dense1 = dense2; dense1 += sparse; - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename internal::enable_if::Shape,DenseShape>::value>::type - run(DstXprType &dst, const CwiseBinaryOp, const Lhs, const Rhs> &src, - const internal::assign_op& /*func*/) - { - #ifdef EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_ADD_DENSE_PLUGIN - EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_ADD_DENSE_PLUGIN - #endif - - // Apply the dense matrix first, then the sparse one. - call_assignment_no_alias(dst, src.rhs(), Func1()); - call_assignment_no_alias(dst, src.lhs(), Func2()); - } - - // Specialization for dense1 = sparse - dense2; -> dense1 = -dense2; dense1 += sparse; - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename internal::enable_if::Shape,DenseShape>::value>::type - run(DstXprType &dst, const CwiseBinaryOp, const Lhs, const Rhs> &src, - const internal::assign_op& /*func*/) - { - #ifdef EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_SUB_DENSE_PLUGIN - EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_SUB_DENSE_PLUGIN - #endif - - // Apply the dense matrix first, then the sparse one. - call_assignment_no_alias(dst, -src.rhs(), Func1()); - call_assignment_no_alias(dst, src.lhs(), add_assign_op()); - } -}; - -#define EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(ASSIGN_OP,BINOP,ASSIGN_OP2) \ - template< typename DstXprType, typename Lhs, typename Rhs, typename Scalar> \ - struct Assignment, const Lhs, const Rhs>, internal::ASSIGN_OP, \ - Sparse2Dense, \ - typename internal::enable_if< internal::is_same::Shape,DenseShape>::value \ - || internal::is_same::Shape,DenseShape>::value>::type> \ - : assignment_from_dense_op_sparse, internal::ASSIGN_OP2 > \ - {} - -EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(assign_op, scalar_sum_op,add_assign_op); -EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(add_assign_op,scalar_sum_op,add_assign_op); -EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(sub_assign_op,scalar_sum_op,sub_assign_op); - -EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(assign_op, scalar_difference_op,sub_assign_op); -EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(add_assign_op,scalar_difference_op,sub_assign_op); -EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(sub_assign_op,scalar_difference_op,add_assign_op); - - // Specialization for "dst = dec.solve(rhs)" // NOTE we need to specialize it for Sparse2Sparse to avoid ambiguous specialization error template @@ -246,22 +179,35 @@ struct Assignment { typedef typename DstXprType::StorageIndex StorageIndex; typedef typename DstXprType::Scalar Scalar; + typedef Array ArrayXI; + typedef Array ArrayXS; + template + static void run(SparseMatrix &dst, const SrcXprType &src, const internal::assign_op &/*func*/) + { + Index dstRows = src.rows(); + Index dstCols = src.cols(); + if((dst.rows()!=dstRows) || (dst.cols()!=dstCols)) + dst.resize(dstRows, dstCols); - template - static void run(SparseMatrix &dst, const SrcXprType &src, const AssignFunc &func) - { dst.assignDiagonal(src.diagonal(), func); } + Index size = src.diagonal().size(); + dst.makeCompressed(); + dst.resizeNonZeros(size); + Map(dst.innerIndexPtr(), size).setLinSpaced(0,StorageIndex(size)-1); + Map(dst.outerIndexPtr(), size+1).setLinSpaced(0,StorageIndex(size)); + Map(dst.valuePtr(), size) = src.diagonal(); + } template static void run(SparseMatrixBase &dst, const SrcXprType &src, const internal::assign_op &/*func*/) - { dst.derived().diagonal() = src.diagonal(); } + { + dst.diagonal() = src.diagonal(); + } - template - static void run(SparseMatrixBase &dst, const SrcXprType &src, const internal::add_assign_op &/*func*/) - { dst.derived().diagonal() += src.diagonal(); } + static void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op &/*func*/) + { dst.diagonal() += src.diagonal(); } - template - static void run(SparseMatrixBase &dst, const SrcXprType &src, const internal::sub_assign_op &/*func*/) - { dst.derived().diagonal() -= src.diagonal(); } + static void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op &/*func*/) + { dst.diagonal() -= src.diagonal(); } }; } // end namespace internal diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseBlock.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseBlock.h index db5090257..511e92b2f 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseBlock.h +++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseBlock.h @@ -164,7 +164,7 @@ public: } else { - if(m_matrix.isCompressed() && nnz!=block_size) + if(m_matrix.isCompressed()) { // no need to realloc, simply copy the tail at its respective position and insert tmp matrix.data().resize(start + nnz + tail_size); @@ -326,6 +326,46 @@ private: //---------- +/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this + * is col-major (resp. row-major). + */ +template +typename SparseMatrixBase::InnerVectorReturnType SparseMatrixBase::innerVector(Index outer) +{ return InnerVectorReturnType(derived(), outer); } + +/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this + * is col-major (resp. row-major). Read-only. + */ +template +const typename SparseMatrixBase::ConstInnerVectorReturnType SparseMatrixBase::innerVector(Index outer) const +{ return ConstInnerVectorReturnType(derived(), outer); } + +/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this + * is col-major (resp. row-major). + */ +template +typename SparseMatrixBase::InnerVectorsReturnType +SparseMatrixBase::innerVectors(Index outerStart, Index outerSize) +{ + return Block(derived(), + IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart, + IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize); + +} + +/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this + * is col-major (resp. row-major). Read-only. + */ +template +const typename SparseMatrixBase::ConstInnerVectorsReturnType +SparseMatrixBase::innerVectors(Index outerStart, Index outerSize) const +{ + return Block(derived(), + IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart, + IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize); + +} + /** Generic implementation of sparse Block expression. * Real-only. */ @@ -463,25 +503,22 @@ template class unary_evaluator, IteratorBased>::InnerVectorInnerIterator : public EvalIterator { - // NOTE MSVC fails to compile if we don't explicitely "import" IsRowMajor from unary_evaluator - // because the base class EvalIterator has a private IsRowMajor enum too. (bug #1786) - // NOTE We cannot call it IsRowMajor because it would shadow unary_evaluator::IsRowMajor - enum { XprIsRowMajor = unary_evaluator::IsRowMajor }; + enum { IsRowMajor = unary_evaluator::IsRowMajor }; const XprType& m_block; Index m_end; public: EIGEN_STRONG_INLINE InnerVectorInnerIterator(const unary_evaluator& aEval, Index outer) - : EvalIterator(aEval.m_argImpl, outer + (XprIsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol())), + : EvalIterator(aEval.m_argImpl, outer + (IsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol())), m_block(aEval.m_block), - m_end(XprIsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows()) + m_end(IsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows()) { - while( (EvalIterator::operator bool()) && (EvalIterator::index() < (XprIsRowMajor ? m_block.startCol() : m_block.startRow())) ) + while( (EvalIterator::operator bool()) && (EvalIterator::index() < (IsRowMajor ? m_block.startCol() : m_block.startRow())) ) EvalIterator::operator++(); } - inline StorageIndex index() const { return EvalIterator::index() - convert_index(XprIsRowMajor ? m_block.startCol() : m_block.startRow()); } - inline Index outer() const { return EvalIterator::outer() - (XprIsRowMajor ? m_block.startRow() : m_block.startCol()); } + inline StorageIndex index() const { return EvalIterator::index() - convert_index(IsRowMajor ? m_block.startCol() : m_block.startRow()); } + inline Index outer() const { return EvalIterator::outer() - (IsRowMajor ? m_block.startRow() : m_block.startCol()); } inline Index row() const { return EvalIterator::row() - m_block.startRow(); } inline Index col() const { return EvalIterator::col() - m_block.startCol(); } @@ -491,8 +528,7 @@ public: template class unary_evaluator, IteratorBased>::OuterVectorInnerIterator { - // NOTE see above - enum { XprIsRowMajor = unary_evaluator::IsRowMajor }; + enum { IsRowMajor = unary_evaluator::IsRowMajor }; const unary_evaluator& m_eval; Index m_outerPos; const Index m_innerIndex; @@ -502,9 +538,9 @@ public: EIGEN_STRONG_INLINE OuterVectorInnerIterator(const unary_evaluator& aEval, Index outer) : m_eval(aEval), - m_outerPos( (XprIsRowMajor ? aEval.m_block.startCol() : aEval.m_block.startRow()) ), - m_innerIndex(XprIsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol()), - m_end(XprIsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows()), + m_outerPos( (IsRowMajor ? aEval.m_block.startCol() : aEval.m_block.startRow()) ), + m_innerIndex(IsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol()), + m_end(IsRowMajor ? aEval.m_block.startCol()+aEval.m_block.blockCols() : aEval.m_block.startRow()+aEval.m_block.blockRows()), m_it(m_eval.m_argImpl, m_outerPos) { EIGEN_UNUSED_VARIABLE(outer); @@ -515,10 +551,10 @@ public: ++(*this); } - inline StorageIndex index() const { return convert_index(m_outerPos - (XprIsRowMajor ? m_eval.m_block.startCol() : m_eval.m_block.startRow())); } + inline StorageIndex index() const { return convert_index(m_outerPos - (IsRowMajor ? m_eval.m_block.startCol() : m_eval.m_block.startRow())); } inline Index outer() const { return 0; } - inline Index row() const { return XprIsRowMajor ? 0 : index(); } - inline Index col() const { return XprIsRowMajor ? index() : 0; } + inline Index row() const { return IsRowMajor ? 0 : index(); } + inline Index col() const { return IsRowMajor ? index() : 0; } inline Scalar value() const { return m_it.value(); } inline Scalar& valueRef() { return m_it.valueRef(); } diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseCompressedBase.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseCompressedBase.h index 6a2c7a8ce..5ccb46656 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseCompressedBase.h +++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseCompressedBase.h @@ -128,28 +128,6 @@ class SparseCompressedBase protected: /** Default constructor. Do nothing. */ SparseCompressedBase() {} - - /** \internal return the index of the coeff at (row,col) or just before if it does not exist. - * This is an analogue of std::lower_bound. - */ - internal::LowerBoundIndex lower_bound(Index row, Index col) const - { - eigen_internal_assert(row>=0 && rowrows() && col>=0 && colcols()); - - const Index outer = Derived::IsRowMajor ? row : col; - const Index inner = Derived::IsRowMajor ? col : row; - - Index start = this->outerIndexPtr()[outer]; - Index end = this->isCompressed() ? this->outerIndexPtr()[outer+1] : this->outerIndexPtr()[outer] + this->innerNonZeroPtr()[outer]; - eigen_assert(end>=start && "you are using a non finalized sparse matrix or written coefficient does not exist"); - internal::LowerBoundIndex p; - p.value = std::lower_bound(this->innerIndexPtr()+start, this->innerIndexPtr()+end,inner) - this->innerIndexPtr(); - p.found = (p.valueinnerIndexPtr()[p.value]==inner); - return p; - } - - friend struct internal::evaluator >; - private: template explicit SparseCompressedBase(const SparseCompressedBase&); }; @@ -207,14 +185,6 @@ class SparseCompressedBase::InnerIterator } inline InnerIterator& operator++() { m_id++; return *this; } - inline InnerIterator& operator+=(Index i) { m_id += i ; return *this; } - - inline InnerIterator operator+(Index i) - { - InnerIterator result = *this; - result += i; - return result; - } inline const Scalar& value() const { return m_values[m_id]; } inline Scalar& valueRef() { return const_cast(m_values[m_id]); } @@ -275,14 +245,6 @@ class SparseCompressedBase::ReverseInnerIterator } inline ReverseInnerIterator& operator--() { --m_id; return *this; } - inline ReverseInnerIterator& operator-=(Index i) { m_id -= i; return *this; } - - inline ReverseInnerIterator operator-(Index i) - { - ReverseInnerIterator result = *this; - result -= i; - return result; - } inline const Scalar& value() const { return m_values[m_id-1]; } inline Scalar& valueRef() { return const_cast(m_values[m_id-1]); } @@ -355,8 +317,17 @@ protected: Index find(Index row, Index col) const { - internal::LowerBoundIndex p = m_matrix->lower_bound(row,col); - return p.found ? p.value : Dynamic; + eigen_internal_assert(row>=0 && rowrows() && col>=0 && colcols()); + + const Index outer = Derived::IsRowMajor ? row : col; + const Index inner = Derived::IsRowMajor ? col : row; + + Index start = m_matrix->outerIndexPtr()[outer]; + Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer]; + eigen_assert(end>=start && "you are using a non finalized sparse matrix or written coefficient does not exist"); + const Index p = std::lower_bound(m_matrix->innerIndexPtr()+start, m_matrix->innerIndexPtr()+end,inner) - m_matrix->innerIndexPtr(); + + return ((pinnerIndexPtr()[p]==inner)) ? p : Dynamic; } const Derived *m_matrix; diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h index 6130bab43..e315e3550 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseCwiseBinaryOp.h @@ -101,7 +101,7 @@ public: } else { - m_value = Scalar(0); // this is to avoid a compilation warning + m_value = 0; // this is to avoid a compilation warning m_id = -1; } return *this; @@ -212,7 +212,8 @@ public: enum { CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, - Flags = XprType::Flags + // Expose storage order of the sparse expression + Flags = (XprType::Flags & ~RowMajorBit) | (int(Rhs::Flags)&RowMajorBit) }; explicit binary_evaluator(const XprType& xpr) @@ -299,7 +300,8 @@ public: enum { CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, - Flags = XprType::Flags + // Expose storage order of the sparse expression + Flags = (XprType::Flags & ~RowMajorBit) | (int(Lhs::Flags)&RowMajorBit) }; explicit binary_evaluator(const XprType& xpr) @@ -531,7 +533,8 @@ public: enum { CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, - Flags = XprType::Flags + // Expose storage order of the sparse expression + Flags = (XprType::Flags & ~RowMajorBit) | (int(RhsArg::Flags)&RowMajorBit) }; explicit sparse_conjunction_evaluator(const XprType& xpr) @@ -605,7 +608,8 @@ public: enum { CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, - Flags = XprType::Flags + // Expose storage order of the sparse expression + Flags = (XprType::Flags & ~RowMajorBit) | (int(LhsArg::Flags)&RowMajorBit) }; explicit sparse_conjunction_evaluator(const XprType& xpr) diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseDenseProduct.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseDenseProduct.h index f005a18a1..0547db596 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseDenseProduct.h +++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseDenseProduct.h @@ -88,11 +88,10 @@ struct sparse_time_dense_product_impl::type Lhs; typedef typename internal::remove_all::type Rhs; typedef typename internal::remove_all::type Res; - typedef evaluator LhsEval; - typedef typename LhsEval::InnerIterator LhsInnerIterator; + typedef typename evaluator::InnerIterator LhsInnerIterator; static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha) { - LhsEval lhsEval(lhs); + evaluator lhsEval(lhs); for(Index c=0; c::type Lhs; typedef typename internal::remove_all::type Rhs; typedef typename internal::remove_all::type Res; - typedef evaluator LhsEval; - typedef typename LhsEval::InnerIterator LhsInnerIterator; + typedef typename evaluator::InnerIterator LhsInnerIterator; static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha) { - Index n = lhs.rows(); - LhsEval lhsEval(lhs); - -#ifdef EIGEN_HAS_OPENMP - Eigen::initParallel(); - Index threads = Eigen::nbThreads(); - // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems. - // It basically represents the minimal amount of work to be done to be worth it. - if(threads>1 && lhsEval.nonZerosEstimate()*rhs.cols() > 20000) + evaluator lhsEval(lhs); + for(Index j=0; j Base; using Base::convert_index; friend class SparseVector<_Scalar,0,_StorageIndex>; - template - friend struct internal::Assignment; public: using Base::isCompressed; using Base::nonZeros; @@ -329,7 +327,8 @@ class SparseMatrix m_outerIndex[j] = newOuterIndex[j]; m_innerNonZeros[j] = innerNNZ; } - m_outerIndex[m_outerSize] = m_outerIndex[m_outerSize-1] + m_innerNonZeros[m_outerSize-1] + reserveSizes[m_outerSize-1]; + if(m_outerSize>0) + m_outerIndex[m_outerSize] = m_outerIndex[m_outerSize-1] + m_innerNonZeros[m_outerSize-1] + reserveSizes[m_outerSize-1]; m_data.resize(m_outerIndex[m_outerSize]); } @@ -504,8 +503,8 @@ class SparseMatrix m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i]; } } - - /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerance \a epsilon */ + + /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerence \a epsilon */ void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits::dummy_precision()) { prune(default_prunning_func(reference,epsilon)); @@ -606,9 +605,9 @@ class SparseMatrix m_outerIndex = newOuterIndex; if (outerChange > 0) { - StorageIndex lastIdx = m_outerSize == 0 ? 0 : m_outerIndex[m_outerSize]; + StorageIndex last = m_outerSize == 0 ? 0 : m_outerIndex[m_outerSize]; for(Index i=m_outerSize; i - void assignDiagonal(const DiagXpr diagXpr, const Func& assignFunc) - { - Index n = diagXpr.size(); - - const bool overwrite = internal::is_same >::value; - if(overwrite) - { - if((this->rows()!=n) || (this->cols()!=n)) - this->resize(n, n); - } - - if(m_data.size()==0 || overwrite) - { - typedef Array ArrayXI; - this->makeCompressed(); - this->resizeNonZeros(n); - Eigen::Map(this->innerIndexPtr(), n).setLinSpaced(0,StorageIndex(n)-1); - Eigen::Map(this->outerIndexPtr(), n+1).setLinSpaced(0,StorageIndex(n)); - Eigen::Map > values = this->coeffs(); - values.setZero(); - internal::call_assignment_no_alias(values, diagXpr, assignFunc); - } - else - { - bool isComp = isCompressed(); - internal::evaluator diaEval(diagXpr); - std::vector newEntries; - - // 1 - try in-place update and record insertion failures - for(Index i = 0; ilower_bound(i,i); - Index p = lb.value; - if(lb.found) - { - // the coeff already exists - assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i)); - } - else if((!isComp) && m_innerNonZeros[i] < (m_outerIndex[i+1]-m_outerIndex[i])) - { - // non compressed mode with local room for inserting one element - m_data.moveChunk(p, p+1, m_outerIndex[i]+m_innerNonZeros[i]-p); - m_innerNonZeros[i]++; - m_data.value(p) = Scalar(0); - m_data.index(p) = StorageIndex(i); - assignFunc.assignCoeff(m_data.value(p), diaEval.coeff(i)); - } - else - { - // defer insertion - newEntries.push_back(IndexPosPair(i,p)); - } - } - // 2 - insert deferred entries - Index n_entries = Index(newEntries.size()); - if(n_entries>0) - { - Storage newData(m_data.size()+n_entries); - Index prev_p = 0; - Index prev_i = 0; - for(Index k=0; k template @@ -1341,7 +1233,7 @@ typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& SparseMatrix<_Sca } m_data.index(p) = convert_index(inner); - return (m_data.value(p) = Scalar(0)); + return (m_data.value(p) = 0); } if(m_data.size() != m_data.allocatedSize()) diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseMatrixBase.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseMatrixBase.h index 229449f02..c6b548f11 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseMatrixBase.h +++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseMatrixBase.h @@ -87,11 +87,6 @@ template class SparseMatrixBase * we are dealing with a column-vector (if there is only one column) or with * a row-vector (if there is only one row). */ - NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0 : bool(IsVectorAtCompileTime) ? 1 : 2, - /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors, - * and 2 for matrices. - */ - Flags = internal::traits::Flags, /**< This stores expression \ref flags flags which may or may not be inherited by new expressions * constructed from this one. See the \ref flags "list of flags". @@ -355,6 +350,18 @@ template class SparseMatrixBase const ConstTransposeReturnType transpose() const { return ConstTransposeReturnType(derived()); } const AdjointReturnType adjoint() const { return AdjointReturnType(transpose()); } + // inner-vector + typedef Block InnerVectorReturnType; + typedef Block ConstInnerVectorReturnType; + InnerVectorReturnType innerVector(Index outer); + const ConstInnerVectorReturnType innerVector(Index outer) const; + + // set of inner-vectors + typedef Block InnerVectorsReturnType; + typedef Block ConstInnerVectorsReturnType; + InnerVectorsReturnType innerVectors(Index outerStart, Index outerSize); + const ConstInnerVectorsReturnType innerVectors(Index outerStart, Index outerSize) const; + DenseMatrixType toDense() const { return DenseMatrixType(derived()); diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseProduct.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseProduct.h index c495a7398..4cbf68781 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseProduct.h +++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseProduct.h @@ -17,7 +17,7 @@ namespace Eigen { * The automatic pruning of the small values can be achieved by calling the pruned() function * in which case a totally different product algorithm is employed: * \code - * C = (A*B).pruned(); // suppress numerical zeros (exact) + * C = (A*B).pruned(); // supress numerical zeros (exact) * C = (A*B).pruned(ref); * C = (A*B).pruned(ref,epsilon); * \endcode diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseRef.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseRef.h index 748f87d62..d91f38f97 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseRef.h +++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseRef.h @@ -201,7 +201,7 @@ class Ref, Options, StrideType ~Ref() { if(m_hasCopy) { - TPlainObjectType* obj = reinterpret_cast(&m_storage); + TPlainObjectType* obj = reinterpret_cast(m_object_bytes); obj->~TPlainObjectType(); } } @@ -213,7 +213,7 @@ class Ref, Options, StrideType { if((Options & int(StandardCompressedFormat)) && (!expr.isCompressed())) { - TPlainObjectType* obj = reinterpret_cast(&m_storage); + TPlainObjectType* obj = reinterpret_cast(m_object_bytes); ::new (obj) TPlainObjectType(expr); m_hasCopy = true; Base::construct(*obj); @@ -227,14 +227,14 @@ class Ref, Options, StrideType template void construct(const Expression& expr, internal::false_type) { - TPlainObjectType* obj = reinterpret_cast(&m_storage); + TPlainObjectType* obj = reinterpret_cast(m_object_bytes); ::new (obj) TPlainObjectType(expr); m_hasCopy = true; Base::construct(*obj); } protected: - typename internal::aligned_storage::type m_storage; + char m_object_bytes[sizeof(TPlainObjectType)]; bool m_hasCopy; }; @@ -319,7 +319,7 @@ class Ref, Options, StrideType ~Ref() { if(m_hasCopy) { - TPlainObjectType* obj = reinterpret_cast(&m_storage); + TPlainObjectType* obj = reinterpret_cast(m_object_bytes); obj->~TPlainObjectType(); } } @@ -335,14 +335,14 @@ class Ref, Options, StrideType template void construct(const Expression& expr, internal::false_type) { - TPlainObjectType* obj = reinterpret_cast(&m_storage); + TPlainObjectType* obj = reinterpret_cast(m_object_bytes); ::new (obj) TPlainObjectType(expr); m_hasCopy = true; Base::construct(*obj); } protected: - typename internal::aligned_storage::type m_storage; + char m_object_bytes[sizeof(TPlainObjectType)]; bool m_hasCopy; }; diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h index 65611b3d4..76117a010 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h +++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseSelfAdjointView.h @@ -453,7 +453,7 @@ void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix struct glue_shapes { typedef SparseSelfAdjointShape type; }; template<> struct glue_shapes { typedef SparseTriangularShape type; }; -// return type of SparseCompressedBase::lower_bound; -struct LowerBoundIndex { - LowerBoundIndex() : value(-1), found(false) {} - LowerBoundIndex(Index val, bool ok) : value(val), found(ok) {} - Index value; - bool found; -}; - } // end namespace internal /** \ingroup SparseCore_Module diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseVector.h b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseVector.h index 05779be68..19b0fbc9d 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseVector.h +++ b/uppsrc/plugin/Eigen/Eigen/src/SparseCore/SparseVector.h @@ -281,7 +281,7 @@ class SparseVector } /** Swaps the values of \c *this and \a other. - * Overloaded for performance: this version performs a \em shallow swap by swapping pointers and attributes only. + * Overloaded for performance: this version performs a \em shallow swap by swaping pointers and attributes only. * \sa SparseMatrixBase::swap() */ inline void swap(SparseVector& other) diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU.h b/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU.h index 090993adc..87f0efe37 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU.h +++ b/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU.h @@ -26,7 +26,7 @@ template struct SparseLUMatrixURetu * This class implements the supernodal LU factorization for general matrices. * It uses the main techniques from the sequential SuperLU package * (http://crd-legacy.lbl.gov/~xiaoye/SuperLU/). It handles transparently real - * and complex arithmetic with single and double precision, depending on the + * and complex arithmetics with single and double precision, depending on the * scalar type of your input matrix. * The code has been optimized to provide BLAS-3 operations during supernode-panel updates. * It benefits directly from the built-in high-performant Eigen BLAS routines. @@ -193,7 +193,7 @@ class SparseLU : public SparseSolverBase >, /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was successful, + * \returns \c Success if computation was succesful, * \c NumericalIssue if the LU factorization reports a problem, zero diagonal for instance * \c InvalidInput if the input matrix is invalid * @@ -501,6 +501,7 @@ void SparseLU::factorize(const MatrixType& matrix) m_isInitialized = true; + // Apply the column permutation computed in analyzepattern() // m_mat = matrix * m_perm_c.inverse(); m_mat = matrix; @@ -703,8 +704,8 @@ struct SparseLUMatrixLReturnType : internal::no_assignment_operator typedef typename MappedSupernodalType::Scalar Scalar; explicit SparseLUMatrixLReturnType(const MappedSupernodalType& mapL) : m_mapL(mapL) { } - Index rows() const { return m_mapL.rows(); } - Index cols() const { return m_mapL.cols(); } + Index rows() { return m_mapL.rows(); } + Index cols() { return m_mapL.cols(); } template void solveInPlace( MatrixBase &X) const { @@ -720,8 +721,8 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator SparseLUMatrixUReturnType(const MatrixLType& mapL, const MatrixUType& mapU) : m_mapL(mapL),m_mapU(mapU) { } - Index rows() const { return m_mapL.rows(); } - Index cols() const { return m_mapL.cols(); } + Index rows() { return m_mapL.rows(); } + Index cols() { return m_mapL.cols(); } template void solveInPlace(MatrixBase &X) const { @@ -744,9 +745,8 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator } else { - // FIXME: the following lines should use Block expressions and not Map! Map, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) ); - Map< Matrix, 0, OuterStride<> > U (&(X.coeffRef(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); + Map< Matrix, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); U = A.template triangularView().solve(U); } diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_Memory.h b/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_Memory.h index 349bfd585..4dc42e87b 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_Memory.h +++ b/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_Memory.h @@ -51,7 +51,7 @@ inline Index LUTempSpace(Index&m, Index& w) /** - * Expand the existing storage to accommodate more fill-ins + * Expand the existing storage to accomodate more fill-ins * \param vec Valid pointer to the vector to allocate or expand * \param[in,out] length At input, contain the current length of the vector that is to be increased. At output, length of the newly allocated vector * \param[in] nbElts Current number of elements in the factors diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h b/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h index 8583b1b69..721e1883b 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +++ b/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h @@ -75,12 +75,12 @@ class MappedSuperNodalMatrix /** * Number of rows */ - Index rows() const { return m_row; } + Index rows() { return m_row; } /** * Number of columns */ - Index cols() const { return m_col; } + Index cols() { return m_col; } /** * Return the array of nonzero values packed by column diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h b/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h index 5a2c941b4..c98b30e32 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h +++ b/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_column_dfs.h @@ -151,7 +151,7 @@ Index SparseLUImpl::column_dfs(const Index m, const Index j StorageIndex ito = glu.xlsub(fsupc+1); glu.xlsub(jcolm1) = ito; StorageIndex istop = ito + jptr - jm1ptr; - xprune(jcolm1) = istop; // initialize xprune(jcol-1) + xprune(jcolm1) = istop; // intialize xprune(jcol-1) glu.xlsub(jcol) = istop; for (StorageIndex ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito) @@ -166,7 +166,7 @@ Index SparseLUImpl::column_dfs(const Index m, const Index j // Tidy up the pointers before exit glu.xsup(nsuper+1) = jcolp1; glu.supno(jcolp1) = nsuper; - xprune(jcol) = StorageIndex(nextl); // Initialize upper bound for pruning + xprune(jcol) = StorageIndex(nextl); // Intialize upper bound for pruning glu.xlsub(jcolp1) = StorageIndex(nextl); return 0; diff --git a/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h b/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h index e37c2fe0d..95ba7413f 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +++ b/uppsrc/plugin/Eigen/Eigen/src/SparseLU/SparseLU_gemm_kernel.h @@ -215,7 +215,7 @@ void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const if(RK==4){ a3 = pload(A3+i+(I+1)*PacketSize); }\ pstore(C0+i+(I)*PacketSize, c0); - // aggressive vectorization and peeling + // agressive vectorization and peeling for(Index i=0; i - * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing - * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011. - * - * Even though it is qualified as "rank-revealing", this strategy might fail for some - * rank deficient problems. When this class is used to solve linear or least-square problems - * it is thus strongly recommended to check the accuracy of the computed solution. If it - * failed, it usually helps to increase the threshold with setPivotThreshold. - * * \warning The input sparse matrix A must be in compressed mode (see SparseMatrix::makeCompressed()). * \warning For complex matrices matrixQ().transpose() will actually return the adjoint matrix. * @@ -343,7 +331,7 @@ void SparseQR::analyzePattern(const MatrixType& mat) m_R.resize(m, n); m_Q.resize(m, diagSize); - // Allocate space for nonzero elements: rough estimation + // Allocate space for nonzero elements : rough estimation m_R.reserve(2*mat.nonZeros()); //FIXME Get a more accurate estimation through symbolic factorization with the etree m_Q.reserve(2*mat.nonZeros()); m_hcoeffs.resize(diagSize); @@ -652,8 +640,7 @@ struct SparseQR_QProduct : ReturnByValue::value ? numext::mini(j,diagSize-1) : diagSize-1; - for (Index k = start_k; k >=0; k--) + for (Index k = diagSize-1; k >=0; k--) { Scalar tau = Scalar(0); tau = m_qr.m_Q.col(k).dot(res.col(j)); diff --git a/uppsrc/plugin/Eigen/Eigen/src/StlSupport/StdDeque.h b/uppsrc/plugin/Eigen/Eigen/src/StlSupport/StdDeque.h index 045da7b4d..af158f425 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/StlSupport/StdDeque.h +++ b/uppsrc/plugin/Eigen/Eigen/src/StlSupport/StdDeque.h @@ -36,7 +36,7 @@ namespace std \ deque(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : deque_base(first, last, a) {} \ deque(const deque& c) : deque_base(c) {} \ explicit deque(size_type num, const value_type& val = value_type()) : deque_base(num, val) {} \ - deque(iterator start_, iterator end_) : deque_base(start_, end_) {} \ + deque(iterator start, iterator end) : deque_base(start, end) {} \ deque& operator=(const deque& x) { \ deque_base::operator=(x); \ return *this; \ @@ -62,7 +62,7 @@ namespace std { : deque_base(first, last, a) {} \ deque(const deque& c) : deque_base(c) {} \ explicit deque(size_type num, const value_type& val = value_type()) : deque_base(num, val) {} \ - deque(iterator start_, iterator end_) : deque_base(start_, end_) {} \ + deque(iterator start, iterator end) : deque_base(start, end) {} \ deque& operator=(const deque& x) { \ deque_base::operator=(x); \ return *this; \ @@ -98,8 +98,10 @@ namespace std { { return deque_base::insert(position,x); } void insert(const_iterator position, size_type new_size, const value_type& x) { deque_base::insert(position, new_size, x); } -#elif defined(_GLIBCXX_DEQUE) && EIGEN_GNUC_AT_LEAST(4,2) +#elif defined(_GLIBCXX_DEQUE) && EIGEN_GNUC_AT_LEAST(4,2) && !EIGEN_GNUC_AT_LEAST(10, 1) // workaround GCC std::deque implementation + // GCC 10.1 doesn't let us access _Deque_impl _M_impl anymore and we have to + // fall-back to the default case void resize(size_type new_size, const value_type& x) { if (new_size < deque_base::size()) @@ -108,7 +110,7 @@ namespace std { deque_base::insert(deque_base::end(), new_size - deque_base::size(), x); } #else - // either GCC 4.1 or non-GCC + // either non-GCC or GCC between 4.1 and 10.1 // default implementation which should always work. void resize(size_type new_size, const value_type& x) { diff --git a/uppsrc/plugin/Eigen/Eigen/src/StlSupport/StdList.h b/uppsrc/plugin/Eigen/Eigen/src/StlSupport/StdList.h index 8ba3fada0..e1eba4985 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/StlSupport/StdList.h +++ b/uppsrc/plugin/Eigen/Eigen/src/StlSupport/StdList.h @@ -35,7 +35,7 @@ namespace std \ list(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : list_base(first, last, a) {} \ list(const list& c) : list_base(c) {} \ explicit list(size_type num, const value_type& val = value_type()) : list_base(num, val) {} \ - list(iterator start_, iterator end_) : list_base(start_, end_) {} \ + list(iterator start, iterator end) : list_base(start, end) {} \ list& operator=(const list& x) { \ list_base::operator=(x); \ return *this; \ @@ -62,7 +62,7 @@ namespace std : list_base(first, last, a) {} \ list(const list& c) : list_base(c) {} \ explicit list(size_type num, const value_type& val = value_type()) : list_base(num, val) {} \ - list(iterator start_, iterator end_) : list_base(start_, end_) {} \ + list(iterator start, iterator end) : list_base(start, end) {} \ list& operator=(const list& x) { \ list_base::operator=(x); \ return *this; \ diff --git a/uppsrc/plugin/Eigen/Eigen/src/StlSupport/StdVector.h b/uppsrc/plugin/Eigen/Eigen/src/StlSupport/StdVector.h index 9fcf19bce..ec22821d2 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/StlSupport/StdVector.h +++ b/uppsrc/plugin/Eigen/Eigen/src/StlSupport/StdVector.h @@ -36,7 +36,7 @@ namespace std \ vector(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : vector_base(first, last, a) {} \ vector(const vector& c) : vector_base(c) {} \ explicit vector(size_type num, const value_type& val = value_type()) : vector_base(num, val) {} \ - vector(iterator start_, iterator end_) : vector_base(start_, end_) {} \ + vector(iterator start, iterator end) : vector_base(start, end) {} \ vector& operator=(const vector& x) { \ vector_base::operator=(x); \ return *this; \ @@ -62,7 +62,7 @@ namespace std { : vector_base(first, last, a) {} \ vector(const vector& c) : vector_base(c) {} \ explicit vector(size_type num, const value_type& val = value_type()) : vector_base(num, val) {} \ - vector(iterator start_, iterator end_) : vector_base(start_, end_) {} \ + vector(iterator start, iterator end) : vector_base(start, end) {} \ vector& operator=(const vector& x) { \ vector_base::operator=(x); \ return *this; \ diff --git a/uppsrc/plugin/Eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h b/uppsrc/plugin/Eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h index 354e33de5..7261c7d0f 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h +++ b/uppsrc/plugin/Eigen/Eigen/src/SuperLUSupport/SuperLUSupport.h @@ -352,7 +352,7 @@ class SuperLUBase : public SparseSolverBase /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was successful, + * \returns \c Success if computation was succesful, * \c NumericalIssue if the matrix.appears to be negative. */ ComputationInfo info() const diff --git a/uppsrc/plugin/Eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h b/uppsrc/plugin/Eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h index e3a333f80..91c09ab13 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h +++ b/uppsrc/plugin/Eigen/Eigen/src/UmfPackSupport/UmfPackSupport.h @@ -10,16 +10,6 @@ #ifndef EIGEN_UMFPACKSUPPORT_H #define EIGEN_UMFPACKSUPPORT_H -// for compatibility with super old version of umfpack, -// not sure this is really needed, but this is harmless. -#ifndef SuiteSparse_long -#ifdef UF_long -#define SuiteSparse_long UF_long -#else -#error neither SuiteSparse_long nor UF_long are defined -#endif -#endif - namespace Eigen { /* TODO extract L, extract U, compute det, etc... */ @@ -27,85 +17,42 @@ namespace Eigen { // generic double/complex wrapper functions: - // Defaults -inline void umfpack_defaults(double control[UMFPACK_CONTROL], double, int) +inline void umfpack_defaults(double control[UMFPACK_CONTROL], double) { umfpack_di_defaults(control); } -inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex, int) +inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex) { umfpack_zi_defaults(control); } -inline void umfpack_defaults(double control[UMFPACK_CONTROL], double, SuiteSparse_long) -{ umfpack_dl_defaults(control); } - -inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex, SuiteSparse_long) -{ umfpack_zl_defaults(control); } - -// Report info -inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double, int) +inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double) { umfpack_di_report_info(control, info);} -inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex, int) +inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex) { umfpack_zi_report_info(control, info);} -inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double, SuiteSparse_long) -{ umfpack_dl_report_info(control, info);} - -inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex, SuiteSparse_long) -{ umfpack_zl_report_info(control, info);} - -// Report status -inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double, int) +inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double) { umfpack_di_report_status(control, status);} -inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex, int) +inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex) { umfpack_zi_report_status(control, status);} -inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double, SuiteSparse_long) -{ umfpack_dl_report_status(control, status);} - -inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex, SuiteSparse_long) -{ umfpack_zl_report_status(control, status);} - -// report control -inline void umfpack_report_control(double control[UMFPACK_CONTROL], double, int) +inline void umfpack_report_control(double control[UMFPACK_CONTROL], double) { umfpack_di_report_control(control);} -inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex, int) +inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex) { umfpack_zi_report_control(control);} -inline void umfpack_report_control(double control[UMFPACK_CONTROL], double, SuiteSparse_long) -{ umfpack_dl_report_control(control);} - -inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex, SuiteSparse_long) -{ umfpack_zl_report_control(control);} - -// Free numeric -inline void umfpack_free_numeric(void **Numeric, double, int) +inline void umfpack_free_numeric(void **Numeric, double) { umfpack_di_free_numeric(Numeric); *Numeric = 0; } -inline void umfpack_free_numeric(void **Numeric, std::complex, int) +inline void umfpack_free_numeric(void **Numeric, std::complex) { umfpack_zi_free_numeric(Numeric); *Numeric = 0; } -inline void umfpack_free_numeric(void **Numeric, double, SuiteSparse_long) -{ umfpack_dl_free_numeric(Numeric); *Numeric = 0; } - -inline void umfpack_free_numeric(void **Numeric, std::complex, SuiteSparse_long) -{ umfpack_zl_free_numeric(Numeric); *Numeric = 0; } - -// Free symbolic -inline void umfpack_free_symbolic(void **Symbolic, double, int) +inline void umfpack_free_symbolic(void **Symbolic, double) { umfpack_di_free_symbolic(Symbolic); *Symbolic = 0; } -inline void umfpack_free_symbolic(void **Symbolic, std::complex, int) +inline void umfpack_free_symbolic(void **Symbolic, std::complex) { umfpack_zi_free_symbolic(Symbolic); *Symbolic = 0; } -inline void umfpack_free_symbolic(void **Symbolic, double, SuiteSparse_long) -{ umfpack_dl_free_symbolic(Symbolic); *Symbolic = 0; } - -inline void umfpack_free_symbolic(void **Symbolic, std::complex, SuiteSparse_long) -{ umfpack_zl_free_symbolic(Symbolic); *Symbolic = 0; } - -// Symbolic inline int umfpack_symbolic(int n_row,int n_col, const int Ap[], const int Ai[], const double Ax[], void **Symbolic, const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO]) @@ -119,21 +66,7 @@ inline int umfpack_symbolic(int n_row,int n_col, { return umfpack_zi_symbolic(n_row,n_col,Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Control,Info); } -inline SuiteSparse_long umfpack_symbolic( SuiteSparse_long n_row,SuiteSparse_long n_col, - const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[], void **Symbolic, - const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO]) -{ - return umfpack_dl_symbolic(n_row,n_col,Ap,Ai,Ax,Symbolic,Control,Info); -} -inline SuiteSparse_long umfpack_symbolic( SuiteSparse_long n_row,SuiteSparse_long n_col, - const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex Ax[], void **Symbolic, - const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO]) -{ - return umfpack_zl_symbolic(n_row,n_col,Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Control,Info); -} - -// Numeric inline int umfpack_numeric( const int Ap[], const int Ai[], const double Ax[], void *Symbolic, void **Numeric, const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO]) @@ -147,21 +80,7 @@ inline int umfpack_numeric( const int Ap[], const int Ai[], const std::complex Ax[], - void *Symbolic, void **Numeric, - const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO]) -{ - return umfpack_zl_numeric(Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Numeric,Control,Info); -} - -// solve inline int umfpack_solve( int sys, const int Ap[], const int Ai[], const double Ax[], double X[], const double B[], void *Numeric, const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) @@ -176,21 +95,6 @@ inline int umfpack_solve( int sys, const int Ap[], const int Ai[], const std::co return umfpack_zi_solve(sys,Ap,Ai,&numext::real_ref(Ax[0]),0,&numext::real_ref(X[0]),0,&numext::real_ref(B[0]),0,Numeric,Control,Info); } -inline SuiteSparse_long umfpack_solve(int sys, const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[], - double X[], const double B[], void *Numeric, - const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) -{ - return umfpack_dl_solve(sys,Ap,Ai,Ax,X,B,Numeric,Control,Info); -} - -inline SuiteSparse_long umfpack_solve(int sys, const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const std::complex Ax[], - std::complex X[], const std::complex B[], void *Numeric, - const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) -{ - return umfpack_zl_solve(sys,Ap,Ai,&numext::real_ref(Ax[0]),0,&numext::real_ref(X[0]),0,&numext::real_ref(B[0]),0,Numeric,Control,Info); -} - -// Get Lunz inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_udiag, void *Numeric, double) { return umfpack_di_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric); @@ -201,19 +105,6 @@ inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_ return umfpack_zi_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric); } -inline SuiteSparse_long umfpack_get_lunz( SuiteSparse_long *lnz, SuiteSparse_long *unz, SuiteSparse_long *n_row, SuiteSparse_long *n_col, - SuiteSparse_long *nz_udiag, void *Numeric, double) -{ - return umfpack_dl_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric); -} - -inline SuiteSparse_long umfpack_get_lunz( SuiteSparse_long *lnz, SuiteSparse_long *unz, SuiteSparse_long *n_row, SuiteSparse_long *n_col, - SuiteSparse_long *nz_udiag, void *Numeric, std::complex) -{ - return umfpack_zl_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric); -} - -// Get Numeric inline int umfpack_get_numeric(int Lp[], int Lj[], double Lx[], int Up[], int Ui[], double Ux[], int P[], int Q[], double Dx[], int *do_recip, double Rs[], void *Numeric) { @@ -229,45 +120,18 @@ inline int umfpack_get_numeric(int Lp[], int Lj[], std::complex Lx[], in return umfpack_zi_get_numeric(Lp,Lj,Lx?&lx0_real:0,0,Up,Ui,Ux?&ux0_real:0,0,P,Q, Dx?&dx0_real:0,0,do_recip,Rs,Numeric); } -inline SuiteSparse_long umfpack_get_numeric(SuiteSparse_long Lp[], SuiteSparse_long Lj[], double Lx[], SuiteSparse_long Up[], SuiteSparse_long Ui[], double Ux[], - SuiteSparse_long P[], SuiteSparse_long Q[], double Dx[], SuiteSparse_long *do_recip, double Rs[], void *Numeric) -{ - return umfpack_dl_get_numeric(Lp,Lj,Lx,Up,Ui,Ux,P,Q,Dx,do_recip,Rs,Numeric); -} -inline SuiteSparse_long umfpack_get_numeric(SuiteSparse_long Lp[], SuiteSparse_long Lj[], std::complex Lx[], SuiteSparse_long Up[], SuiteSparse_long Ui[], std::complex Ux[], - SuiteSparse_long P[], SuiteSparse_long Q[], std::complex Dx[], SuiteSparse_long *do_recip, double Rs[], void *Numeric) -{ - double& lx0_real = numext::real_ref(Lx[0]); - double& ux0_real = numext::real_ref(Ux[0]); - double& dx0_real = numext::real_ref(Dx[0]); - return umfpack_zl_get_numeric(Lp,Lj,Lx?&lx0_real:0,0,Up,Ui,Ux?&ux0_real:0,0,P,Q, - Dx?&dx0_real:0,0,do_recip,Rs,Numeric); -} - -// Get Determinant -inline int umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], int) +inline int umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO]) { return umfpack_di_get_determinant(Mx,Ex,NumericHandle,User_Info); } -inline int umfpack_get_determinant(std::complex *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], int) +inline int umfpack_get_determinant(std::complex *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO]) { double& mx_real = numext::real_ref(*Mx); return umfpack_zi_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info); } -inline SuiteSparse_long umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], SuiteSparse_long) -{ - return umfpack_dl_get_determinant(Mx,Ex,NumericHandle,User_Info); -} - -inline SuiteSparse_long umfpack_get_determinant(std::complex *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO], SuiteSparse_long) -{ - double& mx_real = numext::real_ref(*Mx); - return umfpack_zl_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info); -} - /** \ingroup UmfPackSupport_Module * \brief A sparse LU factorization and solver based on UmfPack @@ -300,7 +164,7 @@ class UmfPackLU : public SparseSolverBase > typedef Matrix IntRowVectorType; typedef Matrix IntColVectorType; typedef SparseMatrix LUMatrixType; - typedef SparseMatrix UmfpackMatrixType; + typedef SparseMatrix UmfpackMatrixType; typedef Ref UmfpackMatrixRef; enum { ColsAtCompileTime = MatrixType::ColsAtCompileTime, @@ -328,8 +192,8 @@ class UmfPackLU : public SparseSolverBase > ~UmfPackLU() { - if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(), StorageIndex()); - if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar(), StorageIndex()); + if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar()); + if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar()); } inline Index rows() const { return mp_matrix.rows(); } @@ -337,7 +201,7 @@ class UmfPackLU : public SparseSolverBase > /** \brief Reports whether previous computation was successful. * - * \returns \c Success if computation was successful, + * \returns \c Success if computation was succesful, * \c NumericalIssue if the matrix.appears to be negative. */ ComputationInfo info() const @@ -377,8 +241,8 @@ class UmfPackLU : public SparseSolverBase > template void compute(const InputMatrixType& matrix) { - if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(),StorageIndex()); - if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex()); + if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar()); + if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar()); grab(matrix.derived()); analyzePattern_impl(); factorize_impl(); @@ -393,8 +257,8 @@ class UmfPackLU : public SparseSolverBase > template void analyzePattern(const InputMatrixType& matrix) { - if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar(),StorageIndex()); - if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex()); + if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar()); + if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar()); grab(matrix.derived()); @@ -445,7 +309,7 @@ class UmfPackLU : public SparseSolverBase > { eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()"); if(m_numeric) - umfpack_free_numeric(&m_numeric,Scalar(),StorageIndex()); + umfpack_free_numeric(&m_numeric,Scalar()); grab(matrix.derived()); @@ -456,28 +320,28 @@ class UmfPackLU : public SparseSolverBase > * * \sa umfpackControl() */ - void printUmfpackControl() + void umfpackReportControl() { - umfpack_report_control(m_control.data(), Scalar(),StorageIndex()); + umfpack_report_control(m_control.data(), Scalar()); } /** Prints statistics collected by UmfPack. * * \sa analyzePattern(), compute() */ - void printUmfpackInfo() + void umfpackReportInfo() { eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()"); - umfpack_report_info(m_control.data(), m_umfpackInfo.data(), Scalar(),StorageIndex()); + umfpack_report_info(m_control.data(), m_umfpackInfo.data(), Scalar()); } /** Prints the status of the previous factorization operation performed by UmfPack (symbolic or numerical factorization). * * \sa analyzePattern(), compute() */ - void printUmfpackStatus() { + void umfpackReportStatus() { eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()"); - umfpack_report_status(m_control.data(), m_fact_errorCode, Scalar(),StorageIndex()); + umfpack_report_status(m_control.data(), m_fact_errorCode, Scalar()); } /** \internal */ @@ -498,13 +362,13 @@ class UmfPackLU : public SparseSolverBase > m_symbolic = 0; m_extractedDataAreDirty = true; - umfpack_defaults(m_control.data(), Scalar(),StorageIndex()); + umfpack_defaults(m_control.data(), Scalar()); } void analyzePattern_impl() { - m_fact_errorCode = umfpack_symbolic(internal::convert_index(mp_matrix.rows()), - internal::convert_index(mp_matrix.cols()), + m_fact_errorCode = umfpack_symbolic(internal::convert_index(mp_matrix.rows()), + internal::convert_index(mp_matrix.cols()), mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(), &m_symbolic, m_control.data(), m_umfpackInfo.data()); @@ -544,7 +408,7 @@ class UmfPackLU : public SparseSolverBase > // cached data to reduce reallocation, etc. mutable LUMatrixType m_l; - StorageIndex m_fact_errorCode; + int m_fact_errorCode; UmfpackControl m_control; mutable UmfpackInfo m_umfpackInfo; @@ -574,7 +438,7 @@ void UmfPackLU::extractData() const if (m_extractedDataAreDirty) { // get size of the data - StorageIndex lnz, unz, rows, cols, nz_udiag; + int lnz, unz, rows, cols, nz_udiag; umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar()); // allocate data @@ -600,7 +464,7 @@ template typename UmfPackLU::Scalar UmfPackLU::determinant() const { Scalar det; - umfpack_get_determinant(&det, 0, m_numeric, 0, StorageIndex()); + umfpack_get_determinant(&det, 0, m_numeric, 0); return det; } @@ -613,6 +477,7 @@ bool UmfPackLU::_solve_impl(const MatrixBase &b, MatrixBas eigen_assert((XDerived::Flags&RowMajorBit)==0 && "UmfPackLU backend does not support non col-major result yet"); eigen_assert(b.derived().data() != x.derived().data() && " Umfpack does not support inplace solve"); + int errorCode; Scalar* x_ptr = 0; Matrix x_tmp; if(x.innerStride()!=1) @@ -624,10 +489,9 @@ bool UmfPackLU::_solve_impl(const MatrixBase &b, MatrixBas { if(x.innerStride()==1) x_ptr = &x.col(j).coeffRef(0); - StorageIndex errorCode = umfpack_solve(UMFPACK_A, - mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(), - x_ptr, &b.const_cast_derived().col(j).coeffRef(0), - m_numeric, m_control.data(), m_umfpackInfo.data()); + errorCode = umfpack_solve(UMFPACK_A, + mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(), + x_ptr, &b.const_cast_derived().col(j).coeffRef(0), m_numeric, m_control.data(), m_umfpackInfo.data()); if(x.innerStride()!=1) x.col(j) = x_tmp; if (errorCode!=0) diff --git a/uppsrc/plugin/Eigen/Eigen/src/misc/lapacke.h b/uppsrc/plugin/Eigen/Eigen/src/misc/lapacke.h index 3d8e24f5a..8c7e79b03 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/misc/lapacke.h +++ b/uppsrc/plugin/Eigen/Eigen/src/misc/lapacke.h @@ -43,6 +43,10 @@ #include "lapacke_config.h" #endif +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + #include #ifndef lapack_int @@ -104,11 +108,6 @@ lapack_complex_double lapack_make_complex_double( double re, double im ); #endif - -#ifdef __cplusplus -extern "C" { -#endif /* __cplusplus */ - #ifndef LAPACKE_malloc #define LAPACKE_malloc( size ) malloc( size ) #endif diff --git a/uppsrc/plugin/Eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h b/uppsrc/plugin/Eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h index 73d5f51c8..1f8a531af 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h +++ b/uppsrc/plugin/Eigen/Eigen/src/plugins/ArrayCwiseBinaryOps.h @@ -75,32 +75,6 @@ max return (max)(Derived::PlainObject::Constant(rows(), cols(), other)); } -/** \returns an expression of the coefficient-wise absdiff of \c *this and \a other - * - * Example: \include Cwise_absolute_difference.cpp - * Output: \verbinclude Cwise_absolute_difference.out - * - * \sa absolute_difference() - */ -EIGEN_MAKE_CWISE_BINARY_OP(absolute_difference,absolute_difference) - -/** \returns an expression of the coefficient-wise absolute_difference of \c *this and scalar \a other - * - * \sa absolute_difference() - */ -EIGEN_DEVICE_FUNC -EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, - const CwiseNullaryOp, PlainObject> > -#ifdef EIGEN_PARSED_BY_DOXYGEN -absolute_difference -#else -(absolute_difference) -#endif -(const Scalar &other) const -{ - return (absolute_difference)(Derived::PlainObject::Constant(rows(), cols(), other)); -} - /** \returns an expression of the coefficient-wise power of \c *this to the given array of \a exponents. * * This function computes the coefficient-wise power. @@ -340,9 +314,9 @@ polygamma(const EIGEN_CURRENT_STORAGE_BASE_CLASS &n) const * * It returns the Riemann zeta function of two arguments \c *this and \a q: * + * \param *this is the exposent, it must be > 1 * \param q is the shift, it must be > 0 * - * \note *this is the exponent, it must be > 1. * \note This function supports only float and double scalar types. To support other scalar types, the user has * to provide implementations of zeta(T,T) for any scalar type T to be supported. * diff --git a/uppsrc/plugin/Eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/uppsrc/plugin/Eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h index 59a4ee6a0..ebaa3f192 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h +++ b/uppsrc/plugin/Eigen/Eigen/src/plugins/ArrayCwiseUnaryOps.h @@ -10,7 +10,6 @@ typedef CwiseUnaryOp, const Derived> Inverse typedef CwiseUnaryOp, const Derived> BooleanNotReturnType; typedef CwiseUnaryOp, const Derived> ExpReturnType; -typedef CwiseUnaryOp, const Derived> Expm1ReturnType; typedef CwiseUnaryOp, const Derived> LogReturnType; typedef CwiseUnaryOp, const Derived> Log1pReturnType; typedef CwiseUnaryOp, const Derived> Log10ReturnType; @@ -21,18 +20,11 @@ typedef CwiseUnaryOp, const Derived> AcosReturn typedef CwiseUnaryOp, const Derived> AsinReturnType; typedef CwiseUnaryOp, const Derived> AtanReturnType; typedef CwiseUnaryOp, const Derived> TanhReturnType; -typedef CwiseUnaryOp, const Derived> LogisticReturnType; typedef CwiseUnaryOp, const Derived> SinhReturnType; -#if EIGEN_HAS_CXX11_MATH -typedef CwiseUnaryOp, const Derived> AtanhReturnType; -typedef CwiseUnaryOp, const Derived> AsinhReturnType; -typedef CwiseUnaryOp, const Derived> AcoshReturnType; -#endif typedef CwiseUnaryOp, const Derived> CoshReturnType; typedef CwiseUnaryOp, const Derived> SquareReturnType; typedef CwiseUnaryOp, const Derived> CubeReturnType; typedef CwiseUnaryOp, const Derived> RoundReturnType; -typedef CwiseUnaryOp, const Derived> RintReturnType; typedef CwiseUnaryOp, const Derived> FloorReturnType; typedef CwiseUnaryOp, const Derived> CeilReturnType; typedef CwiseUnaryOp, const Derived> IsNaNReturnType; @@ -98,20 +90,6 @@ exp() const return ExpReturnType(derived()); } -/** \returns an expression of the coefficient-wise exponential of *this minus 1. - * - * In exact arithmetic, \c x.expm1() is equivalent to \c x.exp() - 1, - * however, with finite precision, this function is much more accurate when \c x is close to zero. - * - * \sa Math functions, exp() - */ -EIGEN_DEVICE_FUNC -inline const Expm1ReturnType -expm1() const -{ - return Expm1ReturnType(derived()); -} - /** \returns an expression of the coefficient-wise logarithm of *this. * * This function computes the coefficient-wise logarithm. The function MatrixBase::log() in the @@ -120,7 +98,7 @@ expm1() const * Example: \include Cwise_log.cpp * Output: \verbinclude Cwise_log.out * - * \sa Math functions, log() + * \sa Math functions, exp() */ EIGEN_DEVICE_FUNC inline const LogReturnType @@ -333,7 +311,7 @@ sinh() const * Example: \include Cwise_cosh.cpp * Output: \verbinclude Cwise_cosh.out * - * \sa Math functions, tanh(), sinh(), cosh() + * \sa Math functions, tan(), sinh(), cosh() */ EIGEN_DEVICE_FUNC inline const CoshReturnType @@ -342,50 +320,6 @@ cosh() const return CoshReturnType(derived()); } -#if EIGEN_HAS_CXX11_MATH -/** \returns an expression of the coefficient-wise inverse hyperbolic tan of *this. - * - * \sa Math functions, atanh(), asinh(), acosh() - */ -EIGEN_DEVICE_FUNC -inline const AtanhReturnType -atanh() const -{ - return AtanhReturnType(derived()); -} - -/** \returns an expression of the coefficient-wise inverse hyperbolic sin of *this. - * - * \sa Math functions, atanh(), asinh(), acosh() - */ -EIGEN_DEVICE_FUNC -inline const AsinhReturnType -asinh() const -{ - return AsinhReturnType(derived()); -} - -/** \returns an expression of the coefficient-wise inverse hyperbolic cos of *this. - * - * \sa Math functions, atanh(), asinh(), acosh() - */ -EIGEN_DEVICE_FUNC -inline const AcoshReturnType -acosh() const -{ - return AcoshReturnType(derived()); -} -#endif - -/** \returns an expression of the coefficient-wise logistic of *this. - */ -EIGEN_DEVICE_FUNC -inline const LogisticReturnType -logistic() const -{ - return LogisticReturnType(derived()); -} - /** \returns an expression of the coefficient-wise inverse of *this. * * Example: \include Cwise_inverse.cpp @@ -428,20 +362,6 @@ cube() const return CubeReturnType(derived()); } -/** \returns an expression of the coefficient-wise rint of *this. - * - * Example: \include Cwise_rint.cpp - * Output: \verbinclude Cwise_rint.out - * - * \sa Math functions, ceil(), floor() - */ -EIGEN_DEVICE_FUNC -inline const RintReturnType -rint() const -{ - return RintReturnType(derived()); -} - /** \returns an expression of the coefficient-wise round of *this. * * Example: \include Cwise_round.cpp @@ -551,12 +471,14 @@ typedef CwiseUnaryOp, const Derived> LgammaRe typedef CwiseUnaryOp, const Derived> DigammaReturnType; typedef CwiseUnaryOp, const Derived> ErfReturnType; typedef CwiseUnaryOp, const Derived> ErfcReturnType; -typedef CwiseUnaryOp, const Derived> NdtriReturnType; /** \cpp11 \returns an expression of the coefficient-wise ln(|gamma(*this)|). * * \specialfunctions_module * + * Example: \include Cwise_lgamma.cpp + * Output: \verbinclude Cwise_lgamma.out + * * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types, * or float/double in non c++11 mode, the user has to provide implementations of lgamma(T) for any scalar * type T to be supported. @@ -592,6 +514,9 @@ digamma() const * * \specialfunctions_module * + * Example: \include Cwise_erf.cpp + * Output: \verbinclude Cwise_erf.out + * * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types, * or float/double in non c++11 mode, the user has to provide implementations of erf(T) for any scalar * type T to be supported. @@ -610,6 +535,9 @@ erf() const * * \specialfunctions_module * + * Example: \include Cwise_erfc.cpp + * Output: \verbinclude Cwise_erfc.out + * * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types, * or float/double in non c++11 mode, the user has to provide implementations of erfc(T) for any scalar * type T to be supported. @@ -622,23 +550,3 @@ erfc() const { return ErfcReturnType(derived()); } - -/** \returns an expression of the coefficient-wise inverse of the CDF of the Normal distribution function - * function of *this. - * - * \specialfunctions_module - * - * In other words, considering `x = ndtri(y)`, it returns the argument, x, for which the area under the - * Gaussian probability density function (integrated from minus infinity to x) is equal to y. - * - * \note This function supports only float and double scalar types. To support other scalar types, - * the user has to provide implementations of ndtri(T) for any scalar type T to be supported. - * - * \sa Math functions - */ -EIGEN_DEVICE_FUNC -inline const NdtriReturnType -ndtri() const -{ - return NdtriReturnType(derived()); -} diff --git a/uppsrc/plugin/Eigen/Eigen/src/plugins/BlockMethods.h b/uppsrc/plugin/Eigen/Eigen/src/plugins/BlockMethods.h index 935a604b6..ac35a0086 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/plugins/BlockMethods.h +++ b/uppsrc/plugin/Eigen/Eigen/src/plugins/BlockMethods.h @@ -40,126 +40,68 @@ typedef const VectorBlock ConstSegmentReturnType; template struct FixedSegmentReturnType { typedef VectorBlock Type; }; template struct ConstFixedSegmentReturnType { typedef const VectorBlock Type; }; -/// \internal inner-vector -typedef Block InnerVectorReturnType; -typedef Block ConstInnerVectorReturnType; - -/// \internal set of inner-vectors -typedef Block InnerVectorsReturnType; -typedef Block ConstInnerVectorsReturnType; - #endif // not EIGEN_PARSED_BY_DOXYGEN -/// \returns an expression of a block in \c *this with either dynamic or fixed sizes. +/// \returns a dynamic-size expression of a block in *this. /// -/// \param startRow the first row in the block -/// \param startCol the first column in the block -/// \param blockRows number of rows in the block, specified at either run-time or compile-time -/// \param blockCols number of columns in the block, specified at either run-time or compile-time -/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. -/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. +/// \param startRow the first row in the block +/// \param startCol the first column in the block +/// \param blockRows the number of rows in the block +/// \param blockCols the number of columns in the block /// -/// Example using runtime (aka dynamic) sizes: \include MatrixBase_block_int_int_int_int.cpp +/// Example: \include MatrixBase_block_int_int_int_int.cpp /// Output: \verbinclude MatrixBase_block_int_int_int_int.out /// -/// \newin{3.4}: -/// -/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix, -/// or Eigen::fix(n) as arguments. In the later case, \c n plays the role of a runtime fallback value in case \c N equals Eigen::Dynamic. -/// Here is an example with a fixed number of rows \c NRows and dynamic number of columns \c cols: -/// \code -/// mat.block(i,j,fix,cols) -/// \endcode -/// -/// This function thus fully covers the features offered by the following overloads block(Index, Index), -/// and block(Index, Index, Index, Index) that are thus obsolete. Indeed, this generic version avoids -/// redundancy, it preserves the argument order, and prevents the need to rely on the template keyword in templated code. -/// -/// but with less redundancy and more consistency as it does not modify the argument order -/// and seamlessly enable hybrid fixed/dynamic sizes. -/// -/// \note Even in the case that the returned expression has dynamic size, in the case +/// \note Even though the returned expression has dynamic size, in the case /// when it is applied to a fixed-size matrix, it inherits a fixed maximal size, /// which means that evaluating it does not cause a dynamic memory allocation. /// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa class Block, fix, fix(int) +/// \sa class Block, block(Index,Index) /// -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type -#else -typename FixedBlockXpr<...,...>::Type -#endif -block(Index startRow, Index startCol, NRowsType blockRows, NColsType blockCols) +EIGEN_DEVICE_FUNC +inline BlockXpr block(Index startRow, Index startCol, Index blockRows, Index blockCols) { - return typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type( - derived(), startRow, startCol, internal::get_runtime_value(blockRows), internal::get_runtime_value(blockCols)); + return BlockXpr(derived(), startRow, startCol, blockRows, blockCols); } -/// This is the const version of block(Index,Index,NRowsType,NColsType) -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type -#else -const typename ConstFixedBlockXpr<...,...>::Type -#endif -block(Index startRow, Index startCol, NRowsType blockRows, NColsType blockCols) const +/// This is the const version of block(Index,Index,Index,Index). */ +EIGEN_DEVICE_FUNC +inline const ConstBlockXpr block(Index startRow, Index startCol, Index blockRows, Index blockCols) const { - return typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type( - derived(), startRow, startCol, internal::get_runtime_value(blockRows), internal::get_runtime_value(blockCols)); + return ConstBlockXpr(derived(), startRow, startCol, blockRows, blockCols); } -/// \returns a expression of a top-right corner of \c *this with either dynamic or fixed sizes. + +/// \returns a dynamic-size expression of a top-right corner of *this. /// /// \param cRows the number of rows in the corner /// \param cCols the number of columns in the corner -/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. -/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// -/// Example with dynamic sizes: \include MatrixBase_topRightCorner_int_int.cpp +/// Example: \include MatrixBase_topRightCorner_int_int.cpp /// Output: \verbinclude MatrixBase_topRightCorner_int_int.out /// -/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix, -/// or Eigen::fix(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. -/// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa block(Index,Index,NRowsType,NColsType), class Block +/// \sa class Block, block(Index,Index,Index,Index) /// -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type -#else -typename FixedBlockXpr<...,...>::Type -#endif -topRightCorner(NRowsType cRows, NColsType cCols) +EIGEN_DEVICE_FUNC +inline BlockXpr topRightCorner(Index cRows, Index cCols) { - return typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type - (derived(), 0, cols() - internal::get_runtime_value(cCols), internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); + return BlockXpr(derived(), 0, cols() - cCols, cRows, cCols); } -/// This is the const version of topRightCorner(NRowsType, NColsType). -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type -#else -const typename ConstFixedBlockXpr<...,...>::Type -#endif -topRightCorner(NRowsType cRows, NColsType cCols) const +/// This is the const version of topRightCorner(Index, Index). +EIGEN_DEVICE_FUNC +inline const ConstBlockXpr topRightCorner(Index cRows, Index cCols) const { - return typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type - (derived(), 0, cols() - internal::get_runtime_value(cCols), internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); + return ConstBlockXpr(derived(), 0, cols() - cCols, cRows, cCols); } -/// \returns an expression of a fixed-size top-right corner of \c *this. +/// \returns an expression of a fixed-size top-right corner of *this. /// /// \tparam CRows the number of rows in the corner /// \tparam CCols the number of columns in the corner @@ -172,21 +114,21 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block, block(Index,Index) /// template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename FixedBlockXpr::Type topRightCorner() +EIGEN_DEVICE_FUNC +inline typename FixedBlockXpr::Type topRightCorner() { return typename FixedBlockXpr::Type(derived(), 0, cols() - CCols); } /// This is the const version of topRightCorner(). template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -const typename ConstFixedBlockXpr::Type topRightCorner() const +EIGEN_DEVICE_FUNC +inline const typename ConstFixedBlockXpr::Type topRightCorner() const { return typename ConstFixedBlockXpr::Type(derived(), 0, cols() - CCols); } -/// \returns an expression of a top-right corner of \c *this. +/// \returns an expression of a top-right corner of *this. /// /// \tparam CRows number of rows in corner as specified at compile-time /// \tparam CCols number of columns in corner as specified at compile-time @@ -206,67 +148,46 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block /// template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename FixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) +inline typename FixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) { return typename FixedBlockXpr::Type(derived(), 0, cols() - cCols, cRows, cCols); } /// This is the const version of topRightCorner(Index, Index). template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -const typename ConstFixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) const +inline const typename ConstFixedBlockXpr::Type topRightCorner(Index cRows, Index cCols) const { return typename ConstFixedBlockXpr::Type(derived(), 0, cols() - cCols, cRows, cCols); } -/// \returns an expression of a top-left corner of \c *this with either dynamic or fixed sizes. +/// \returns a dynamic-size expression of a top-left corner of *this. /// /// \param cRows the number of rows in the corner /// \param cCols the number of columns in the corner -/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. -/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// /// Example: \include MatrixBase_topLeftCorner_int_int.cpp /// Output: \verbinclude MatrixBase_topLeftCorner_int_int.out /// -/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix, -/// or Eigen::fix(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. -/// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa block(Index,Index,NRowsType,NColsType), class Block +/// \sa class Block, block(Index,Index,Index,Index) /// -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type -#else -typename FixedBlockXpr<...,...>::Type -#endif -topLeftCorner(NRowsType cRows, NColsType cCols) +EIGEN_DEVICE_FUNC +inline BlockXpr topLeftCorner(Index cRows, Index cCols) { - return typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type - (derived(), 0, 0, internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); + return BlockXpr(derived(), 0, 0, cRows, cCols); } /// This is the const version of topLeftCorner(Index, Index). -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type -#else -const typename ConstFixedBlockXpr<...,...>::Type -#endif -topLeftCorner(NRowsType cRows, NColsType cCols) const +EIGEN_DEVICE_FUNC +inline const ConstBlockXpr topLeftCorner(Index cRows, Index cCols) const { - return typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type - (derived(), 0, 0, internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); + return ConstBlockXpr(derived(), 0, 0, cRows, cCols); } -/// \returns an expression of a fixed-size top-left corner of \c *this. +/// \returns an expression of a fixed-size top-left corner of *this. /// /// The template parameters CRows and CCols are the number of rows and columns in the corner. /// @@ -275,24 +196,24 @@ topLeftCorner(NRowsType cRows, NColsType cCols) const /// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa block(Index,Index,NRowsType,NColsType), class Block +/// \sa class Block, block(Index,Index,Index,Index) /// template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename FixedBlockXpr::Type topLeftCorner() +EIGEN_DEVICE_FUNC +inline typename FixedBlockXpr::Type topLeftCorner() { return typename FixedBlockXpr::Type(derived(), 0, 0); } /// This is the const version of topLeftCorner(). template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -const typename ConstFixedBlockXpr::Type topLeftCorner() const +EIGEN_DEVICE_FUNC +inline const typename ConstFixedBlockXpr::Type topLeftCorner() const { return typename ConstFixedBlockXpr::Type(derived(), 0, 0); } -/// \returns an expression of a top-left corner of \c *this. +/// \returns an expression of a top-left corner of *this. /// /// \tparam CRows number of rows in corner as specified at compile-time /// \tparam CCols number of columns in corner as specified at compile-time @@ -312,69 +233,46 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block /// template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename FixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) +inline typename FixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) { return typename FixedBlockXpr::Type(derived(), 0, 0, cRows, cCols); } /// This is the const version of topLeftCorner(Index, Index). template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -const typename ConstFixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) const +inline const typename ConstFixedBlockXpr::Type topLeftCorner(Index cRows, Index cCols) const { return typename ConstFixedBlockXpr::Type(derived(), 0, 0, cRows, cCols); } -/// \returns an expression of a bottom-right corner of \c *this with either dynamic or fixed sizes. +/// \returns a dynamic-size expression of a bottom-right corner of *this. /// /// \param cRows the number of rows in the corner /// \param cCols the number of columns in the corner -/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. -/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// /// Example: \include MatrixBase_bottomRightCorner_int_int.cpp /// Output: \verbinclude MatrixBase_bottomRightCorner_int_int.out /// -/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix, -/// or Eigen::fix(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. -/// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa block(Index,Index,NRowsType,NColsType), class Block +/// \sa class Block, block(Index,Index,Index,Index) /// -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type -#else -typename FixedBlockXpr<...,...>::Type -#endif -bottomRightCorner(NRowsType cRows, NColsType cCols) +EIGEN_DEVICE_FUNC +inline BlockXpr bottomRightCorner(Index cRows, Index cCols) { - return typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type - (derived(), rows() - internal::get_runtime_value(cRows), cols() - internal::get_runtime_value(cCols), - internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); + return BlockXpr(derived(), rows() - cRows, cols() - cCols, cRows, cCols); } -/// This is the const version of bottomRightCorner(NRowsType, NColsType). -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -const typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type -#else -const typename ConstFixedBlockXpr<...,...>::Type -#endif -bottomRightCorner(NRowsType cRows, NColsType cCols) const +/// This is the const version of bottomRightCorner(Index, Index). +EIGEN_DEVICE_FUNC +inline const ConstBlockXpr bottomRightCorner(Index cRows, Index cCols) const { - return typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type - (derived(), rows() - internal::get_runtime_value(cRows), cols() - internal::get_runtime_value(cCols), - internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); + return ConstBlockXpr(derived(), rows() - cRows, cols() - cCols, cRows, cCols); } -/// \returns an expression of a fixed-size bottom-right corner of \c *this. +/// \returns an expression of a fixed-size bottom-right corner of *this. /// /// The template parameters CRows and CCols are the number of rows and columns in the corner. /// @@ -383,24 +281,24 @@ bottomRightCorner(NRowsType cRows, NColsType cCols) const /// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa block(Index,Index,NRowsType,NColsType), class Block +/// \sa class Block, block(Index,Index,Index,Index) /// template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename FixedBlockXpr::Type bottomRightCorner() +EIGEN_DEVICE_FUNC +inline typename FixedBlockXpr::Type bottomRightCorner() { return typename FixedBlockXpr::Type(derived(), rows() - CRows, cols() - CCols); } /// This is the const version of bottomRightCorner(). template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -const typename ConstFixedBlockXpr::Type bottomRightCorner() const +EIGEN_DEVICE_FUNC +inline const typename ConstFixedBlockXpr::Type bottomRightCorner() const { return typename ConstFixedBlockXpr::Type(derived(), rows() - CRows, cols() - CCols); } -/// \returns an expression of a bottom-right corner of \c *this. +/// \returns an expression of a bottom-right corner of *this. /// /// \tparam CRows number of rows in corner as specified at compile-time /// \tparam CCols number of columns in corner as specified at compile-time @@ -420,69 +318,46 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block /// template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename FixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) +inline typename FixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) { return typename FixedBlockXpr::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols); } /// This is the const version of bottomRightCorner(Index, Index). template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -const typename ConstFixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) const +inline const typename ConstFixedBlockXpr::Type bottomRightCorner(Index cRows, Index cCols) const { return typename ConstFixedBlockXpr::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols); } -/// \returns an expression of a bottom-left corner of \c *this with either dynamic or fixed sizes. +/// \returns a dynamic-size expression of a bottom-left corner of *this. /// /// \param cRows the number of rows in the corner /// \param cCols the number of columns in the corner -/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. -/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// /// Example: \include MatrixBase_bottomLeftCorner_int_int.cpp /// Output: \verbinclude MatrixBase_bottomLeftCorner_int_int.out /// -/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing Eigen::fix, -/// or Eigen::fix(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. -/// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa block(Index,Index,NRowsType,NColsType), class Block +/// \sa class Block, block(Index,Index,Index,Index) /// -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type -#else -typename FixedBlockXpr<...,...>::Type -#endif -bottomLeftCorner(NRowsType cRows, NColsType cCols) +EIGEN_DEVICE_FUNC +inline BlockXpr bottomLeftCorner(Index cRows, Index cCols) { - return typename FixedBlockXpr::value,internal::get_fixed_value::value>::Type - (derived(), rows() - internal::get_runtime_value(cRows), 0, - internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); + return BlockXpr(derived(), rows() - cRows, 0, cRows, cCols); } -/// This is the const version of bottomLeftCorner(NRowsType, NColsType). -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type -#else -typename ConstFixedBlockXpr<...,...>::Type -#endif -bottomLeftCorner(NRowsType cRows, NColsType cCols) const +/// This is the const version of bottomLeftCorner(Index, Index). +EIGEN_DEVICE_FUNC +inline const ConstBlockXpr bottomLeftCorner(Index cRows, Index cCols) const { - return typename ConstFixedBlockXpr::value,internal::get_fixed_value::value>::Type - (derived(), rows() - internal::get_runtime_value(cRows), 0, - internal::get_runtime_value(cRows), internal::get_runtime_value(cCols)); + return ConstBlockXpr(derived(), rows() - cRows, 0, cRows, cCols); } -/// \returns an expression of a fixed-size bottom-left corner of \c *this. +/// \returns an expression of a fixed-size bottom-left corner of *this. /// /// The template parameters CRows and CCols are the number of rows and columns in the corner. /// @@ -491,24 +366,24 @@ bottomLeftCorner(NRowsType cRows, NColsType cCols) const /// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa block(Index,Index,NRowsType,NColsType), class Block +/// \sa class Block, block(Index,Index,Index,Index) /// template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename FixedBlockXpr::Type bottomLeftCorner() +EIGEN_DEVICE_FUNC +inline typename FixedBlockXpr::Type bottomLeftCorner() { return typename FixedBlockXpr::Type(derived(), rows() - CRows, 0); } /// This is the const version of bottomLeftCorner(). template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -const typename ConstFixedBlockXpr::Type bottomLeftCorner() const +EIGEN_DEVICE_FUNC +inline const typename ConstFixedBlockXpr::Type bottomLeftCorner() const { return typename ConstFixedBlockXpr::Type(derived(), rows() - CRows, 0); } -/// \returns an expression of a bottom-left corner of \c *this. +/// \returns an expression of a bottom-left corner of *this. /// /// \tparam CRows number of rows in corner as specified at compile-time /// \tparam CCols number of columns in corner as specified at compile-time @@ -528,66 +403,45 @@ EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// \sa class Block /// template -EIGEN_STRONG_INLINE -typename FixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) +inline typename FixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) { return typename FixedBlockXpr::Type(derived(), rows() - cRows, 0, cRows, cCols); } /// This is the const version of bottomLeftCorner(Index, Index). template -EIGEN_STRONG_INLINE -const typename ConstFixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) const +inline const typename ConstFixedBlockXpr::Type bottomLeftCorner(Index cRows, Index cCols) const { return typename ConstFixedBlockXpr::Type(derived(), rows() - cRows, 0, cRows, cCols); } -/// \returns a block consisting of the top rows of \c *this. +/// \returns a block consisting of the top rows of *this. /// /// \param n the number of rows in the block -/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. /// /// Example: \include MatrixBase_topRows_int.cpp /// Output: \verbinclude MatrixBase_topRows_int.out /// -/// The number of rows \a n can also be specified at compile-time by passing Eigen::fix, -/// or Eigen::fix(n) as arguments. -/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. -/// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// -/// \sa block(Index,Index,NRowsType,NColsType), class Block +/// \sa class Block, block(Index,Index,Index,Index) /// -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -typename NRowsBlockXpr::value>::Type -#else -typename NRowsBlockXpr<...>::Type -#endif -topRows(NRowsType n) +EIGEN_DEVICE_FUNC +inline RowsBlockXpr topRows(Index n) { - return typename NRowsBlockXpr::value>::Type - (derived(), 0, 0, internal::get_runtime_value(n), cols()); + return RowsBlockXpr(derived(), 0, 0, n, cols()); } -/// This is the const version of topRows(NRowsType). -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -const typename ConstNRowsBlockXpr::value>::Type -#else -const typename ConstNRowsBlockXpr<...>::Type -#endif -topRows(NRowsType n) const +/// This is the const version of topRows(Index). +EIGEN_DEVICE_FUNC +inline ConstRowsBlockXpr topRows(Index n) const { - return typename ConstNRowsBlockXpr::value>::Type - (derived(), 0, 0, internal::get_runtime_value(n), cols()); + return ConstRowsBlockXpr(derived(), 0, 0, n, cols()); } -/// \returns a block consisting of the top rows of \c *this. +/// \returns a block consisting of the top rows of *this. /// /// \tparam N the number of rows in the block as specified at compile-time /// \param n the number of rows in the block as specified at run-time @@ -600,69 +454,50 @@ topRows(NRowsType n) const /// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// -/// \sa block(Index,Index,NRowsType,NColsType), class Block +/// \sa class Block, block(Index,Index,Index,Index) /// template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename NRowsBlockXpr::Type topRows(Index n = N) +EIGEN_DEVICE_FUNC +inline typename NRowsBlockXpr::Type topRows(Index n = N) { return typename NRowsBlockXpr::Type(derived(), 0, 0, n, cols()); } /// This is the const version of topRows(). template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename ConstNRowsBlockXpr::Type topRows(Index n = N) const +EIGEN_DEVICE_FUNC +inline typename ConstNRowsBlockXpr::Type topRows(Index n = N) const { return typename ConstNRowsBlockXpr::Type(derived(), 0, 0, n, cols()); } -/// \returns a block consisting of the bottom rows of \c *this. +/// \returns a block consisting of the bottom rows of *this. /// /// \param n the number of rows in the block -/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. /// /// Example: \include MatrixBase_bottomRows_int.cpp /// Output: \verbinclude MatrixBase_bottomRows_int.out /// -/// The number of rows \a n can also be specified at compile-time by passing Eigen::fix, -/// or Eigen::fix(n) as arguments. -/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. -/// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// -/// \sa block(Index,Index,NRowsType,NColsType), class Block +/// \sa class Block, block(Index,Index,Index,Index) /// -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -typename NRowsBlockXpr::value>::Type -#else -typename NRowsBlockXpr<...>::Type -#endif -bottomRows(NRowsType n) +EIGEN_DEVICE_FUNC +inline RowsBlockXpr bottomRows(Index n) { - return typename NRowsBlockXpr::value>::Type - (derived(), rows() - internal::get_runtime_value(n), 0, internal::get_runtime_value(n), cols()); + return RowsBlockXpr(derived(), rows() - n, 0, n, cols()); } -/// This is the const version of bottomRows(NRowsType). -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -const typename ConstNRowsBlockXpr::value>::Type -#else -const typename ConstNRowsBlockXpr<...>::Type -#endif -bottomRows(NRowsType n) const +/// This is the const version of bottomRows(Index). +EIGEN_DEVICE_FUNC +inline ConstRowsBlockXpr bottomRows(Index n) const { - return typename ConstNRowsBlockXpr::value>::Type - (derived(), rows() - internal::get_runtime_value(n), 0, internal::get_runtime_value(n), cols()); + return ConstRowsBlockXpr(derived(), rows() - n, 0, n, cols()); } -/// \returns a block consisting of the bottom rows of \c *this. +/// \returns a block consisting of the bottom rows of *this. /// /// \tparam N the number of rows in the block as specified at compile-time /// \param n the number of rows in the block as specified at run-time @@ -675,70 +510,51 @@ bottomRows(NRowsType n) const /// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// -/// \sa block(Index,Index,NRowsType,NColsType), class Block +/// \sa class Block, block(Index,Index,Index,Index) /// template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename NRowsBlockXpr::Type bottomRows(Index n = N) +EIGEN_DEVICE_FUNC +inline typename NRowsBlockXpr::Type bottomRows(Index n = N) { return typename NRowsBlockXpr::Type(derived(), rows() - n, 0, n, cols()); } /// This is the const version of bottomRows(). template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename ConstNRowsBlockXpr::Type bottomRows(Index n = N) const +EIGEN_DEVICE_FUNC +inline typename ConstNRowsBlockXpr::Type bottomRows(Index n = N) const { return typename ConstNRowsBlockXpr::Type(derived(), rows() - n, 0, n, cols()); } -/// \returns a block consisting of a range of rows of \c *this. +/// \returns a block consisting of a range of rows of *this. /// /// \param startRow the index of the first row in the block /// \param n the number of rows in the block -/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index. /// /// Example: \include DenseBase_middleRows_int.cpp /// Output: \verbinclude DenseBase_middleRows_int.out /// -/// The number of rows \a n can also be specified at compile-time by passing Eigen::fix, -/// or Eigen::fix(n) as arguments. -/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. -/// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// -/// \sa block(Index,Index,NRowsType,NColsType), class Block +/// \sa class Block, block(Index,Index,Index,Index) /// -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -typename NRowsBlockXpr::value>::Type -#else -typename NRowsBlockXpr<...>::Type -#endif -middleRows(Index startRow, NRowsType n) +EIGEN_DEVICE_FUNC +inline RowsBlockXpr middleRows(Index startRow, Index n) { - return typename NRowsBlockXpr::value>::Type - (derived(), startRow, 0, internal::get_runtime_value(n), cols()); + return RowsBlockXpr(derived(), startRow, 0, n, cols()); } -/// This is the const version of middleRows(Index,NRowsType). -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -const typename ConstNRowsBlockXpr::value>::Type -#else -const typename ConstNRowsBlockXpr<...>::Type -#endif -middleRows(Index startRow, NRowsType n) const +/// This is the const version of middleRows(Index,Index). +EIGEN_DEVICE_FUNC +inline ConstRowsBlockXpr middleRows(Index startRow, Index n) const { - return typename ConstNRowsBlockXpr::value>::Type - (derived(), startRow, 0, internal::get_runtime_value(n), cols()); + return ConstRowsBlockXpr(derived(), startRow, 0, n, cols()); } -/// \returns a block consisting of a range of rows of \c *this. +/// \returns a block consisting of a range of rows of *this. /// /// \tparam N the number of rows in the block as specified at compile-time /// \param startRow the index of the first row in the block @@ -752,69 +568,50 @@ middleRows(Index startRow, NRowsType n) const /// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /// -/// \sa block(Index,Index,NRowsType,NColsType), class Block +/// \sa class Block, block(Index,Index,Index,Index) /// template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename NRowsBlockXpr::Type middleRows(Index startRow, Index n = N) +EIGEN_DEVICE_FUNC +inline typename NRowsBlockXpr::Type middleRows(Index startRow, Index n = N) { return typename NRowsBlockXpr::Type(derived(), startRow, 0, n, cols()); } /// This is the const version of middleRows(). template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename ConstNRowsBlockXpr::Type middleRows(Index startRow, Index n = N) const +EIGEN_DEVICE_FUNC +inline typename ConstNRowsBlockXpr::Type middleRows(Index startRow, Index n = N) const { return typename ConstNRowsBlockXpr::Type(derived(), startRow, 0, n, cols()); } -/// \returns a block consisting of the left columns of \c *this. +/// \returns a block consisting of the left columns of *this. /// /// \param n the number of columns in the block -/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// /// Example: \include MatrixBase_leftCols_int.cpp /// Output: \verbinclude MatrixBase_leftCols_int.out /// -/// The number of columns \a n can also be specified at compile-time by passing Eigen::fix, -/// or Eigen::fix(n) as arguments. -/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. -/// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// -/// \sa block(Index,Index,NRowsType,NColsType), class Block +/// \sa class Block, block(Index,Index,Index,Index) /// -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -typename NColsBlockXpr::value>::Type -#else -typename NColsBlockXpr<...>::Type -#endif -leftCols(NColsType n) +EIGEN_DEVICE_FUNC +inline ColsBlockXpr leftCols(Index n) { - return typename NColsBlockXpr::value>::Type - (derived(), 0, 0, rows(), internal::get_runtime_value(n)); + return ColsBlockXpr(derived(), 0, 0, rows(), n); } -/// This is the const version of leftCols(NColsType). -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -const typename ConstNColsBlockXpr::value>::Type -#else -const typename ConstNColsBlockXpr<...>::Type -#endif -leftCols(NColsType n) const +/// This is the const version of leftCols(Index). +EIGEN_DEVICE_FUNC +inline ConstColsBlockXpr leftCols(Index n) const { - return typename ConstNColsBlockXpr::value>::Type - (derived(), 0, 0, rows(), internal::get_runtime_value(n)); + return ConstColsBlockXpr(derived(), 0, 0, rows(), n); } -/// \returns a block consisting of the left columns of \c *this. +/// \returns a block consisting of the left columns of *this. /// /// \tparam N the number of columns in the block as specified at compile-time /// \param n the number of columns in the block as specified at run-time @@ -827,69 +624,50 @@ leftCols(NColsType n) const /// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// -/// \sa block(Index,Index,NRowsType,NColsType), class Block +/// \sa class Block, block(Index,Index,Index,Index) /// template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename NColsBlockXpr::Type leftCols(Index n = N) +EIGEN_DEVICE_FUNC +inline typename NColsBlockXpr::Type leftCols(Index n = N) { return typename NColsBlockXpr::Type(derived(), 0, 0, rows(), n); } /// This is the const version of leftCols(). template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename ConstNColsBlockXpr::Type leftCols(Index n = N) const +EIGEN_DEVICE_FUNC +inline typename ConstNColsBlockXpr::Type leftCols(Index n = N) const { return typename ConstNColsBlockXpr::Type(derived(), 0, 0, rows(), n); } -/// \returns a block consisting of the right columns of \c *this. +/// \returns a block consisting of the right columns of *this. /// /// \param n the number of columns in the block -/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// /// Example: \include MatrixBase_rightCols_int.cpp /// Output: \verbinclude MatrixBase_rightCols_int.out /// -/// The number of columns \a n can also be specified at compile-time by passing Eigen::fix, -/// or Eigen::fix(n) as arguments. -/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. -/// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// -/// \sa block(Index,Index,NRowsType,NColsType), class Block +/// \sa class Block, block(Index,Index,Index,Index) /// -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -typename NColsBlockXpr::value>::Type -#else -typename NColsBlockXpr<...>::Type -#endif -rightCols(NColsType n) +EIGEN_DEVICE_FUNC +inline ColsBlockXpr rightCols(Index n) { - return typename NColsBlockXpr::value>::Type - (derived(), 0, cols() - internal::get_runtime_value(n), rows(), internal::get_runtime_value(n)); + return ColsBlockXpr(derived(), 0, cols() - n, rows(), n); } -/// This is the const version of rightCols(NColsType). -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -const typename ConstNColsBlockXpr::value>::Type -#else -const typename ConstNColsBlockXpr<...>::Type -#endif -rightCols(NColsType n) const +/// This is the const version of rightCols(Index). +EIGEN_DEVICE_FUNC +inline ConstColsBlockXpr rightCols(Index n) const { - return typename ConstNColsBlockXpr::value>::Type - (derived(), 0, cols() - internal::get_runtime_value(n), rows(), internal::get_runtime_value(n)); + return ConstColsBlockXpr(derived(), 0, cols() - n, rows(), n); } -/// \returns a block consisting of the right columns of \c *this. +/// \returns a block consisting of the right columns of *this. /// /// \tparam N the number of columns in the block as specified at compile-time /// \param n the number of columns in the block as specified at run-time @@ -902,70 +680,51 @@ rightCols(NColsType n) const /// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// -/// \sa block(Index,Index,NRowsType,NColsType), class Block +/// \sa class Block, block(Index,Index,Index,Index) /// template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename NColsBlockXpr::Type rightCols(Index n = N) +EIGEN_DEVICE_FUNC +inline typename NColsBlockXpr::Type rightCols(Index n = N) { return typename NColsBlockXpr::Type(derived(), 0, cols() - n, rows(), n); } /// This is the const version of rightCols(). template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename ConstNColsBlockXpr::Type rightCols(Index n = N) const +EIGEN_DEVICE_FUNC +inline typename ConstNColsBlockXpr::Type rightCols(Index n = N) const { return typename ConstNColsBlockXpr::Type(derived(), 0, cols() - n, rows(), n); } -/// \returns a block consisting of a range of columns of \c *this. +/// \returns a block consisting of a range of columns of *this. /// /// \param startCol the index of the first column in the block /// \param numCols the number of columns in the block -/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index. /// /// Example: \include DenseBase_middleCols_int.cpp /// Output: \verbinclude DenseBase_middleCols_int.out /// -/// The number of columns \a n can also be specified at compile-time by passing Eigen::fix, -/// or Eigen::fix(n) as arguments. -/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. -/// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// -/// \sa block(Index,Index,NRowsType,NColsType), class Block +/// \sa class Block, block(Index,Index,Index,Index) /// -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -typename NColsBlockXpr::value>::Type -#else -typename NColsBlockXpr<...>::Type -#endif -middleCols(Index startCol, NColsType numCols) +EIGEN_DEVICE_FUNC +inline ColsBlockXpr middleCols(Index startCol, Index numCols) { - return typename NColsBlockXpr::value>::Type - (derived(), 0, startCol, rows(), internal::get_runtime_value(numCols)); + return ColsBlockXpr(derived(), 0, startCol, rows(), numCols); } -/// This is the const version of middleCols(Index,NColsType). -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -const typename ConstNColsBlockXpr::value>::Type -#else -const typename ConstNColsBlockXpr<...>::Type -#endif -middleCols(Index startCol, NColsType numCols) const +/// This is the const version of middleCols(Index,Index). +EIGEN_DEVICE_FUNC +inline ConstColsBlockXpr middleCols(Index startCol, Index numCols) const { - return typename ConstNColsBlockXpr::value>::Type - (derived(), 0, startCol, rows(), internal::get_runtime_value(numCols)); + return ConstColsBlockXpr(derived(), 0, startCol, rows(), numCols); } -/// \returns a block consisting of a range of columns of \c *this. +/// \returns a block consisting of a range of columns of *this. /// /// \tparam N the number of columns in the block as specified at compile-time /// \param startCol the index of the first column in the block @@ -979,26 +738,26 @@ middleCols(Index startCol, NColsType numCols) const /// EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /// -/// \sa block(Index,Index,NRowsType,NColsType), class Block +/// \sa class Block, block(Index,Index,Index,Index) /// template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename NColsBlockXpr::Type middleCols(Index startCol, Index n = N) +EIGEN_DEVICE_FUNC +inline typename NColsBlockXpr::Type middleCols(Index startCol, Index n = N) { return typename NColsBlockXpr::Type(derived(), 0, startCol, rows(), n); } /// This is the const version of middleCols(). template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename ConstNColsBlockXpr::Type middleCols(Index startCol, Index n = N) const +EIGEN_DEVICE_FUNC +inline typename ConstNColsBlockXpr::Type middleCols(Index startCol, Index n = N) const { return typename ConstNColsBlockXpr::Type(derived(), 0, startCol, rows(), n); } -/// \returns a fixed-size expression of a block of \c *this. +/// \returns a fixed-size expression of a block in *this. /// /// The template parameters \a NRows and \a NCols are the number of /// rows and columns in the block. @@ -1009,35 +768,29 @@ typename ConstNColsBlockXpr::Type middleCols(Index startCol, Index n = N) con /// Example: \include MatrixBase_block_int_int.cpp /// Output: \verbinclude MatrixBase_block_int_int.out /// -/// \note The usage of of this overload is discouraged from %Eigen 3.4, better used the generic -/// block(Index,Index,NRowsType,NColsType), here is the one-to-one equivalence: -/// \code -/// mat.template block(i,j) <--> mat.block(i,j,fix,fix) -/// \endcode -/// /// \note since block is a templated member, the keyword template has to be used /// if the matrix type is also a template parameter: \code m.template block<3,3>(1,1); \endcode /// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa block(Index,Index,NRowsType,NColsType), class Block +/// \sa class Block, block(Index,Index,Index,Index) /// template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename FixedBlockXpr::Type block(Index startRow, Index startCol) +EIGEN_DEVICE_FUNC +inline typename FixedBlockXpr::Type block(Index startRow, Index startCol) { return typename FixedBlockXpr::Type(derived(), startRow, startCol); } /// This is the const version of block<>(Index, Index). */ template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol) const +EIGEN_DEVICE_FUNC +inline const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol) const { return typename ConstFixedBlockXpr::Type(derived(), startRow, startCol); } -/// \returns an expression of a block of \c *this. +/// \returns an expression of a block in *this. /// /// \tparam NRows number of rows in block as specified at compile-time /// \tparam NCols number of columns in block as specified at compile-time @@ -1052,25 +805,14 @@ const typename ConstFixedBlockXpr::Type block(Index startRow, Index /// \a NRows is \a Dynamic, and the same for the number of columns. /// /// Example: \include MatrixBase_template_int_int_block_int_int_int_int.cpp -/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.out -/// -/// \note The usage of of this overload is discouraged from %Eigen 3.4, better used the generic -/// block(Index,Index,NRowsType,NColsType), here is the one-to-one complete equivalence: -/// \code -/// mat.template block(i,j,rows,cols) <--> mat.block(i,j,fix(rows),fix(cols)) -/// \endcode -/// If we known that, e.g., NRows==Dynamic and NCols!=Dynamic, then the equivalence becomes: -/// \code -/// mat.template block(i,j,rows,NCols) <--> mat.block(i,j,rows,fix) -/// \endcode +/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.cpp /// EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /// -/// \sa block(Index,Index,NRowsType,NColsType), class Block +/// \sa class Block, block(Index,Index,Index,Index) /// template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename FixedBlockXpr::Type block(Index startRow, Index startCol, +inline typename FixedBlockXpr::Type block(Index startRow, Index startCol, Index blockRows, Index blockCols) { return typename FixedBlockXpr::Type(derived(), startRow, startCol, blockRows, blockCols); @@ -1078,14 +820,13 @@ typename FixedBlockXpr::Type block(Index startRow, Index startCol, /// This is the const version of block<>(Index, Index, Index, Index). template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol, +inline const typename ConstFixedBlockXpr::Type block(Index startRow, Index startCol, Index blockRows, Index blockCols) const { return typename ConstFixedBlockXpr::Type(derived(), startRow, startCol, blockRows, blockCols); } -/// \returns an expression of the \a i-th column of \c *this. Note that the numbering starts at 0. +/// \returns an expression of the \a i-th column of *this. Note that the numbering starts at 0. /// /// Example: \include MatrixBase_col.cpp /// Output: \verbinclude MatrixBase_col.out @@ -1093,20 +834,20 @@ const typename ConstFixedBlockXpr::Type block(Index startRow, Index EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column-major) /** * \sa row(), class Block */ -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -ColXpr col(Index i) +EIGEN_DEVICE_FUNC +inline ColXpr col(Index i) { return ColXpr(derived(), i); } /// This is the const version of col(). -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -ConstColXpr col(Index i) const +EIGEN_DEVICE_FUNC +inline ConstColXpr col(Index i) const { return ConstColXpr(derived(), i); } -/// \returns an expression of the \a i-th row of \c *this. Note that the numbering starts at 0. +/// \returns an expression of the \a i-th row of *this. Note that the numbering starts at 0. /// /// Example: \include MatrixBase_row.cpp /// Output: \verbinclude MatrixBase_row.out @@ -1114,166 +855,109 @@ ConstColXpr col(Index i) const EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row-major) /** * \sa col(), class Block */ -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -RowXpr row(Index i) +EIGEN_DEVICE_FUNC +inline RowXpr row(Index i) { return RowXpr(derived(), i); } /// This is the const version of row(). */ -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -ConstRowXpr row(Index i) const +EIGEN_DEVICE_FUNC +inline ConstRowXpr row(Index i) const { return ConstRowXpr(derived(), i); } -/// \returns an expression of a segment (i.e. a vector block) in \c *this with either dynamic or fixed sizes. +/// \returns a dynamic-size expression of a segment (i.e. a vector block) in *this. /// /// \only_for_vectors /// /// \param start the first coefficient in the segment /// \param n the number of coefficients in the segment -/// \tparam NType the type of the value handling the number of coefficients in the segment, typically Index. /// /// Example: \include MatrixBase_segment_int_int.cpp /// Output: \verbinclude MatrixBase_segment_int_int.out /// -/// The number of coefficients \a n can also be specified at compile-time by passing Eigen::fix, -/// or Eigen::fix(n) as arguments. -/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. -/// -/// \note Even in the case that the returned expression has dynamic size, in the case +/// \note Even though the returned expression has dynamic size, in the case /// when it is applied to a fixed-size vector, it inherits a fixed maximal size, /// which means that evaluating it does not cause a dynamic memory allocation. /// -/// \sa block(Index,Index,NRowsType,NColsType), fix, fix(int), class Block +/// \sa class Block, segment(Index) /// -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -typename FixedSegmentReturnType::value>::Type -#else -typename FixedSegmentReturnType<...>::Type -#endif -segment(Index start, NType n) +EIGEN_DEVICE_FUNC +inline SegmentReturnType segment(Index start, Index n) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return typename FixedSegmentReturnType::value>::Type - (derived(), start, internal::get_runtime_value(n)); + return SegmentReturnType(derived(), start, n); } -/// This is the const version of segment(Index,NType). -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -const typename ConstFixedSegmentReturnType::value>::Type -#else -const typename ConstFixedSegmentReturnType<...>::Type -#endif -segment(Index start, NType n) const +/// This is the const version of segment(Index,Index). +EIGEN_DEVICE_FUNC +inline ConstSegmentReturnType segment(Index start, Index n) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return typename ConstFixedSegmentReturnType::value>::Type - (derived(), start, internal::get_runtime_value(n)); + return ConstSegmentReturnType(derived(), start, n); } -/// \returns an expression of the first coefficients of \c *this with either dynamic or fixed sizes. +/// \returns a dynamic-size expression of the first coefficients of *this. /// /// \only_for_vectors /// /// \param n the number of coefficients in the segment -/// \tparam NType the type of the value handling the number of coefficients in the segment, typically Index. /// /// Example: \include MatrixBase_start_int.cpp /// Output: \verbinclude MatrixBase_start_int.out /// -/// The number of coefficients \a n can also be specified at compile-time by passing Eigen::fix, -/// or Eigen::fix(n) as arguments. -/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. -/// -/// \note Even in the case that the returned expression has dynamic size, in the case +/// \note Even though the returned expression has dynamic size, in the case /// when it is applied to a fixed-size vector, it inherits a fixed maximal size, /// which means that evaluating it does not cause a dynamic memory allocation. /// /// \sa class Block, block(Index,Index) /// -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -typename FixedSegmentReturnType::value>::Type -#else -typename FixedSegmentReturnType<...>::Type -#endif -head(NType n) +EIGEN_DEVICE_FUNC +inline SegmentReturnType head(Index n) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return typename FixedSegmentReturnType::value>::Type - (derived(), 0, internal::get_runtime_value(n)); + return SegmentReturnType(derived(), 0, n); } -/// This is the const version of head(NType). -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -const typename ConstFixedSegmentReturnType::value>::Type -#else -const typename ConstFixedSegmentReturnType<...>::Type -#endif -head(NType n) const +/// This is the const version of head(Index). +EIGEN_DEVICE_FUNC +inline ConstSegmentReturnType head(Index n) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return typename ConstFixedSegmentReturnType::value>::Type - (derived(), 0, internal::get_runtime_value(n)); + return ConstSegmentReturnType(derived(), 0, n); } -/// \returns an expression of a last coefficients of \c *this with either dynamic or fixed sizes. +/// \returns a dynamic-size expression of the last coefficients of *this. /// /// \only_for_vectors /// /// \param n the number of coefficients in the segment -/// \tparam NType the type of the value handling the number of coefficients in the segment, typically Index. /// /// Example: \include MatrixBase_end_int.cpp /// Output: \verbinclude MatrixBase_end_int.out /// -/// The number of coefficients \a n can also be specified at compile-time by passing Eigen::fix, -/// or Eigen::fix(n) as arguments. -/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details. -/// -/// \note Even in the case that the returned expression has dynamic size, in the case +/// \note Even though the returned expression has dynamic size, in the case /// when it is applied to a fixed-size vector, it inherits a fixed maximal size, /// which means that evaluating it does not cause a dynamic memory allocation. /// /// \sa class Block, block(Index,Index) /// -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -typename FixedSegmentReturnType::value>::Type -#else -typename FixedSegmentReturnType<...>::Type -#endif -tail(NType n) +EIGEN_DEVICE_FUNC +inline SegmentReturnType tail(Index n) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return typename FixedSegmentReturnType::value>::Type - (derived(), this->size() - internal::get_runtime_value(n), internal::get_runtime_value(n)); + return SegmentReturnType(derived(), this->size() - n, n); } /// This is the const version of tail(Index). -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -#ifndef EIGEN_PARSED_BY_DOXYGEN -const typename ConstFixedSegmentReturnType::value>::Type -#else -const typename ConstFixedSegmentReturnType<...>::Type -#endif -tail(NType n) const +EIGEN_DEVICE_FUNC +inline ConstSegmentReturnType tail(Index n) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return typename ConstFixedSegmentReturnType::value>::Type - (derived(), this->size() - internal::get_runtime_value(n), internal::get_runtime_value(n)); + return ConstSegmentReturnType(derived(), this->size() - n, n); } /// \returns a fixed-size expression of a segment (i.e. a vector block) in \c *this @@ -1290,11 +974,11 @@ tail(NType n) const /// Example: \include MatrixBase_template_int_segment.cpp /// Output: \verbinclude MatrixBase_template_int_segment.out /// -/// \sa segment(Index,NType), class Block +/// \sa class Block /// template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename FixedSegmentReturnType::Type segment(Index start, Index n = N) +EIGEN_DEVICE_FUNC +inline typename FixedSegmentReturnType::Type segment(Index start, Index n = N) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename FixedSegmentReturnType::Type(derived(), start, n); @@ -1302,14 +986,14 @@ typename FixedSegmentReturnType::Type segment(Index start, Index n = N) /// This is the const version of segment(Index). template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename ConstFixedSegmentReturnType::Type segment(Index start, Index n = N) const +EIGEN_DEVICE_FUNC +inline typename ConstFixedSegmentReturnType::Type segment(Index start, Index n = N) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename ConstFixedSegmentReturnType::Type(derived(), start, n); } -/// \returns a fixed-size expression of the first coefficients of \c *this. +/// \returns a fixed-size expression of the first coefficients of *this. /// /// \only_for_vectors /// @@ -1322,11 +1006,11 @@ typename ConstFixedSegmentReturnType::Type segment(Index start, Index n = N) /// Example: \include MatrixBase_template_int_start.cpp /// Output: \verbinclude MatrixBase_template_int_start.out /// -/// \sa head(NType), class Block +/// \sa class Block /// template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename FixedSegmentReturnType::Type head(Index n = N) +EIGEN_DEVICE_FUNC +inline typename FixedSegmentReturnType::Type head(Index n = N) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename FixedSegmentReturnType::Type(derived(), 0, n); @@ -1334,14 +1018,14 @@ typename FixedSegmentReturnType::Type head(Index n = N) /// This is the const version of head(). template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename ConstFixedSegmentReturnType::Type head(Index n = N) const +EIGEN_DEVICE_FUNC +inline typename ConstFixedSegmentReturnType::Type head(Index n = N) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename ConstFixedSegmentReturnType::Type(derived(), 0, n); } -/// \returns a fixed-size expression of the last coefficients of \c *this. +/// \returns a fixed-size expression of the last coefficients of *this. /// /// \only_for_vectors /// @@ -1354,11 +1038,11 @@ typename ConstFixedSegmentReturnType::Type head(Index n = N) const /// Example: \include MatrixBase_template_int_end.cpp /// Output: \verbinclude MatrixBase_template_int_end.out /// -/// \sa tail(NType), class Block +/// \sa class Block /// template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename FixedSegmentReturnType::Type tail(Index n = N) +EIGEN_DEVICE_FUNC +inline typename FixedSegmentReturnType::Type tail(Index n = N) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename FixedSegmentReturnType::Type(derived(), size() - n); @@ -1366,78 +1050,9 @@ typename FixedSegmentReturnType::Type tail(Index n = N) /// This is the const version of tail. template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename ConstFixedSegmentReturnType::Type tail(Index n = N) const +EIGEN_DEVICE_FUNC +inline typename ConstFixedSegmentReturnType::Type tail(Index n = N) const { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return typename ConstFixedSegmentReturnType::Type(derived(), size() - n); } - -/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this -/// is col-major (resp. row-major). -/// -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -InnerVectorReturnType innerVector(Index outer) -{ return InnerVectorReturnType(derived(), outer); } - -/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this -/// is col-major (resp. row-major). Read-only. -/// -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -const ConstInnerVectorReturnType innerVector(Index outer) const -{ return ConstInnerVectorReturnType(derived(), outer); } - -/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this -/// is col-major (resp. row-major). -/// -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -InnerVectorsReturnType -innerVectors(Index outerStart, Index outerSize) -{ - return Block(derived(), - IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart, - IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize); - -} - -/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this -/// is col-major (resp. row-major). Read-only. -/// -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -const ConstInnerVectorsReturnType -innerVectors(Index outerStart, Index outerSize) const -{ - return Block(derived(), - IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart, - IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize); - -} - -/** \returns the i-th subvector (column or vector) according to the \c Direction - * \sa subVectors() - */ -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename internal::conditional::type -subVector(Index i) -{ - return typename internal::conditional::type(derived(),i); -} - -/** This is the const version of subVector(Index) */ -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename internal::conditional::type -subVector(Index i) const -{ - return typename internal::conditional::type(derived(),i); -} - -/** \returns the number of subvectors (rows or columns) in the direction \c Direction - * \sa subVector(Index) - */ -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -Index subVectors() const -{ return (Direction==Vertical)?cols():rows(); } - diff --git a/uppsrc/plugin/Eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h b/uppsrc/plugin/Eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h index 42ff901ca..89f4faaac 100644 --- a/uppsrc/plugin/Eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h +++ b/uppsrc/plugin/Eigen/Eigen/src/plugins/CommonCwiseUnaryOps.h @@ -64,49 +64,6 @@ cast() const return typename CastXpr::Type(derived()); } -template struct ShiftRightXpr { - typedef CwiseUnaryOp, const Derived> Type; -}; - -/// \returns an expression of \c *this with the \a Scalar type arithmetically -/// shifted right by \a N bit positions. -/// -/// The template parameter \a N specifies the number of bit positions to shift. -/// -EIGEN_DOC_UNARY_ADDONS(cast,conversion function) -/// -/// \sa class CwiseUnaryOp -/// -template -EIGEN_DEVICE_FUNC -typename ShiftRightXpr::Type -shift_right() const -{ - return typename ShiftRightXpr::Type(derived()); -} - - -template struct ShiftLeftXpr { - typedef CwiseUnaryOp, const Derived> Type; -}; - -/// \returns an expression of \c *this with the \a Scalar type logically -/// shifted left by \a N bit positions. -/// -/// The template parameter \a N specifies the number of bit positions to shift. -/// -EIGEN_DOC_UNARY_ADDONS(cast,conversion function) -/// -/// \sa class CwiseUnaryOp -/// -template -EIGEN_DEVICE_FUNC -typename ShiftLeftXpr::Type -shift_left() const -{ - return typename ShiftLeftXpr::Type(derived()); -} - /// \returns an expression of the complex conjugate of \c *this. /// EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate) @@ -119,20 +76,6 @@ conjugate() const return ConjugateReturnType(derived()); } -/// \returns an expression of the complex conjugate of \c *this if Cond==true, returns derived() otherwise. -/// -EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate) -/// -/// \sa conjugate() -template -EIGEN_DEVICE_FUNC -inline typename internal::conditional::type -conjugateIf() const -{ - typedef typename internal::conditional::type ReturnType; - return ReturnType(derived()); -} - /// \returns a read-only expression of the real part of \c *this. /// EIGEN_DOC_UNARY_ADDONS(real,real part function) diff --git a/uppsrc/plugin/Eigen/Eigen/src/plugins/IndexedViewMethods.h b/uppsrc/plugin/Eigen/Eigen/src/plugins/IndexedViewMethods.h deleted file mode 100644 index 5bfb19ac6..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/plugins/IndexedViewMethods.h +++ /dev/null @@ -1,262 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2017 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#if !defined(EIGEN_PARSED_BY_DOXYGEN) - -// This file is automatically included twice to generate const and non-const versions - -#ifndef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS -#define EIGEN_INDEXED_VIEW_METHOD_CONST const -#define EIGEN_INDEXED_VIEW_METHOD_TYPE ConstIndexedViewType -#else -#define EIGEN_INDEXED_VIEW_METHOD_CONST -#define EIGEN_INDEXED_VIEW_METHOD_TYPE IndexedViewType -#endif - -#ifndef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS -protected: - -// define some aliases to ease readability - -template -struct IvcRowType : public internal::IndexedViewCompatibleType {}; - -template -struct IvcColType : public internal::IndexedViewCompatibleType {}; - -template -struct IvcType : public internal::IndexedViewCompatibleType {}; - -typedef typename internal::IndexedViewCompatibleType::type IvcIndex; - -template -typename IvcRowType::type -ivcRow(const Indices& indices) const { - return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic(derived().rows()),Specialized); -} - -template -typename IvcColType::type -ivcCol(const Indices& indices) const { - return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic(derived().cols()),Specialized); -} - -template -typename IvcColType::type -ivcSize(const Indices& indices) const { - return internal::makeIndexedViewCompatible(indices, internal::variable_if_dynamic(derived().size()),Specialized); -} - -public: - -#endif - -template -struct EIGEN_INDEXED_VIEW_METHOD_TYPE { - typedef IndexedView::type, - typename IvcColType::type> type; -}; - -// This is the generic version - -template -typename internal::enable_if::value - && internal::traits::type>::ReturnAsIndexedView, - typename EIGEN_INDEXED_VIEW_METHOD_TYPE::type >::type -operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST -{ - return typename EIGEN_INDEXED_VIEW_METHOD_TYPE::type - (derived(), ivcRow(rowIndices), ivcCol(colIndices)); -} - -// The following overload returns a Block<> object - -template -typename internal::enable_if::value - && internal::traits::type>::ReturnAsBlock, - typename internal::traits::type>::BlockType>::type -operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST -{ - typedef typename internal::traits::type>::BlockType BlockType; - typename IvcRowType::type actualRowIndices = ivcRow(rowIndices); - typename IvcColType::type actualColIndices = ivcCol(colIndices); - return BlockType(derived(), - internal::first(actualRowIndices), - internal::first(actualColIndices), - internal::size(actualRowIndices), - internal::size(actualColIndices)); -} - -// The following overload returns a Scalar - -template -typename internal::enable_if::value - && internal::traits::type>::ReturnAsScalar, - CoeffReturnType >::type -operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST -{ - return Base::operator()(internal::eval_expr_given_size(rowIndices,rows()),internal::eval_expr_given_size(colIndices,cols())); -} - -#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE - -// The following three overloads are needed to handle raw Index[N] arrays. - -template -IndexedView::type> -operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndices& colIndices) EIGEN_INDEXED_VIEW_METHOD_CONST -{ - return IndexedView::type> - (derived(), rowIndices, ivcCol(colIndices)); -} - -template -IndexedView::type, const ColIndicesT (&)[ColIndicesN]> -operator()(const RowIndices& rowIndices, const ColIndicesT (&colIndices)[ColIndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST -{ - return IndexedView::type,const ColIndicesT (&)[ColIndicesN]> - (derived(), ivcRow(rowIndices), colIndices); -} - -template -IndexedView -operator()(const RowIndicesT (&rowIndices)[RowIndicesN], const ColIndicesT (&colIndices)[ColIndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST -{ - return IndexedView - (derived(), rowIndices, colIndices); -} - -#endif // EIGEN_HAS_STATIC_ARRAY_TEMPLATE - -// Overloads for 1D vectors/arrays - -template -typename internal::enable_if< - IsRowMajor && (!(internal::get_compile_time_incr::type>::value==1 || internal::is_valid_index_type::value)), - IndexedView::type> >::type -operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST -{ - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return IndexedView::type> - (derived(), IvcIndex(0), ivcCol(indices)); -} - -template -typename internal::enable_if< - (!IsRowMajor) && (!(internal::get_compile_time_incr::type>::value==1 || internal::is_valid_index_type::value)), - IndexedView::type,IvcIndex> >::type -operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST -{ - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return IndexedView::type,IvcIndex> - (derived(), ivcRow(indices), IvcIndex(0)); -} - -template -typename internal::enable_if< - (internal::get_compile_time_incr::type>::value==1) && (!internal::is_valid_index_type::value) && (!symbolic::is_symbolic::value), - VectorBlock::value> >::type -operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST -{ - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - typename IvcType::type actualIndices = ivcSize(indices); - return VectorBlock::value> - (derived(), internal::first(actualIndices), internal::size(actualIndices)); -} - -template -typename internal::enable_if::value, CoeffReturnType >::type -operator()(const IndexType& id) EIGEN_INDEXED_VIEW_METHOD_CONST -{ - return Base::operator()(internal::eval_expr_given_size(id,size())); -} - -#if EIGEN_HAS_STATIC_ARRAY_TEMPLATE - -template -typename internal::enable_if >::type -operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST -{ - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return IndexedView - (derived(), IvcIndex(0), indices); -} - -template -typename internal::enable_if >::type -operator()(const IndicesT (&indices)[IndicesN]) EIGEN_INDEXED_VIEW_METHOD_CONST -{ - EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) - return IndexedView - (derived(), indices, IvcIndex(0)); -} - -#endif // EIGEN_HAS_STATIC_ARRAY_TEMPLATE - -#undef EIGEN_INDEXED_VIEW_METHOD_CONST -#undef EIGEN_INDEXED_VIEW_METHOD_TYPE - -#ifndef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS -#define EIGEN_INDEXED_VIEW_METHOD_2ND_PASS -#include "IndexedViewMethods.h" -#undef EIGEN_INDEXED_VIEW_METHOD_2ND_PASS -#endif - -#else // EIGEN_PARSED_BY_DOXYGEN - -/** - * \returns a generic submatrix view defined by the rows and columns indexed \a rowIndices and \a colIndices respectively. - * - * Each parameter must either be: - * - An integer indexing a single row or column - * - Eigen::all indexing the full set of respective rows or columns in increasing order - * - An ArithmeticSequence as returned by the Eigen::seq and Eigen::seqN functions - * - Any %Eigen's vector/array of integers or expressions - * - Plain C arrays: \c int[N] - * - And more generally any type exposing the following two member functions: - * \code - * operator[]() const; - * size() const; - * \endcode - * where \c stands for any integer type compatible with Eigen::Index (i.e. \c std::ptrdiff_t). - * - * The last statement implies compatibility with \c std::vector, \c std::valarray, \c std::array, many of the Range-v3's ranges, etc. - * - * If the submatrix can be represented using a starting position \c (i,j) and positive sizes \c (rows,columns), then this - * method will returns a Block object after extraction of the relevant information from the passed arguments. This is the case - * when all arguments are either: - * - An integer - * - Eigen::all - * - An ArithmeticSequence with compile-time increment strictly equal to 1, as returned by Eigen::seq(a,b), and Eigen::seqN(a,N). - * - * Otherwise a more general IndexedView object will be returned, after conversion of the inputs - * to more suitable types \c RowIndices' and \c ColIndices'. - * - * For 1D vectors and arrays, you better use the operator()(const Indices&) overload, which behave the same way but taking a single parameter. - * - * See also this question and its answer for an example of how to duplicate coefficients. - * - * \sa operator()(const Indices&), class Block, class IndexedView, DenseBase::block(Index,Index,Index,Index) - */ -template -IndexedView_or_Block -operator()(const RowIndices& rowIndices, const ColIndices& colIndices); - -/** This is an overload of operator()(const RowIndices&, const ColIndices&) for 1D vectors or arrays - * - * \only_for_vectors - */ -template -IndexedView_or_VectorBlock -operator()(const Indices& indices); - -#endif // EIGEN_PARSED_BY_DOXYGEN diff --git a/uppsrc/plugin/Eigen/Eigen/src/plugins/ReshapedMethods.h b/uppsrc/plugin/Eigen/Eigen/src/plugins/ReshapedMethods.h deleted file mode 100644 index 482a6b045..000000000 --- a/uppsrc/plugin/Eigen/Eigen/src/plugins/ReshapedMethods.h +++ /dev/null @@ -1,149 +0,0 @@ - -#ifdef EIGEN_PARSED_BY_DOXYGEN - -/// \returns an expression of \c *this with reshaped sizes. -/// -/// \param nRows the number of rows in the reshaped expression, specified at either run-time or compile-time, or AutoSize -/// \param nCols the number of columns in the reshaped expression, specified at either run-time or compile-time, or AutoSize -/// \tparam Order specifies whether the coefficients should be processed in column-major-order (ColMajor), in row-major-order (RowMajor), -/// or follows the \em natural order of the nested expression (AutoOrder). The default is ColMajor. -/// \tparam NRowsType the type of the value handling the number of rows, typically Index. -/// \tparam NColsType the type of the value handling the number of columns, typically Index. -/// -/// Dynamic size example: \include MatrixBase_reshaped_int_int.cpp -/// Output: \verbinclude MatrixBase_reshaped_int_int.out -/// -/// The number of rows \a nRows and columns \a nCols can also be specified at compile-time by passing Eigen::fix, -/// or Eigen::fix(n) as arguments. In the later case, \c n plays the role of a runtime fallback value in case \c N equals Eigen::Dynamic. -/// Here is an example with a fixed number of rows and columns: -/// \include MatrixBase_reshaped_fixed.cpp -/// Output: \verbinclude MatrixBase_reshaped_fixed.out -/// -/// Finally, one of the sizes parameter can be automatically deduced from the other one by passing AutoSize as in the following example: -/// \include MatrixBase_reshaped_auto.cpp -/// Output: \verbinclude MatrixBase_reshaped_auto.out -/// AutoSize does preserve compile-time sizes when possible, i.e., when the sizes of the input are known at compile time \b and -/// that the other size is passed at compile-time using Eigen::fix as above. -/// -/// \sa class Reshaped, fix, fix(int) -/// -template -EIGEN_DEVICE_FUNC -inline Reshaped -reshaped(NRowsType nRows, NColsType nCols); - -/// This is the const version of reshaped(NRowsType,NColsType). -template -EIGEN_DEVICE_FUNC -inline const Reshaped -reshaped(NRowsType nRows, NColsType nCols) const; - -/// \returns an expression of \c *this with columns (or rows) stacked to a linear column vector -/// -/// \tparam Order specifies whether the coefficients should be processed in column-major-order (ColMajor), in row-major-order (RowMajor), -/// or follows the \em natural order of the nested expression (AutoOrder). The default is ColMajor. -/// -/// This overloads is essentially a shortcut for `A.reshaped(AutoSize,fix<1>)`. -/// -/// - If `Order==ColMajor` (the default), then it returns a column-vector from the stacked columns of \c *this. -/// - If `Order==RowMajor`, then it returns a column-vector from the stacked rows of \c *this. -/// - If `Order==AutoOrder`, then it returns a column-vector with elements stacked following the storage order of \c *this. -/// This mode is the recommended one when the particular ordering of the element is not relevant. -/// -/// Example: -/// \include MatrixBase_reshaped_to_vector.cpp -/// Output: \verbinclude MatrixBase_reshaped_to_vector.out -/// -/// If you want more control, you can still fall back to reshaped(NRowsType,NColsType). -/// -/// \sa reshaped(NRowsType,NColsType), class Reshaped -/// -template -EIGEN_DEVICE_FUNC -inline Reshaped -reshaped(); - -/// This is the const version of reshaped(). -template -EIGEN_DEVICE_FUNC -inline const Reshaped -reshaped() const; - -#else - -// This file is automatically included twice to generate const and non-const versions - -#ifndef EIGEN_RESHAPED_METHOD_2ND_PASS -#define EIGEN_RESHAPED_METHOD_CONST const -#else -#define EIGEN_RESHAPED_METHOD_CONST -#endif - -#ifndef EIGEN_RESHAPED_METHOD_2ND_PASS - -// This part is included once - -#endif - -template -EIGEN_DEVICE_FUNC -inline Reshaped::value, - internal::get_compiletime_reshape_size::value> -reshaped(NRowsType nRows, NColsType nCols) EIGEN_RESHAPED_METHOD_CONST -{ - return Reshaped::value, - internal::get_compiletime_reshape_size::value> - (derived(), - internal::get_runtime_reshape_size(nRows,internal::get_runtime_value(nCols),size()), - internal::get_runtime_reshape_size(nCols,internal::get_runtime_value(nRows),size())); -} - -template -EIGEN_DEVICE_FUNC -inline Reshaped::value, - internal::get_compiletime_reshape_size::value, - internal::get_compiletime_reshape_order::value> -reshaped(NRowsType nRows, NColsType nCols) EIGEN_RESHAPED_METHOD_CONST -{ - return Reshaped::value, - internal::get_compiletime_reshape_size::value, - internal::get_compiletime_reshape_order::value> - (derived(), - internal::get_runtime_reshape_size(nRows,internal::get_runtime_value(nCols),size()), - internal::get_runtime_reshape_size(nCols,internal::get_runtime_value(nRows),size())); -} - -// Views as linear vectors - -EIGEN_DEVICE_FUNC -inline Reshaped -reshaped() EIGEN_RESHAPED_METHOD_CONST -{ - return Reshaped(derived(),size(),1); -} - -template -EIGEN_DEVICE_FUNC -inline Reshaped::value> -reshaped() EIGEN_RESHAPED_METHOD_CONST -{ - EIGEN_STATIC_ASSERT(Order==RowMajor || Order==ColMajor || Order==AutoOrder, INVALID_TEMPLATE_PARAMETER); - return Reshaped::value> - (derived(), size(), 1); -} - -#undef EIGEN_RESHAPED_METHOD_CONST - -#ifndef EIGEN_RESHAPED_METHOD_2ND_PASS -#define EIGEN_RESHAPED_METHOD_2ND_PASS -#include "ReshapedMethods.h" -#undef EIGEN_RESHAPED_METHOD_2ND_PASS -#endif - -#endif // EIGEN_PARSED_BY_DOXYGEN diff --git a/uppsrc/plugin/Eigen/srcdoc.tpp/Eigen_en-us.tpp b/uppsrc/plugin/Eigen/srcdoc.tpp/Eigen_en-us.tpp index 8b935cdde..bef98af12 100644 --- a/uppsrc/plugin/Eigen/srcdoc.tpp/Eigen_en-us.tpp +++ b/uppsrc/plugin/Eigen/srcdoc.tpp/Eigen_en-us.tpp @@ -17,9 +17,8 @@ vectors, numerical solvers and related algorithms.]&] [s0; [C2 -|Matrix2d res `= a`*b;-|// Just multiply them using `*]&] [s0;#2 &] [s0;#2 &] -[s0;# [2 Eigen package is a wrapper of Eigen library, updated to master -branch ][^https`:`/`/gitlab`.com`/libeigen`/eigen`/`-`/commit`/c1d944dd913d05180b7d2d1229072c9c52a11f29^2 c -ommit C1D944DD][2 (9/May/2020). It includes the library and helper +[s0;# [2 Eigen package is a wrapper of Eigen library, updated to stable +release 3.3.8 (05/10/2020). It includes the library and helper functions to integrate better Eigen with U`+`+. Starting from the 3.1.1 version, it is licensed under the ][^http`:`/`/www`.mozilla`.org`/MPL`/2`.0`/^2 M PL2][2 , which is a simple weak copyleft license. Common questions diff --git a/uppsrc/plugin/Eigen/unsupported/CMakeLists.txt b/uppsrc/plugin/Eigen/unsupported/CMakeLists.txt new file mode 100644 index 000000000..9a5666105 --- /dev/null +++ b/uppsrc/plugin/Eigen/unsupported/CMakeLists.txt @@ -0,0 +1,9 @@ +add_subdirectory(Eigen) +add_subdirectory(doc EXCLUDE_FROM_ALL) +if(BUILD_TESTING) + if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) + add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest + else() + add_subdirectory(test EXCLUDE_FROM_ALL) + endif() +endif() diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/AdolcForward b/uppsrc/plugin/Eigen/unsupported/Eigen/AdolcForward index 9b8d3cd1a..15f5f0731 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/AdolcForward +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/AdolcForward @@ -40,7 +40,7 @@ # undef realloc #endif -#include "../../Eigen/Core" +#include namespace Eigen { diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/AlignedVector3 b/uppsrc/plugin/Eigen/unsupported/Eigen/AlignedVector3 index 4fa1842ac..47a86d4c0 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/AlignedVector3 +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/AlignedVector3 @@ -10,9 +10,7 @@ #ifndef EIGEN_ALIGNED_VECTOR3 #define EIGEN_ALIGNED_VECTOR3 -#include "../../Eigen/Geometry" - -#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" +#include namespace Eigen { @@ -78,9 +76,6 @@ template class AlignedVector3 { return m_coeffs.coeffRef(index);} - inline AlignedVector3() - {} - inline AlignedVector3(const Scalar& x, const Scalar& y, const Scalar& z) : m_coeffs(x, y, z, Scalar(0)) {} @@ -134,9 +129,6 @@ template class AlignedVector3 inline AlignedVector3 operator-(const AlignedVector3& other) const { return AlignedVector3(m_coeffs - other.m_coeffs); } - inline AlignedVector3 operator-() const - { return AlignedVector3(-m_coeffs); } - inline AlignedVector3 operator-=(const AlignedVector3& other) { m_coeffs -= other.m_coeffs; return *this; } @@ -229,6 +221,4 @@ struct evaluator > } -#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" - #endif // EIGEN_ALIGNED_VECTOR3 diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/ArpackSupport b/uppsrc/plugin/Eigen/unsupported/Eigen/ArpackSupport index 28c95ffa2..a0d4820e1 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/ArpackSupport +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/ArpackSupport @@ -9,7 +9,7 @@ #ifndef EIGEN_ARPACKSUPPORT_MODULE_H #define EIGEN_ARPACKSUPPORT_MODULE_H -#include "../../Eigen/Core" +#include /** \defgroup ArpackSupport_Module Arpack support module * @@ -20,12 +20,12 @@ * \endcode */ -#include "../../Eigen/SparseCholesky" +#include -#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" +#include #include "src/Eigenvalues/ArpackSelfAdjointEigenSolver.h" -#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" +#include #endif // EIGEN_ARPACKSUPPORT_MODULE_H /* vim: set filetype=cpp et sw=2 ts=2 ai: */ diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/AutoDiff b/uppsrc/plugin/Eigen/unsupported/Eigen/AutoDiff index 7a4ff460c..abf5b7d67 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/AutoDiff +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/AutoDiff @@ -28,17 +28,11 @@ namespace Eigen { //@{ } -#include "../../Eigen/src/Core/util/DisableStupidWarnings.h" - #include "src/AutoDiff/AutoDiffScalar.h" // #include "src/AutoDiff/AutoDiffVector.h" #include "src/AutoDiff/AutoDiffJacobian.h" -#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h" - - - namespace Eigen { //@} } diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/BVH b/uppsrc/plugin/Eigen/unsupported/Eigen/BVH index 666c9835f..0161a5402 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/BVH +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/BVH @@ -10,9 +10,9 @@ #ifndef EIGEN_BVH_MODULE_H #define EIGEN_BVH_MODULE_H -#include "../../Eigen/Core" -#include "../../Eigen/Geometry" -#include "../../Eigen/StdVector" +#include +#include +#include #include #include diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CMakeLists.txt b/uppsrc/plugin/Eigen/unsupported/Eigen/CMakeLists.txt new file mode 100644 index 000000000..631a06014 --- /dev/null +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CMakeLists.txt @@ -0,0 +1,32 @@ +set(Eigen_HEADERS + AdolcForward + AlignedVector3 + ArpackSupport + AutoDiff + BVH + EulerAngles + FFT + IterativeSolvers + KroneckerProduct + LevenbergMarquardt + MatrixFunctions + MoreVectorization + MPRealSupport + NonLinearOptimization + NumericalDiff + OpenGLSupport + Polynomials + Skyline + SparseExtra + SpecialFunctions + Splines + ) + +install(FILES + ${Eigen_HEADERS} + DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel + ) + +install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h") + +add_subdirectory(CXX11) diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/Tensor b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/Tensor index 2640f9565..bb6523d15 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/Tensor +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/Tensor @@ -13,11 +13,21 @@ #include "../../../Eigen/Core" -#if EIGEN_HAS_CXX11 +#ifdef EIGEN_USE_SYCL +#undef min +#undef max +#undef isnan +#undef isinf +#undef isfinite +#include +#include +#include +#include +#endif + +#include #include "../SpecialFunctions" - -#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h" #include "src/util/CXX11Meta.h" #include "src/util/MaxSizeVector.h" @@ -36,7 +46,6 @@ #include #include #include -#include #ifdef _WIN32 typedef __int16 int16_t; @@ -45,10 +54,12 @@ typedef __int32 int32_t; typedef unsigned __int32 uint32_t; typedef __int64 int64_t; typedef unsigned __int64 uint64_t; -#include #else #include -#include +#endif + +#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900 +#include #endif #ifdef _WIN32 @@ -59,19 +70,17 @@ typedef unsigned __int64 uint64_t; #include #endif -#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL) +#ifdef EIGEN_USE_THREADS #include "ThreadPool" #endif #ifdef EIGEN_USE_GPU - #include - #if defined(EIGEN_USE_HIP) - #include - #else - #include - #endif - #include - #include +#include +#include +#if __cplusplus >= 201103L +#include +#include +#endif #endif #include "src/Tensor/TensorMacros.h" @@ -81,10 +90,7 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorCostModel.h" #include "src/Tensor/TensorDeviceDefault.h" #include "src/Tensor/TensorDeviceThreadPool.h" -#include "src/Tensor/TensorDeviceGpu.h" -#ifndef gpu_assert -#define gpu_assert(x) -#endif +#include "src/Tensor/TensorDeviceCuda.h" #include "src/Tensor/TensorDeviceSycl.h" #include "src/Tensor/TensorIndexList.h" #include "src/Tensor/TensorDimensionList.h" @@ -97,19 +103,18 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorGlobalFunctions.h" #include "src/Tensor/TensorBase.h" -#include "src/Tensor/TensorBlock.h" #include "src/Tensor/TensorEvaluator.h" #include "src/Tensor/TensorExpr.h" #include "src/Tensor/TensorReduction.h" -#include "src/Tensor/TensorReductionGpu.h" +#include "src/Tensor/TensorReductionCuda.h" #include "src/Tensor/TensorArgMax.h" #include "src/Tensor/TensorConcatenation.h" #include "src/Tensor/TensorContractionMapper.h" #include "src/Tensor/TensorContractionBlocking.h" #include "src/Tensor/TensorContraction.h" #include "src/Tensor/TensorContractionThreadPool.h" -#include "src/Tensor/TensorContractionGpu.h" +#include "src/Tensor/TensorContractionCuda.h" #include "src/Tensor/TensorConversion.h" #include "src/Tensor/TensorConvolution.h" #include "src/Tensor/TensorFFT.h" @@ -131,15 +136,8 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorGenerator.h" #include "src/Tensor/TensorAssign.h" #include "src/Tensor/TensorScan.h" -#include "src/Tensor/TensorTrace.h" - -#ifdef EIGEN_USE_SYCL -#include "src/Tensor/TensorReductionSycl.h" -#include "src/Tensor/TensorConvolutionSycl.h" -#include "src/Tensor/TensorContractionSycl.h" -#include "src/Tensor/TensorScanSycl.h" -#endif +#include "src/Tensor/TensorSycl.h" #include "src/Tensor/TensorExecutor.h" #include "src/Tensor/TensorDevice.h" @@ -151,7 +149,6 @@ typedef unsigned __int64 uint64_t; #include "src/Tensor/TensorIO.h" -#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h" +#include -#endif // EIGEN_HAS_CXX11 //#endif // EIGEN_CXX11_TENSOR_MODULE diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/TensorSymmetry b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/TensorSymmetry index b09c5e472..fb1b0c0fb 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/TensorSymmetry +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/TensorSymmetry @@ -10,9 +10,9 @@ #ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE #define EIGEN_CXX11_TENSORSYMMETRY_MODULE -#include "Tensor" +#include -#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h" +#include #include "src/util/CXX11Meta.h" @@ -33,7 +33,7 @@ #include "src/TensorSymmetry/StaticSymmetry.h" #include "src/TensorSymmetry/DynamicSymmetry.h" -#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h" +#include #endif // EIGEN_CXX11_TENSORSYMMETRY_MODULE diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/ThreadPool b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/ThreadPool index 71a6afe39..09d637e9a 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/ThreadPool +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/ThreadPool @@ -12,7 +12,7 @@ #include "../../../Eigen/Core" -#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h" +#include /** \defgroup CXX11_ThreadPool_Module C++11 ThreadPool Module * @@ -44,32 +44,22 @@ #include #include #include -#include - -// There are non-parenthesized calls to "max" in the header, -// which trigger a check in test/main.h causing compilation to fail. -// We work around the check here by removing the check for max in -// the case where we have to emulate thread_local. -#ifdef max -#undef max -#endif -#include #include "src/util/CXX11Meta.h" #include "src/util/MaxSizeVector.h" #include "src/ThreadPool/ThreadLocal.h" #include "src/ThreadPool/ThreadYield.h" -#include "src/ThreadPool/ThreadCancel.h" #include "src/ThreadPool/EventCount.h" #include "src/ThreadPool/RunQueue.h" #include "src/ThreadPool/ThreadPoolInterface.h" #include "src/ThreadPool/ThreadEnvironment.h" -#include "src/ThreadPool/Barrier.h" +#include "src/ThreadPool/SimpleThreadPool.h" #include "src/ThreadPool/NonBlockingThreadPool.h" #endif -#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h" +#include #endif // EIGEN_CXX11_THREADPOOL_MODULE + diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/README.md b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/README.md index 9b6f14204..da70fa216 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/README.md +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -83,7 +83,7 @@ large enough to hold all the data. // You can also map fixed-size tensors. Here we get a 1d view of // the 2d fixed-size tensor. - TensorFixedSize> t_4x3; + TensorFixedSize> t_4x3; TensorMap> t_12(t_4x3.data(), 12); @@ -430,11 +430,8 @@ This is exactly the same as not inserting a `device()` call. #### Evaluating with a Thread Pool - // Create the Eigen ThreadPool - Eigen::ThreadPool pool(8 /* number of threads in pool */) - // Create the Eigen ThreadPoolDevice. - Eigen::ThreadPoolDevice my_device(&pool, 4 /* number of threads to use */); + Eigen::ThreadPoolDevice my_device(4 /* number of threads to use */); // Now just use the device when evaluating expressions. Eigen::Tensor c(30, 50); @@ -1178,58 +1175,6 @@ Reduce a tensor using a user-defined reduction operator. See `SumReducer` in TensorFunctors.h for information on how to implement a reduction operator. -## Trace - -A *Trace* operation returns a tensor with fewer dimensions than the original -tensor. It returns a tensor whose elements are the sum of the elements of the -original tensor along the main diagonal for a list of specified dimensions, the -"trace dimensions". Similar to the `Reduction Dimensions`, the trace dimensions -are passed as an input parameter to the operation, are of type `::``Dimensions` -, and have the same requirements when passed as an input parameter. In addition, -the trace dimensions must have the same size. - -Example: Trace along 2 dimensions. - - // Create a tensor of 3 dimensions - Eigen::Tensor a(2, 2, 3); - a.setValues({{{1, 2, 3}, {4, 5, 6}}, {{7, 8, 9}, {10, 11, 12}}}); - // Specify the dimensions along which the trace will be computed. - // In this example, the trace can only be computed along the dimensions - // with indices 0 and 1 - Eigen::array dims({0, 1}); - // The output tensor contains all but the trace dimensions. - Tensor a_trace = a.trace(dims); - cout << "a_trace:" << endl; - cout << a_trace << endl; - => - a_trace: - 11 - 13 - 15 - - -### ` trace(const Dimensions& new_dims)` -### ` trace()` - -As a special case, if no parameter is passed to the operation, trace is computed -along *all* dimensions of the input tensor. - -Example: Trace along all dimensions. - - // Create a tensor of 3 dimensions, with all dimensions having the same size. - Eigen::Tensor a(3, 3, 3); - a.setValues({{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, - {{10, 11, 12}, {13, 14, 15}, {16, 17, 18}}, - {{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}}); - // Result is a zero dimension tensor - Tensor a_trace = a.trace(); - cout<<"a_trace:"< - a_trace: - 42 - - ## Scan Operations A *Scan* operation returns a tensor with the same dimensions as the original @@ -1630,81 +1575,81 @@ dimension in RowMajor layout. For example, given the following input tensor: - Eigen::Tensor tensor(3,4); - tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f}, - {4.0f, 5.0f, 6.0f, 7.0f}, - {8.0f, 9.0f, 10.0f, 11.0f}}); + Eigen::Tensor tensor(3,4); + tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f}, + {4.0f, 5.0f, 6.0f, 7.0f}, + {8.0f, 9.0f, 10.0f, 11.0f}}); - cout << "tensor: " << endl << tensor << endl; - => - tensor: - 0 1 2 3 - 4 5 6 7 - 8 9 10 11 + cout << "tensor: " << endl << tensor << endl; +=> +tensor: + 0 1 2 3 + 4 5 6 7 + 8 9 10 11 Six 2x2 patches can be extracted and indexed using the following code: - Eigen::Tensor patch; - Eigen::array patch_dims; - patch_dims[0] = 2; - patch_dims[1] = 2; - patch = tensor.extract_patches(patch_dims); - for (int k = 0; k < 6; ++k) { - cout << "patch index: " << k << endl; - for (int i = 0; i < 2; ++i) { - for (int j = 0; j < 2; ++j) { - if (DataLayout == ColMajor) { - cout << patch(i, j, k) << " "; - } else { - cout << patch(k, i, j) << " "; - } - } - cout << endl; + Eigen::Tensor patch; + Eigen::array patch_dims; + patch_dims[0] = 2; + patch_dims[1] = 2; + patch = tensor.extract_patches(patch_dims); + for (int k = 0; k < 6; ++k) { + cout << "patch index: " << k << endl; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 2; ++j) { + if (DataLayout == ColMajor) { + cout << patch(i, j, k) << " "; + } else { + cout << patch(k, i, j) << " "; + } } + cout << endl; } + } This code results in the following output when the data layout is ColMajor: - patch index: 0 - 0 1 - 4 5 - patch index: 1 - 4 5 - 8 9 - patch index: 2 - 1 2 - 5 6 - patch index: 3 - 5 6 - 9 10 - patch index: 4 - 2 3 - 6 7 - patch index: 5 - 6 7 - 10 11 +patch index: 0 +0 1 +4 5 +patch index: 1 +4 5 +8 9 +patch index: 2 +1 2 +5 6 +patch index: 3 +5 6 +9 10 +patch index: 4 +2 3 +6 7 +patch index: 5 +6 7 +10 11 This code results in the following output when the data layout is RowMajor: (NOTE: the set of patches is the same as in ColMajor, but are indexed differently). - patch index: 0 - 0 1 - 4 5 - patch index: 1 - 1 2 - 5 6 - patch index: 2 - 2 3 - 6 7 - patch index: 3 - 4 5 - 8 9 - patch index: 4 - 5 6 - 9 10 - patch index: 5 - 6 7 - 10 11 +patch index: 0 +0 1 +4 5 +patch index: 1 +1 2 +5 6 +patch index: 2 +2 3 +6 7 +patch index: 3 +4 5 +8 9 +patch index: 4 +5 6 +9 10 +patch index: 5 +6 7 +10 11 ### ` extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type)` @@ -1736,30 +1681,28 @@ sizes: *) columns: 5 *) batch: 7 - Tensor tensor(2,3,5,7); - Tensor tensor_row_major = tensor.swap_layout(); + Tensor tensor(2,3,5,7); + Tensor tensor_row_major = tensor.swap_layout(); 2x2 image patches can be extracted and indexed using the following code: *) 2D patch: ColMajor (patch indexed by second-to-last dimension) - - Tensor twod_patch; - twod_patch = tensor.extract_image_patches<2, 2>(); - // twod_patch.dimension(0) == 2 - // twod_patch.dimension(1) == 2 - // twod_patch.dimension(2) == 2 - // twod_patch.dimension(3) == 3*5 - // twod_patch.dimension(4) == 7 + Tensor twod_patch; + twod_patch = tensor.extract_image_patches<2, 2>(); + // twod_patch.dimension(0) == 2 + // twod_patch.dimension(1) == 2 + // twod_patch.dimension(2) == 2 + // twod_patch.dimension(3) == 3*5 + // twod_patch.dimension(4) == 7 *) 2D patch: RowMajor (patch indexed by the second dimension) - - Tensor twod_patch_row_major; - twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>(); - // twod_patch_row_major.dimension(0) == 7 - // twod_patch_row_major.dimension(1) == 3*5 - // twod_patch_row_major.dimension(2) == 2 - // twod_patch_row_major.dimension(3) == 2 - // twod_patch_row_major.dimension(4) == 2 + Tensor twod_patch_row_major; + twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>(); + // twod_patch_row_major.dimension(0) == 7 + // twod_patch_row_major.dimension(1) == 3*5 + // twod_patch_row_major.dimension(2) == 2 + // twod_patch_row_major.dimension(3) == 2 + // twod_patch_row_major.dimension(4) == 2 ## Special Operations diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/Tensor.h index 17cee495f..00295a255 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/Tensor.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/Tensor.h @@ -112,7 +112,7 @@ class Tensor : public TensorBase - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const + EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const { // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) @@ -398,21 +398,6 @@ class Tensor : public TensorBase::run(assign, DefaultDevice()); } - #if EIGEN_HAS_RVALUE_REFERENCES - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor(Self&& other) - : Tensor() - { - m_storage.swap(other.m_storage); - } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Tensor& operator=(Self&& other) - { - m_storage.swap(other.m_storage); - return *this; - } - #endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other) { @@ -477,18 +462,6 @@ class Tensor : public TensorBase - EIGEN_DEVICE_FUNC - void resize(const Eigen::IndexList& dimensions) { - array dims; - for (int i = 0; i < NumIndices; ++i) { - dims[i] = static_cast(dimensions[i]); - } - resize(dims); - } -#endif - /** Custom Dimension */ #ifdef EIGEN_HAS_SFINAE template > : public traits template struct eval, Eigen::Dense> { - typedef const TensorIndexTupleOpEIGEN_DEVICE_REF type; + typedef const TensorIndexTupleOp& type; }; template @@ -82,23 +82,16 @@ struct TensorEvaluator, Device> typedef typename TensorEvaluator::Dimensions Dimensions; static const int NumDims = internal::array_size::value; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = /*TensorEvaluator::IsAligned*/ false, PacketAccess = /*TensorEvaluator::PacketAccess*/ false, BlockAccess = false, - PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { } @@ -106,7 +99,7 @@ struct TensorEvaluator, Device> return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -124,13 +117,7 @@ struct TensorEvaluator, Device> return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1); } - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - -#ifdef EIGEN_USE_SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: TensorEvaluator m_impl; @@ -160,7 +147,7 @@ struct traits > : public traits struct eval, Eigen::Dense> { - typedef const TensorTupleReducerOpEIGEN_DEVICE_REF type; + typedef const TensorTupleReducerOp& type; }; template @@ -185,7 +172,7 @@ class TensorTupleReducerOp : public TensorBase, Devi typedef typename TensorEvaluator , Device>::Dimensions InputDimensions; static const int NumDims = internal::array_size::value; typedef array StrideDims; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; - typedef StorageMemory TupleStorageMem; enum { - IsAligned = /*TensorEvaluator::IsAligned*/ false, - PacketAccess = /*TensorEvaluator::PacketAccess*/ false, - BlockAccess = false, - PreferBlockAccess = TensorEvaluator::PreferBlockAccess, - Layout = TensorEvaluator >, Device>::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = /*TensorEvaluator::IsAligned*/ false, + PacketAccess = /*TensorEvaluator::PacketAccess*/ false, + BlockAccess = false, + Layout = TensorEvaluator >, Device>::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_orig_impl(op.expression(), device), m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device), - m_return_dim(op.return_dim()) - { + m_return_dim(op.return_dim()) { + gen_strides(m_orig_impl.dimensions(), m_strides); if (Layout == static_cast(ColMajor)) { const Index total_size = internal::array_prod(m_orig_impl.dimensions()); @@ -252,18 +231,15 @@ struct TensorEvaluator, Devi } else { const Index total_size = internal::array_prod(m_orig_impl.dimensions()); m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size; - } - // If m_return_dim is not a valid index, returns 1 or this can crash on Windows. - m_stride_div = ((m_return_dim >= 0) && - (m_return_dim < static_cast(m_strides.size()))) - ? m_strides[m_return_dim] : 1; + } + m_stride_div = m_strides[m_return_dim]; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -276,13 +252,7 @@ struct TensorEvaluator, Devi return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div; } - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } -#ifdef EIGEN_USE_SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - m_orig_impl.bind(cgh); - } -#endif + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { @@ -318,7 +288,7 @@ struct TensorEvaluator, Devi protected: TensorEvaluator, Device> m_orig_impl; TensorEvaluator >, Device> m_impl; - const Index m_return_dim; + const int m_return_dim; StrideDims m_strides; Index m_stride_mod; Index m_stride_div; diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 72f072cf2..166be200c 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -34,7 +34,6 @@ struct traits > typedef typename remove_reference::type _RhsNested; static const std::size_t NumDimensions = internal::traits::NumDimensions; static const int Layout = internal::traits::Layout; - typedef typename traits::PointerType PointerType; enum { Flags = 0 @@ -68,8 +67,6 @@ class TensorAssignOp : public TensorBase typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; - static const int NumDims = Eigen::internal::traits::NumDimensions; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs) : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {} @@ -97,41 +94,20 @@ struct TensorEvaluator, Device> typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef typename TensorEvaluator::Dimensions Dimensions; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; - - static const int PacketSize = PacketType::size; - static const int NumDims = XprType::NumDims; + static const int PacketSize = internal::unpacket_traits::size; enum { - IsAligned = TensorEvaluator::IsAligned & - TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & - TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess & - TensorEvaluator::BlockAccess, - PreferBlockAccess = TensorEvaluator::PreferBlockAccess | - TensorEvaluator::PreferBlockAccess, - Layout = TensorEvaluator::Layout, - RawAccess = TensorEvaluator::RawAccess + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + RawAccess = TensorEvaluator::RawAccess }; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator TensorBlockScratch; - - typedef typename TensorEvaluator::TensorBlock - RightTensorBlock; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) { - EIGEN_STATIC_ASSERT( - (static_cast(TensorEvaluator::Layout) == - static_cast(TensorEvaluator::Layout)), - YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); } EIGEN_DEVICE_FUNC const Dimensions& dimensions() const @@ -142,7 +118,7 @@ struct TensorEvaluator, Device> return m_rightImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); m_leftImpl.evalSubExprsIfNeeded(NULL); // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non @@ -151,18 +127,6 @@ struct TensorEvaluator, Device> // by the rhs to the lhs. return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data()); } - -#ifdef EIGEN_USE_THREADS - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType, EvalSubExprsCallback done) { - m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) { - m_rightImpl.evalSubExprsIfNeededAsync( - m_leftImpl.data(), [done](bool need_assign) { done(need_assign); }); - }); - } -#endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); m_rightImpl.cleanup(); @@ -172,7 +136,6 @@ struct TensorEvaluator, Device> m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) { - const int LhsStoreMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; const int RhsLoadMode = TensorEvaluator::IsAligned ? Aligned : Unaligned; m_leftImpl.template writePacket(i, m_rightImpl.template packet(i)); @@ -200,41 +163,12 @@ struct TensorEvaluator, Device> TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - return internal::TensorBlockResourceRequirements::merge( - m_leftImpl.getResourceRequirements(), - m_rightImpl.getResourceRequirements()); - } + /// required by sycl in order to extract the accessor + const TensorEvaluator& left_impl() const { return m_leftImpl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& right_impl() const { return m_rightImpl; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock( - TensorBlockDesc& desc, TensorBlockScratch& scratch) { - if (TensorEvaluator::RawAccess && - m_leftImpl.data() != NULL) { - // If destination has raw data access, we pass it as a potential - // destination for a block descriptor evaluation. - desc.template AddDestinationBuffer( - /*dst_base=*/m_leftImpl.data() + desc.offset(), - /*dst_strides=*/internal::strides(m_leftImpl.dimensions())); - } - - RightTensorBlock block = m_rightImpl.block(desc, scratch, /*root_of_expr_ast=*/true); - // If block was evaluated into a destination, there is no need to do assignment. - if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) { - m_leftImpl.writeBlock(desc, block); - } - block.cleanup(); - } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_leftImpl.bind(cgh); - m_rightImpl.bind(cgh); - } -#endif - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_leftImpl.data(); } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_leftImpl.data(); } private: TensorEvaluator m_leftImpl; diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index bb0969f49..f573608d9 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -20,7 +20,7 @@ namespace Eigen { * \brief The tensor base class. * * This class is the common parent of the Tensor and TensorMap class, thus - * making it possible to use either class interchangeably in expressions. + * making it possible to use either class interchangably in expressions. */ #ifndef EIGEN_PARSED_BY_DOXYGEN // FIXME Doxygen does not like the inheritance with different template parameters @@ -135,78 +135,6 @@ class TensorBase return unaryExpr(internal::scalar_digamma_op()); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - bessel_i0() const { - return unaryExpr(internal::scalar_bessel_i0_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - bessel_i0e() const { - return unaryExpr(internal::scalar_bessel_i0e_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - bessel_i1() const { - return unaryExpr(internal::scalar_bessel_i1_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - bessel_i1e() const { - return unaryExpr(internal::scalar_bessel_i1e_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - bessel_j0() const { - return unaryExpr(internal::scalar_bessel_j0_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - bessel_y0() const { - return unaryExpr(internal::scalar_bessel_y0_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - bessel_j1() const { - return unaryExpr(internal::scalar_bessel_j1_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - bessel_y1() const { - return unaryExpr(internal::scalar_bessel_y1_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - bessel_k0() const { - return unaryExpr(internal::scalar_bessel_k0_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - bessel_k0e() const { - return unaryExpr(internal::scalar_bessel_k0e_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - bessel_k1() const { - return unaryExpr(internal::scalar_bessel_k1_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - bessel_k1e() const { - return unaryExpr(internal::scalar_bessel_k1e_op()); - } - // igamma(a = this, x = other) template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> @@ -214,20 +142,6 @@ class TensorBase return binaryExpr(other.derived(), internal::scalar_igamma_op()); } - // igamma_der_a(a = this, x = other) - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - igamma_der_a(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_igamma_der_a_op()); - } - - // gamma_sample_der_alpha(alpha = this, sample = other) - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorCwiseBinaryOp, const Derived, const OtherDerived> - gamma_sample_der_alpha(const OtherDerived& other) const { - return binaryExpr(other.derived(), internal::scalar_gamma_sample_der_alpha_op()); - } - // igammac(a = this, x = other) template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> @@ -262,15 +176,9 @@ class TensorBase } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - ndtri() const { - return unaryExpr(internal::scalar_ndtri_op()); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> sigmoid() const { - return unaryExpr(internal::scalar_logistic_op()); + return unaryExpr(internal::scalar_sigmoid_op()); } EIGEN_DEVICE_FUNC @@ -279,12 +187,6 @@ class TensorBase return unaryExpr(internal::scalar_exp_op()); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - expm1() const { - return unaryExpr(internal::scalar_expm1_op()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> log() const { @@ -304,17 +206,9 @@ class TensorBase } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - clip(Scalar min, Scalar max) const { - return unaryExpr(internal::scalar_clamp_op(min, max)); - } - - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const typename internal::conditional::IsComplex, - TensorCwiseUnaryOp, const Derived>, - Derived>::type + EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> conjugate() const { - return choose(Cond::IsComplex>(), unaryExpr(internal::scalar_conjugate_op()), derived()); + return unaryExpr(internal::scalar_conjugate_op()); } EIGEN_DEVICE_FUNC @@ -407,13 +301,10 @@ class TensorBase return cwiseMin(constant(threshold)); } - template - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const typename internal::conditional::value, - Derived, - TensorConversionOp >::type + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const TensorConversionOp cast() const { - return choose(Cond::value>(), derived(), TensorConversionOp(derived())); + return TensorConversionOp(derived()); } EIGEN_DEVICE_FUNC @@ -422,12 +313,6 @@ class TensorBase return unaryExpr(internal::scalar_round_op()); } - EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> - rint() const { - return unaryExpr(internal::scalar_rint_op()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseUnaryOp, const Derived> ceil() const { @@ -596,15 +481,9 @@ class TensorBase typedef Eigen::IndexPair DimensionPair; template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorContractionOp + const TensorContractionOp contract(const OtherDerived& other, const Dimensions& dims) const { - return TensorContractionOp(derived(), other.derived(), dims); - } - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorContractionOp - contract(const OtherDerived& other, const Dimensions& dims, const OutputKernel& output_kernel) const { - return TensorContractionOp(derived(), other.derived(), dims, output_kernel); + return TensorContractionOp(derived(), other.derived(), dims); } // Convolutions. @@ -617,8 +496,8 @@ class TensorBase // Fourier transforms template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorFFTOp - fft(const FFT& dims) const { - return TensorFFTOp(derived(), dims); + fft(const FFT& fft) const { + return TensorFFTOp(derived(), fft); } // Scan. @@ -705,26 +584,26 @@ class TensorBase } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp::value, Derived, TensorConversionOp >::type > + const TensorReductionOp > all(const Dims& dims) const { return cast().reduce(dims, internal::AndReducer()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const typename internal::conditional::value, Derived, TensorConversionOp >::type > + const TensorReductionOp, const TensorConversionOp > all() const { DimensionList in_dims; return cast().reduce(in_dims, internal::AndReducer()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp::value, Derived, TensorConversionOp >::type > + const TensorReductionOp > any(const Dims& dims) const { return cast().reduce(dims, internal::OrReducer()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorReductionOp, const typename internal::conditional::value, Derived, TensorConversionOp >::type > + const TensorReductionOp, const TensorConversionOp > any() const { DimensionList in_dims; return cast().reduce(in_dims, internal::OrReducer()); @@ -736,7 +615,7 @@ class TensorBase const array, const Derived> argmax() const { array in_dims; - for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d; + for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d; return TensorTupleReducerOp< internal::ArgMaxTupleReducer >, const array, @@ -749,7 +628,7 @@ class TensorBase const array, const Derived> argmin() const { array in_dims; - for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d; + for (int d = 0; d < NumDimensions; ++d) in_dims[d] = d; return TensorTupleReducerOp< internal::ArgMinTupleReducer >, const array, @@ -760,7 +639,7 @@ class TensorBase const TensorTupleReducerOp< internal::ArgMaxTupleReducer >, const array, const Derived> - argmax(const Index return_dim) const { + argmax(const int return_dim) const { array in_dims; in_dims[0] = return_dim; return TensorTupleReducerOp< @@ -773,7 +652,7 @@ class TensorBase const TensorTupleReducerOp< internal::ArgMinTupleReducer >, const array, const Derived> - argmin(const Index return_dim) const { + argmin(const int return_dim) const { array in_dims; in_dims[0] = return_dim; return TensorTupleReducerOp< @@ -788,22 +667,10 @@ class TensorBase return TensorReductionOp(derived(), dims, reducer); } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TensorTraceOp - trace(const Dims& dims) const { - return TensorTraceOp(derived(), dims); - } - - const TensorTraceOp, const Derived> - trace() const { - DimensionList in_dims; - return TensorTraceOp, const Derived>(derived(), in_dims); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorBroadcastingOp - broadcast(const Broadcast& bcast) const { - return TensorBroadcastingOp(derived(), bcast); + broadcast(const Broadcast& broadcast) const { + return TensorBroadcastingOp(derived(), broadcast); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -911,8 +778,8 @@ class TensorBase } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorShufflingOp - shuffle(const Shuffle& shfl) const { - return TensorShufflingOp(derived(), shfl); + shuffle(const Shuffle& shuffle) const { + return TensorShufflingOp(derived(), shuffle); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorStridingOp @@ -953,8 +820,7 @@ class TensorBase protected: template friend class Tensor; template friend class TensorFixedSize; - // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0 - template friend class Eigen::TensorBase; + template friend class TensorBase; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast(this); } }; @@ -970,8 +836,7 @@ class TensorBase : public TensorBase { template friend class Tensor; template friend class TensorFixedSize; - // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0 - template friend class Eigen::TensorBase; + template friend class TensorBase; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& setZero() { @@ -1109,13 +974,13 @@ class TensorBase : public TensorBase { template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorShufflingOp - shuffle(const Shuffle& shfl) const { - return TensorShufflingOp(derived(), shfl); + shuffle(const Shuffle& shuffle) const { + return TensorShufflingOp(derived(), shuffle); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp - shuffle(const Shuffle& shfl) { - return TensorShufflingOp(derived(), shfl); + shuffle(const Shuffle& shuffle) { + return TensorShufflingOp(derived(), shuffle); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -1131,14 +996,8 @@ class TensorBase : public TensorBase { // Select the device on which to evaluate the expression. template - TensorDevice device(const DeviceType& dev) { - return TensorDevice(dev, derived()); - } - - // Select the async device on which to evaluate the expression. - template - TensorAsyncDevice device(const DeviceType& dev, DoneCallback done) { - return TensorAsyncDevice(dev, derived(), std::move(done)); + TensorDevice device(const DeviceType& device) { + return TensorDevice(device, derived()); } protected: diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h deleted file mode 100644 index 1e55d12c4..000000000 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ /dev/null @@ -1,1559 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H -#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H - -namespace Eigen { -namespace internal { - -// -------------------------------------------------------------------------- // -// Forward declarations for templates defined below. -template -class TensorBlockIO; - -// -------------------------------------------------------------------------- // -// Helper function to compute strides for densely stored buffer of given -// dimensions. - -// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use -// this function instead everywhere. -template -EIGEN_ALWAYS_INLINE DSizes strides( - const DSizes& dimensions) { - DSizes strides; - if (NumDims == 0) return strides; - - // TODO(ezhulenev): Use templates to unroll this loop (similar to - // h_array_reduce in CXX11meta.h)? Benchmark it. - if (static_cast(Layout) == static_cast(ColMajor)) { - strides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - strides[i] = strides[i - 1] * dimensions[i - 1]; - } - } else { - strides[NumDims - 1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - strides[i] = strides[i + 1] * dimensions[i + 1]; - } - } - - return strides; -} - -template -EIGEN_ALWAYS_INLINE DSizes strides( - const Eigen::array& dimensions) { - return strides(DSizes(dimensions)); -} - -template -EIGEN_STRONG_INLINE DSizes strides( - const Sizes& sizes) { - return strides(DSizes(sizes)); -} - -// -------------------------------------------------------------------------- // - -// Tensor block shape type defines what are the shape preference for the blocks -// extracted from the larger tensor. -// -// Example: blocks of 100 elements from the large 100x100 tensor: -// - tensor: 100x100 -// - target_block_size: 100 -// -// TensorBlockShapeType: -// - kUniformAllDims: 100 blocks of size 10x10 -// - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column -// or row major layout) -enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims }; - -struct TensorBlockResourceRequirements { - TensorBlockShapeType shape_type; // target block shape - size_t size; // target block size - TensorOpCost cost_per_coeff; // cost of computing a single block element - -#ifdef EIGEN_HIPCC - // For HIPCC, we need to explicitly declare as a "device fun", the constructor - // which is implicitly invoked in the "merge" / "any" routines. else HIPCC - // errors out complaining about the lack of a matching constructor - EIGEN_DEVICE_FUNC - TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_, - TensorOpCost cost_) - : shape_type(shape_type_), size(size_), cost_per_coeff(cost_) - {} -#endif - - template - EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize( - TensorBlockShapeType shape_type, size_t size_in_bytes, - TensorOpCost cost) { - const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar)); - return {shape_type, size, cost}; - } - - template - EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize( - TensorBlockShapeType shape_type, size_t size_in_bytes) { - // This default cost per coefficient is valid for most materialized tensor - // block evaluation implementations, because they typically just read - // coefficients from the underlying tensor storage, and write to the tensor - // block buffer (scratch or destination memory, reads and writes have linear - // access pattern). We ignore the fixed cost of block evaluation, because in - // practice it should negligible. - // - // Lazy block evaluation adds the cost of calling a functor for each - // coefficient. - // - // All non-trivial block evaluation implementations must provide their own - // cost approximation (e.g. shuffling inner dimension has a much higher cost - // because it reads memory randomly, although the total number of moved - // bytes is the same). - return withShapeAndSize(shape_type, size_in_bytes, - {/*bytes_loaded=*/sizeof(Scalar), - /*bytes_stored=*/sizeof(Scalar), - /*compute_cycles=*/0}); - } - - template - EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed( - size_t size_in_bytes) { - return withShapeAndSize(TensorBlockShapeType::kSkewedInnerDims, - size_in_bytes); - } - - template - EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform( - size_t size_in_bytes) { - return withShapeAndSize(TensorBlockShapeType::kUniformAllDims, - size_in_bytes); - } - - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE TensorBlockResourceRequirements - merge(const TensorBlockResourceRequirements& lhs, - const TensorBlockResourceRequirements& rhs) { - return {merge(lhs.shape_type, rhs.shape_type), // shape_type - merge(lhs.size, rhs.size), // size - merge(lhs.cost_per_coeff, rhs.cost_per_coeff)}; // cost_per_coeff - } - - EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff( - TensorOpCost cost) { - cost_per_coeff += cost; - return *this; - } - - // This is a resource requirement that should be returned from expressions - // that do not have any block evaluation preference (e.g. default tensor - // expression with raw buffer access). - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() { - return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}}; - } - - private: - using Requirements = TensorBlockResourceRequirements; - - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) { - return numext::maxi(lhs_size, rhs_size); - } - - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE TensorBlockShapeType - merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) { - return (lhs == TensorBlockShapeType::kSkewedInnerDims || - rhs == TensorBlockShapeType::kSkewedInnerDims) - ? TensorBlockShapeType::kSkewedInnerDims - : TensorBlockShapeType::kUniformAllDims; - } - - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost, - TensorOpCost rhs_cost) { - return lhs_cost + rhs_cost; - } -}; - -// -------------------------------------------------------------------------- // -// TensorBlockDescriptor specifies a block offset within a tensor and the block -// sizes along each of the tensor dimensions. - -template -class TensorBlockDescriptor { - public: - typedef DSizes Dimensions; - - // If we evaluate a Tensor assignment, and expression on the left, already has - // a memory buffer, then we might do performance optimization, and evaluate - // the root expression directly into the final output memory. Some time it's - // possible to reuse it for materializing subexpressions inside an expression - // tree, to to avoid dynamic memory allocation. - // - // The pointer type of the underlying storage is erased, because passing - // Scalar type through all the expression evaluation layers is way too many - // templates. In practice destination buffer type should always match the - // evaluated expression scalar type. - class DestinationBuffer { - public: - enum DestinationBufferKind : int { - // The above explicit specification of "int" as the enum basetype is - // needed to get around a HIPCC link error ("the field type is not - // amp-compatible") - // which is issued for class members with the enum type. - // TODO(rocm): - // remove the "int" basetype once HIPCC has been fixed to not error out - // in the above scenario. - - // Destination buffer is not defined (`m_data` == nullptr). - kEmpty, - - // Tensor block defined by an owning tensor block descriptor can fit - // contiguously into the destination buffer. In this case it's safe to - // materialize tensor block in the destination buffer, wrap it in a - // TensorMap, and use to build Eigen expression on top of it. - kContiguous, - - // Destination buffer strides do not match strides of the contiguously - // stored block, and it's impossible to define a TensorMap over this - // buffer. However if we are evaluating a root of an expression tree, we - // still can materialize an output into this destination, because we can - // guarantee that no one will ever access it through block API. - // - // In theory it is possible to build valid TensorStriding - // expression on top of this destination buffer, however it has - // inefficient coeff/packet access, and defeats the purpose of fast block - // evaluation API. - kStrided - }; - - template - Scalar* data() const { - eigen_assert(m_data_type_size == sizeof(Scalar)); - return static_cast(m_data); - } - - const Dimensions& strides() const { return m_strides; } - const DestinationBufferKind& kind() const { return m_kind; } - - private: - friend class TensorBlockDescriptor; - - DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {} - - template - DestinationBuffer(Scalar* data, const Dimensions& strides, - DestinationBufferKind kind) - : m_data(static_cast(data)), - m_data_type_size(sizeof(Scalar)), - m_strides(strides), - m_kind(kind) {} - - template - static DestinationBuffer make(const TensorBlockDescriptor& desc, - Scalar* data, const Dimensions& strides) { - return DestinationBuffer(data, strides, kind(desc, strides)); - } - - template - static DestinationBufferKind kind(const TensorBlockDescriptor& desc, - const Dimensions& strides) { - const Dimensions& desc_dims = desc.dimensions(); - const Dimensions& desc_strides = internal::strides(desc_dims); - for (int i = 0; i < NumDims; ++i) { - if (desc_dims[i] == 1) continue; - if (desc_strides[i] != strides[i]) return kStrided; - } - return kContiguous; - } - - // Storage pointer is type erased, to reduce template bloat, but we still - // keep the size of the underlying element type for error checking. - void* m_data; - size_t m_data_type_size; - - // Destination buffer dimensions always match the dimensions of a tensor - // block descriptor it belongs to, however strides might be different. - Dimensions m_strides; - - DestinationBufferKind m_kind; - }; - - TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions, - const DestinationBuffer& destination) - : m_offset(offset), - m_dimensions(dimensions), - m_destination(destination) {} - - TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions) - : m_offset(offset), - m_dimensions(dimensions), - m_destination(DestinationBuffer()) {} - - IndexType offset() const { return m_offset; } - const Dimensions& dimensions() const { return m_dimensions; } - IndexType dimension(int index) const { return m_dimensions[index]; } - IndexType size() const { return array_prod(m_dimensions); } - - const DestinationBuffer& destination() const { return m_destination; } - - template - void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) { - eigen_assert(dst_base != NULL); - m_destination = - DestinationBuffer::template make(*this, dst_base, dst_strides); - } - - template - void AddDestinationBuffer( - Scalar* dst_base, - const DSizes& dst_strides) { - // DSizes constructor will do index type promotion if it's safe. - AddDestinationBuffer(dst_base, Dimensions(dst_strides)); - } - - TensorBlockDescriptor& DropDestinationBuffer() { - m_destination.m_data = NULL; - m_destination.m_kind = DestinationBuffer::kEmpty; - return *this; - } - - bool HasDestinationBuffer() const { - return m_destination.kind() != DestinationBuffer::kEmpty; - } - - // Returns a copy of `*this` with updated offset. - TensorBlockDescriptor WithOffset(IndexType offset) const { - return TensorBlockDescriptor(offset, m_dimensions, m_destination); - } - - private: - // Offset and dimensions are immutable after construction. Block descriptor - // can only be mutated by adding or dropping destination. - const IndexType m_offset; - const Dimensions m_dimensions; - DestinationBuffer m_destination; -}; - -// -------------------------------------------------------------------------- // -// TensorBlockMapper is responsible for iterating over the blocks of a tensor. - -template -class TensorBlockMapper { - typedef TensorBlockDescriptor BlockDescriptor; - - public: - typedef DSizes Dimensions; - - TensorBlockMapper() = default; - TensorBlockMapper(const DSizes& dimensions, - const TensorBlockResourceRequirements& requirements) - : m_tensor_dimensions(dimensions), m_requirements(requirements) { - // Compute block dimensions and the total number of blocks. - InitializeBlockDimensions(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const { - return m_total_block_count; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const { - return m_block_dimensions.TotalSize(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes& - blockDimensions() const { - return m_block_dimensions; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor - blockDescriptor(IndexType block_index) const { - static const bool isColMajor = Layout == static_cast(ColMajor); - - IndexType offset = 0; - DSizes dimensions; - - if (NumDims == 0) return BlockDescriptor(offset, dimensions); - - // Iterate outer -> inner dimensions. - for (int i = NumDims - 1; i >= 0; --i) { - const int dim = isColMajor ? i : NumDims - i - 1; - - const IndexType idx = block_index / m_block_strides[dim]; - block_index -= idx * m_block_strides[dim]; - - const IndexType coord = idx * m_block_dimensions[dim]; - dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord, - m_block_dimensions[dim]); - offset += coord * m_tensor_strides[dim]; - } - - return {offset, dimensions}; - } - - private: - void InitializeBlockDimensions() { - // Requested block shape and size. - const TensorBlockShapeType shape_type = m_requirements.shape_type; - IndexType target_block_size = - numext::maxi(1, static_cast(m_requirements.size)); - - IndexType tensor_size = m_tensor_dimensions.TotalSize(); - - // Corner case: one of the dimensions is zero. Logic below is too complex - // to handle this case on a general basis, just use unit block size. - // Note: we must not yield blocks with zero dimensions (recipe for - // overflows/underflows, divisions by zero and NaNs later). - if (tensor_size == 0) { - for (int i = 0; i < NumDims; ++i) { - m_block_dimensions[i] = 1; - } - m_total_block_count = 0; - return; - } - - // If tensor fits into a target block size, evaluate it as a single block. - if (tensor_size <= target_block_size) { - m_block_dimensions = m_tensor_dimensions; - m_total_block_count = 1; - // The only valid block index is `0`, and in this case we do not need - // to compute real strides for tensor or blocks (see blockDescriptor). - for (int i = 0; i < NumDims; ++i) { - m_tensor_strides[i] = 0; - m_block_strides[i] = 1; - } - return; - } - - static const bool isColMajor = Layout == static_cast(ColMajor); - - // Block shape skewed towards inner dimension. - if (shape_type == TensorBlockShapeType::kSkewedInnerDims) { - IndexType coeff_to_allocate = target_block_size; - - for (int i = 0; i < NumDims; ++i) { - const int dim = isColMajor ? i : NumDims - i - 1; - m_block_dimensions[dim] = - numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]); - coeff_to_allocate = divup( - coeff_to_allocate, - numext::maxi(static_cast(1), m_block_dimensions[dim])); - } - eigen_assert(coeff_to_allocate == 1); - - } else if (shape_type == TensorBlockShapeType::kUniformAllDims) { - // Tensor will not fit within 'target_block_size' budget: calculate tensor - // block dimension sizes based on "square" dimension size target. - const IndexType dim_size_target = convert_index( - std::pow(static_cast(target_block_size), - 1.0f / static_cast(m_block_dimensions.rank()))); - - for (int i = 0; i < NumDims; ++i) { - // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it - // a multiple of the packet size. Note that reducing - // 'block_dim_size' in this manner can increase the number of - // blocks, and so will amplify any per-block overhead. - m_block_dimensions[i] = - numext::mini(dim_size_target, m_tensor_dimensions[i]); - } - - // Add any un-allocated coefficients to inner dimension(s). - IndexType total_size = m_block_dimensions.TotalSize(); - for (int i = 0; i < NumDims; ++i) { - const int dim = isColMajor ? i : NumDims - i - 1; - - if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) { - const IndexType total_size_other_dims = - total_size / m_block_dimensions[dim]; - const IndexType alloc_avail = - divup(target_block_size, total_size_other_dims); - if (alloc_avail == m_block_dimensions[dim]) { - // Insufficient excess coefficients to allocate. - break; - } - m_block_dimensions[dim] = - numext::mini(m_tensor_dimensions[dim], alloc_avail); - total_size = total_size_other_dims * m_block_dimensions[dim]; - } - } - - } else { - eigen_assert(false); // unknown block shape - } - - eigen_assert(m_block_dimensions.TotalSize() >= - numext::mini(target_block_size, - m_tensor_dimensions.TotalSize())); - - // Calculate block counts by dimension and total block count. - DSizes block_count; - for (int i = 0; i < NumDims; ++i) { - block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]); - } - m_total_block_count = array_prod(block_count); - - // Calculate block strides (used for enumerating blocks). - m_tensor_strides = strides(m_tensor_dimensions); - m_block_strides = strides(block_count); - } - - DSizes m_tensor_dimensions; - TensorBlockResourceRequirements m_requirements; - - DSizes m_block_dimensions; - IndexType m_total_block_count; - - DSizes m_tensor_strides; - DSizes m_block_strides; -}; - -// -------------------------------------------------------------------------- // -// TensorBlockScratchAllocator is responsible for allocating temporary buffers -// for block evaluation (output or input block materialization). Given that -// Eigen expression traversal order is deterministic, all temporary allocations -// are happening in the same order, and usually have exactly the same size. -// Scratch allocator keeps a trace of all dynamic allocations, and after the -// first block evaluation is completed, we should be able to reuse all the -// temporary buffers for the next block evaluation. - -template -class TensorBlockScratchAllocator { - public: - explicit TensorBlockScratchAllocator(const Device& device) - : m_device(device), m_allocation_index(0) {} - - ~TensorBlockScratchAllocator() { - for (size_t i = 0; i < m_allocations.size(); ++i) { - m_device.deallocate(m_allocations[i].ptr); - } - } - - void* allocate(size_t size) { - // TODO(ezhulenev): Remove when replaced with inlined vector. - if (m_allocations.capacity() == 0) m_allocations.reserve(8); - - // Check if we already have an existing allocation att current index. - const int num_allocations = static_cast(m_allocations.size()); - const bool has_allocation = m_allocation_index < num_allocations; - - // Allocation index can't be larger than the number of allocations. - eigen_assert(m_allocation_index <= num_allocations); - - // If we have existing allocation, and its size is larger or equal to - // requested size, we do nothing. - - // If current allocation can't fit requested size, we deallocate it, and - // replace with a larger allocation. - if (has_allocation && m_allocations[m_allocation_index].size < size) { - m_device.deallocate(m_allocations[m_allocation_index].ptr); - m_allocations[m_allocation_index].ptr = m_device.allocate(size); - m_allocations[m_allocation_index].size = size; - } - - // Make a new allocation if we don't have and existing one. - if (!has_allocation) { - Allocation allocation; - allocation.ptr = m_device.allocate(size); - allocation.size = size; - m_allocations.push_back(allocation); - } - - eigen_assert(m_allocations[m_allocation_index].ptr != NULL); - eigen_assert(m_allocations[m_allocation_index].size >= size); - - return m_allocations[m_allocation_index++].ptr; - } - - void reset() { m_allocation_index = 0; } - - private: - struct Allocation { - void* ptr; - size_t size; - }; - - const Device& m_device; - int m_allocation_index; - // TODO(ezhulenev): This should be an inlined vector. - std::vector m_allocations; -}; - -// -------------------------------------------------------------------------- // -// TensorBlockKind represents all possible block kinds, that can be produced by -// TensorEvaluator::evalBlock function. -enum TensorBlockKind { - // Tensor block that is a lazy expression that must be assigned to a - // destination using TensorBlockAssign. - kExpr, - - // Tensor block that is a view into a memory buffer owned by an underlying - // Tensor expression (e.g. it can be a view into a Tensor buffer). - kView, - - // Tensor block that was materialized in a scratch memory buffer, allocated - // with TensorBlockScratchAllocator. This block must be copied to a - // destination, similar to a block of `kExpr` type. - kMaterializedInScratch, - - // Tensor block that was materialized directly into the final output memory - // buffer. For example if the left side of an assignment is a Tensor, we can - // directly materialize the block in the destination memory. - // - // If strides in the output buffer do not match tensor block strides, the - // Tensor expression will be invalid, and should not be used by - // TensorBlockAssign or for constructing another block expression. - kMaterializedInOutput -}; - -// -------------------------------------------------------------------------- // -// TensorBlockNotImplemented should be used to defined TensorBlock typedef in -// TensorEvaluators that do not support block evaluation. - -class TensorBlockNotImplemented { - public: - typedef void XprType; -}; - -// -------------------------------------------------------------------------- // -// XprScalar extracts Scalar type from the Eigen expressions (if expression type -// is not void). It's required to be able to define lazy block expression for -// argument types, that do not support block evaluation. - -template -struct XprScalar { - typedef typename XprType::Scalar type; -}; -template <> -struct XprScalar { - typedef void type; -}; - -// -------------------------------------------------------------------------- // -// TensorMaterializedBlock is a fully evaluated block of the original tensor, -// and XprType is just a TensorMap over the data. This block type is typically -// used to materialize blocks of tensor expressions, that can't be efficiently -// represented as lazy Tensor expressions with fast coeff/packet operations, -// e.g. we materialize all broadcasts into evaluated blocks. -// -// TensorMaterializedBlock does not own its memory buffer, it's either a memory -// buffer that backs the original expression (e.g. block is just a view into a -// Tensor), or a memory buffer allocated with scratch allocator, and in this -// case the scratch allocator will deallocate it at the end of block based -// expression execution. -// -// If the block was evaluated directly into the output buffer, and strides in -// the output buffer do not match block strides, the TensorMap expression will -// be invalid, and should never be used in block assignment or any other tensor -// expression. - -template -class TensorMaterializedBlock { - public: - typedef DSizes Dimensions; - typedef TensorMap > XprType; - - TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data, - const Dimensions& dimensions, bool valid_expr = true) - : m_kind(kind), - m_data(data), - m_dimensions(dimensions), - m_expr(m_data, m_dimensions), - m_valid_expr(valid_expr) { - eigen_assert(m_kind == internal::TensorBlockKind::kView || - m_kind == internal::TensorBlockKind::kMaterializedInScratch || - m_kind == internal::TensorBlockKind::kMaterializedInOutput); - } - - TensorBlockKind kind() const { return m_kind; } - // NOTE(ezhulenev): Returning XprType by value like in other block types - // causes asan failures. The theory is that XprType::Nested doesn't work - // properly for TensorMap. - const XprType& expr() const { - eigen_assert(m_valid_expr); - return m_expr; - } - const Scalar* data() const { return m_data; } - void cleanup() {} - - typedef internal::TensorBlockDescriptor TensorBlockDesc; - - // TensorMaterializedBlock can be backed by different types of storage: - // - // (1) Contiguous block of memory allocated with scratch allocator. - // (2) Contiguous block of memory reused from tensor block descriptor - // destination buffer. - // (3) Strided block of memory reused from tensor block descriptor - // destination buffer. - // - class Storage { - public: - Scalar* data() const { return m_data; } - const Dimensions& dimensions() const { return m_dimensions; } - const Dimensions& strides() const { return m_strides; } - - TensorMaterializedBlock AsTensorMaterializedBlock() const { - return TensorMaterializedBlock( - m_materialized_in_output - ? internal::TensorBlockKind::kMaterializedInOutput - : internal::TensorBlockKind::kMaterializedInScratch, - m_data, m_dimensions, !m_strided_storage); - } - - private: - friend class TensorMaterializedBlock; - - Storage(Scalar* data, const Dimensions& dimensions, - const Dimensions& strides, bool materialized_in_output, - bool strided_storage) - : m_data(data), - m_dimensions(dimensions), - m_strides(strides), - m_materialized_in_output(materialized_in_output), - m_strided_storage(strided_storage) {} - - Scalar* m_data; - Dimensions m_dimensions; - Dimensions m_strides; - bool m_materialized_in_output; - bool m_strided_storage; - }; - - // Creates a storage for materialized block either from the block descriptor - // destination buffer, or allocates a new buffer with scratch allocator. - template - EIGEN_STRONG_INLINE static Storage prepareStorage( - TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool allow_strided_storage = false) { - // Try to reuse destination as an output block buffer. - typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer; - - if (desc.destination().kind() == DestinationBuffer::kContiguous) { - Scalar* buffer = desc.destination().template data(); - desc.DropDestinationBuffer(); - return Storage(buffer, desc.dimensions(), - internal::strides(desc.dimensions()), - /*materialized_in_output=*/true, - /*strided_storage=*/false); - - } else if (desc.destination().kind() == DestinationBuffer::kStrided && - allow_strided_storage) { - Scalar* buffer = desc.destination().template data(); - desc.DropDestinationBuffer(); - return Storage(buffer, desc.dimensions(), desc.destination().strides(), - /*materialized_in_output=*/true, /*strided_storage=*/true); - - } else { - void* mem = scratch.allocate(desc.size() * sizeof(Scalar)); - return Storage(static_cast(mem), desc.dimensions(), - internal::strides(desc.dimensions()), - /*materialized_in_output=*/false, - /*strided_storage=*/false); - } - } - - // Creates a materialized block for the given descriptor from a memory buffer. - template - EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize( - const Scalar* data, const DataDimensions& data_dims, - TensorBlockDesc& desc, TensorBlockScratch& scratch) { - eigen_assert(array_size::value == desc.dimensions().size()); - - // If a tensor block dimensions covers a contiguous block of the underlying - // memory, we can skip block buffer memory allocation, and construct a block - // from existing `data` memory buffer. - // - // Example: (RowMajor layout) - // data_dims: [11, 12, 13, 14] - // desc.dimensions(): [1, 1, 3, 14] - // - // In this case we can construct a TensorBlock starting at - // `data + desc.offset()`, with a `desc.dimensions()` block sizes. - static const bool is_col_major = Layout == ColMajor; - - // Find out how many inner dimensions have a matching size. - int num_matching_inner_dims = 0; - for (int i = 0; i < NumDims; ++i) { - int dim = is_col_major ? i : NumDims - i - 1; - if (data_dims[dim] != desc.dimensions()[dim]) break; - ++num_matching_inner_dims; - } - - // All the outer dimensions must be of size `1`, except a single dimension - // before the matching inner dimension (`3` in the example above). - bool can_use_direct_access = true; - for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) { - int dim = is_col_major ? i : NumDims - i - 1; - if (desc.dimension(dim) != 1) { - can_use_direct_access = false; - break; - } - } - - if (can_use_direct_access) { - const Scalar* block_start = data + desc.offset(); - return TensorMaterializedBlock(internal::TensorBlockKind::kView, - block_start, desc.dimensions()); - - } else { - // Reuse destination buffer or allocate new buffer with scratch allocator. - const Storage storage = prepareStorage(desc, scratch); - - typedef internal::TensorBlockIO - TensorBlockIO; - typedef typename TensorBlockIO::Dst TensorBlockIODst; - typedef typename TensorBlockIO::Src TensorBlockIOSrc; - - TensorBlockIOSrc src(internal::strides(Dimensions(data_dims)), - data, desc.offset()); - TensorBlockIODst dst(storage.dimensions(), storage.strides(), - storage.data()); - - TensorBlockIO::Copy(dst, src); - return storage.AsTensorMaterializedBlock(); - } - } - - private: - TensorBlockKind m_kind; - const Scalar* m_data; - Dimensions m_dimensions; - XprType m_expr; - bool m_valid_expr; -}; - -// -------------------------------------------------------------------------- // -// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp -// functor to the blocks produced by the underlying Tensor expression. - -template -class TensorCwiseUnaryBlock { - static const bool NoArgBlockAccess = - internal::is_void::value; - - public: - typedef typename conditional< - NoArgBlockAccess, void, - TensorCwiseUnaryOp >:: - type XprType; - - typedef typename XprScalar::type Scalar; - - TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor) - : m_arg_block(arg_block), m_functor(functor) {} - - TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } - - XprType expr() const { return XprType(m_arg_block.expr(), m_functor); } - const Scalar* data() const { return NULL; } - void cleanup() { m_arg_block.cleanup(); } - - private: - ArgTensorBlock m_arg_block; - UnaryOp m_functor; -}; - -// -------------------------------------------------------------------------- // -// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp -// functor to the blocks produced by the underlying Tensor expression. - -template -class TensorCwiseBinaryBlock { - static const bool NoArgBlockAccess = - internal::is_void::value || - internal::is_void::value; - - public: - typedef typename conditional< - NoArgBlockAccess, void, - TensorCwiseBinaryOp >::type - XprType; - - typedef typename XprScalar::type Scalar; - - TensorCwiseBinaryBlock(const LhsTensorBlock& left_block, - const RhsTensorBlock& right_block, - const BinaryOp& functor) - : m_left_block(left_block), - m_right_block(right_block), - m_functor(functor) {} - - TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } - - XprType expr() const { - return XprType(m_left_block.expr(), m_right_block.expr(), m_functor); - } - - const Scalar* data() const { return NULL; } - - void cleanup() { - m_left_block.cleanup(); - m_right_block.cleanup(); - } - - private: - LhsTensorBlock m_left_block; - RhsTensorBlock m_right_block; - BinaryOp m_functor; -}; - -// -------------------------------------------------------------------------- // -// TensorUnaryExprBlock is a lazy tensor expression block that can construct -// an arbitrary tensor expression from a block of the underlying type (this is a -// generalization of the TensorCwiseUnaryBlock for arbitrary expressions). - -template -class TensorUnaryExprBlock { - typedef typename ArgTensorBlock::XprType ArgXprType; - static const bool NoArgBlockAccess = internal::is_void::value; - - public: - typedef typename conditional< - NoArgBlockAccess, void, - typename BlockFactory::template XprType::type>::type XprType; - - typedef typename XprScalar::type Scalar; - - TensorUnaryExprBlock(const ArgTensorBlock& arg_block, - const BlockFactory& factory) - : m_arg_block(arg_block), m_factory(factory) {} - - TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } - XprType expr() const { return m_factory.expr(m_arg_block.expr()); } - const Scalar* data() const { return NULL; } - void cleanup() { m_arg_block.cleanup(); } - - private: - ArgTensorBlock m_arg_block; - BlockFactory m_factory; -}; - -// -------------------------------------------------------------------------- // -// TensorTernaryExprBlock is a lazy tensor expression block that can construct -// an arbitrary tensor expression from three blocks of the underlying type. - -template -class TensorTernaryExprBlock { - typedef typename Arg1TensorBlock::XprType Arg1XprType; - typedef typename Arg2TensorBlock::XprType Arg2XprType; - typedef typename Arg3TensorBlock::XprType Arg3XprType; - - static const bool NoArgBlockAccess = internal::is_void::value || - internal::is_void::value || - internal::is_void::value; - - public: - typedef typename conditional< - NoArgBlockAccess, void, - typename BlockFactory::template XprType::type>::type XprType; - - typedef typename XprScalar::type Scalar; - - TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block, - const Arg2TensorBlock& arg2_block, - const Arg3TensorBlock& arg3_block, - const BlockFactory& factory) - : m_arg1_block(arg1_block), - m_arg2_block(arg2_block), - m_arg3_block(arg3_block), - m_factory(factory) {} - - TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } - XprType expr() const { - return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(), - m_arg3_block.expr()); - } - const Scalar* data() const { return NULL; } - void cleanup() { - m_arg1_block.cleanup(); - m_arg2_block.cleanup(); - m_arg3_block.cleanup(); - } - - private: - Arg1TensorBlock m_arg1_block; - Arg2TensorBlock m_arg2_block; - Arg3TensorBlock m_arg3_block; - BlockFactory m_factory; -}; - -// -------------------------------------------------------------------------- // -// StridedLinearBufferCopy provides a method to copy data between two linear -// buffers with different strides, with optimized paths for scatter/gather. - -template -class StridedLinearBufferCopy { - typedef typename packet_traits::type Packet; - enum { - Vectorizable = packet_traits::Vectorizable, - PacketSize = packet_traits::size - }; - - public: - // Specifying linear copy kind statically gives ~30% speedup for small sizes. - enum class Kind { - Linear = 0, // src_stride == 1 && dst_stride == 1 - Scatter = 1, // src_stride == 1 && dst_stride != 1 - FillLinear = 2, // src_stride == 0 && dst_stride == 1 - FillScatter = 3, // src_stride == 0 && dst_stride != 1 - Gather = 4, // dst_stride == 1 - Random = 5 // everything else - }; - - struct Dst { - Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {} - - IndexType offset; - IndexType stride; - Scalar* data; - }; - - struct Src { - Src(IndexType o, IndexType s, const Scalar* d) - : offset(o), stride(s), data(d) {} - - IndexType offset; - IndexType stride; - const Scalar* data; - }; - - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst, - const Src& src, - const size_t count) { - Run(count, dst.offset, dst.stride, dst.data, src.offset, src.stride, - src.data); - } - - private: - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const IndexType count, const IndexType dst_offset, - const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data, - const IndexType src_offset, const IndexType src_stride, - const Scalar* EIGEN_RESTRICT src_data) { - const Scalar* src = &src_data[src_offset]; - Scalar* dst = &dst_data[dst_offset]; - - if (!Vectorizable) { - for (Index i = 0; i < count; ++i) { - dst[i * dst_stride] = src[i * src_stride]; - } - return; - } - - const IndexType vectorized_size = count - PacketSize; - IndexType i = 0; - - if (kind == StridedLinearBufferCopy::Kind::Linear) { - // ******************************************************************** // - // Linear copy from `src` to `dst`. - const IndexType unrolled_size = count - 4 * PacketSize; - eigen_assert(src_stride == 1 && dst_stride == 1); - for (; i <= unrolled_size; i += 4 * PacketSize) { - for (int j = 0; j < 4; ++j) { - Packet p = ploadu(src + i + j * PacketSize); - pstoreu(dst + i + j * PacketSize, p); - } - } - for (; i <= vectorized_size; i += PacketSize) { - Packet p = ploadu(src + i); - pstoreu(dst + i, p); - } - for (; i < count; ++i) { - dst[i] = src[i]; - } - // ******************************************************************** // - } else if (kind == StridedLinearBufferCopy::Kind::Scatter) { - // Scatter from `src` to `dst`. - eigen_assert(src_stride == 1 && dst_stride != 1); - for (; i <= vectorized_size; i += PacketSize) { - Packet p = ploadu(src + i); - pscatter(dst + i * dst_stride, p, dst_stride); - } - for (; i < count; ++i) { - dst[i * dst_stride] = src[i]; - } - // ******************************************************************** // - } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) { - // Fill `dst` with value at `*src`. - eigen_assert(src_stride == 0 && dst_stride == 1); - const IndexType unrolled_size = count - 4 * PacketSize; - Packet p = pload1(src); - for (; i <= unrolled_size; i += 4 * PacketSize) { - for (int j = 0; j < 4; ++j) { - pstoreu(dst + i + j * PacketSize, p); - } - } - for (; i <= vectorized_size; i += PacketSize) { - pstoreu(dst + i, p); - } - for (; i < count; ++i) { - dst[i] = *src; - } - // ******************************************************************** // - } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) { - // Scatter `*src` into `dst`. - eigen_assert(src_stride == 0 && dst_stride != 1); - Packet p = pload1(src); - for (; i <= vectorized_size; i += PacketSize) { - pscatter(dst + i * dst_stride, p, dst_stride); - } - for (; i < count; ++i) { - dst[i * dst_stride] = *src; - } - // ******************************************************************** // - } else if (kind == StridedLinearBufferCopy::Kind::Gather) { - // Gather from `src` into `dst`. - eigen_assert(dst_stride == 1); - for (; i <= vectorized_size; i += PacketSize) { - Packet p = pgather(src + i * src_stride, src_stride); - pstoreu(dst + i, p); - } - for (; i < count; ++i) { - dst[i] = src[i * src_stride]; - } - // ******************************************************************** // - } else if (kind == StridedLinearBufferCopy::Kind::Random) { - // Random. - for (; i < count; ++i) { - dst[i * dst_stride] = src[i * src_stride]; - } - } else { - eigen_assert(false); - } - } -}; - -// -------------------------------------------------------------------------- // -// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block. -// It's possible to specify src->dst dimension mapping for the copy operation. -// Dimensions of `dst` specify how many elements have to be copied, for the -// `src` we need to know only stride to navigate through source memory buffer. - -template -class TensorBlockIO { - static const bool IsColMajor = (Layout == ColMajor); - - typedef StridedLinearBufferCopy LinCopy; - - public: - typedef DSizes Dimensions; - typedef DSizes DimensionsMap; - - struct Dst { - Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst, - IndexType dst_offset = 0) - : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {} - - Dimensions dims; - Dimensions strides; - Scalar* data; - IndexType offset; - }; - - struct Src { - Src(const Dimensions& src_strides, const Scalar* src, - IndexType src_offset = 0) - : strides(src_strides), data(src), offset(src_offset) {} - - Dimensions strides; - const Scalar* data; - IndexType offset; - }; - - // Copies data to `dst` from `src`, using provided dimensions mapping: - // - // src_dimension_index = dst_to_src_dim_map[dst_dimension_index] - // - // Returns the number of copied elements. - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy( - const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) { - // Copy single scalar value from `src` to `dst`. - if (NumDims == 0) { - *(dst.data + dst.offset) = *(src.data + src.offset); - return 1; - } - - // Both `dst` and `src` must have contiguous innermost dimension. We also - // accept the special case with stride '0', because it's used as a trick to - // implement broadcasting. - { - int inner_dim = IsColMajor ? 0 : NumDims - 1; - EIGEN_UNUSED_VARIABLE(inner_dim); - eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0); - eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0); - } - - // Give a shorter name to `dst_to_src_dim_map`. - const DimensionsMap& dim_map = dst_to_src_dim_map; - - // Do not squeeze reordered inner dimensions. - int num_squeezable_dims = NumSqueezableInnerDims(dim_map); - - // NOTE: We find the innermost dimension (contiguous in memory) in the dst - // block, and we write data linearly into that dimension, reading it from - // the src. If dimensions are reordered, we might end up reading data from - // the src with `stride != 1`. - // - // NOTE: Random-Read/Linear-Write can be up to ~2X faster than - // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680 - - // Find the innermost dimension in the dst whose size is not 1. This is the - // effective inner dim. - int num_size_one_inner_dims = 0; - for (int i = 0; i < num_squeezable_dims; ++i) { - const int dst_dim = IsColMajor ? i : NumDims - i - 1; - if (dst.dims[dst_dim] != 1) break; - num_size_one_inner_dims++; - } - - // If all dimensions are of size 1, just copy a scalar from `src` to `dst`. - if (num_size_one_inner_dims == NumDims) { - *(dst.data + dst.offset) = *(src.data + src.offset); - return 1; - } - - // Outermost dimension in the dst with `stride == 1` (contiguous in memory). - const int dst_stride1_dim = IsColMajor - ? num_size_one_inner_dims - : NumDims - num_size_one_inner_dims - 1; - - // Dimension in the src that corresponds to the dst innermost dimension. - const int src_dim_for_dst_stride1_dim = - NumDims == 0 ? 1 : dim_map[dst_stride1_dim]; - - // Size of the innermost dimension (length of contiguous blocks of memory). - IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim]; - - // Squeeze multiple inner dims into one if they are contiguous in `dst` and - // `src` memory, so we can do less linear copy calls. - for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) { - const int dst_dim = IsColMajor ? i : NumDims - i - 1; - const IndexType dst_stride = dst.strides[dst_dim]; - const IndexType src_stride = src.strides[dim_map[dst_dim]]; - if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) { - dst_inner_dim_size *= dst.dims[dst_dim]; - ++num_size_one_inner_dims; - } else { - break; - } - } - - // Setup strides to read data from `src` and write to `dst`. - IndexType input_offset = src.offset; - IndexType output_offset = dst.offset; - IndexType input_stride = - NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim]; - IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim]; - - const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; - array it; - - // Initialize block iterator state. Squeeze away any dimension of size 1. - int idx = 0; // currently initialized iterator state index - for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) { - const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2; - if (dst.dims[dst_dim] == 1) continue; - - it[idx].size = dst.dims[dst_dim]; - it[idx].input_stride = src.strides[dim_map[dst_dim]]; - it[idx].output_stride = dst.strides[dst_dim]; - - it[idx].input_span = it[idx].input_stride * (it[idx].size - 1); - it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); - - idx++; - } - - // Iterate copying data from src to dst. - const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize(); - -#define COPY_INNER_DIM(KIND) \ - IndexType num_copied = 0; \ - for (num_copied = 0; num_copied < block_total_size; \ - num_copied += dst_inner_dim_size) { \ - LinCopy::template Run( \ - typename LinCopy::Dst(output_offset, output_stride, dst.data), \ - typename LinCopy::Src(input_offset, input_stride, src.data), \ - dst_inner_dim_size); \ - \ - for (int j = 0; j < idx; ++j) { \ - if (++it[j].count < it[j].size) { \ - input_offset += it[j].input_stride; \ - output_offset += it[j].output_stride; \ - break; \ - } \ - it[j].count = 0; \ - input_offset -= it[j].input_span; \ - output_offset -= it[j].output_span; \ - } \ - } \ - return num_copied; - - if (input_stride == 1 && output_stride == 1) { - COPY_INNER_DIM(LinCopy::Kind::Linear); - } else if (input_stride == 1 && output_stride != 1) { - COPY_INNER_DIM(LinCopy::Kind::Scatter); - } else if (input_stride == 0 && output_stride == 1) { - COPY_INNER_DIM(LinCopy::Kind::FillLinear); - } else if (input_stride == 0 && output_stride != 1) { - COPY_INNER_DIM(LinCopy::Kind::FillScatter); - } else if (output_stride == 1) { - COPY_INNER_DIM(LinCopy::Kind::Gather); - } else { - COPY_INNER_DIM(LinCopy::Kind::Random); - } - -#undef COPY_INNER_DIM - } - - // Copy from `src` to `dst` with an identity src->dst dimension map. Returns - // the number of copied elements. - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst, - const Src& src) { - DimensionsMap dst_to_src_map; - for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i; - return Copy(dst, src, dst_to_src_map); - } - - private: - struct BlockIteratorState { - BlockIteratorState() - : size(0), - count(0), - input_stride(0), - output_stride(0), - input_span(0), - output_span(0) {} - - IndexType size; - IndexType count; - IndexType input_stride; - IndexType output_stride; - IndexType input_span; - IndexType output_span; - }; - - // Compute how many inner dimensions it's allowed to squeeze when doing IO - // between two tensor blocks. It's safe to squeeze inner dimensions, only - // if they are not reordered. - static int NumSqueezableInnerDims(const DimensionsMap& dim_map) { - int num_squeezable_dims = 0; - for (int i = 0; i < NumDims; ++i) { - const int dim = IsColMajor ? i : NumDims - i - 1; - if (dim_map[dim] != dim) break; - num_squeezable_dims++; - } - return num_squeezable_dims; - } -}; - -// -------------------------------------------------------------------------- // -// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to -// a Tensor block defined by `desc`, backed by a memory buffer at `target`. -// -// Currently there is no way to write from a Tensor expression to a block of -// memory, if dimensions are reordered. If you need to do that, you should -// materialize a Tensor block expression into a memory buffer, and then use -// TensorBlockIO to copy data between two memory buffers with a custom -// `target->src` dimension map (see definition above). -// -// Also currently the innermost dimension of `target` must have a stride '1' -// (contiguous in memory). This restriction could be lifted with a `pscatter`, -// but in practice it's never needed, and there is a similar TensorBlockIO -// workaround for that. -// -// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO -// where `src` is a tensor expression. Explore if it is possible to rewrite IO -// to use expressions instead of pointers, and after that TensorBlockAssignment -// will become an alias to IO. -template -class TensorBlockAssignment { - // We will use coeff/packet path to evaluate block expressions. - typedef TensorEvaluator - TensorBlockEvaluator; - - typedef DSizes Dimensions; - - enum { - Vectorizable = packet_traits::Vectorizable, - PacketSize = packet_traits::size - }; - - template - struct InnerDimAssign { - EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, - const Evaluator& eval, - IndexType eval_offset) { - for (IndexType i = 0; i < count; ++i) { - target[i] = eval.coeff(eval_offset + i); - } - } - }; - - template - struct InnerDimAssign { - EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, - const Evaluator& eval, - IndexType eval_offset) { - typedef typename packet_traits::type Packet; - - const IndexType unrolled_size = count - 4 * PacketSize; - const IndexType vectorized_size = count - PacketSize; - IndexType i = 0; - - for (; i <= unrolled_size; i += 4 * PacketSize) { - for (int j = 0; j < 4; ++j) { - const IndexType idx = eval_offset + i + j * PacketSize; - Packet p = eval.template packet(idx); - pstoreu(target + i + j * PacketSize, p); - } - } - - for (; i <= vectorized_size; i += PacketSize) { - Packet p = eval.template packet(eval_offset + i); - pstoreu(target + i, p); - } - - for (; i < count; ++i) { - target[i] = eval.coeff(eval_offset + i); - } - } - }; - - public: - struct Target { - Target(const Dimensions& target_dims, const Dimensions& target_strides, - Scalar* target_data, IndexType target_offset = 0) - : dims(target_dims), - strides(target_strides), - data(target_data), - offset(target_offset) {} - - Dimensions dims; - Dimensions strides; - Scalar* data; - IndexType offset; - }; - - static Target target(const Dimensions& target_dims, - const Dimensions& target_strides, Scalar* target_data, - IndexType target_offset = 0) { - return Target(target_dims, target_strides, target_data, target_offset); - } - - template - static Target target( - const DSizes& target_dims, - const DSizes& target_strides, - Scalar* target_data, IndexType target_offset = 0) { - // DSizes constructor will do index type promotion if it's safe. - return Target(Dimensions(target_dims), Dimensions(target_strides), - target_data, target_offset); - } - - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const Target& target, const TensorBlockExpr& expr) { - // Prepare evaluator for block expression. - DefaultDevice default_device; - TensorBlockEvaluator eval(expr, default_device); - - // Tensor block expression dimension should match destination dimensions. - eigen_assert(dimensions_match(target.dims, eval.dimensions())); - - static const int Layout = TensorBlockEvaluator::Layout; - static const bool is_col_major = Layout == ColMajor; - - // Initialize output inner dimension size based on a layout. - const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize(); - const int inner_dim_idx = is_col_major ? 0 : NumDims - 1; - IndexType output_inner_dim_size = target.dims[inner_dim_idx]; - - // Target inner dimension stride must be '1'. - eigen_assert(target.strides[inner_dim_idx] == 1); - - // Squeeze multiple inner dims into one if they are contiguous in `target`. - IndexType num_squeezed_dims = 0; - for (Index i = 1; i < NumDims; ++i) { - const Index dim = is_col_major ? i : NumDims - i - 1; - const IndexType target_stride = target.strides[dim]; - - if (output_inner_dim_size == target_stride) { - output_inner_dim_size *= target.dims[dim]; - num_squeezed_dims++; - } else { - break; - } - } - - // Initialize output block iterator state. Dimension in this array are - // always in inner_most -> outer_most order (col major layout). - array it; - - int idx = 0; // currently initialized iterator state index - for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) { - const Index dim = is_col_major ? i + 1 : NumDims - i - 2; - - it[idx].count = 0; - it[idx].size = target.dims[dim]; - it[idx].output_stride = target.strides[dim]; - it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); - idx++; - } - - // We read block expression from the beginning, and start writing data to - // `target` at given offset. - IndexType input_offset = 0; - IndexType output_offset = target.offset; - - // Iterate copying data from `eval` to `target`. - for (IndexType i = 0; i < output_size; i += output_inner_dim_size) { - // Assign to `target` at current offset. - InnerDimAssign::Run(target.data + output_offset, - output_inner_dim_size, eval, - input_offset); - - // Move input offset forward by the number of assigned coefficients. - input_offset += output_inner_dim_size; - - // Update index. - for (int j = 0; j < idx; ++j) { - if (++it[j].count < it[j].size) { - output_offset += it[j].output_stride; - break; - } - it[j].count = 0; - output_offset -= it[j].output_span; - } - } - } - - private: - struct BlockIteratorState { - BlockIteratorState() - : count(0), size(0), output_stride(0), output_span(0) {} - - IndexType count; - IndexType size; - IndexType output_stride; - IndexType output_span; - }; -}; - -// -------------------------------------------------------------------------- // - -} // namespace internal -} // namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 3408f90d1..4cfe300eb 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -31,13 +31,12 @@ struct traits > : public traits::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; }; template struct eval, Eigen::Dense> { - typedef const TensorBroadcastingOp EIGEN_DEVICE_REF type; + typedef const TensorBroadcastingOp& type; }; template @@ -55,7 +54,7 @@ struct is_input_scalar > { static const bool value = true; }; #ifndef EIGEN_EMULATE_CXX11_META_H -template +template struct is_input_scalar > { static const bool value = (Sizes::total_size == 1); }; @@ -104,58 +103,27 @@ struct TensorEvaluator, Device> typedef typename TensorEvaluator::Dimensions InputDimensions; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = PacketType::size; - protected: // all the non-static fields must have the same access control, otherwise the TensorEvaluator wont be standard layout; - bool isCopy, nByOne, oneByN; - public: - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; + static const int PacketSize = internal::unpacket_traits::size; enum { - IsAligned = true, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, - PreferBlockAccess = true, - Layout = TensorEvaluator::Layout, - RawAccess = false + IsAligned = true, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false }; - typedef typename internal::remove_const::type ScalarNoConst; - - // We do block based broadcasting using a trick with 2x tensor rank and 0 - // strides. See block method implementation for details. - typedef DSizes BroadcastDimensions; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator TensorBlockScratch; - - typedef typename TensorEvaluator::TensorBlock - ArgTensorBlock; - - typedef typename internal::TensorMaterializedBlock - TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, - const Device& device) - : isCopy(false), nByOne(false), oneByN(false), - m_device(device), m_broadcast(op.broadcast()), m_impl(op.expression(), device) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_broadcast(op.broadcast()),m_impl(op.expression(), device) { - // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar // and store the result in a scalar. Instead one should reshape the scalar into a a N-D // tensor with N >= 1 of 1 element first and then broadcast. EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); const InputDimensions& input_dims = m_impl.dimensions(); - isCopy = true; + const Broadcast& broadcast = op.broadcast(); for (int i = 0; i < NumDims; ++i) { eigen_assert(input_dims[i] > 0); - m_dimensions[i] = input_dims[i] * m_broadcast[i]; - if (m_broadcast[i] != 1) { - isCopy = false; - } + m_dimensions[i] = input_dims[i] * broadcast[i]; } if (static_cast(Layout) == static_cast(ColMajor)) { @@ -173,57 +141,15 @@ struct TensorEvaluator, Device> m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1]; } } - - if (input_dims[0] == 1) { - oneByN = true; - for (int i = 1; i < NumDims; ++i) { - if (m_broadcast[i] != 1) { - oneByN = false; - break; - } - } - } else if (input_dims[NumDims-1] == 1) { - nByOne = true; - for (int i = 0; i < NumDims-1; ++i) { - if (m_broadcast[i] != 1) { - nByOne = false; - break; - } - } - } - - // Handle special format like NCHW, its input shape is '[1, N..., 1]' and - // broadcast shape is '[N, 1..., N]' - if (!oneByN && !nByOne) { - if (input_dims[0] == 1 && input_dims[NumDims-1] == 1 && NumDims > 2) { - nByOne = true; - oneByN = true; - for (int i = 1; i < NumDims-1; ++i) { - if (m_broadcast[i] != 1) { - nByOne = false; - oneByN = false; - break; - } - } - } - } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } -#ifdef EIGEN_USE_THREADS - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType, EvalSubExprsCallback done) { - m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); - } -#endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -235,24 +161,16 @@ struct TensorEvaluator, Device> } if (static_cast(Layout) == static_cast(ColMajor)) { - if (isCopy) { - return m_impl.coeff(index); - } else { - return coeffColMajor(index); - } + return coeffColMajor(index); } else { - if (isCopy) { - return m_impl.coeff(index); - } else { - return coeffRowMajor(index); - } + return coeffRowMajor(index); } } // TODO: attempt to speed this up. The integer divisions and modulo are slow - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const + { Index inputIndex = 0; - EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq(i, 1)) { @@ -277,17 +195,12 @@ struct TensorEvaluator, Device> inputIndex += (index % m_impl.dimensions()[0]); } } - return inputIndex; + return m_impl.coeff(inputIndex); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const { - return m_impl.coeff(indexColMajor(index)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const { Index inputIndex = 0; - EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq(i, 1)) { @@ -302,22 +215,17 @@ struct TensorEvaluator, Device> } index -= idx * m_outputStrides[i]; } - if (internal::index_statically_eq(NumDims - 1, 1)) { - eigen_assert(index < m_impl.dimensions()[NumDims - 1]); + if (internal::index_statically_eq(NumDims-1, 1)) { + eigen_assert(index < m_impl.dimensions()[NumDims-1]); inputIndex += index; } else { - if (internal::index_statically_eq(NumDims - 1, 1)) { - eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0); + if (internal::index_statically_eq(NumDims-1, 1)) { + eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0); } else { - inputIndex += (index % m_impl.dimensions()[NumDims - 1]); + inputIndex += (index % m_impl.dimensions()[NumDims-1]); } } - return inputIndex; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const - { - return m_impl.coeff(indexRowMajor(index)); + return m_impl.coeff(inputIndex); } template @@ -328,148 +236,9 @@ struct TensorEvaluator, Device> } if (static_cast(Layout) == static_cast(ColMajor)) { - if (isCopy) { - #ifdef EIGEN_GPU_COMPILE_PHASE - // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing - // unaligned loads here. The reason is unclear though. - return m_impl.template packet(index); - #else - return m_impl.template packet(index); - #endif - } else if (oneByN && !nByOne) { - return packetNByOne(index); - } else if (!oneByN && nByOne) { - return packetOneByN(index); - } else if (oneByN && nByOne) { - return packetOneByNByOne(index); - } else { - return packetColMajor(index); - } + return packetColMajor(index); } else { - if (isCopy) { - #ifdef EIGEN_GPU_COMPILE_PHASE - // See above. - return m_impl.template packet(index); - #else - return m_impl.template packet(index); - #endif - } else if (oneByN && !nByOne) { - return packetOneByN(index); - } else if (!oneByN && nByOne) { - return packetNByOne(index); - } else if (oneByN && nByOne) { - return packetOneByNByOne(index); - } else { - return packetRowMajor(index); - } - } - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne - (Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - Index startDim, endDim; - Index inputIndex, outputOffset, batchedIndex; - - if (static_cast(Layout) == static_cast(ColMajor)) { - startDim = NumDims - 1; - endDim = 1; - } else { - startDim = 0; - endDim = NumDims - 2; - } - - batchedIndex = index % m_outputStrides[startDim]; - inputIndex = batchedIndex / m_outputStrides[endDim]; - outputOffset = batchedIndex % m_outputStrides[endDim]; - - if (outputOffset + PacketSize <= m_outputStrides[endDim]) { - values[0] = m_impl.coeff(inputIndex); - return internal::pload1(values); - } else { - EIGEN_UNROLL_LOOP - for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) { - if (outputOffset + cur < m_outputStrides[endDim]) { - values[i] = m_impl.coeff(inputIndex); - } else { - ++inputIndex; - inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex); - values[i] = m_impl.coeff(inputIndex); - outputOffset = 0; - cur = 0; - } - } - return internal::pload(values); - } - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - Index dim, inputIndex; - - if (static_cast(Layout) == static_cast(ColMajor)) { - dim = NumDims - 1; - } else { - dim = 0; - } - - inputIndex = index % m_inputStrides[dim]; - if (inputIndex + PacketSize <= m_inputStrides[dim]) { - return m_impl.template packet(inputIndex); - } else { - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - EIGEN_UNROLL_LOOP - for (int i = 0; i < PacketSize; ++i) { - if (inputIndex > m_inputStrides[dim]-1) { - inputIndex = 0; - } - values[i] = m_impl.coeff(inputIndex++); - } - return internal::pload(values); - } - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const - { - EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - Index dim, inputIndex, outputOffset; - - if (static_cast(Layout) == static_cast(ColMajor)) { - dim = 1; - } else { - dim = NumDims - 2; - } - - inputIndex = index / m_outputStrides[dim]; - outputOffset = index % m_outputStrides[dim]; - if (outputOffset + PacketSize <= m_outputStrides[dim]) { - values[0] = m_impl.coeff(inputIndex); - return internal::pload1(values); - } else { - EIGEN_UNROLL_LOOP - for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) { - if (outputOffset + cur < m_outputStrides[dim]) { - values[i] = m_impl.coeff(inputIndex); - } else { - values[i] = m_impl.coeff(++inputIndex); - outputOffset = 0; - cur = 0; - } - } - return internal::pload(values); + return packetRowMajor(index); } } @@ -484,7 +253,6 @@ struct TensorEvaluator, Device> const Index originalIndex = index; Index inputIndex = 0; - EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq(i, 1)) { @@ -520,13 +288,8 @@ struct TensorEvaluator, Device> } else { EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; values[0] = m_impl.coeff(inputIndex); - EIGEN_UNROLL_LOOP for (int i = 1; i < PacketSize; ++i) { - if (innermostLoc + i < m_impl.dimensions()[0]) { - values[i] = m_impl.coeff(inputIndex+i); - } else { - values[i] = coeffColMajor(originalIndex+i); - } + values[i] = coeffColMajor(originalIndex+i); } PacketReturnType rslt = internal::pload(values); return rslt; @@ -542,7 +305,6 @@ struct TensorEvaluator, Device> const Index originalIndex = index; Index inputIndex = 0; - EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i]; if (internal::index_statically_eq(i, 1)) { @@ -578,13 +340,8 @@ struct TensorEvaluator, Device> } else { EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; values[0] = m_impl.coeff(inputIndex); - EIGEN_UNROLL_LOOP for (int i = 1; i < PacketSize; ++i) { - if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) { - values[i] = m_impl.coeff(inputIndex+i); - } else { - values[i] = coeffRowMajor(originalIndex+i); - } + values[i] = coeffRowMajor(originalIndex+i); } PacketReturnType rslt = internal::pload(values); return rslt; @@ -594,8 +351,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { double compute_cost = TensorOpCost::AddCost(); - if (!isCopy && NumDims > 0) { - EIGEN_UNROLL_LOOP + if (NumDims > 0) { for (int i = NumDims - 1; i > 0; --i) { compute_cost += TensorOpCost::DivCost(); if (internal::index_statically_eq(i, 1)) { @@ -616,472 +372,14 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large - // tensors. But this might need further tuning. - const size_t target_size = m_device.firstLevelCacheSize(); - return internal::TensorBlockResourceRequirements::merge( - m_impl.getResourceRequirements(), - internal::TensorBlockResourceRequirements::skewed(target_size)); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { - BlockBroadcastingParams params = blockBroadcastingParams(desc); - - if (params.inner_dim_size == 0 || params.bcast_dim_size == 0) { - return emptyBlock(); - } - - // Prepare storage for the materialized broadcasting result. - const typename TensorBlock::Storage block_storage = - TensorBlock::prepareStorage(desc, scratch); - ScalarNoConst* materialized_output = block_storage.data(); - - // We potentially will need to materialize input blocks. - size_t materialized_input_size = 0; - ScalarNoConst* materialized_input = NULL; - - // Initialize block broadcating iterator state for outer dimensions (outer - // with regard to bcast dimension). Dimension in this array are always in - // inner_most -> outer_most order (col major layout). - array it; - int idx = 0; - - for (int i = params.inner_dim_count + 1; i < NumDims; ++i) { - const Index dim = IsColMajor ? i : NumDims - 1 - i; - it[idx].size = params.output_dims[dim]; - it[idx].count = 0; - it[idx].output_stride = m_outputStrides[dim]; - it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); - idx++; - } - - // Write output into the beginning of `materialized_output`. - Index output_offset = 0; - - // We will fill output block by broadcasting along the bcast dim, and - // iterating over outer dimension. - const Index output_size = NumDims == 0 ? 1 : params.output_dims.TotalSize(); - - for (Index num_output_coeffs = 0; num_output_coeffs < output_size;) { - ScalarNoConst* bcast_output = materialized_output + num_output_coeffs; - Index bcast_offset = desc.offset() + output_offset; - - // Broadcast along the bcast dimension. - num_output_coeffs += BroadcastBlockAlongBcastDim( - params, bcast_offset, scratch, bcast_output, &materialized_input, - &materialized_input_size); - - // Switch to the next outer dimension. - for (int j = 0; j < idx; ++j) { - if (++it[j].count < it[j].size) { - output_offset += it[j].output_stride; - break; - } - it[j].count = 0; - output_offset -= it[j].output_span; - } - } - - return block_storage.AsTensorMaterializedBlock(); - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } const TensorEvaluator& impl() const { return m_impl; } Broadcast functor() const { return m_broadcast; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind( - cl::sycl::handler& cgh) const { - m_impl.bind(cgh); - } -#endif - private: - static const bool IsColMajor = - static_cast(Layout) == static_cast(ColMajor); - // We will build a general case block broadcasting on top of broadcasting - // primitive that will do broadcasting only for the inner dimension(s) along - // the first dimension smaller than the input size (it's called `bcast_dim`). - // - // Example: - // dim: 0 1 2 (ColMajor) - // input size: [9, 3, 6] - // block size: [9, 2, 6] - // - // We will compute broadcasted block by iterating over the outer dimensions - // before `bcast_dim` (only dimension `2` in this example) and computing - // broadcasts along the `bcast_dim` (dimension `1` in this example). - - // BlockBroadcastingParams holds precomputed parameters for broadcasting a - // single block along the broadcasting dimension. Sizes and strides along the - // `bcast_dim` might be invalid, they will be adjusted later in - // `BroadcastBlockAlongBcastDim`. - struct BlockBroadcastingParams { - Dimensions input_dims; // input expression dimensions - Dimensions output_dims; // output block sizes - Dimensions output_strides; // output block strides - - int inner_dim_count; // count inner dimensions matching in size - int bcast_dim; // broadcasting dimension index - Index bcast_dim_size; // broadcasting dimension size - Index inner_dim_size; // inner dimensions size - - // Block sizes and strides for the input block where all dimensions before - // `bcast_dim` are equal to `1`. - Dimensions input_block_sizes; - Dimensions input_block_strides; - - // Block sizes and strides for blocks with extra dimensions and strides `0`. - BroadcastDimensions bcast_block_sizes; - BroadcastDimensions bcast_block_strides; - BroadcastDimensions bcast_input_strides; - }; - - struct BlockBroadcastingIteratorState { - Index size; - Index count; - Index output_stride; - Index output_span; - }; - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams - blockBroadcastingParams(TensorBlockDesc& desc) const { - BlockBroadcastingParams params; - - params.input_dims = Dimensions(m_impl.dimensions()); - - // Output block sizes and strides. - params.output_dims = desc.dimensions(); - params.output_strides = internal::strides(params.output_dims); - - // Find the broadcasting dimension (first dimension with output size smaller - // that the input size). - params.bcast_dim = 0; - params.bcast_dim_size = 1; - params.inner_dim_size = 1; - - // Count the number of inner dimensions that have the same size in the block - // and in the broadcast expression. - params.inner_dim_count = 0; - - for (int i = 0; i < NumDims; ++i) { - const int dim = IsColMajor ? i : NumDims - i - 1; - - if (params.output_dims[dim] == m_dimensions[dim]) { - params.inner_dim_size *= params.output_dims[dim]; - ++params.inner_dim_count; - continue; - } - - // First non-matching dimension is the broadcasting dimension. - eigen_assert(params.output_dims[dim] < m_dimensions[dim]); - params.bcast_dim = dim; - params.bcast_dim_size = params.output_dims[dim]; - break; - } - - // Calculate the input block size for looking into the input. - for (int i = 0; i < params.inner_dim_count; ++i) { - const int dim = IsColMajor ? i : NumDims - i - 1; - params.input_block_sizes[dim] = params.input_dims[dim]; - } - for (int i = params.inner_dim_count; i < NumDims; ++i) { - const int dim = IsColMajor ? i : NumDims - i - 1; - params.input_block_sizes[dim] = 1; - } - params.input_block_strides = - internal::strides(params.input_block_sizes); - - // Broadcast with the 0-stride trick: Create 1 extra dim for each - // broadcast, set the input stride to 0. - // - // When ColMajor: - // - // - bcast_block_sizes: - // [d_0, b_0, d_1, b_1, ...] - // - // - bcast_block_strides: - // [output_block_strides[0], output_block_strides[0] * d_0, - // output_block_strides[1], output_block_strides[1] * d_1, - // ...] - // - // - bcast_input_strides: - // [input_block_strides[0], 0, - // input_block_strides[1], 0, - // ...]. - // - for (int i = 0; i < params.inner_dim_count; ++i) { - const int dim = IsColMajor ? i : NumDims - i - 1; - - const int copy_dim = IsColMajor ? 2 * i : 2 * NumDims - 2 * i - 1; - const int broadcast_dim = IsColMajor ? copy_dim + 1 : copy_dim - 1; - - params.bcast_block_sizes[copy_dim] = params.input_dims[dim]; - params.bcast_block_sizes[broadcast_dim] = m_broadcast[dim]; - params.bcast_block_strides[copy_dim] = params.output_strides[dim]; - params.bcast_block_strides[broadcast_dim] = - params.output_strides[dim] * params.input_dims[dim]; - params.bcast_input_strides[copy_dim] = params.input_block_strides[dim]; - params.bcast_input_strides[broadcast_dim] = 0; - } - - for (int i = 2 * params.inner_dim_count; i < 2 * NumDims; ++i) { - const int dim = IsColMajor ? i : 2 * NumDims - i - 1; - params.bcast_block_sizes[dim] = 1; - params.bcast_block_strides[dim] = 0; - params.bcast_input_strides[dim] = 0; - } - - return params; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock emptyBlock() const { - DSizes dimensions; - for (int i = 0; i < NumDims; ++i) dimensions[i] = 0; - return TensorBlock(internal::TensorBlockKind::kView, NULL, dimensions); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockAlongBcastDim( - BlockBroadcastingParams params, Index bcast_offset, - TensorBlockScratch& scratch, ScalarNoConst* materialized_output, - ScalarNoConst** materialized_input, - size_t* materialized_input_size) const { - if (params.bcast_dim_size == 1) { - // We just need one block read using the ready-set values above. - return BroadcastBlock( - params.input_block_sizes, params.input_block_strides, - params.bcast_block_sizes, params.bcast_block_strides, - params.bcast_input_strides, bcast_offset, 0, scratch, - materialized_output, materialized_input, materialized_input_size); - - } else if (params.input_dims[params.bcast_dim] == 1) { - // Broadcast bcast dimension (< NumDims) by bcast_dim_size. - const int broadcast_bcast_dim = - IsColMajor ? 2 * params.inner_dim_count + 1 - : 2 * NumDims - 2 * params.inner_dim_count - 2; - - params.bcast_block_sizes[broadcast_bcast_dim] = params.bcast_dim_size; - params.bcast_input_strides[broadcast_bcast_dim] = 0; - params.bcast_block_strides[broadcast_bcast_dim] = - params.output_strides[params.bcast_dim]; - - return BroadcastBlock( - params.input_block_sizes, params.input_block_strides, - params.bcast_block_sizes, params.bcast_block_strides, - params.bcast_input_strides, bcast_offset, 0, scratch, - materialized_output, materialized_input, materialized_input_size); - - } else { - // Keep track of the total number of the coefficients written to the - // output block. - Index num_output_coeffs = 0; - - // The general case. Let's denote the output block as - // - // x[..., a:a+bcast_dim_size, :, ..., :] - // - // where a:a+bcast_dim_size is a slice on the bcast_dim dimension - // (< NumDims). We need to split the a:a+bcast_dim_size into possibly 3 - // sub-blocks: - // - // (1) a:b, where b is the smallest multiple of - // input_dims[bcast_dim_start] in [a, a+bcast_dim_size]. - // - // (2) b:c, where c is the largest multiple of input_dims[bcast_dim_start] - // in [a, a+bcast_dim_size]. - // - // (3) c:a+bcast_dim_size . - // - // Or, when b and c do not exist, we just need to process the whole block - // together. - - // Find a. - const Index bcast_dim_left_index = - bcast_offset / m_outputStrides[params.bcast_dim]; - - // Find b and c. - const Index input_bcast_dim_size = params.input_dims[params.bcast_dim]; - - // First multiple after a. This is b when <= bcast_dim_left_index + - // bcast_dim_size. - const Index first_multiple = - divup(bcast_dim_left_index, input_bcast_dim_size) * - input_bcast_dim_size; - - if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) { - // b exists, so does c. Find it. - const Index last_multiple = - (bcast_dim_left_index + params.bcast_dim_size) / - input_bcast_dim_size * input_bcast_dim_size; - const int copy_bcast_dim = - IsColMajor ? 2 * params.inner_dim_count - : 2 * NumDims - 2 * params.inner_dim_count - 1; - const int broadcast_bcast_dim = - IsColMajor ? 2 * params.inner_dim_count + 1 - : 2 * NumDims - 2 * params.inner_dim_count - 2; - - if (first_multiple > bcast_dim_left_index) { - const Index head_size = first_multiple - bcast_dim_left_index; - params.input_block_sizes[params.bcast_dim] = head_size; - params.bcast_block_sizes[copy_bcast_dim] = head_size; - params.bcast_input_strides[copy_bcast_dim] = - params.input_block_strides[params.bcast_dim]; - params.bcast_block_strides[copy_bcast_dim] = - params.output_strides[params.bcast_dim]; - params.bcast_block_sizes[broadcast_bcast_dim] = 1; - params.bcast_input_strides[broadcast_bcast_dim] = 0; - params.bcast_block_strides[broadcast_bcast_dim] = - params.output_strides[params.bcast_dim] * - params.input_dims[params.bcast_dim]; - - num_output_coeffs += BroadcastBlock( - params.input_block_sizes, params.input_block_strides, - params.bcast_block_sizes, params.bcast_block_strides, - params.bcast_input_strides, bcast_offset, 0, scratch, - materialized_output, materialized_input, materialized_input_size); - } - if (first_multiple < last_multiple) { - params.input_block_sizes[params.bcast_dim] = input_bcast_dim_size; - params.bcast_block_sizes[copy_bcast_dim] = input_bcast_dim_size; - params.bcast_input_strides[copy_bcast_dim] = - params.input_block_strides[params.bcast_dim]; - params.bcast_block_strides[copy_bcast_dim] = - params.output_strides[params.bcast_dim]; - params.bcast_block_sizes[broadcast_bcast_dim] = - (last_multiple - first_multiple) / input_bcast_dim_size; - params.bcast_input_strides[broadcast_bcast_dim] = 0; - params.bcast_block_strides[broadcast_bcast_dim] = - params.output_strides[params.bcast_dim] * - params.input_dims[params.bcast_dim]; - const Index offset = (first_multiple - bcast_dim_left_index) * - m_outputStrides[params.bcast_dim]; - - num_output_coeffs += BroadcastBlock( - params.input_block_sizes, params.input_block_strides, - params.bcast_block_sizes, params.bcast_block_strides, - params.bcast_input_strides, bcast_offset, offset, scratch, - materialized_output, materialized_input, materialized_input_size); - } - if (last_multiple < bcast_dim_left_index + params.bcast_dim_size) { - const Index tail_size = - bcast_dim_left_index + params.bcast_dim_size - last_multiple; - params.input_block_sizes[params.bcast_dim] = tail_size; - params.bcast_block_sizes[copy_bcast_dim] = tail_size; - params.bcast_input_strides[copy_bcast_dim] = - params.input_block_strides[params.bcast_dim]; - params.bcast_block_strides[copy_bcast_dim] = - params.output_strides[params.bcast_dim]; - params.bcast_block_sizes[broadcast_bcast_dim] = 1; - params.bcast_input_strides[broadcast_bcast_dim] = 0; - params.bcast_block_strides[broadcast_bcast_dim] = - params.output_strides[params.bcast_dim] * - params.input_dims[params.bcast_dim]; - const Index offset = (last_multiple - bcast_dim_left_index) * - m_outputStrides[params.bcast_dim]; - - num_output_coeffs += BroadcastBlock( - params.input_block_sizes, params.input_block_strides, - params.bcast_block_sizes, params.bcast_block_strides, - params.bcast_input_strides, bcast_offset, offset, scratch, - materialized_output, materialized_input, materialized_input_size); - } - } else { - // b and c do not exist. - const int copy_bcast_dim = - IsColMajor ? 2 * params.inner_dim_count - : 2 * NumDims - 2 * params.inner_dim_count - 1; - params.input_block_sizes[params.bcast_dim] = params.bcast_dim_size; - params.bcast_block_sizes[copy_bcast_dim] = params.bcast_dim_size; - params.bcast_input_strides[copy_bcast_dim] = - params.input_block_strides[params.bcast_dim]; - params.bcast_block_strides[copy_bcast_dim] = - params.output_strides[params.bcast_dim]; - - num_output_coeffs += BroadcastBlock( - params.input_block_sizes, params.input_block_strides, - params.bcast_block_sizes, params.bcast_block_strides, - params.bcast_input_strides, bcast_offset, 0, scratch, - materialized_output, materialized_input, materialized_input_size); - } - - return num_output_coeffs; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlock( - const Dimensions& input_block_sizes, - const Dimensions& input_block_strides, - const BroadcastDimensions& bcast_block_sizes, - const BroadcastDimensions& bcast_block_strides, - const BroadcastDimensions& bcast_input_strides, Index bcast_offset, - Index offset, TensorBlockScratch& scratch, - ScalarNoConst* materialized_output, ScalarNoConst** materialized_input, - size_t* materialized_input_size) const { - // ---------------------------------------------------------------------- // - // Tensor block descriptor for reading block from the input. - const Index input_offset = bcast_offset + offset; - TensorBlockDesc input_desc( - IsColMajor ? indexColMajor(input_offset) : indexRowMajor(input_offset), - input_block_sizes); - - ArgTensorBlock input_block = m_impl.block(input_desc, scratch); - - // ---------------------------------------------------------------------- // - // Materialize input block into a temporary memory buffer only if it's not - // already available in the arg block. - const ScalarNoConst* input_buffer = NULL; - - if (input_block.data() != NULL) { - // Input block already has raw data, there is no need to materialize it. - input_buffer = input_block.data(); - - } else { - // Otherwise we have to do block assignment into a temporary buffer. - - // Maybe reuse previously allocated buffer, or allocate a new one with a - // scratch allocator. - const size_t input_total_size = input_block_sizes.TotalSize(); - if (*materialized_input == NULL || - *materialized_input_size < input_total_size) { - *materialized_input_size = input_total_size; - void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar)); - *materialized_input = static_cast(mem); - } - - typedef internal::TensorBlockAssignment< - ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index> - TensorBlockAssignment; - - TensorBlockAssignment::Run( - TensorBlockAssignment::target(input_block_sizes, input_block_strides, - *materialized_input), - input_block.expr()); - - input_buffer = *materialized_input; - } - - // ---------------------------------------------------------------------- // - // Copy data from materialized input block to the materialized output, using - // given broadcast strides (strides with zeroes). - typedef internal::TensorBlockIO - TensorBlockIO; - - typename TensorBlockIO::Src src(bcast_input_strides, input_buffer); - typename TensorBlockIO::Dst dst(bcast_block_sizes, bcast_block_strides, - materialized_output + offset); - - return TensorBlockIO::Copy(dst, src); - } - -protected: - const Device EIGEN_DEVICE_REF m_device; - const typename internal::remove_reference::type m_broadcast; + protected: + const Broadcast m_broadcast; Dimensions m_dimensions; array m_outputStrides; array m_inputStrides; diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 5b28e706d..1ba7ef170 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -32,13 +32,12 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions - 1; static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; }; template struct eval, Eigen::Dense> { - typedef const TensorChippingOp EIGEN_DEVICE_REF type; + typedef const TensorChippingOp& type; }; template @@ -51,7 +50,6 @@ template struct DimensionId { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) { - EIGEN_UNUSED_VARIABLE(dim); eigen_assert(dim == DimId); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { @@ -138,48 +136,19 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = PacketType::size; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; + static const int PacketSize = internal::unpacket_traits::size; + enum { // Alignment can't be guaranteed at compile time since it depends on the // slice offsets. - IsAligned = false, - Layout = TensorEvaluator::Layout, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, - // Chipping of outer-most dimension is a trivial operation, because we can - // read and write directly from the underlying tensor using single offset. - IsOuterChipping = (static_cast(Layout) == ColMajor && DimId == NumInputDims - 1) || - (static_cast(Layout) == RowMajor && DimId == 0), - // Chipping inner-most dimension. - IsInnerChipping = (static_cast(Layout) == ColMajor && DimId == 0) || - (static_cast(Layout) == RowMajor && DimId == NumInputDims - 1), - // Prefer block access if the underlying expression prefers it, otherwise - // only if chipping is not trivial. - PreferBlockAccess = TensorEvaluator::PreferBlockAccess || - !IsOuterChipping, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; - typedef typename internal::remove_const::type ScalarNoConst; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator TensorBlockScratch; - - typedef internal::TensorBlockDescriptor - ArgTensorBlockDesc; - typedef typename TensorEvaluator::TensorBlock - ArgTensorBlock; - - typedef typename internal::TensorMaterializedBlock - TensorBlock; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) { @@ -216,7 +185,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -236,20 +205,21 @@ struct TensorEvaluator, Device> EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - if (isInnerChipping()) { + if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); Index inputIndex = index * m_inputStride + m_inputOffset; EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = m_impl.coeff(inputIndex); inputIndex += m_inputStride; } PacketReturnType rslt = internal::pload(values); return rslt; - } else if (isOuterChipping()) { - // m_stride is always greater than index, so let's avoid the integer division. + } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims - 1) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { + // m_stride is aways greater than index, so let's avoid the integer division. eigen_assert(m_stride > index); return m_impl.template packet(index + m_inputOffset); } else { @@ -261,7 +231,6 @@ struct TensorEvaluator, Device> } else { // Cross the stride boundary. Fallback to slow path. EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index); ++index; @@ -294,100 +263,29 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - const size_t target_size = m_device.lastLevelCacheSize(); - return internal::TensorBlockResourceRequirements::merge( - internal::TensorBlockResourceRequirements::skewed(target_size), - m_impl.getResourceRequirements()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool root_of_expr_ast = false) const { - const Index chip_dim = m_dim.actualDim(); - - DSizes input_block_dims; - for (int i = 0; i < NumInputDims; ++i) { - input_block_dims[i] - = i < chip_dim ? desc.dimension(i) - : i > chip_dim ? desc.dimension(i - 1) - : 1; - } - - ArgTensorBlockDesc arg_desc(srcCoeff(desc.offset()), input_block_dims); - - // Try to reuse destination buffer for materializing argument block. - if (desc.HasDestinationBuffer()) { - DSizes arg_destination_strides; - for (int i = 0; i < NumInputDims; ++i) { - arg_destination_strides[i] - = i < chip_dim ? desc.destination().strides()[i] - : i > chip_dim ? desc.destination().strides()[i - 1] - : 0; // for dimensions of size `1` stride should never be used. - } - - arg_desc.template AddDestinationBuffer( - desc.destination().template data(), - arg_destination_strides); - } - - ArgTensorBlock arg_block = m_impl.block(arg_desc, scratch, root_of_expr_ast); - if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer(); - - if (arg_block.data() != NULL) { - // Forward argument block buffer if possible. - return TensorBlock(arg_block.kind(), arg_block.data(), - desc.dimensions()); - - } else { - // Assign argument block expression to a buffer. - - // Prepare storage for the materialized chipping result. - const typename TensorBlock::Storage block_storage = - TensorBlock::prepareStorage(desc, scratch); - - typedef internal::TensorBlockAssignment< - ScalarNoConst, NumInputDims, typename ArgTensorBlock::XprType, Index> - TensorBlockAssignment; - - TensorBlockAssignment::Run( - TensorBlockAssignment::target( - arg_desc.dimensions(), - internal::strides(arg_desc.dimensions()), - block_storage.data()), - arg_block.expr()); - - return block_storage.AsTensorMaterializedBlock(); - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { - typename Storage::Type result = constCast(m_impl.data()); - if (isOuterChipping() && result) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { + CoeffReturnType* result = const_cast(m_impl.data()); + if (((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumDims) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) && + result) { return result + m_inputOffset; } else { return NULL; } } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex; - if (isInnerChipping()) { + if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == 0) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(m_stride == 1); inputIndex = index * m_inputStride + m_inputOffset; - } else if (isOuterChipping()) { - // m_stride is always greater than index, so let's avoid the integer - // division. + } else if ((static_cast(Layout) == static_cast(ColMajor) && m_dim.actualDim() == NumInputDims-1) || + (static_cast(Layout) == static_cast(RowMajor) && m_dim.actualDim() == 0)) { + // m_stride is aways greater than index, so let's avoid the integer division. eigen_assert(m_stride > index); inputIndex = index + m_inputOffset; } else { @@ -399,25 +297,13 @@ struct TensorEvaluator, Device> return inputIndex; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isInnerChipping() const { - return IsInnerChipping || - (static_cast(Layout) == ColMajor && m_dim.actualDim() == 0) || - (static_cast(Layout) == RowMajor && m_dim.actualDim() == NumInputDims - 1); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isOuterChipping() const { - return IsOuterChipping || - (static_cast(Layout) == ColMajor && m_dim.actualDim() == NumInputDims-1) || - (static_cast(Layout) == RowMajor && m_dim.actualDim() == 0); - } - Dimensions m_dimensions; Index m_stride; Index m_inputOffset; Index m_inputStride; TensorEvaluator m_impl; const internal::DimensionId m_dim; - const Device EIGEN_DEVICE_REF m_device; + const Device& m_device; }; @@ -435,20 +321,14 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = PacketType::size; + static const int PacketSize = internal::unpacket_traits::size; enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::RawAccess, - Layout = TensorEvaluator::Layout, - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + RawAccess = false }; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor TensorBlockDesc; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } @@ -463,19 +343,20 @@ struct TensorEvaluator, Device> { EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - if (this->isInnerChipping()) { + if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == 0) || + (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) { // m_stride is equal to 1, so let's avoid the integer division. eigen_assert(this->m_stride == 1); EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; internal::pstore(values, x); Index inputIndex = index * this->m_inputStride + this->m_inputOffset; - EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { this->m_impl.coeffRef(inputIndex) = values[i]; inputIndex += this->m_inputStride; } - } else if (this->isOuterChipping()) { - // m_stride is always greater than index, so let's avoid the integer division. + } else if ((static_cast(this->Layout) == static_cast(ColMajor) && this->m_dim.actualDim() == NumInputDims-1) || + (static_cast(this->Layout) == static_cast(RowMajor) && this->m_dim.actualDim() == 0)) { + // m_stride is aways greater than index, so let's avoid the integer division. eigen_assert(this->m_stride > index); this->m_impl.template writePacket(index + this->m_inputOffset, x); } else { @@ -488,7 +369,6 @@ struct TensorEvaluator, Device> // Cross stride boundary. Fallback to slow path. EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; internal::pstore(values, x); - EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { this->coeffRef(index) = values[i]; ++index; @@ -496,36 +376,6 @@ struct TensorEvaluator, Device> } } } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( - const TensorBlockDesc& desc, const TensorBlock& block) { - assert(this->m_impl.data() != NULL); - - const Index chip_dim = this->m_dim.actualDim(); - - DSizes input_block_dims; - for (int i = 0; i < NumInputDims; ++i) { - input_block_dims[i] = i < chip_dim ? desc.dimension(i) - : i > chip_dim ? desc.dimension(i - 1) - : 1; - } - - typedef TensorReshapingOp, - const typename TensorBlock::XprType> - TensorBlockExpr; - - typedef internal::TensorBlockAssignment - TensorBlockAssign; - - TensorBlockAssign::Run( - TensorBlockAssign::target( - input_block_dims, - internal::strides(this->m_impl.dimensions()), - this->m_impl.data(), this->srcCoeff(desc.offset())), - block.expr().reshape(input_block_dims)); - } }; diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 5968ff4b7..59bf90d93 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -37,8 +37,6 @@ struct traits > static const int NumDimensions = traits::NumDimensions; static const int Layout = traits::Layout; enum { Flags = 0 }; - typedef typename conditional::val, - typename traits::PointerType, typename traits::PointerType>::type PointerType; }; template @@ -119,23 +117,13 @@ struct TensorEvaluator::type PacketReturnType; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess && - TensorEvaluator::PacketAccess, - BlockAccess = false, - PreferBlockAccess = TensorEvaluator::PreferBlockAccess || - TensorEvaluator::PreferBlockAccess, - Layout = TensorEvaluator::Layout, - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false }; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis()) { @@ -189,7 +177,7 @@ struct TensorEvaluator(Layout) == static_cast(ColMajor)) { left_index = subs[0]; - EIGEN_UNROLL_LOOP for (int i = 1; i < NumDims; ++i) { left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; } } else { left_index = subs[NumDims - 1]; - EIGEN_UNROLL_LOOP for (int i = NumDims - 2; i >= 0; --i) { left_index += (subs[i] % left_dims[i]) * m_leftStrides[i]; } @@ -245,13 +231,11 @@ struct TensorEvaluator(Layout) == static_cast(ColMajor)) { right_index = subs[0]; - EIGEN_UNROLL_LOOP for (int i = 1; i < NumDims; ++i) { right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; } } else { right_index = subs[NumDims - 1]; - EIGEN_UNROLL_LOOP for (int i = NumDims - 2; i >= 0; --i) { right_index += (subs[i] % right_dims[i]) * m_rightStrides[i]; } @@ -264,12 +248,11 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = PacketType::size; + const int packetSize = internal::unpacket_traits::size; EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index + packetSize - 1 < dimensions().TotalSize()); EIGEN_ALIGN_MAX CoeffReturnType values[packetSize]; - EIGEN_UNROLL_LOOP for (int i = 0; i < packetSize; ++i) { values[i] = coeff(index+i); } @@ -292,15 +275,7 @@ struct TensorEvaluator XprType; typedef typename Base::Dimensions Dimensions; enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess && - TensorEvaluator::PacketAccess, - BlockAccess = false, - PreferBlockAccess = TensorEvaluator::PreferBlockAccess || - TensorEvaluator::PreferBlockAccess, - Layout = TensorEvaluator::Layout, - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + RawAccess = false }; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device) : Base(op, device) { @@ -377,7 +344,7 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - const int packetSize = PacketType::size; + const int packetSize = internal::unpacket_traits::size; EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize()); diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 605d72c8d..20b29e5fd 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -21,8 +21,8 @@ namespace Eigen { */ namespace internal { -template -struct traits > +template +struct traits > { // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename gebp_traits::type, @@ -38,305 +38,53 @@ struct traits::type _RhsNested; // From NumDims below. - static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; + static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; static const int Layout = traits::Layout; - typedef typename conditional::val, - typename traits::PointerType, - typename traits::PointerType>::type - PointerType; enum { Flags = 0 }; }; -template -struct eval, Eigen::Dense> +template +struct eval, Eigen::Dense> { - typedef const TensorContractionOp& type; + typedef const TensorContractionOp& type; }; -template -struct nested, 1, typename eval >::type> +template +struct nested, 1, typename eval >::type> { - typedef TensorContractionOp type; + typedef TensorContractionOp type; }; -template -struct traits, Device_> > { +template +struct traits, Device_> > { typedef Indices_ Indices; typedef LeftArgType_ LeftArgType; typedef RightArgType_ RightArgType; - typedef OutputKernelType_ OutputKernelType; typedef Device_ Device; // From NumDims below. static const int NumDimensions = traits::NumDimensions + traits::NumDimensions - 2 * array_size::value; }; -// Helper class to allocate and deallocate temporary memory for packed buffers. -template -struct TensorContractionBlockMemAllocator { - typedef void* BlockMemHandle; - - template - EIGEN_DEVICE_FUNC static BlockMemHandle allocate(Device& d, const Index bm, - const Index bk, - const Index bn, - LhsScalar** lhs_block, - RhsScalar** rhs_block) { - eigen_assert(lhs_block); - eigen_assert(rhs_block); - BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn); - char* block_mem = static_cast(d.allocate(sz.lhs_size + sz.rhs_size)); - eigen_assert(block_mem); - *lhs_block = reinterpret_cast(block_mem); - *rhs_block = reinterpret_cast(block_mem + sz.lhs_size); - return block_mem; - } - - template - EIGEN_DEVICE_FUNC static BlockMemHandle allocateSlices( - Device& d, const Index bm, const Index bk, const Index bn, - const Index num_lhs, const Index num_rhs, const Index num_slices, - std::vector* lhs_blocks, - std::vector* rhs_blocks) { - eigen_assert(num_slices > 0); - eigen_assert(num_lhs >= 0 && num_rhs >= 0); - eigen_assert(num_lhs == 0 || lhs_blocks); - eigen_assert(num_rhs == 0 || rhs_blocks); - BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn); - void* block_mem = d.allocate( - (num_lhs * sz.lhs_size + num_rhs * sz.rhs_size) * num_slices); - eigen_assert(block_mem); - char* mem = static_cast(block_mem); - - for (Index x = 0; x < num_slices; x++) { - if (num_lhs > 0) lhs_blocks[x].resize(num_lhs); - for (Index m = 0; m < num_lhs; m++) { - lhs_blocks[x][m] = reinterpret_cast(mem); - mem += sz.lhs_size; - } - if (num_rhs > 0) rhs_blocks[x].resize(num_rhs); - for (Index n = 0; n < num_rhs; n++) { - rhs_blocks[x][n] = reinterpret_cast(mem); - mem += sz.rhs_size; - } - } - - return block_mem; - } - - template - EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) { - d.deallocate(handle); - } - - private: - struct BlockSizes { - Index lhs_size; - Index rhs_size; - }; - EIGEN_DEVICE_FUNC static BlockSizes ComputeLhsRhsBlockSizes(const Index bm, - const Index bk, - const Index bn) { - Index align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); - BlockSizes sz; - sz.lhs_size = divup(bm * bk * sizeof(LhsScalar), align) * align; - sz.rhs_size = divup(bn * bk * sizeof(RhsScalar), align) * align; - return sz; - } -}; - -// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in -// ColMajor storage order. This property is guaranteed by the -// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack -// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix -// multiplication for these blocks. Default tensor contraction uses -// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see -// GeneralBlocPanelKernel.h for details). -// -// By specializing contraction kernels we can use other low level libraries to -// perform matrix multiplication, and still rely on Eigen contraction evaluator. -// This also includes full support in TensorContractionThreadPool, assuming that -// underlying gemm do not use it's own threading. -// -// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of -// multiplication, lhs tensor and rhs tensor respectively. -// -// - StorageIndex - index type for the tensor expressions. In practice almost -// always is Eigen::Index. -// -// - OutputMapper provides access to the memory of the output matrix. In -// practice it's always column major blas_data_mapper (it must be of ResScalar -// type). -// -// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional -// view into the Lhs/Rhs tensor expressions. In practice it's -// TensorContractionInputMapper, or some specialization of it based on the -// type of tensor expression (e.g. TensorImagePatchOp has optimized input -// mapper). -template -struct TensorContractionKernel { - // True if `invoke()` supports `beta` in `C <- alpha * A * B + beta * C` - // (otherwise beta should be always equal to 1). - enum { HasBeta = false }; - - EIGEN_DEVICE_FUNC - TensorContractionKernel(StorageIndex m_, StorageIndex k_, StorageIndex n_, - StorageIndex bm_, StorageIndex bk_, StorageIndex bn_) - : m(m_), k(k_), n(n_), bm(bm_), bk(bk_), bn(bn_) {} - - // Pack blocks of Lhs and Rhs into contiguous blocks in memory. - typedef LhsScalar* LhsBlock; - typedef RhsScalar* RhsBlock; - - // Packed Lhs/Rhs block memory allocator. - typedef TensorContractionBlockMemAllocator - BlockMemAllocator; - typedef typename BlockMemAllocator::BlockMemHandle BlockMemHandle; - - typedef typename internal::gebp_traits Traits; - - typedef internal::gemm_pack_lhs< - LhsScalar, StorageIndex, typename LhsMapper::SubMapper, Traits::mr, - Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor> - LhsPacker; - - typedef internal::gemm_pack_rhs - RhsPacker; - - typedef internal::gebp_kernel - GebpKernel; - - template - EIGEN_DEVICE_FUNC BlockMemHandle allocate(Device& d, LhsBlock* lhs_block, - RhsBlock* rhs_block) { - return BlockMemAllocator::allocate(d, bm, bk, bn, lhs_block, rhs_block); - } - - template - EIGEN_DEVICE_FUNC BlockMemHandle allocateSlices( - Device& d, const StorageIndex num_lhs, const StorageIndex num_rhs, - const StorageIndex num_slices, std::vector* lhs_blocks, - std::vector* rhs_blocks) { - return BlockMemAllocator::allocateSlices( - d, bm, bk, bn, num_lhs, num_rhs, num_slices, lhs_blocks, rhs_blocks); - } - - template - EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) { - BlockMemAllocator::deallocate(d, handle); - } - - EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packLhs( - LhsBlock* lhsBlock, const typename LhsMapper::SubMapper& data_mapper, - const StorageIndex depth, const StorageIndex rows) { - LhsPacker()(*lhsBlock, data_mapper, depth, rows, /*stride*/ 0, - /*offset*/ 0); - } - - EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packRhs( - RhsBlock* rhsBlock, const typename RhsMapper::SubMapper& data_mapper, - const StorageIndex depth, const StorageIndex cols) { - RhsPacker()(*rhsBlock, data_mapper, depth, cols); - } - - EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void invoke( - const OutputMapper& output_mapper, const LhsBlock& lhsBlock, - const RhsBlock& rhsBlock, const StorageIndex rows, - const StorageIndex depth, const StorageIndex cols, - const ResScalar alpha, const ResScalar beta) { - // Default GEBP kernel does not support beta. - eigen_assert(beta == ResScalar(1)); - static const int kComputeStrideFromBlockDimensions = -1; - GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha, - /*strideA*/ kComputeStrideFromBlockDimensions, - /*strideB*/ kComputeStrideFromBlockDimensions, - /*offsetA*/ 0, /*offsetB*/ 0); - } - - private: - // These are dimensions of the original Tensors, and selected block sizes. The - // actual block sizes passed to all function above might be smaller because of - // the partial blocks at the end. - const StorageIndex m; - const StorageIndex k; - const StorageIndex n; - const StorageIndex bm; - const StorageIndex bk; - const StorageIndex bn; -}; - } // end namespace internal -// Tensor contraction params that should enable to get from output matrix -// 2-dimensional coordinates to the output tensor dimensions. -struct TensorContractionParams { - // TensorContraction evaluator assumes that both tensors are in ColMajor - // layout, if tensors are in RowMajor evaluator swap lhs with rhs. - bool swapped_arguments; -}; - -// Output kernel allows to fuse operations into the tensor contraction. -// -// Examples: -// 1. Elementwise Relu transformation following Conv2D. -// 2. AddBias to the Conv2D output channels dimension. -// -// The NoOpOutputKernel implements an output kernel that does absolutely nothing. -struct NoOpOutputKernel { - /** - * Tensor contraction evaluator calls this kernel after finishing each block - * of output matrix. Output blocks belong to the 2-dimensional output tensor. - * - * TensorContractionParams contains contraction dimensions information - * required to map output 2-d space into the expected output tensor space - * (potentially higher dimensional). - * - * \param[in] output_mapper Access to output tensor memory - * \param[in] params Tensor contraction parameters - * \param[in] i Index of a first row available through output_mapper - * \param[in] j Index of a first column available through output_mapper - * \param[in] num_rows Number of available rows - * \param[in] num_cols Number of available columns - */ - template - EIGEN_ALWAYS_INLINE void operator()( - const internal::blas_data_mapper& output_mapper, - const TensorContractionParams& params, Index i, - Index j, Index num_rows, Index num_cols) const { - EIGEN_UNUSED_VARIABLE(output_mapper); - EIGEN_UNUSED_VARIABLE(params); - EIGEN_UNUSED_VARIABLE(i); - EIGEN_UNUSED_VARIABLE(j); - EIGEN_UNUSED_VARIABLE(num_rows); - EIGEN_UNUSED_VARIABLE(num_cols); - } -}; - -template -class TensorContractionOp : public TensorBase, ReadOnlyAccessors> +template +class TensorContractionOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; typedef typename internal::gebp_traits::ResScalar CoeffReturnType; + typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType; typedef typename Eigen::internal::nested::type Nested; typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp( - const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims, - const OutputKernelType& output_kernel = OutputKernelType()) - : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims), - m_output_kernel(output_kernel) {} + const LhsXprType& lhs, const RhsXprType& rhs, const Indices& dims) + : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims) {} EIGEN_DEVICE_FUNC const Indices& indices() const { return m_indices; } @@ -350,14 +98,10 @@ class TensorContractionOp : public TensorBase::type& rhsExpression() const { return m_rhs_xpr; } - EIGEN_DEVICE_FUNC - const OutputKernelType& outputKernel() const { return m_output_kernel; } - protected: typename LhsXprType::Nested m_lhs_xpr; typename RhsXprType::Nested m_rhs_xpr; const Indices m_indices; - const OutputKernelType m_output_kernel; }; @@ -367,31 +111,22 @@ struct TensorContractionEvaluatorBase typedef typename internal::traits::Indices Indices; typedef typename internal::traits::LeftArgType LeftArgType; typedef typename internal::traits::RightArgType RightArgType; - typedef typename internal::traits::OutputKernelType OutputKernelType; typedef typename internal::traits::Device Device; - typedef TensorContractionOp XprType; + typedef TensorContractionOp XprType; typedef typename internal::remove_const::type Scalar; typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = true, - PacketAccess = (PacketType::size > 1), - BlockAccess = false, - PreferBlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = true + IsAligned = true, + PacketAccess = (internal::unpacket_traits::size > 1), + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = true }; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - // Most of the code is assuming that both input tensors are ColMajor. If the // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: // If we want to compute A * B = C, where A is LHS and B is RHS, the code @@ -401,9 +136,6 @@ struct TensorContractionEvaluatorBase typedef typename internal::conditional< static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - typedef TensorEvaluator LeftEvaluatorType; - typedef TensorEvaluator RightEvaluatorType; - static const int LDims = internal::array_size::Dimensions>::value; static const int RDims = @@ -419,15 +151,14 @@ struct TensorContractionEvaluatorBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionEvaluatorBase(const XprType& op, const Device& device) - : m_leftImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), + : m_leftImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), op.lhsExpression(), op.rhsExpression()), device), - m_rightImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), - op.rhsExpression(), op.lhsExpression()), device), + m_rightImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), + op.rhsExpression(), op.lhsExpression()), device), m_device(device), - m_output_kernel(op.outputKernel()), m_result(NULL) { EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == - static_cast(TensorEvaluator::Layout)), + static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -502,7 +233,7 @@ struct TensorContractionEvaluatorBase // dimensions and right non-contracting dimensions. m_lhs_inner_dim_contiguous = true; int dim_idx = 0; - Index nocontract_idx = 0; + unsigned int nocontract_idx = 0; for (int i = 0; i < LDims; i++) { // find if we are contracting on index i of left tensor @@ -592,140 +323,64 @@ struct TensorContractionEvaluatorBase numext::swap(m_dimensions[i], m_dimensions[j]); } } - - // A set of parameters that will allow output kernel to get from output - // tensor dimensions (i, j) into the original tensor dimensions. - // TODO(ezhulenev): Add parameters required to infer output tensor index for - // more complex contractions than 2x2 on internal dimension. - m_tensor_contraction_params.swapped_arguments = static_cast(Layout) == RowMajor; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { m_leftImpl.evalSubExprsIfNeeded(NULL); m_rightImpl.evalSubExprsIfNeeded(NULL); if (data) { evalTo(data); return false; } else { - m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); + m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); evalTo(m_result); return true; } } -#ifdef EIGEN_USE_THREADS - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType dest, EvalSubExprsCallback done) { - m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) { - m_rightImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) { - if (dest) { - evalToAsync(dest, [done]() { done(false); }); - } else { - m_result = static_cast( - m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); - evalToAsync(m_result, [done]() { done(true); }); - } - }); - }); - } -#endif // EIGEN_USE_THREADS - -#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \ - if (this->m_lhs_inner_dim_contiguous) { \ - if (this->m_rhs_inner_dim_contiguous) { \ - if (this->m_rhs_inner_dim_reordered) { \ - METHOD ARGS; \ - } else { \ - METHOD ARGS; \ - } \ - } else { \ - if (this->m_rhs_inner_dim_reordered) { \ - METHOD ARGS; \ - } else { \ - METHOD ARGS; \ - } \ - } \ - } else { \ - if (this->m_rhs_inner_dim_contiguous) { \ - if (this->m_rhs_inner_dim_reordered) { \ - METHOD ARGS; \ - } else { \ - METHOD ARGS; \ - } \ - } else { \ - if (this->m_rhs_inner_dim_reordered) { \ - METHOD ARGS; \ - } else { \ - METHOD ARGS; \ - } \ - } \ - } - -#define TENSOR_CONTRACTION_ASYNC_DISPATCH(METHOD, DONE, ALIGNMENT, ARGS, FN) \ - if (this->m_lhs_inner_dim_contiguous) { \ - if (this->m_rhs_inner_dim_contiguous) { \ - if (this->m_rhs_inner_dim_reordered) { \ - (new METHOD ARGS)->FN; \ - } else { \ - (new METHOD ARGS)->FN; \ - } \ - } else { \ - if (this->m_rhs_inner_dim_reordered) { \ - (new METHOD ARGS)->FN; \ - } else { \ - (new METHOD ARGS)->FN; \ - } \ - } \ - } else { \ - if (this->m_rhs_inner_dim_contiguous) { \ - if (this->m_rhs_inner_dim_reordered) { \ - (new METHOD ARGS)->FN; \ - } else { \ - (new METHOD ARGS)->FN; \ - } \ - } else { \ - if (this->m_rhs_inner_dim_reordered) { \ - (new METHOD ARGS)->FN; \ - } else { \ - (new METHOD ARGS)->FN; \ - } \ - } \ - } - EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const { - static_cast(this)->template evalProduct(buffer); - } - -#ifdef EIGEN_USE_THREADS - template - void evalToAsync(Scalar* buffer, EvalToCallback done) const { - static_cast(this) - ->template evalProductAsync(buffer, - std::move(done)); - } -#endif // EIGEN_USE_THREADS - - template - void evalProductSequential(Scalar* buffer) const { - if (this->m_j_size == 1) { - this->template evalGemv(buffer); - } else { - this->template evalGemm(buffer); + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + static_cast(this)->template evalProduct(buffer); + } + else { + static_cast(this)->template evalProduct(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + static_cast(this)->template evalProduct(buffer); + } + else { + static_cast(this)->template evalProduct(buffer); + } + } + } + else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + static_cast(this)->template evalProduct(buffer); + } + else { + static_cast(this)->template evalProduct(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + static_cast(this)->template evalProduct(buffer); + } + else { + static_cast(this)->template evalProduct(buffer); + } + } } } template - #if !defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC - #endif - void evalGemv(Scalar* buffer) const { + EIGEN_DEVICE_FUNC void evalGemv(Scalar* buffer) const { const Index rows = m_i_size; const Index cols = m_k_size; @@ -763,41 +418,12 @@ struct TensorContractionEvaluatorBase internal::general_matrix_vector_product::run( rows, cols, lhs, rhs, buffer, resIncr, alpha); - - typedef internal::blas_data_mapper OutputMapper; - m_output_kernel(OutputMapper(buffer, rows), m_tensor_contraction_params, - static_cast(0), static_cast(0), rows, - static_cast(1)); } template - #if !defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC - #endif - void evalGemm(Scalar* buffer) const { + EIGEN_DEVICE_FUNC void evalGemm(Scalar* buffer) const { // columns in left side, rows in right side const Index k = this->m_k_size; - this->template evalGemmPartial(buffer, 0, k, 1); - } - - template - EIGEN_DEVICE_FUNC void evalGemmPartialWithoutOutputKernel( - Scalar* buffer, Index k_start, Index k_end, int num_threads) const { - evalGemmPartial(buffer, k_start, k_end, - num_threads); - } - - template - EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int num_threads) const { - eigen_assert(k_end >= k_start && k_start >= 0 && k_end <= this->m_k_size); - // columns in slice on left side, rows on right side - const Index k_slice = k_end - k_start; // rows in left side const Index m = this->m_i_size; @@ -805,9 +431,16 @@ struct TensorContractionEvaluatorBase // columns in right side const Index n = this->m_j_size; - // define data mappers for Lhs and Rhs + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + // define mr, nr, and all of my data mapper types typedef typename internal::remove_const::type LhsScalar; typedef typename internal::remove_const::type RhsScalar; + typedef typename internal::gebp_traits Traits; + + const Index nr = Traits::nr; + const Index mr = Traits::mr; typedef TensorEvaluator LeftEvaluator; typedef TensorEvaluator RightEvaluator; @@ -829,9 +462,11 @@ struct TensorContractionEvaluatorBase typedef internal::blas_data_mapper OutputMapper; - typedef internal::TensorContractionKernel< - Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper> - TensorContractionKernel; + // Declare GEBP packing and kernel structs + internal::gemm_pack_lhs pack_lhs; + internal::gemm_pack_rhs pack_rhs; + + internal::gebp_kernel gebp; // initialize data mappers LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, @@ -843,69 +478,39 @@ struct TensorContractionEvaluatorBase OutputMapper output(buffer, m); // Sizes of the blocks to load in cache. See the Goto paper for details. - internal::TensorContractionBlocking - blocking(k_slice, m, n, num_threads); + internal::TensorContractionBlocking blocking(k, m, n, 1); const Index kc = blocking.kc(); const Index mc = numext::mini(m, blocking.mc()); const Index nc = numext::mini(n, blocking.nc()); + const Index sizeA = mc * kc; + const Index sizeB = kc * nc; - typedef typename TensorContractionKernel::LhsBlock LhsBlock; - typedef typename TensorContractionKernel::RhsBlock RhsBlock; - - LhsBlock blockA; - RhsBlock blockB; - - TensorContractionKernel kernel(m, k_slice, n, mc, kc, nc); - - typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle; - const BlockMemHandle packed_mem = - kernel.allocate(this->m_device, &blockA, &blockB); - - // If a contraction kernel does not support beta, explicitly initialize - // output buffer with zeroes. - if (!TensorContractionKernel::HasBeta) { - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - } + LhsScalar* blockA = static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar))); + RhsScalar* blockB = static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar))); for(Index i2=0; i2= k_end) { - m_output_kernel(output_mapper, m_tensor_contraction_params, i2, j2, - actual_mc, actual_nc); - } + gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, Scalar(1), -1, -1, 0, 0); } } } - kernel.deallocate(this->m_device, packed_mem); + this->m_device.deallocate(blockA); + this->m_device.deallocate(blockB); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { @@ -931,9 +536,9 @@ struct TensorContractionEvaluatorBase return internal::ploadt(m_result + index); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_result; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { return m_result; } -protected: + protected: // Prevent assignment TensorContractionEvaluatorBase& operator = (const TensorContractionEvaluatorBase&); Dimensions m_dimensions; @@ -955,25 +560,22 @@ protected: Index m_j_size; Index m_k_size; - TensorContractionParams m_tensor_contraction_params; - TensorEvaluator m_leftImpl; TensorEvaluator m_rightImpl; - const Device EIGEN_DEVICE_REF m_device; - OutputKernelType m_output_kernel; - EvaluatorPointerType m_result; + const Device& m_device; + Scalar* m_result; }; // evaluator for default device -template -struct TensorEvaluator, Device> : +template +struct TensorEvaluator, Device> : public TensorContractionEvaluatorBase< - TensorEvaluator, Device> > { - typedef TensorEvaluator, Device> Self; + TensorEvaluator, Device> > { + typedef TensorEvaluator, Device> Self; typedef TensorContractionEvaluatorBase Base; - typedef TensorContractionOp XprType; + typedef TensorContractionOp XprType; typedef typename internal::remove_const::type Scalar; typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; @@ -1010,9 +612,14 @@ struct TensorEvaluator - void evalProduct(Scalar* buffer) const { - TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, Alignment, (buffer)); + template + EIGEN_DEVICE_FUNC void evalProduct(Scalar* buffer) const { + if (this->m_j_size == 1) { + this->template evalGemv(buffer); + return; + } + + this->template evalGemm(buffer); } }; diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h index 974feb0ad..5cf7b4f71 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h @@ -21,28 +21,14 @@ enum { // Default Blocking Strategy -template +template class TensorContractionBlocking { public: - /* - adding EIGEN_DEVICE_FUNC unconditionally to 'TensorContractionBlocking' constructor in `TensorContractionBlocking.h` - requires adding EIGEN_DEVICE_FUNC to `computeProductBlockingSizes` in `GeneralBlockPanelKernel.h` - which in turn, requires adding EIGEN_DEVICE_FUNC to `evaluateProductBlockingSizesHeuristic` in `GeneralBlockPanelKernel.h` - which in turn, requires adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h` - (else HIPCC will error out) + typedef typename LhsMapper::Scalar LhsScalar; + typedef typename RhsMapper::Scalar RhsScalar; - However adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h` - results in NVCC erroring out with the following error - - ../Eigen/src/Core/products/GeneralBlockPanelKernel.h(57): error #2901: - dynamic initialization is not supported for function-scope static variables within a __device__/__global__ function - */ - - #if !defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC - #endif - TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, StorageIndex num_threads = 1) : + EIGEN_DEVICE_FUNC TensorContractionBlocking(Index k, Index m, Index n, Index num_threads = 1) : kc_(k), mc_(m), nc_(n) { if (ShardingType == ShardByCol) { @@ -51,22 +37,19 @@ class TensorContractionBlocking { else { computeProductBlockingSizes(kc_, nc_, mc_, num_threads); } - - const int rhs_packet_size = internal::packet_traits::size; - kc_ = (rhs_packet_size <= 8 || kc_ <= rhs_packet_size) ? - kc_ : (kc_ / rhs_packet_size) * rhs_packet_size; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index kc() const { return kc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index mc() const { return mc_; } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index nc() const { return nc_; } private: - StorageIndex kc_; - StorageIndex mc_; - StorageIndex nc_; + Index kc_; + Index mc_; + Index nc_; }; + } // end namespace internal } // end namespace Eigen diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h index 3f315fedc..d65dbb40f 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h @@ -1,6 +1,1391 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014-2015 Benoit Steiner +// Copyright (C) 2015 Navdeep Jaitly +// Copyright (C) 2014 Eric Martin +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#if defined(__clang__) || defined(__GNUC__) -#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorContractionGpu.h file" -#endif +#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H +#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H -#include "TensorContractionGpu.h" +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + +namespace Eigen { + +template +__device__ EIGEN_STRONG_INLINE void +EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem, + const Index m_size, const Index n_size, const Index k_size) { + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + // declare and initialize 64 registers for output 8x8 block + + // prefetch registers + Scalar lhs_pf0; + Scalar lhs_pf1; + Scalar lhs_pf2; + Scalar lhs_pf3; + Scalar lhs_pf4; + Scalar lhs_pf5; + Scalar lhs_pf6; + Scalar lhs_pf7; + + Scalar rhs_pf0; + Scalar rhs_pf1; + Scalar rhs_pf2; + Scalar rhs_pf3; + Scalar rhs_pf4; + Scalar rhs_pf5; + Scalar rhs_pf6; + Scalar rhs_pf7; + + // shared memory is formatted + // (contract idx in block, nocontract idx in block, block idx) + // where block idx is column major. This transposition limits the number of + // bank conflicts when reading the LHS. The core idea is that since the contracting + // index is shared by both sides, then the contracting index should be in threadIdx.x. + + // On the LHS, we pad each row inside of each block with an extra element. This makes + // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts + // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks. + + // On the RHS we just add 8 padding elements to the end of each block. This gives no bank + // conflicts on writes and also none on reads. + + // storage indices + const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z; + const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x; + + const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0; + const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1; + const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2; + const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3; + const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4; + const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5; + const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6; + const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7; + + const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0; + const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1; + const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2; + const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3; + const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4; + const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5; + const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6; + const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7; + + // in the loading code, the following variables are important: + // threadIdx.x: the vertical position in an 8x8 block + // threadIdx.y: the vertical index of the 8x8 block in the grid + // threadIdx.z: the horizontal position in an 8x8 block + // k: the horizontal index of the 8x8 block in the grid + // + // The k parameter is implicit (it was the loop counter for a loop that went + // from 0 to <8, but now that loop is unrolled in the below code. + + const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y; + const Index lhs_vert = base_m + load_idx_vert; + +#define prefetchIntoRegisters(base_k) \ + { \ + lhs_pf0 = conv(0); \ + lhs_pf1 = conv(0); \ + lhs_pf2 = conv(0); \ + lhs_pf3 = conv(0); \ + lhs_pf4 = conv(0); \ + lhs_pf5 = conv(0); \ + lhs_pf6 = conv(0); \ + lhs_pf7 = conv(0); \ + \ + rhs_pf0 = conv(0); \ + rhs_pf1 = conv(0); \ + rhs_pf2 = conv(0); \ + rhs_pf3 = conv(0); \ + rhs_pf4 = conv(0); \ + rhs_pf5 = conv(0); \ + rhs_pf6 = conv(0); \ + rhs_pf7 = conv(0); \ + \ + if (!needs_edge_check || lhs_vert < m_size) { \ + const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \ + const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \ + const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \ + const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \ + const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \ + const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \ + const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \ + const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \ + \ + if (!needs_edge_check || lhs_horiz_7 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \ + } else if (lhs_horiz_6 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ + } else if (lhs_horiz_5 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ + } else if (lhs_horiz_4 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ + } else if (lhs_horiz_3 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ + } else if (lhs_horiz_2 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ + } else if (lhs_horiz_1 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ + } else if (lhs_horiz_0 < k_size) { \ + lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ + } \ + } \ + \ + const Index rhs_vert = base_k + load_idx_vert; \ + if (!needs_edge_check || rhs_vert < k_size) { \ + const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \ + const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \ + const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \ + const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \ + const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \ + const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \ + const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \ + const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \ + \ + if (rhs_horiz_7 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \ + } else if (rhs_horiz_6 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ + } else if (rhs_horiz_5 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ + } else if (rhs_horiz_4 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ + } else if (rhs_horiz_3 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ + } else if (rhs_horiz_2 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ + } else if (rhs_horiz_1 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ + } else if (rhs_horiz_0 < n_size) { \ + rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ + } \ + } \ + } \ + +#define writeRegToShmem(_) \ + lhs_shmem[lhs_store_idx_0] = lhs_pf0; \ + rhs_shmem[rhs_store_idx_0] = rhs_pf0; \ + \ + lhs_shmem[lhs_store_idx_1] = lhs_pf1; \ + rhs_shmem[rhs_store_idx_1] = rhs_pf1; \ + \ + lhs_shmem[lhs_store_idx_2] = lhs_pf2; \ + rhs_shmem[rhs_store_idx_2] = rhs_pf2; \ + \ + lhs_shmem[lhs_store_idx_3] = lhs_pf3; \ + rhs_shmem[rhs_store_idx_3] = rhs_pf3; \ + \ + lhs_shmem[lhs_store_idx_4] = lhs_pf4; \ + rhs_shmem[rhs_store_idx_4] = rhs_pf4; \ + \ + lhs_shmem[lhs_store_idx_5] = lhs_pf5; \ + rhs_shmem[rhs_store_idx_5] = rhs_pf5; \ + \ + lhs_shmem[lhs_store_idx_6] = lhs_pf6; \ + rhs_shmem[rhs_store_idx_6] = rhs_pf6; \ + \ + lhs_shmem[lhs_store_idx_7] = lhs_pf7; \ + rhs_shmem[rhs_store_idx_7] = rhs_pf7; \ + + // declare and initialize result array +#define res(i, j) _res_##i##j +#define initResultRow(i) \ + Scalar res(i, 0) = conv(0); \ + Scalar res(i, 1) = conv(0); \ + Scalar res(i, 2) = conv(0); \ + Scalar res(i, 3) = conv(0); \ + Scalar res(i, 4) = conv(0); \ + Scalar res(i, 5) = conv(0); \ + Scalar res(i, 6) = conv(0); \ + Scalar res(i, 7) = conv(0); \ + + internal::scalar_cast_op conv; + initResultRow(0); + initResultRow(1); + initResultRow(2); + initResultRow(3); + initResultRow(4); + initResultRow(5); + initResultRow(6); + initResultRow(7); +#undef initResultRow + + for (Index base_k = 0; base_k < k_size; base_k += 64) { + // wait for previous iteration to finish with shmem. Despite common sense, + // the code is a bit faster with this here then at bottom of loop + __syncthreads(); + + prefetchIntoRegisters(base_k); + writeRegToShmem(); + + #undef prefetchIntoRegisters + #undef writeRegToShmem + + // wait for shared mem packing to be done before starting computation + __syncthreads(); + + // compute 8x8 matrix product by outer product. This involves packing one column + // of LHS and one row of RHS into registers (takes 16 registers). + +#define lcol(i) _lcol##i + Scalar lcol(0); + Scalar lcol(1); + Scalar lcol(2); + Scalar lcol(3); + Scalar lcol(4); + Scalar lcol(5); + Scalar lcol(6); + Scalar lcol(7); + +#define rrow(j) _rrow##j + Scalar rrow(0); + Scalar rrow(1); + Scalar rrow(2); + Scalar rrow(3); + Scalar rrow(4); + Scalar rrow(5); + Scalar rrow(6); + Scalar rrow(7); + + // Now x corresponds to k, y to m, and z to n + const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y]; + const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z]; + +#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))] +#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))] + +#define loadData(i, j) \ + lcol(0) = lhs_element(0, j); \ + rrow(0) = rhs_element(i, 0); \ + lcol(1) = lhs_element(1, j); \ + rrow(1) = rhs_element(i, 1); \ + lcol(2) = lhs_element(2, j); \ + rrow(2) = rhs_element(i, 2); \ + lcol(3) = lhs_element(3, j); \ + rrow(3) = rhs_element(i, 3); \ + lcol(4) = lhs_element(4, j); \ + rrow(4) = rhs_element(i, 4); \ + lcol(5) = lhs_element(5, j); \ + rrow(5) = rhs_element(i, 5); \ + lcol(6) = lhs_element(6, j); \ + rrow(6) = rhs_element(i, 6); \ + lcol(7) = lhs_element(7, j); \ + rrow(7) = rhs_element(i, 7); \ + +#define computeCol(j) \ + res(0, j) += lcol(0) * rrow(j); \ + res(1, j) += lcol(1) * rrow(j); \ + res(2, j) += lcol(2) * rrow(j); \ + res(3, j) += lcol(3) * rrow(j); \ + res(4, j) += lcol(4) * rrow(j); \ + res(5, j) += lcol(5) * rrow(j); \ + res(6, j) += lcol(6) * rrow(j); \ + res(7, j) += lcol(7) * rrow(j); \ + +#define computePass(i) \ + loadData(i, i); \ + \ + computeCol(0); \ + computeCol(1); \ + computeCol(2); \ + computeCol(3); \ + computeCol(4); \ + computeCol(5); \ + computeCol(6); \ + computeCol(7); \ + + computePass(0); + computePass(1); + computePass(2); + computePass(3); + computePass(4); + computePass(5); + computePass(6); + computePass(7); + +#undef lcol +#undef rrow +#undef lhs_element +#undef rhs_element +#undef loadData +#undef computeCol +#undef computePass + } // end loop over k + + // we've now iterated over all of the large (ie width 64) k blocks and + // accumulated results in registers. At this point thread (x, y, z) contains + // the sum across all big k blocks of the product of little k block of index (x, y) + // with block of index (y, z). To compute the final output, we need to reduce + // the 8 threads over y by summation. +#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) + +#define reduceRow(i, mask) \ + shuffleInc(i, 0, mask); \ + shuffleInc(i, 1, mask); \ + shuffleInc(i, 2, mask); \ + shuffleInc(i, 3, mask); \ + shuffleInc(i, 4, mask); \ + shuffleInc(i, 5, mask); \ + shuffleInc(i, 6, mask); \ + shuffleInc(i, 7, mask); \ + +#define reduceMatrix(mask) \ + reduceRow(0, mask); \ + reduceRow(1, mask); \ + reduceRow(2, mask); \ + reduceRow(3, mask); \ + reduceRow(4, mask); \ + reduceRow(5, mask); \ + reduceRow(6, mask); \ + reduceRow(7, mask); \ + + // actually perform the reduction, now each thread of index (_, y, z) + // contains the correct values in its registers that belong in the output + // block + reduceMatrix(1); + reduceMatrix(2); + reduceMatrix(4); + +#undef shuffleInc +#undef reduceRow +#undef reduceMatrix + + // now we need to copy the 64 values into main memory. We can't split work + // among threads because all variables are in registers. There's 2 ways + // to do this: + // (1) have 1 thread do 64 writes from registers into global memory + // (2) have 1 thread do 64 writes into shared memory, and then 8 threads + // each do 8 writes into global memory. We can just overwrite the shared + // memory from the problem we just solved. + // (2) is slightly faster than (1) due to less branching and more ILP + + // TODO: won't yield much gain, but could just use currently unused shared mem + // and then we won't have to sync + // wait for shared mem to be out of use + __syncthreads(); + +#define writeResultShmem(i, j) \ + lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \ + +#define writeRow(i) \ + writeResultShmem(i, 0); \ + writeResultShmem(i, 1); \ + writeResultShmem(i, 2); \ + writeResultShmem(i, 3); \ + writeResultShmem(i, 4); \ + writeResultShmem(i, 5); \ + writeResultShmem(i, 6); \ + writeResultShmem(i, 7); \ + + if (threadIdx.x == 0) { + writeRow(0); + writeRow(1); + writeRow(2); + writeRow(3); + writeRow(4); + writeRow(5); + writeRow(6); + writeRow(7); + } +#undef writeResultShmem +#undef writeRow + + const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8); + const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8); + + if (threadIdx.x < max_i_write) { + if (max_j_write == 8) { + // TODO: can i trade bank conflicts for coalesced writes? + Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0]; + Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1]; + Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2]; + Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3]; + Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4]; + Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5]; + Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6]; + Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7]; + + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7; + } else { +#pragma unroll 7 + for (int j = 0; j < max_j_write; j++) { + Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j]; + output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val; + } + } + } +#undef res +} + + +template +__global__ void +__launch_bounds__(512) +EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ Scalar lhs_shmem[72 * 64]; + __shared__ Scalar rhs_shmem[72 * 64]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size && base_n + 63 < n_size) { + EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } else { + EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); + } +} + + +template +__device__ EIGEN_STRONG_INLINE void +EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][16], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + typedef float Scalar; + + // prefetch registers + float4 lhs_pf0, rhs_pf0; + + float4 results[4]; + for (int i=0; i < 4; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + + +#define prefetch_lhs(reg, row, col) \ + if (!CHECK_LHS_BOUNDARY) { \ + if (col < k_size) { \ + reg =lhs.loadPacket(row, col); \ + } \ + } else { \ + if (col < k_size) { \ + if (row + 3 < m_size) { \ + reg =lhs.loadPacket(row, col); \ + } else if (row + 2 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + reg.z =lhs(row + 2, col); \ + } else if (row + 1 < m_size) { \ + reg.x =lhs(row + 0, col); \ + reg.y =lhs(row + 1, col); \ + } else if (row < m_size) { \ + reg.x =lhs(row + 0, col); \ + } \ + } \ + } \ + + + Index lhs_vert = base_m+threadIdx.x*4; + + for (Index k = 0; k < k_size; k += 16) { + lhs_pf0 = internal::pset1(0); + rhs_pf0 = internal::pset1(0); + + Index lhs_horiz = threadIdx.y+k; + prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz) + + Index rhs_vert = k+(threadIdx.x%4)*4; + Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n; + + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } else { + if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + float x1, x2 ; + // the following can be a bitwise operation..... some day. + if((threadIdx.x%8) < 4) { + x1 = rhs_pf0.y; + x2 = rhs_pf0.w; + } else { + x1 = rhs_pf0.x; + x2 = rhs_pf0.z; + } + x1 = __shfl_xor(x1, 4); + x2 = __shfl_xor(x2, 4); + if((threadIdx.x%8) < 4) { + rhs_pf0.y = x1; + rhs_pf0.w = x2; + } else { + rhs_pf0.x = x1; + rhs_pf0.z = x2; + } + + // We have 64 features. + // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1. + // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3. + // ... + // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63 + // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1 + // ... + rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y); + rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w); + + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // ... + // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) + // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) + // ... + + lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w); + + +#define add_vals(fl1, fl2, fr1, fr2)\ + results[0].x += fl1.x * fr1.x;\ + results[0].y += fl1.y * fr1.x;\ + results[0].z += fl2.x * fr1.x;\ + results[0].w += fl2.y * fr1.x;\ +\ + results[1].x += fl1.x * fr1.y;\ + results[1].y += fl1.y * fr1.y;\ + results[1].z += fl2.x * fr1.y;\ + results[1].w += fl2.y * fr1.y;\ +\ + results[2].x += fl1.x * fr2.x;\ + results[2].y += fl1.y * fr2.x;\ + results[2].z += fl2.x * fr2.x;\ + results[2].w += fl2.y * fr2.x;\ +\ + results[3].x += fl1.x * fr2.y;\ + results[3].y += fl1.y * fr2.y;\ + results[3].z += fl2.x * fr2.y;\ + results[3].w += fl2.y * fr2.y;\ + + __syncthreads(); + + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 16; koff ++) { + // 32 x threads. + float2 fl1 = lhs_shmem2[koff][threadIdx.x]; + float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x]; + + int start_feature = threadIdx.y * 4; + float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; + float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; + + add_vals(fl1, fl2, fr1, fr2) + } + __syncthreads(); + } + +#undef prefetch_lhs +#undef add_vals + + Index horiz_base = threadIdx.y*4+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + // CHECK LHS + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 4; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK RHS + /* + int ncols_rem = fminf(n_size- horiz_base, 4); + for (int i = 0; i < ncols_rem; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + }*/ + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 4; i++) { + if (horiz_base+i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + + +template +__device__ EIGEN_STRONG_INLINE void +EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, float2 lhs_shmem2[][32], + float2 rhs_shmem2[][8], const Index m_size, + const Index n_size, const Index k_size, + const Index base_m, const Index base_n) { + typedef float Scalar; + + // prefetch registers + float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; + float4 rhs_pf0, rhs_pf1; + + float4 results[8]; + for (int i=0; i < 8; i++) { + results[i].x = results[i].y = results[i].z = results[i].w = 0; + } + + + Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32; + for (Index k = 0; k < k_size; k += 32) { + lhs_pf0 = internal::pset1(0); + lhs_pf1 = internal::pset1(0); + lhs_pf2 = internal::pset1(0); + lhs_pf3 = internal::pset1(0); + + rhs_pf0 = internal::pset1(0); + rhs_pf1 = internal::pset1(0); + + if (!CHECK_LHS_BOUNDARY) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + } + } else { + // just CHECK_LHS_BOUNDARY + if (lhs_vert + 3 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + lhs_pf3 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + lhs_pf2 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + lhs_pf1 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0 =lhs.loadPacket(lhs_vert, (threadIdx.y/4+k)); + } + } else if (lhs_vert + 2 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); + lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); + } + } else if (lhs_vert + 1 < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); + } + } else if (lhs_vert < m_size) { + if ((threadIdx.y/4+k+24) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); + } else if ((threadIdx.y/4+k+16) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); + } else if ((threadIdx.y/4+k+8) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); + } else if ((threadIdx.y/4+k) < k_size) { + lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); + } + } + } + __syncthreads(); + Index rhs_vert = k+threadIdx.x*4; + Index rhs_horiz0 = threadIdx.y*2+base_n; + Index rhs_horiz1 = threadIdx.y*2+1+base_n; + if (!CHECK_RHS_BOUNDARY) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (rhs_vert + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else { + if (rhs_horiz1 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + rhs_pf1 = rhs.loadPacket(rhs_vert, rhs_horiz1); + } else if (rhs_vert + 2 < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); + } else if (k+threadIdx.x*4 + 1 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); + } else if (k+threadIdx.x*4 < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); + } + } else if (rhs_horiz0 < n_size) { + if ((rhs_vert + 3) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0 = rhs.loadPacket(rhs_vert, rhs_horiz0); + } else if ((rhs_vert + 2) < k_size) { + // just CHECK_RHS_BOUNDARY + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); + } else if ((rhs_vert + 1) < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); + } else if (rhs_vert < k_size) { + rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); + } + } + } + __syncthreads(); + // Loaded. Do computation + // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1. + // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3. + // .. + // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63 + rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x); + // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1. + // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3. + // .. + rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y); + // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1. + // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3. + rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z); + // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1. + // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3. + rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w); + + // LHS. + // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) + // ... + // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) + + +#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\ + results[0].x += a_feat1.x * f1.x;\ + results[1].x += a_feat1.x * f1.y;\ + results[2].x += a_feat1.x * f2.x;\ + results[3].x += a_feat1.x * f2.y;\ + results[4].x += a_feat1.x * f3.x;\ + results[5].x += a_feat1.x * f3.y;\ + results[6].x += a_feat1.x * f4.x;\ + results[7].x += a_feat1.x * f4.y;\ +\ + results[0].y += a_feat1.y * f1.x;\ + results[1].y += a_feat1.y * f1.y;\ + results[2].y += a_feat1.y * f2.x;\ + results[3].y += a_feat1.y * f2.y;\ + results[4].y += a_feat1.y * f3.x;\ + results[5].y += a_feat1.y * f3.y;\ + results[6].y += a_feat1.y * f4.x;\ + results[7].y += a_feat1.y * f4.y;\ +\ + results[0].z += a_feat2.x * f1.x;\ + results[1].z += a_feat2.x * f1.y;\ + results[2].z += a_feat2.x * f2.x;\ + results[3].z += a_feat2.x * f2.y;\ + results[4].z += a_feat2.x * f3.x;\ + results[5].z += a_feat2.x * f3.y;\ + results[6].z += a_feat2.x * f4.x;\ + results[7].z += a_feat2.x * f4.y;\ +\ + results[0].w += a_feat2.y * f1.x;\ + results[1].w += a_feat2.y * f1.y;\ + results[2].w += a_feat2.y * f2.x;\ + results[3].w += a_feat2.y * f2.y;\ + results[4].w += a_feat2.y * f3.x;\ + results[5].w += a_feat2.y * f3.y;\ + results[6].w += a_feat2.y * f4.x;\ + results[7].w += a_feat2.y * f4.y;\ + + lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y); + lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y); + lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y); + lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y); + + lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w); + lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w); + lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w); + lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w); + + __syncthreads(); + + // Do the multiplies. + #pragma unroll + for (int koff = 0; koff < 32; koff ++) { + float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8]; + float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8]; + + // first feature is at (threadIdx.y/4) * 8 last is at start + 8. + int start_feature = (threadIdx.y / 4) * 8; + + float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4]; + float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4]; + float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4]; + float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4]; + + add_vals(a3, a4, br1, br2, br3, br4) + } + __syncthreads(); + } // end loop over k + + + __syncthreads(); + Index horiz_base = (threadIdx.y/4)*8+base_n; + if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (!CHECK_RHS_BOUNDARY) { + if (lhs_vert + 3 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } else if (lhs_vert + 2 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + } + } else if (lhs_vert + 1 < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + } + } else if (lhs_vert < m_size) { + for (int i = 0; i < 8; i++) { + output(lhs_vert, horiz_base + i) = results[i].x; + } + } + } else if (!CHECK_LHS_BOUNDARY) { + // CHECK BOUNDARY_B + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + output(lhs_vert, horiz_base + i) = results[i].x; + output(lhs_vert + 1, horiz_base + i) = results[i].y; + output(lhs_vert + 2, horiz_base + i) = results[i].z; + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } else { + // CHECK both boundaries. + for (int i = 0; i < 8; i++) { + if (horiz_base + i < n_size) { + if (lhs_vert < m_size) + output(lhs_vert, horiz_base + i) = results[i].x; + if (lhs_vert + 1 < m_size) + output(lhs_vert + 1, horiz_base + i) = results[i].y; + if (lhs_vert + 2 < m_size) + output(lhs_vert + 2, horiz_base + i) = results[i].z; + if (lhs_vert + 3 < m_size) + output(lhs_vert + 3, horiz_base + i) = results[i].w; + } + } + } +} + + +template +__global__ void +__launch_bounds__(256) +EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[64*32]; + __shared__ float2 rhs_shmem[128*8]; + + typedef float2 LHS_MEM[64][32]; + typedef float2 RHS_MEM[128][8]; + + typedef float2 LHS_MEM16x16[32][16]; + typedef float2 RHS_MEM16x16[64][8]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 128 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + bool check_rhs = (base_n + 63) >= n_size; + bool check_lhs128 = (base_m + 127) >= m_size; + + if (!check_rhs) { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } else { + if (!check_lhs128) { + // >= 128 rows left + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal( + lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); + } + } +} + +template +__global__ void +__launch_bounds__(256) +EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, + const OutputMapper output, + const Index m_size, const Index n_size, const Index k_size) { + __shared__ float2 lhs_shmem[32][16]; + __shared__ float2 rhs_shmem[64][8]; + + const Index m_block_idx = blockIdx.x; + const Index n_block_idx = blockIdx.y; + + const Index base_m = 64 * m_block_idx; + const Index base_n = 64 * n_block_idx; + + if (base_m + 63 < m_size) { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } else { + if (base_n + 63 < n_size) { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } else { + EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); + } + } +} + + +template +struct TensorEvaluator, GpuDevice> : + public TensorContractionEvaluatorBase, GpuDevice> > { + + typedef GpuDevice Device; + + typedef TensorEvaluator, Device> Self; + typedef TensorContractionEvaluatorBase Base; + + typedef TensorContractionOp XprType; + typedef typename internal::remove_const::type Scalar; + typedef typename XprType::Index Index; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + + enum { + Layout = TensorEvaluator::Layout, + }; + + // Most of the code is assuming that both input tensors are ColMajor. If the + // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: + // If we want to compute A * B = C, where A is LHS and B is RHS, the code + // will pretend B is LHS and A is RHS. + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; + typedef typename internal::conditional< + static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; + + static const int LDims = + internal::array_size::Dimensions>::value; + static const int RDims = + internal::array_size::Dimensions>::value; + static const int ContractDims = internal::array_size::value; + + typedef array left_dim_mapper_t; + typedef array right_dim_mapper_t; + + typedef array contract_t; + typedef array left_nocontract_t; + typedef array right_nocontract_t; + + static const int NumDims = LDims + RDims - 2 * ContractDims; + + typedef DSizes Dimensions; + + // typedefs needed in evalTo + typedef typename internal::remove_const::type LhsScalar; + typedef typename internal::remove_const::type RhsScalar; + + typedef TensorEvaluator LeftEvaluator; + typedef TensorEvaluator RightEvaluator; + + typedef typename LeftEvaluator::Dimensions LeftDimensions; + typedef typename RightEvaluator::Dimensions RightDimensions; + + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + Base(op, device) {} + + // We need to redefine this method to make nvcc happy + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { + this->m_leftImpl.evalSubExprsIfNeeded(NULL); + this->m_rightImpl.evalSubExprsIfNeeded(NULL); + if (data) { + evalTo(data); + return false; + } else { + this->m_result = static_cast(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); + evalTo(this->m_result); + return true; + } + } + + void evalTo(Scalar* buffer) const { + if (this->m_lhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + } + else { + if (this->m_rhs_inner_dim_contiguous) { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + else { + if (this->m_rhs_inner_dim_reordered) { + evalTyped(buffer); + } + else { + evalTyped(buffer); + } + } + } + } + + template struct LaunchKernels { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 8, 8); + LAUNCH_CUDA_KERNEL((EigenContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + } + }; + + template struct LaunchKernels { + static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { + if (m < 768 || n < 768) { + const Index m_blocks = (m + 63) / 64; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(16, 16, 1); + LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel16x16), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + } else { + const Index m_blocks = (m + 127) / 128; + const Index n_blocks = (n + 63) / 64; + const dim3 num_blocks(m_blocks, n_blocks, 1); + const dim3 block_size(8, 32, 1); + LAUNCH_CUDA_KERNEL((EigenFloatContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); + } + } + }; + + template + void evalTyped(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + EIGEN_UNUSED_VARIABLE(k) + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + setCudaSharedMemConfig(cudaSharedMemBankSizeEightByte); + LaunchKernels::Run(lhs, rhs, output, m, n, k, this->m_device); + } +}; + +} // end namespace Eigen + +#endif // EIGEN_USE_GPU and __CUDACC__ +#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_CUDA_H diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h deleted file mode 100644 index bb990b378..000000000 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h +++ /dev/null @@ -1,1413 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014-2015 Benoit Steiner -// Copyright (C) 2015 Navdeep Jaitly -// Copyright (C) 2014 Eric Martin -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H - -#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) - -namespace Eigen { - -template -__device__ EIGEN_STRONG_INLINE void -EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, Scalar* lhs_shmem, Scalar* rhs_shmem, - const Index m_size, const Index n_size, const Index k_size) { - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - // declare and initialize 64 registers for output 8x8 block - - // prefetch registers - Scalar lhs_pf0; - Scalar lhs_pf1; - Scalar lhs_pf2; - Scalar lhs_pf3; - Scalar lhs_pf4; - Scalar lhs_pf5; - Scalar lhs_pf6; - Scalar lhs_pf7; - - Scalar rhs_pf0; - Scalar rhs_pf1; - Scalar rhs_pf2; - Scalar rhs_pf3; - Scalar rhs_pf4; - Scalar rhs_pf5; - Scalar rhs_pf6; - Scalar rhs_pf7; - - // shared memory is formatted - // (contract idx in block, nocontract idx in block, block idx) - // where block idx is column major. This transposition limits the number of - // bank conflicts when reading the LHS. The core idea is that since the contracting - // index is shared by both sides, then the contracting index should be in threadIdx.x. - - // On the LHS, we pad each row inside of each block with an extra element. This makes - // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts - // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks. - - // On the RHS we just add 8 padding elements to the end of each block. This gives no bank - // conflicts on writes and also none on reads. - - // storage indices - const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z; - const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x; - - const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0; - const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1; - const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2; - const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3; - const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4; - const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5; - const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6; - const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7; - - const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0; - const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1; - const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2; - const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3; - const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4; - const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5; - const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6; - const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7; - - // in the loading code, the following variables are important: - // threadIdx.x: the vertical position in an 8x8 block - // threadIdx.y: the vertical index of the 8x8 block in the grid - // threadIdx.z: the horizontal position in an 8x8 block - // k: the horizontal index of the 8x8 block in the grid - // - // The k parameter is implicit (it was the loop counter for a loop that went - // from 0 to <8, but now that loop is unrolled in the below code. - - const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y; - const Index lhs_vert = base_m + load_idx_vert; - -#define prefetchIntoRegisters(base_k) \ - { \ - lhs_pf0 = conv(0); \ - lhs_pf1 = conv(0); \ - lhs_pf2 = conv(0); \ - lhs_pf3 = conv(0); \ - lhs_pf4 = conv(0); \ - lhs_pf5 = conv(0); \ - lhs_pf6 = conv(0); \ - lhs_pf7 = conv(0); \ - \ - rhs_pf0 = conv(0); \ - rhs_pf1 = conv(0); \ - rhs_pf2 = conv(0); \ - rhs_pf3 = conv(0); \ - rhs_pf4 = conv(0); \ - rhs_pf5 = conv(0); \ - rhs_pf6 = conv(0); \ - rhs_pf7 = conv(0); \ - \ - if (!needs_edge_check || lhs_vert < m_size) { \ - const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \ - const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \ - const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \ - const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \ - const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \ - const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \ - const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \ - const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \ - \ - if (!needs_edge_check || lhs_horiz_7 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ - lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ - lhs_pf7 = lhs(lhs_vert, lhs_horiz_7); \ - } else if (lhs_horiz_6 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ - lhs_pf6 = lhs(lhs_vert, lhs_horiz_6); \ - } else if (lhs_horiz_5 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - lhs_pf5 = lhs(lhs_vert, lhs_horiz_5); \ - } else if (lhs_horiz_4 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - lhs_pf4 = lhs(lhs_vert, lhs_horiz_4); \ - } else if (lhs_horiz_3 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - lhs_pf3 = lhs(lhs_vert, lhs_horiz_3); \ - } else if (lhs_horiz_2 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - lhs_pf2 = lhs(lhs_vert, lhs_horiz_2); \ - } else if (lhs_horiz_1 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - lhs_pf1 = lhs(lhs_vert, lhs_horiz_1); \ - } else if (lhs_horiz_0 < k_size) { \ - lhs_pf0 = lhs(lhs_vert, lhs_horiz_0); \ - } \ - } \ - \ - const Index rhs_vert = base_k + load_idx_vert; \ - if (!needs_edge_check || rhs_vert < k_size) { \ - const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \ - const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \ - const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \ - const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \ - const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \ - const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \ - const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \ - const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \ - \ - if (rhs_horiz_7 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ - rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ - rhs_pf7 = rhs(rhs_vert, rhs_horiz_7); \ - } else if (rhs_horiz_6 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ - rhs_pf6 = rhs(rhs_vert, rhs_horiz_6); \ - } else if (rhs_horiz_5 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - rhs_pf5 = rhs(rhs_vert, rhs_horiz_5); \ - } else if (rhs_horiz_4 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - rhs_pf4 = rhs(rhs_vert, rhs_horiz_4); \ - } else if (rhs_horiz_3 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - rhs_pf3 = rhs(rhs_vert, rhs_horiz_3); \ - } else if (rhs_horiz_2 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - rhs_pf2 = rhs(rhs_vert, rhs_horiz_2); \ - } else if (rhs_horiz_1 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - rhs_pf1 = rhs(rhs_vert, rhs_horiz_1); \ - } else if (rhs_horiz_0 < n_size) { \ - rhs_pf0 = rhs(rhs_vert, rhs_horiz_0); \ - } \ - } \ - } \ - -#define writeRegToShmem(_) \ - lhs_shmem[lhs_store_idx_0] = lhs_pf0; \ - rhs_shmem[rhs_store_idx_0] = rhs_pf0; \ - \ - lhs_shmem[lhs_store_idx_1] = lhs_pf1; \ - rhs_shmem[rhs_store_idx_1] = rhs_pf1; \ - \ - lhs_shmem[lhs_store_idx_2] = lhs_pf2; \ - rhs_shmem[rhs_store_idx_2] = rhs_pf2; \ - \ - lhs_shmem[lhs_store_idx_3] = lhs_pf3; \ - rhs_shmem[rhs_store_idx_3] = rhs_pf3; \ - \ - lhs_shmem[lhs_store_idx_4] = lhs_pf4; \ - rhs_shmem[rhs_store_idx_4] = rhs_pf4; \ - \ - lhs_shmem[lhs_store_idx_5] = lhs_pf5; \ - rhs_shmem[rhs_store_idx_5] = rhs_pf5; \ - \ - lhs_shmem[lhs_store_idx_6] = lhs_pf6; \ - rhs_shmem[rhs_store_idx_6] = rhs_pf6; \ - \ - lhs_shmem[lhs_store_idx_7] = lhs_pf7; \ - rhs_shmem[rhs_store_idx_7] = rhs_pf7; \ - - // declare and initialize result array -#define res(i, j) _res_##i##j -#define initResultRow(i) \ - Scalar res(i, 0) = conv(0); \ - Scalar res(i, 1) = conv(0); \ - Scalar res(i, 2) = conv(0); \ - Scalar res(i, 3) = conv(0); \ - Scalar res(i, 4) = conv(0); \ - Scalar res(i, 5) = conv(0); \ - Scalar res(i, 6) = conv(0); \ - Scalar res(i, 7) = conv(0); \ - - internal::scalar_cast_op conv; - initResultRow(0); - initResultRow(1); - initResultRow(2); - initResultRow(3); - initResultRow(4); - initResultRow(5); - initResultRow(6); - initResultRow(7); -#undef initResultRow - - for (Index base_k = 0; base_k < k_size; base_k += 64) { - // wait for previous iteration to finish with shmem. Despite common sense, - // the code is a bit faster with this here then at bottom of loop - __syncthreads(); - - prefetchIntoRegisters(base_k); - writeRegToShmem(); - - #undef prefetchIntoRegisters - #undef writeRegToShmem - - // wait for shared mem packing to be done before starting computation - __syncthreads(); - - // compute 8x8 matrix product by outer product. This involves packing one column - // of LHS and one row of RHS into registers (takes 16 registers). - -#define lcol(i) _lcol##i - Scalar lcol(0); - Scalar lcol(1); - Scalar lcol(2); - Scalar lcol(3); - Scalar lcol(4); - Scalar lcol(5); - Scalar lcol(6); - Scalar lcol(7); - -#define rrow(j) _rrow##j - Scalar rrow(0); - Scalar rrow(1); - Scalar rrow(2); - Scalar rrow(3); - Scalar rrow(4); - Scalar rrow(5); - Scalar rrow(6); - Scalar rrow(7); - - // Now x corresponds to k, y to m, and z to n - const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y]; - const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z]; - -#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))] -#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))] - -#define loadData(i, j) \ - lcol(0) = lhs_element(0, j); \ - rrow(0) = rhs_element(i, 0); \ - lcol(1) = lhs_element(1, j); \ - rrow(1) = rhs_element(i, 1); \ - lcol(2) = lhs_element(2, j); \ - rrow(2) = rhs_element(i, 2); \ - lcol(3) = lhs_element(3, j); \ - rrow(3) = rhs_element(i, 3); \ - lcol(4) = lhs_element(4, j); \ - rrow(4) = rhs_element(i, 4); \ - lcol(5) = lhs_element(5, j); \ - rrow(5) = rhs_element(i, 5); \ - lcol(6) = lhs_element(6, j); \ - rrow(6) = rhs_element(i, 6); \ - lcol(7) = lhs_element(7, j); \ - rrow(7) = rhs_element(i, 7); \ - -#define computeCol(j) \ - res(0, j) += lcol(0) * rrow(j); \ - res(1, j) += lcol(1) * rrow(j); \ - res(2, j) += lcol(2) * rrow(j); \ - res(3, j) += lcol(3) * rrow(j); \ - res(4, j) += lcol(4) * rrow(j); \ - res(5, j) += lcol(5) * rrow(j); \ - res(6, j) += lcol(6) * rrow(j); \ - res(7, j) += lcol(7) * rrow(j); \ - -#define computePass(i) \ - loadData(i, i); \ - \ - computeCol(0); \ - computeCol(1); \ - computeCol(2); \ - computeCol(3); \ - computeCol(4); \ - computeCol(5); \ - computeCol(6); \ - computeCol(7); \ - - computePass(0); - computePass(1); - computePass(2); - computePass(3); - computePass(4); - computePass(5); - computePass(6); - computePass(7); - -#undef lcol -#undef rrow -#undef lhs_element -#undef rhs_element -#undef loadData -#undef computeCol -#undef computePass - } // end loop over k - - // we've now iterated over all of the large (ie width 64) k blocks and - // accumulated results in registers. At this point thread (x, y, z) contains - // the sum across all big k blocks of the product of little k block of index (x, y) - // with block of index (y, z). To compute the final output, we need to reduce - // the 8 threads over y by summation. -#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000) -#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask) -#else -#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask) -#endif - -#define reduceRow(i, mask) \ - shuffleInc(i, 0, mask); \ - shuffleInc(i, 1, mask); \ - shuffleInc(i, 2, mask); \ - shuffleInc(i, 3, mask); \ - shuffleInc(i, 4, mask); \ - shuffleInc(i, 5, mask); \ - shuffleInc(i, 6, mask); \ - shuffleInc(i, 7, mask); \ - -#define reduceMatrix(mask) \ - reduceRow(0, mask); \ - reduceRow(1, mask); \ - reduceRow(2, mask); \ - reduceRow(3, mask); \ - reduceRow(4, mask); \ - reduceRow(5, mask); \ - reduceRow(6, mask); \ - reduceRow(7, mask); \ - - // actually perform the reduction, now each thread of index (_, y, z) - // contains the correct values in its registers that belong in the output - // block - reduceMatrix(1); - reduceMatrix(2); - reduceMatrix(4); - -#undef shuffleInc -#undef reduceRow -#undef reduceMatrix - - // now we need to copy the 64 values into main memory. We can't split work - // among threads because all variables are in registers. There's 2 ways - // to do this: - // (1) have 1 thread do 64 writes from registers into global memory - // (2) have 1 thread do 64 writes into shared memory, and then 8 threads - // each do 8 writes into global memory. We can just overwrite the shared - // memory from the problem we just solved. - // (2) is slightly faster than (1) due to less branching and more ILP - - // TODO: won't yield much gain, but could just use currently unused shared mem - // and then we won't have to sync - // wait for shared mem to be out of use - __syncthreads(); - -#define writeResultShmem(i, j) \ - lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j); \ - -#define writeRow(i) \ - writeResultShmem(i, 0); \ - writeResultShmem(i, 1); \ - writeResultShmem(i, 2); \ - writeResultShmem(i, 3); \ - writeResultShmem(i, 4); \ - writeResultShmem(i, 5); \ - writeResultShmem(i, 6); \ - writeResultShmem(i, 7); \ - - if (threadIdx.x == 0) { - writeRow(0); - writeRow(1); - writeRow(2); - writeRow(3); - writeRow(4); - writeRow(5); - writeRow(6); - writeRow(7); - } -#undef writeResultShmem -#undef writeRow - - const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8); - const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8); - - if (threadIdx.x < max_i_write) { - if (max_j_write == 8) { - // TODO: can i trade bank conflicts for coalesced writes? - Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0]; - Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1]; - Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2]; - Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3]; - Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4]; - Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5]; - Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6]; - Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7]; - - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7; - } else { -#pragma unroll 7 - for (int j = 0; j < max_j_write; j++) { - Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j]; - output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val; - } - } - } -#undef res -} - - -template -__global__ void -#if defined(EIGEN_HIPCC) -__launch_bounds__(512, 1) -#else -__launch_bounds__(512) -#endif -EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, - const Index m_size, const Index n_size, const Index k_size) { - __shared__ Scalar lhs_shmem[72 * 64]; - __shared__ Scalar rhs_shmem[72 * 64]; - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - if (base_m + 63 < m_size && base_n + 63 < n_size) { - EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); - } else { - EigenContractionKernelInternal(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size); - } -} - - -template -__device__ __forceinline__ void -EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, float2 lhs_shmem2[][16], - float2 rhs_shmem2[][8], const Index m_size, - const Index n_size, const Index k_size, - const Index base_m, const Index base_n) { - - // prefetch registers - float4 lhs_pf0, rhs_pf0; - - float4 results[4]; - for (int i=0; i < 4; i++) { - results[i].x = results[i].y = results[i].z = results[i].w = 0; - } - -#define prefetch_lhs(reg, row, col) \ - if (!CHECK_LHS_BOUNDARY) { \ - if (col < k_size) { \ - reg =lhs.template loadPacket(row, col); \ - } \ - } else { \ - if (col < k_size) { \ - if (row + 3 < m_size) { \ - reg =lhs.template loadPacket(row, col); \ - } else if (row + 2 < m_size) { \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - reg.z =lhs(row + 2, col); \ - } else if (row + 1 < m_size) { \ - reg.x =lhs(row + 0, col); \ - reg.y =lhs(row + 1, col); \ - } else if (row < m_size) { \ - reg.x =lhs(row + 0, col); \ - } \ - } \ - } \ - - Index lhs_vert = base_m+threadIdx.x*4; - - for (Index k = 0; k < k_size; k += 16) { - - lhs_pf0 = internal::pset1(0); - rhs_pf0 = internal::pset1(0); - - Index lhs_horiz = threadIdx.y+k; - prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz) - - Index rhs_vert = k+(threadIdx.x%4)*4; - Index rhs_horiz0 = (threadIdx.x>>2)+threadIdx.y*4+base_n; - - if (!CHECK_RHS_BOUNDARY) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); - } else if (rhs_vert + 2 < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - } else if (rhs_vert + 1 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - } - } else { - if (rhs_horiz0 < n_size) { - if ((rhs_vert + 3) < k_size) { - rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); - } else if ((rhs_vert + 2) < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - } else if ((rhs_vert + 1) < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - } - } - } - float x1, x2 ; - // the following can be a bitwise operation..... some day. - if((threadIdx.x%8) < 4) { - x1 = rhs_pf0.y; - x2 = rhs_pf0.w; - } else { - x1 = rhs_pf0.x; - x2 = rhs_pf0.z; - } - #if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000) - x1 = __shfl_xor(x1, 4); - x2 = __shfl_xor(x2, 4); - #else - x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4); - x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4); - #endif - if((threadIdx.x%8) < 4) { - rhs_pf0.y = x1; - rhs_pf0.w = x2; - } else { - rhs_pf0.x = x1; - rhs_pf0.z = x2; - } - - // We have 64 features. - // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1. - // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3. - // ... - // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63 - // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1 - // ... - rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2][threadIdx.x%8] = make_float2(rhs_pf0.x, rhs_pf0.y); - rhs_shmem2[(threadIdx.x>>3)+ threadIdx.y*2+32][threadIdx.x%8] = make_float2(rhs_pf0.z, rhs_pf0.w); - - // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) - // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) - // ... - // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) - // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) - // ... - - lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y); - lhs_shmem2[threadIdx.y+16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w); - - -#define add_vals(fl1, fl2, fr1, fr2)\ - results[0].x += fl1.x * fr1.x;\ - results[0].y += fl1.y * fr1.x;\ - results[0].z += fl2.x * fr1.x;\ - results[0].w += fl2.y * fr1.x;\ -\ - results[1].x += fl1.x * fr1.y;\ - results[1].y += fl1.y * fr1.y;\ - results[1].z += fl2.x * fr1.y;\ - results[1].w += fl2.y * fr1.y;\ -\ - results[2].x += fl1.x * fr2.x;\ - results[2].y += fl1.y * fr2.x;\ - results[2].z += fl2.x * fr2.x;\ - results[2].w += fl2.y * fr2.x;\ -\ - results[3].x += fl1.x * fr2.y;\ - results[3].y += fl1.y * fr2.y;\ - results[3].z += fl2.x * fr2.y;\ - results[3].w += fl2.y * fr2.y;\ - - __syncthreads(); - - // Do the multiplies. - #pragma unroll - for (int koff = 0; koff < 16; koff ++) { - // 32 x threads. - float2 fl1 = lhs_shmem2[koff][threadIdx.x]; - float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x]; - - int start_feature = threadIdx.y * 4; - float2 fr1 = rhs_shmem2[(start_feature>>1) + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; - float2 fr2 = rhs_shmem2[(start_feature>>1) + 1 + 32*((koff%4)/2)][koff/4 + (koff%2)*4]; - - add_vals(fl1, fl2, fr1, fr2) - } - __syncthreads(); - } - -#undef prefetch_lhs -#undef add_vals - - Index horiz_base = threadIdx.y*4+base_n; - if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (!CHECK_RHS_BOUNDARY) { - // CHECK LHS - if (lhs_vert + 3 < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (lhs_vert + 2 < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - } - } else if (lhs_vert + 1 < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - } - } else if (lhs_vert < m_size) { - for (int i = 0; i < 4; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - } - } - } else if (!CHECK_LHS_BOUNDARY) { - // CHECK RHS - /* - int ncols_rem = fminf(n_size- horiz_base, 4); - for (int i = 0; i < ncols_rem; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - }*/ - for (int i = 0; i < 4; i++) { - if (horiz_base+i < n_size) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } else { - // CHECK both boundaries. - for (int i = 0; i < 4; i++) { - if (horiz_base+i < n_size) { - if (lhs_vert < m_size) - output(lhs_vert, horiz_base + i) = results[i].x; - if (lhs_vert + 1 < m_size) - output(lhs_vert + 1, horiz_base + i) = results[i].y; - if (lhs_vert + 2 < m_size) - output(lhs_vert + 2, horiz_base + i) = results[i].z; - if (lhs_vert + 3 < m_size) - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } -} - - -template -__device__ __forceinline__ void -EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, float2 lhs_shmem2[][32], - float2 rhs_shmem2[][8], const Index m_size, - const Index n_size, const Index k_size, - const Index base_m, const Index base_n) { - - // prefetch registers - float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3; - float4 rhs_pf0, rhs_pf1; - - float4 results[8]; - for (int i=0; i < 8; i++) { - results[i].x = results[i].y = results[i].z = results[i].w = 0; - } - - Index lhs_vert = base_m+threadIdx.x*4+(threadIdx.y%4)*32; - for (Index k = 0; k < k_size; k += 32) { - lhs_pf0 = internal::pset1(0); - lhs_pf1 = internal::pset1(0); - lhs_pf2 = internal::pset1(0); - lhs_pf3 = internal::pset1(0); - - rhs_pf0 = internal::pset1(0); - rhs_pf1 = internal::pset1(0); - - if (!CHECK_LHS_BOUNDARY) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - } - } else { - // just CHECK_LHS_BOUNDARY - if (lhs_vert + 3 < m_size) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - lhs_pf3 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - lhs_pf2 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - lhs_pf1 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0 =lhs.template loadPacket(lhs_vert, (threadIdx.y/4+k)); - } - } else if (lhs_vert + 2 < m_size) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); - lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); - lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); - lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); - lhs_pf3.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); - lhs_pf2.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf1.z =lhs(lhs_vert + 2, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf0.z =lhs(lhs_vert + 2, (threadIdx.y/4+k)); - } - } else if (lhs_vert + 1 < m_size) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); - lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); - lhs_pf3.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf2.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf1.y =lhs(lhs_vert + 1, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf0.y =lhs(lhs_vert + 1, (threadIdx.y/4+k)); - } - } else if (lhs_vert < m_size) { - if ((threadIdx.y/4+k+24) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - lhs_pf3.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+24)); - } else if ((threadIdx.y/4+k+16) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - lhs_pf2.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+16)); - } else if ((threadIdx.y/4+k+8) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - lhs_pf1.x =lhs(lhs_vert + 0, (threadIdx.y/4+k+8)); - } else if ((threadIdx.y/4+k) < k_size) { - lhs_pf0.x =lhs(lhs_vert + 0, (threadIdx.y/4+k)); - } - } - } - __syncthreads(); - Index rhs_vert = k+threadIdx.x*4; - Index rhs_horiz0 = threadIdx.y*2+base_n; - Index rhs_horiz1 = threadIdx.y*2+1+base_n; - if (!CHECK_RHS_BOUNDARY) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); - } else if (rhs_vert + 2 < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); - } else if (rhs_vert + 1 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - } - } else { - if (rhs_horiz1 < n_size) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); - rhs_pf1 = rhs.template loadPacket(rhs_vert, rhs_horiz1); - } else if (rhs_vert + 2 < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1); - } else if (k+threadIdx.x*4 + 1 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1); - } else if (k+threadIdx.x*4 < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf1.x = rhs(rhs_vert, rhs_horiz1); - } - } else if (rhs_horiz0 < n_size) { - if ((rhs_vert + 3) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0 = rhs.template loadPacket(rhs_vert, rhs_horiz0); - } else if ((rhs_vert + 2) < k_size) { - // just CHECK_RHS_BOUNDARY - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0); - } else if ((rhs_vert + 1) < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0); - } else if (rhs_vert < k_size) { - rhs_pf0.x = rhs(rhs_vert, rhs_horiz0); - } - } - } - __syncthreads(); - // Loaded. Do computation - // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1. - // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3. - // .. - // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63 - rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x); - // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1. - // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3. - // .. - rhs_shmem2[threadIdx.y+32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y); - // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1. - // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3. - rhs_shmem2[threadIdx.y+64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z); - // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1. - // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3. - rhs_shmem2[threadIdx.y+96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w); - - // LHS. - // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) - // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), .. (60, 61) .. (124, 125) - // ... - // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) - // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), .. (62, 63) .. (126, 127) - - -#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4)\ - results[0].x += a_feat1.x * f1.x;\ - results[1].x += a_feat1.x * f1.y;\ - results[2].x += a_feat1.x * f2.x;\ - results[3].x += a_feat1.x * f2.y;\ - results[4].x += a_feat1.x * f3.x;\ - results[5].x += a_feat1.x * f3.y;\ - results[6].x += a_feat1.x * f4.x;\ - results[7].x += a_feat1.x * f4.y;\ -\ - results[0].y += a_feat1.y * f1.x;\ - results[1].y += a_feat1.y * f1.y;\ - results[2].y += a_feat1.y * f2.x;\ - results[3].y += a_feat1.y * f2.y;\ - results[4].y += a_feat1.y * f3.x;\ - results[5].y += a_feat1.y * f3.y;\ - results[6].y += a_feat1.y * f4.x;\ - results[7].y += a_feat1.y * f4.y;\ -\ - results[0].z += a_feat2.x * f1.x;\ - results[1].z += a_feat2.x * f1.y;\ - results[2].z += a_feat2.x * f2.x;\ - results[3].z += a_feat2.x * f2.y;\ - results[4].z += a_feat2.x * f3.x;\ - results[5].z += a_feat2.x * f3.y;\ - results[6].z += a_feat2.x * f4.x;\ - results[7].z += a_feat2.x * f4.y;\ -\ - results[0].w += a_feat2.y * f1.x;\ - results[1].w += a_feat2.y * f1.y;\ - results[2].w += a_feat2.y * f2.x;\ - results[3].w += a_feat2.y * f2.y;\ - results[4].w += a_feat2.y * f3.x;\ - results[5].w += a_feat2.y * f3.y;\ - results[6].w += a_feat2.y * f4.x;\ - results[7].w += a_feat2.y * f4.y;\ - - lhs_shmem2[threadIdx.y/4][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.x, lhs_pf0.y); - lhs_shmem2[threadIdx.y/4+8][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.x, lhs_pf1.y); - lhs_shmem2[threadIdx.y/4+16][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.x, lhs_pf2.y); - lhs_shmem2[threadIdx.y/4+24][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.x, lhs_pf3.y); - - lhs_shmem2[threadIdx.y/4 + 32][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf0.z, lhs_pf0.w); - lhs_shmem2[threadIdx.y/4 + 40][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf1.z, lhs_pf1.w); - lhs_shmem2[threadIdx.y/4 + 48][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf2.z, lhs_pf2.w); - lhs_shmem2[threadIdx.y/4 + 56][threadIdx.x+(threadIdx.y%4)*8] = make_float2(lhs_pf3.z, lhs_pf3.w); - - __syncthreads(); - - // Do the multiplies. - #pragma unroll - for (int koff = 0; koff < 32; koff ++) { - float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8]; - float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8]; - - // first feature is at (threadIdx.y/4) * 8 last is at start + 8. - int start_feature = (threadIdx.y / 4) * 8; - - float2 br1 = rhs_shmem2[start_feature/2 + (koff % 4) * 32][koff/4]; - float2 br2 = rhs_shmem2[start_feature/2 + 1 + (koff % 4) * 32][koff/4]; - float2 br3 = rhs_shmem2[start_feature/2 + 2 + (koff % 4) * 32][koff/4]; - float2 br4 = rhs_shmem2[start_feature/2 + 3 + (koff % 4) * 32][koff/4]; - - add_vals(a3, a4, br1, br2, br3, br4) - } - __syncthreads(); - } // end loop over k - - __syncthreads(); - Index horiz_base = (threadIdx.y/4)*8+base_n; - if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (!CHECK_RHS_BOUNDARY) { - if (lhs_vert + 3 < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } else if (lhs_vert + 2 < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - } - } else if (lhs_vert + 1 < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - } - } else if (lhs_vert < m_size) { - for (int i = 0; i < 8; i++) { - output(lhs_vert, horiz_base + i) = results[i].x; - } - } - } else if (!CHECK_LHS_BOUNDARY) { - // CHECK BOUNDARY_B - for (int i = 0; i < 8; i++) { - if (horiz_base + i < n_size) { - output(lhs_vert, horiz_base + i) = results[i].x; - output(lhs_vert + 1, horiz_base + i) = results[i].y; - output(lhs_vert + 2, horiz_base + i) = results[i].z; - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } else { - // CHECK both boundaries. - for (int i = 0; i < 8; i++) { - if (horiz_base + i < n_size) { - if (lhs_vert < m_size) - output(lhs_vert, horiz_base + i) = results[i].x; - if (lhs_vert + 1 < m_size) - output(lhs_vert + 1, horiz_base + i) = results[i].y; - if (lhs_vert + 2 < m_size) - output(lhs_vert + 2, horiz_base + i) = results[i].z; - if (lhs_vert + 3 < m_size) - output(lhs_vert + 3, horiz_base + i) = results[i].w; - } - } - } -} - - -template -__global__ void -#if defined(EIGEN_HIPCC) -__launch_bounds__(256, 1) -#else -__launch_bounds__(256) -#endif -EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, - const Index m_size, const Index n_size, const Index k_size) { - __shared__ float2 lhs_shmem[64*32]; - __shared__ float2 rhs_shmem[128*8]; - - typedef float2 LHS_MEM[64][32]; - typedef float2 RHS_MEM[128][8]; - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 128 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - bool check_rhs = (base_n + 63) >= n_size; - bool check_lhs128 = (base_m + 127) >= m_size; - - if (!check_rhs) { - if (!check_lhs128) { - // >= 128 rows left - EigenFloatContractionKernelInternal( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } - } else { - if (!check_lhs128) { - // >= 128 rows left - EigenFloatContractionKernelInternal( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal( - lhs, rhs, output, *((LHS_MEM *) lhs_shmem), *((RHS_MEM *) rhs_shmem), m_size, n_size, k_size, base_m, base_n); - } - } -} - -template -__global__ void -#if defined(EIGEN_HIPCC) -__launch_bounds__(256, 1) -#else -__launch_bounds__(256) -#endif -EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, - const OutputMapper output, - const Index m_size, const Index n_size, const Index k_size) { - __shared__ float2 lhs_shmem[32][16]; - __shared__ float2 rhs_shmem[64][8]; - - const Index m_block_idx = blockIdx.x; - const Index n_block_idx = blockIdx.y; - - const Index base_m = 64 * m_block_idx; - const Index base_n = 64 * n_block_idx; - - if (base_m + 63 < m_size) { - if (base_n + 63 < n_size) { - EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } - } else { - if (base_n + 63 < n_size) { - EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } else { - EigenFloatContractionKernelInternal16x16(lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n); - } - } -} - - -template -struct TensorEvaluator, GpuDevice> : - public TensorContractionEvaluatorBase, GpuDevice> > { - - typedef GpuDevice Device; - - typedef TensorEvaluator, Device> Self; - typedef TensorContractionEvaluatorBase Base; - - typedef TensorContractionOp XprType; - typedef typename internal::remove_const::type Scalar; - typedef typename XprType::Index Index; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - - enum { - Layout = TensorEvaluator::Layout, - }; - - // Most of the code is assuming that both input tensors are ColMajor. If the - // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS: - // If we want to compute A * B = C, where A is LHS and B is RHS, the code - // will pretend B is LHS and A is RHS. - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), LeftArgType, RightArgType>::type EvalLeftArgType; - typedef typename internal::conditional< - static_cast(Layout) == static_cast(ColMajor), RightArgType, LeftArgType>::type EvalRightArgType; - - static const int LDims = - internal::array_size::Dimensions>::value; - static const int RDims = - internal::array_size::Dimensions>::value; - static const int ContractDims = internal::array_size::value; - - typedef array left_dim_mapper_t; - typedef array right_dim_mapper_t; - - typedef array contract_t; - typedef array left_nocontract_t; - typedef array right_nocontract_t; - - static const int NumDims = LDims + RDims - 2 * ContractDims; - - typedef DSizes Dimensions; - - // typedefs needed in evalTo - typedef typename internal::remove_const::type LhsScalar; - typedef typename internal::remove_const::type RhsScalar; - - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - - typedef typename LeftEvaluator::Dimensions LeftDimensions; - typedef typename RightEvaluator::Dimensions RightDimensions; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : - Base(op, device) - { - EIGEN_STATIC_ASSERT( (internal::is_same::value), - GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS); - } - - // We need to redefine this method to make nvcc happy - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - this->m_leftImpl.evalSubExprsIfNeeded(NULL); - this->m_rightImpl.evalSubExprsIfNeeded(NULL); - if (data) { - evalTo(data); - return false; - } else { - this->m_result = static_cast(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar))); - evalTo(this->m_result); - return true; - } - } - - void evalTo(Scalar* buffer) const { - if (this->m_lhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - } - else { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } - else { - evalTyped(buffer); - } - } - } - } - - template struct LaunchKernels { - static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { - const Index m_blocks = (m + 63) / 64; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(8, 8, 8); - LAUNCH_GPU_KERNEL((EigenContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); - } - }; - - template struct LaunchKernels { - static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k, const GpuDevice& device) { - if (m < 768 || n < 768) { - const Index m_blocks = (m + 63) / 64; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(16, 16, 1); - LAUNCH_GPU_KERNEL((EigenFloatContractionKernel16x16), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); - } else { - const Index m_blocks = (m + 127) / 128; - const Index n_blocks = (n + 63) / 64; - const dim3 num_blocks(m_blocks, n_blocks, 1); - const dim3 block_size(8, 32, 1); - LAUNCH_GPU_KERNEL((EigenFloatContractionKernel), num_blocks, block_size, 0, device, lhs, rhs, output, m, n, k); - } - } - }; - - template - void evalTyped(Scalar* buffer) const { - // columns in left side, rows in right side - const Index k = this->m_k_size; - EIGEN_UNUSED_VARIABLE(k) - - // rows in left side - const Index m = this->m_i_size; - - // columns in right side - const Index n = this->m_j_size; - - // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) - this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); - - typedef internal::TensorContractionInputMapper LhsMapper; - - typedef internal::TensorContractionInputMapper RhsMapper; - - typedef internal::blas_data_mapper OutputMapper; - - - // initialize data mappers - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, - this->m_left_contracting_strides, this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, - this->m_right_contracting_strides, this->m_k_strides); - - OutputMapper output(buffer, m); - -#if defined(EIGEN_USE_HIP) - setGpuSharedMemConfig(hipSharedMemBankSizeEightByte); -#else - setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte); -#endif - - LaunchKernels::Run(lhs, rhs, output, m, n, k, this->m_device); - } -}; - -} // end namespace Eigen - -#endif // EIGEN_USE_GPU and EIGEN_GPUCC -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index 9ab900b4a..c28a10dd4 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -22,19 +22,8 @@ enum { /* * Implementation of the Eigen blas_data_mapper class for tensors. */ -/// The make pointer class is used by sycl in order to build the mapper class on the device. For other platform the default make pointer is used which -/// is scalar * for CoeffLoader. -template class MakePointer_ = MakePointer> -struct CoeffLoader; -template class MakePointer_ = MakePointer> -class BaseTensorContractionMapper; - -template class MakePointer_> -struct CoeffLoader { +template struct CoeffLoader { enum { DirectOffsets = false }; @@ -45,12 +34,6 @@ struct CoeffLoader { eigen_assert(false && "unsupported"); } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_::Type - data() const { - eigen_assert(false && "unsupported"); - return NULL; - } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return m_tensor.coeff(index); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -59,19 +42,12 @@ struct CoeffLoader { return m_tensor.template packet(index); } - #ifdef EIGEN_USE_SYCL - // The placeholder accessors require to be bound to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_tensor.bind(cgh); - } - #endif private: const Tensor m_tensor; }; -template class MakePointer_> -struct CoeffLoader { +template struct CoeffLoader { enum { DirectOffsets = true }; @@ -82,11 +58,6 @@ struct CoeffLoader { m_data += offset; } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_::Type - data() const { - return m_data; - } - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const { return loadConstant(m_data+index); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -94,23 +65,15 @@ struct CoeffLoader { { return internal::ploadt_ro(m_data + index); } - - #ifdef EIGEN_USE_SYCL - // The placeholder accessors require to be bound to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_data.bind(cgh); - } - #endif private: typedef typename Tensor::Scalar Scalar; - - typename MakePointer_::Type m_data; + const Scalar* m_data; }; template class MakePointer_ = MakePointer> + int packet_size, bool inner_dim_contiguous, int Alignment> class SimpleTensorContractionMapper { public: EIGEN_DEVICE_FUNC @@ -126,7 +89,7 @@ class SimpleTensorContractionMapper { m_k_strides(k_strides) { } enum { - DirectOffsets = CoeffLoader::DirectOffsets + DirectOffsets = CoeffLoader::DirectOffsets }; EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { @@ -153,7 +116,6 @@ class SimpleTensorContractionMapper { EIGEN_UNUSED_VARIABLE(left); // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963 Index nocontract_val = left ? row : col; Index linidx = 0; - EIGEN_UNROLL_LOOP for (int i = static_cast(array_size::value) - 1; i > 0; i--) { const Index idx = nocontract_val / m_ij_strides[i]; linidx += idx * m_nocontract_strides[i]; @@ -170,7 +132,6 @@ class SimpleTensorContractionMapper { Index contract_val = left ? col : row; if(array_size::value > 0) { - EIGEN_UNROLL_LOOP for (int i = static_cast(array_size::value) - 1; i > 0; i--) { const Index idx = contract_val / m_k_strides[i]; linidx += idx * m_contract_strides[i]; @@ -195,7 +156,6 @@ class SimpleTensorContractionMapper { Index nocontract_val[2] = {left ? row : col, left ? row + distance : col}; Index linidx[2] = {0, 0}; if (array_size::value > array_size::value) { - EIGEN_UNROLL_LOOP for (int i = static_cast(array_size::value) - 1; i > 0; i--) { const Index idx0 = nocontract_val[0] / m_ij_strides[i]; const Index idx1 = nocontract_val[1] / m_ij_strides[i]; @@ -216,7 +176,6 @@ class SimpleTensorContractionMapper { Index contract_val[2] = {left ? col : row, left ? col : row + distance}; if (array_size::value> 0) { - EIGEN_UNROLL_LOOP for (int i = static_cast(array_size::value) - 1; i > 0; i--) { const Index idx0 = contract_val[0] / m_k_strides[i]; const Index idx1 = contract_val[1] / m_k_strides[i]; @@ -248,41 +207,24 @@ class SimpleTensorContractionMapper { return ((side == Lhs) && inner_dim_contiguous && array_size::value > 0) ? m_contract_strides[0] : 1; } - #ifdef EIGEN_USE_SYCL - // The placeholder accessors require to be bound to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_tensor.bind(cgh); - } - #endif - - const CoeffLoader& tensor() const { - return m_tensor; - } - - const nocontract_t& nocontract_strides() const { - return m_nocontract_strides; - } - const nocontract_t& ij_strides() const { return m_ij_strides; } - const contract_t& contract_strides() const { return m_contract_strides; } - const contract_t& k_strides() const { return m_k_strides; } - protected: - CoeffLoader m_tensor; + CoeffLoader m_tensor; const nocontract_t m_nocontract_strides; const nocontract_t m_ij_strides; const contract_t m_contract_strides; const contract_t m_k_strides; }; + template class MakePointer_> -class BaseTensorContractionMapper : public SimpleTensorContractionMapper + bool inner_dim_reordered, int Alignment> +class BaseTensorContractionMapper : public SimpleTensorContractionMapper { public: - typedef SimpleTensorContractionMapper ParentMapper; + typedef SimpleTensorContractionMapper ParentMapper; EIGEN_DEVICE_FUNC BaseTensorContractionMapper(const Tensor& tensor, @@ -292,11 +234,12 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename internal::enable_if::size==packet_size,PacketT>::type - load(Index i, Index j) const - { + typedef typename Tensor::PacketReturnType Packet; + typedef typename unpacket_traits::half HalfPacket; + + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { // whole method makes column major assumption // don't need to add offsets for now (because operator handles that) @@ -311,7 +254,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper indexPair = this->computeIndexPair(i, j, packet_size - 1); const Index first = indexPair.first; - const Index lastIdx = indexPair.second; + const Index last = indexPair.second; // We can always do optimized packet reads from left hand side right now, because // the vertical matrix dimension on the left hand side is never contracting. @@ -319,7 +262,7 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapper::value <= 1 || !inner_dim_reordered) && - (lastIdx - first) == (packet_size - 1)) { + (last - first) == (packet_size - 1)) { return this->m_tensor.template packet(first); } @@ -327,44 +270,31 @@ class BaseTensorContractionMapper : public SimpleTensorContractionMapperm_tensor.coeff(first); - EIGEN_UNROLL_LOOP for (Index k = 1; k < packet_size - 1; k += 2) { const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); data[k] = this->m_tensor.coeff(internal_pair.first); data[k + 1] = this->m_tensor.coeff(internal_pair.second); } - data[packet_size - 1] = this->m_tensor.coeff(lastIdx); + data[packet_size - 1] = this->m_tensor.coeff(last); - return pload(data); + return pload(data); } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename internal::enable_if::size!=packet_size,PacketT>::type - load(Index i, Index j) const - { - const Index requested_packet_size = internal::unpacket_traits::size; - EIGEN_ALIGN_MAX Scalar data[requested_packet_size]; - - const IndexPair indexPair = this->computeIndexPair(i, j, requested_packet_size - 1); - const Index first = indexPair.first; - const Index lastIdx = indexPair.second; - - data[0] = this->m_tensor.coeff(first); - for (Index k = 1; k < requested_packet_size - 1; k += 2) { - const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); - data[k] = this->m_tensor.coeff(internal_pair.first); - data[k + 1] = this->m_tensor.coeff(internal_pair.second); - } - data[requested_packet_size - 1] = this->m_tensor.coeff(lastIdx); - - return pload(data); - } - - template + template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const { - return this->load(i,j); + EIGEN_STRONG_INLINE HalfPacket loadHalfPacket(Index i, Index j) const { + // whole method makes column major assumption + + // don't need to add offsets for now (because operator handles that) + const Index half_packet_size = unpacket_traits::size; + if (half_packet_size == packet_size) { + return loadPacket(i, j); + } + EIGEN_ALIGN_MAX Scalar data[half_packet_size]; + for (Index k = 0; k < half_packet_size; k++) { + data[k] = operator()(i + k, j); + } + return pload(data); } }; @@ -373,12 +303,11 @@ template class MakePointer_> -class BaseTensorContractionMapper - : public SimpleTensorContractionMapper + bool inner_dim_reordered, int Alignment> +class BaseTensorContractionMapper : public SimpleTensorContractionMapper { public: - typedef SimpleTensorContractionMapper ParentMapper; + typedef SimpleTensorContractionMapper ParentMapper; EIGEN_DEVICE_FUNC BaseTensorContractionMapper(const Tensor& tensor, @@ -388,17 +317,16 @@ class BaseTensorContractionMapper EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const { + typedef typename Tensor::PacketReturnType Packet; + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadPacket(Index i, Index j) const { EIGEN_ALIGN_MAX Scalar data[1]; data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); - return pload(data); + return pload(data); } - template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const { - EIGEN_ALIGN_MAX Scalar data[1]; - data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); - return pload(data); + template EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE Packet loadHalfPacket(Index i, Index j) const { + return loadPacket(i, j); } }; @@ -407,12 +335,14 @@ template class MakePointer_=MakePointer> + bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> class TensorContractionSubMapper { public: + typedef typename Tensor::PacketReturnType Packet; + typedef typename unpacket_traits::half HalfPacket; - typedef BaseTensorContractionMapper ParentMapper; - typedef TensorContractionSubMapper Self; + typedef BaseTensorContractionMapper ParentMapper; + typedef TensorContractionSubMapper Self; typedef Self LinearMapper; enum { @@ -444,32 +374,27 @@ class TensorContractionSubMapper { return m_base_mapper(i + m_vert_offset, j + m_horiz_offset); } - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { if (UseDirectOffsets) { - return m_base_mapper.template loadPacket(i, 0); + return m_base_mapper.template loadPacket(i, 0); } - return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); + } + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + if (UseDirectOffsets) { + return m_base_mapper.template loadPacket(i, j); + } + return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); } - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HalfPacket loadHalfPacket(Index i) const { if (UseDirectOffsets) { - return m_base_mapper.template loadPacket(i, j); + return m_base_mapper.template loadHalfPacket(i, 0); } - return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); + return m_base_mapper.template loadHalfPacket(i + m_vert_offset, m_horiz_offset); } - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const { - if (UseDirectOffsets) { - return m_base_mapper.template load(i, j); - } - return m_base_mapper.template loadPacket(i + m_vert_offset, j + m_horiz_offset); - } - - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketT& p) const { + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Packet p) const { if (UseDirectOffsets) { m_base_mapper.storePacket(i, 0, p); } @@ -485,30 +410,19 @@ class TensorContractionSubMapper { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const { - EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned; if (UseDirectOffsets) { - return m_base_mapper.template loadPacket(i, 0); + return m_base_mapper.template loadPacket(i, 0); } - return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); + return m_base_mapper.template loadPacket(i + m_vert_offset, m_horiz_offset); } - template + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const { return false; } - #ifdef EIGEN_USE_SYCL - // The placeholder accessors require to be bound to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_base_mapper.bind(cgh); - } - #endif - - const ParentMapper& base_mapper() const { return m_base_mapper; } - Index vert_offset() const { return m_vert_offset; } - Index horiz_offset() const { return m_horiz_offset; } - private: ParentMapper m_base_mapper; const Index m_vert_offset; @@ -520,14 +434,14 @@ template class MakePointer_=MakePointer> + bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> class TensorContractionInputMapper - : public BaseTensorContractionMapper { + : public BaseTensorContractionMapper { public: typedef Scalar_ Scalar; - typedef BaseTensorContractionMapper Base; - typedef TensorContractionSubMapper SubMapper; + typedef BaseTensorContractionMapper Base; + typedef TensorContractionSubMapper SubMapper; typedef SubMapper VectorMapper; EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor, @@ -545,29 +459,9 @@ class TensorContractionInputMapper EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const { return VectorMapper(*this, i, j); } - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const CoeffLoader& get_tensor() const { - return Base::m_tensor; - } }; -template struct TensorContractionInputMapperTrait; - -template class MakePointer_> -struct TensorContractionInputMapperTrait > { - - typedef Tensor_ XprType; - static const bool inner_dim_contiguous = inner_dim_contiguous_; - static const bool inner_dim_reordered = inner_dim_reordered_; - }; - } // end namespace internal } // end namespace Eigen diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h deleted file mode 100644 index a6ca1777a..000000000 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ /dev/null @@ -1,1650 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: -// -// This Source Code Form is subject to the terms of the Mozilla Public License v. 2.0. If a copy of the MPL was not -// distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -/***************************************************************** - * TensorContractionSycl.h - * - * \brief: - * TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend - * - *****************************************************************/ - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H - -namespace Eigen { - -namespace TensorSycl { -namespace internal { - -#ifndef EIGEN_SYCL_DISABLE_GEMV -/*! - * \brief TVPanelSize, a template class used for setting the panel size required for launching General TensorVector - * contraction kernel on various hardware devices. - * - * \tparam Scalar: determines the element type of the tensor/vector - * - * \tparam StorageIndex determines the Index type. - * - * \tparam NCWindow: determines the number of non-contracting element to be process by each work-group - * - * \tparam CFactor: determines the number of contracting element to be process by each thread - * - * \tparam NCFactor: determines the number of non-contracting element to be process by each thread - */ -template -struct TVPanelSize { - // LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension - static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0; - // LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension - static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1; - // TileSizeDimNC: determines the tile size for the non-contracting dimension - static EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = NCWindow / NCFactor; - // TileSizeDimC: determines the tile size for the contracting dimension - static EIGEN_CONSTEXPR StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC; - // WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension - static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC; - // WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension - static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC; - // BC : determines if supporting bank conflict is required - static EIGEN_CONSTEXPR bool BC = false; -}; -#endif - -/*! - * \brief TTPanelSize, a template class used for setting the panel size required for launching General Tensor Tensor - contraction kernel on various hardware devices. - * - * \tparam Scalar: determines the element type of the tensor - * - * \tparam StorageIndex: determines the Index type. - * - * \tparam REG_SIZE_M: determines workload per thread for loading the M dimension This can be varied based on the - available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro). - * - * \tparam REG_SIZE_N: determines workload per thread for loading the N dimension This can be varied based on the - available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro). - * - * \tparam TSDK: determines Tile size for dimension K. The packet size is assumed to be considered - */ - -template -struct TTPanelSize { - // TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered - static EIGEN_CONSTEXPR StorageIndex TileSizeDimK = TSDK; - // WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the - // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro// -#ifndef EIGEN_SYCL_REG_M - static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = REG_SIZE_M; -#else - static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M; -#endif -// WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the -// available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro -#ifndef EIGEN_SYCL_REG_N - static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = REG_SIZE_N; -#else - static EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N; -#endif - // LocalThreadSizeM: determines total number of thread per workgroup for the m dimension - static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0; - // LocalThreadSizeN: determines total number of thread per workgroup for the n dimension - static EIGEN_CONSTEXPR StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1; - // TileSizeDimM: determines the tile size for the m dimension - static EIGEN_CONSTEXPR StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM; - // TileSizeDimN: determines the tile size for the n dimension - static EIGEN_CONSTEXPR StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN; - // LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisable by packetsize - static EIGEN_CONSTEXPR StorageIndex LoadPerThreadLhs = - ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimN)); - // LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisable by packetsize - static EIGEN_CONSTEXPR StorageIndex LoadPerThreadRhs = - ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimM)); - // BC : determines if supporting bank conflict is required - static EIGEN_CONSTEXPR bool BC = true; - // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by - // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device doesnot have sufficient local memory) - static EIGEN_CONSTEXPR bool DoubleBuffer = -#ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER - false; -#else - true; -#endif -}; - -/* ! - * \brief contraction_type: an enum class representing the Tensor Contraction implementation algorithm. This is used to - * specialize the contraction algorithm based on device support for dedicated local memory. - */ -enum class contraction_type { local, no_local }; -/* ! - * \brief data_source an enum class determining the location of the data in a memory hierarchy (global, local, private). - */ -enum class data_source { global_mem, local_mem, private_mem }; - -/*! - * \brief read, a template function used for loading the data from global - memory. This function is used to guarantee coalesced and vectorized load whenever possible - * - * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode - * - * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and - vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the - contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case - when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. - * - * \tparam PacketType: determines the type of packet - * - * \tparam TensorMapper: determines the input tensor mapper type - * - * \tparam StorageIndex: determines the Index type - - * \param tensorMapper: is the input tensor - * - * \param NCIndex: is the non-contracting dim index - * - * \param CIndex is the contracting dim index - * - * \param ld: is the leading dimension of the flattened tensor - */ -template -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if::type read( - const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld) { - const StorageIndex row = (is_coalesced_layout) ? NCIndex : CIndex; - const StorageIndex col = (is_coalesced_layout) ? CIndex : NCIndex; - return tensorMapper.get_tensor().template packet(row + (col * ld)); -} - -/*! - * \brief read, special overload of read function, when the read access is not vectorized - * - * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode - * - * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and - vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the - contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case - when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. - * - * \tparam PacketType: determines the type of packet - * - * \tparam TensorMapper: determines the input tensor mapper type - * - * \tparam StorageIndex: determines the Index type - - * \param tensorMapper: is the input tensor - * - * \param NCIndex: is the non-contracting dim index - * - * \param CIndex: is the contracting dim index - */ -template -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if::type read( - const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &) { - const StorageIndex row = (IsRhs) ? CIndex : NCIndex; - const StorageIndex col = (IsRhs) ? NCIndex : CIndex; - return tensorMapper(row, col); -} - -/*! - * \brief write, a template function used for storing the data to local memory. This function is used to guarantee - * coalesced and vectorized store whenever possible. - * - * \tparam StorageIndex: determines the Index type - * - * \param ld is the leading dimension of the local memory. ld is a compile time value for the local memory - * - * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. - * - * \tparam PacketType: determines the type of packet - * - * \tparam DataScalar: determines the output data type - * - * \param packet_data: the data to be written in the local memory - * - * \param ptr: a pointer to the local memory - * - * \param CIndex is the contracting dim index - */ - -template -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename ::Eigen::internal::enable_if

::type - write(PacketType &packet_data, DataScalar ptr) { - EIGEN_CONSTEXPR int PacketSize = Eigen::internal::unpacket_traits::size; - EIGEN_UNROLL_LOOP - for (int i = 0; i < PacketSize; i++) { - *ptr = PacketWrapper::scalarize(i, packet_data); - ptr += ld; - } -} - -/*! - * \brief Overloading the write function for storing the data to global memory, when vectorization enabled This function - * is used to guarantee coalesced and vectorized store whenever possible. - * - * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. - * - * \tparam PacketType: determines the type of packet - * - * \tparam DataScalar: determines the output data type - * - * \param packet_data: the data to be written in the local memory - * - * \param ptr: a pointer to the local memory - */ - -template -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if< - Eigen::internal::unpacket_traits::size != 1 && dt == data_source::global_mem, void>::type -write(PacketType &packet_data, DataScalar *ptr) { - ::Eigen::internal::pstoreu(ptr, packet_data); -} - -/*! - * \brief Overloading the write function for storing the data to global memory, when vectorization is disabled. - * - * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy. - * - * \tparam PacketType: determines the type of packet - * - * \tparam DataScalar: determines the output data type - * - * \param packet_data: the data to be written in the local memory - * - * \param ptr: a pointer to the local memory - */ -template -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if< - Eigen::internal::unpacket_traits::size == 1 && dt == data_source::global_mem, void>::type -write(PacketType &packet_data, DataScalar *ptr) { - *ptr = packet_data; -} - -/*! - * \brief check_boundary: is used to check the edge condition for non-internal blocks. - * - * \tparam is_internal: determines if the block is internal - */ -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool) { - return true; -} - -/*! - * \brief check_boundary: specialization of the check_boundary for non-internal blocks. - * - * \param cond: true when the data is in range. Otherwise false - */ -template <> -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool cond) { - return cond; -} - -/*! - * \brief BlockProperties is a template class that provides different characteristic of a block of each Tensor processed - * by each workgroup. - * - * \tparam is_transposed: iff true, determines whether or not the block of the Tensor is transposed - * - * \tparam packet_load_: determines if the each element of this tensor block should be loaded in a packet mode - * - * \tparam PacketType: determines the type of packet - * - * \tparam OutType: determines the type of each element for this block of tensor. If packet load is true, it will be - * packetType; Otherwise it will be scalar Type - * - * \param elements_per_access determines the size of each element based on OutType - * - * \param is_coalesced_layout determines whether or not the Tensor data in a memory can be access coalesced and - * vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the - * contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case - * when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed. - * - * \param nc_stride determines the stride of non-contracting dimension to access the next adjustment element within the - * Tensor Block for each workgroup - * - * \param c_stride determines the stride of contracting dimension to access the next adjustment element within the - * Tensor Block for each workgroup - */ -template -struct BlockProperties { - static EIGEN_CONSTEXPR bool packet_load = packet_load_; - typedef typename Eigen::internal::unpacket_traits::type OutScalar; - static EIGEN_CONSTEXPR bool is_rhs = is_rhs_; - typedef typename Eigen::internal::conditional::type OutType; - static EIGEN_CONSTEXPR int elements_per_access = Eigen::internal::unpacket_traits::size; - static EIGEN_CONSTEXPR bool is_coalesced_layout = !(is_transposed ^ is_rhs); - static EIGEN_CONSTEXPR int nc_stride = (is_coalesced_layout ? elements_per_access : 1); - static EIGEN_CONSTEXPR int c_stride = (is_coalesced_layout ? 1 : elements_per_access); -}; - -/*! - * \brief ThreadProperties is a template class that provides each thread's properties within a workgroup. Please see - * the sycl-1.2.1 specification (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for the workgroup, - * work-items - * - * \tparam StorageIndex: determines the StorageIndex Type - * - * \param linearLocalThreadId: determines the linearized location of a thread within a work-group - * - * \param kGroupId: determines the logical group id in a k dimension of the flattened tensor. It will be > 1 when - * tall/skinny algorithm is used - * - * \param mGroupOffset: determines the logical start position of all thread within a workgroup for the m dimension of - * the flattened tensor. - * - * \param kGroupOffset determines the logical start position of all thread within a workgroup for the k dimension of the - * flattened tensor. It will be > 1 when tall/skinny algorithm is used. - * - * \param mLocalOffset: determines the logical start position of each thread within a workgroup for the m dimension of a - * flattened tensor. The position determines the distance of each thread within the workgroup from each other - * independent from their global position. - * - * \param nLocalOffset: determines the logical start position of each thread within a workgroup for the n dimension of a - * flattened tensor. The position determines the distance of each thread within the workgroup from each other - * independent from their global position. - * - * \param mGlobalOffset: determines the logical start position of each thread a thread for the m dimension on a - * flattened tensor - * - * \param nGlobalOffset: determines the logical start position of each thread a thread for the n dimension on a - * flattened tensor - * - * \param kSize : determine the number of the k elements of the flattened Tensor to be processed by each thread for the - * given tensor block. This is !=K dimension of Flattened Tensor when Tall/Skinny matrix is used. - * - * \param is_internal : this will determined if the thread within the work-group computes an internal block of tensor or - * the edge blocks. When it is internal, there is no need to check the boundaries and all the if stantement can be - * resolve by compiler. - */ -template -struct ThreadProperties { - const StorageIndex linearLocalThreadId; - const StorageIndex kGroupId; - const StorageIndex mGroupOffset; - const StorageIndex nGroupOffset; - const StorageIndex kGroupOffset; - const StorageIndex mLocalOffset; - const StorageIndex nLocalOffset; - const StorageIndex mGlobalOffset; - const StorageIndex nGlobalOffset; - StorageIndex kSize; - const bool is_internal; - // this is used to adjust the last block - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ThreadProperties( - const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_, - const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_, - const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_, - StorageIndex kSize_, const bool is_internal_) - : linearLocalThreadId(linearLocalThreadId_), - kGroupId(kGroupId_), - mGroupOffset(mGroupOffset_), - nGroupOffset(nGroupOffset_), - kGroupOffset(kGroupOffset_), - mLocalOffset(mLocalOffset_), - nLocalOffset(nLocalOffset_), - mGlobalOffset(mGlobalOffset_), - nGlobalOffset(nGlobalOffset_), - kSize(kSize_), - is_internal(is_internal_) {} -}; - -/*! - * \brief TensorContractionKernel is a template class that provides Tensor -Tensor contraction operation. - * - * \tparam OutScalar: determines the output scalar type - * - * \tparam LhsScalar: determines the left-hand-side scalar type - * - * \tparam RhsScalar: determines the right-hand-side scalar type - * - * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification - (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) - * - * \tparam LhsMapper determines the tensor contraction mapper type for left-hand-side matrix - * - * \tparam RhsMapper determines the tensor contraction mapper type for right-hand-side matrix - * - * \tparam StorageIndex: determines the StorageIndex Type - * - * \tparam Properties: determines the Contraction Panel properties - * - * \tparam TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix - * - * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. - * - * \tparam input_mapper_properties : determine if the input tensors are matrix. If they are matrix, special memory - access is used to guarantee that always the memory access are coalesced. - * - * \tptaram IsFinal : determine if this is the final kernel. If so, the result will be written in a final output. - Otherwise, the result of contraction will be written iin a temporary buffer. This is the case when Tall/Skinny - contraction is used. So in this case, a final reduction step is required to compute final output. - - * \tparam contraction_tp: it is an enum value representing whether the local memroy/no local memory implementation of - the algorithm to be used - * - * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group - * - * \param lhs: determines the left-hand-side flattened tensor (tensor mapper) - * - * \param rhs: determines the right-hand-side flattened tensor (tensor mapper) - * - * \param out_res: determines the output tensor containing the contraction result - * - * \param groupSizeM: a logical number determining the number of work-group for m dimension - * - * \param groupSizeN: a logical number determining the number of work-group for n dimension - * - * \param numTiles: determines total number of tiles on the k dimension - * - * \param TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix - */ -template -class TensorContractionKernel { - public: - typedef typename Eigen::TensorSycl::internal::Vectorise::PacketReturnType - PacketReturnType; - static EIGEN_CONSTEXPR int PacketSize = - Eigen::TensorSycl::internal::Vectorise::PacketSize; - static EIGEN_CONSTEXPR bool is_lhs_transposed = - !::Eigen::internal::TensorContractionInputMapperTrait::inner_dim_contiguous; - static EIGEN_CONSTEXPR bool is_rhs_transposed = - !::Eigen::internal::TensorContractionInputMapperTrait::inner_dim_contiguous; - - typedef BlockProperties - LHSBlockProperties; - - typedef BlockProperties - RHSBlockProperties; - - static EIGEN_CONSTEXPR StorageIndex NStride = - contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride; - - typedef cl::sycl::accessor Scratch; - typedef cl::sycl::multi_ptr local_ptr; - typedef OutScalar * /*cl::sycl::multi_ptr*/ private_ptr; - typedef - typename ::Eigen::internal::conditional::type - tile_ptr; - static EIGEN_CONSTEXPR StorageIndex LSDL = contraction_tp == contraction_type::local - ? Properties::TileSizeDimM + Properties::BC - : Properties::WorkLoadPerThreadM; - static EIGEN_CONSTEXPR StorageIndex LSDR = contraction_tp == contraction_type::local - ? Properties::TileSizeDimN + Properties::BC - : Properties::WorkLoadPerThreadN; - static EIGEN_CONSTEXPR StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN; - - /** - * \brief MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it is not - * allowed to have dynamic memory allocation. While the local memory is created outside of the kernel and passed to - * the kernel as an accessor, the private memory can only allowed to be allocated statically. Since we are abstracting - * the TiledMemory for both local and private memory, the MemHolder structs is used as a helper to abstract out - * different type of memory needed when local/no_local memory computation is called. - * - * \tparam contraction_type: it is an enum value representing whether the local memroy/no local memory implementation - of the algorithm to be used - * \tparam the private memory size - * \param ptr the tile memory pointer type - */ - template - struct MemHolder { - tile_ptr ptr; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr) : ptr(block_start_ptr) {} - }; - /** - * \brief specialization of memHolder class when no local memory kernel is used. - */ - template - struct MemHolder { - OutScalar ptr[MemSize] = {OutScalar{0}}; - }; - /** - * \brief TiledMemory: contains required memory pointer for loading each tile of the TensorContraction panel from - * global memory to local/private memory when local/no_local algorithm used. - * - * \param lhs_scratch_extract : determines the LHS tile memory. It is either private or local memory based on the - * selected contraction_type. - * - * \param rhs_scratch_extract : determines the RHS tile memory. It is either private or local memory based on the - * selected contraction_type. - * - * \param lhs_extract_index: determins the position of each thread on a local memory for lhs input. When private - * memory is used this is set to zero as this is not applicable in case of private memory. - * - * \param rhs_extract_index: determins the position of each thread on a local memory for rhs input. When private - * memory is used this is set to zero as this is not applicable in case of private memory. - * - * \param lhs_scratch_compute : determines the location to load for computation for lhs_local memory. This is the - * same as lhs_scratch_extract for private memory. - * - * \param rhs_scratch_compute : determines the location to load for computation for rhs_local memory. This is the - * same as rhs_scratch_extract for private memory. - */ - struct TiledMemory { - MemHolder lhs_scratch_extract; - MemHolder rhs_scratch_extract; - tile_ptr lhs_scratch_ptr_compute; - tile_ptr rhs_scratch_ptr_compute; - const std::pair lhs_extract_index; - const std::pair rhs_extract_index; - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TiledMemory(const ThreadProperties &, local_ptr, - typename ::Eigen::internal::enable_if::type * = 0) - : lhs_scratch_extract{}, - rhs_scratch_extract{}, - lhs_scratch_ptr_compute(lhs_scratch_extract.ptr), - rhs_scratch_ptr_compute(rhs_scratch_extract.ptr), - lhs_extract_index(std::pair(StorageIndex{0}, StorageIndex{0})), - rhs_extract_index(std::pair(StorageIndex{0}, StorageIndex{0})) {} - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - TiledMemory(const ThreadProperties &thread_properties, local_ptr block_start_ptr, - typename ::Eigen::internal::enable_if::type * = 0) - : lhs_scratch_extract{block_start_ptr}, - rhs_scratch_extract{lhs_scratch_extract.ptr + - ((Properties::DoubleBuffer + 1) * LSDL * Properties::TileSizeDimK)}, - lhs_scratch_ptr_compute(lhs_scratch_extract.ptr + thread_properties.mLocalOffset), - rhs_scratch_ptr_compute(rhs_scratch_extract.ptr + thread_properties.nLocalOffset), - lhs_extract_index( - local_id_extract(thread_properties.linearLocalThreadId)), - rhs_extract_index( - local_id_extract(thread_properties.linearLocalThreadId)) {} - }; - - Scratch scratch; - const LhsMapper lhs; - const RhsMapper rhs; - OutAccessor out_res; - const StorageIndex groupSizeM; - const StorageIndex groupSizeN; - const StorageIndex numTiles; - const TripleDim triple_dim; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, - const RhsMapper rhs_, OutAccessor out_res_, - const StorageIndex groupSizeM_, - const StorageIndex groupSizeN_, - const StorageIndex numTiles_, - const TripleDim triple_dim_) - : scratch(scratch_), - lhs(lhs_), - rhs(rhs_), - out_res(out_res_), - groupSizeM(groupSizeM_), - groupSizeN(groupSizeN_), - numTiles(numTiles_), - triple_dim(triple_dim_) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_, - const RhsMapper rhs_, OutAccessor out_res_, - const StorageIndex groupSizeM_, - const StorageIndex numTiles_, - const TripleDim triple_dim_) - : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { - const StorageIndex linearLocalThreadId = itemID.get_local_id(0); - const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM; - const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM; - const StorageIndex mGroupId = itemID.get_group(0) % groupSizeM; - const StorageIndex tmp = itemID.get_group(0) / groupSizeM; - const StorageIndex nGroupId = IsFinal ? tmp : tmp % groupSizeN; - const StorageIndex kGroupId = IsFinal ? 0 : tmp / groupSizeN; - const StorageIndex mGroupOffset = mGroupId * Properties::TileSizeDimM; - const StorageIndex nGroupOffset = nGroupId * Properties::TileSizeDimN; - const StorageIndex mLocalOffset = PacketSize * mLocalThreadId; - const StorageIndex nLocalOffset = NStride * nLocalThreadId; - const StorageIndex mGlobalOffset = mGroupOffset + mLocalOffset; - const StorageIndex nGlobalOffset = nGroupOffset + nLocalOffset; - - const StorageIndex kSizePerWG = IsFinal ? triple_dim.K : numTiles * Properties::TileSizeDimK; - StorageIndex kGroupOffset = kGroupId * kSizePerWG; - const bool is_internal = triple_dim.M - mGroupOffset >= Properties::TileSizeDimM && - triple_dim.N - nGroupOffset >= Properties::TileSizeDimN && - triple_dim.K - kGroupOffset >= kSizePerWG; - // this is used to adjust the last block - StorageIndex kSize = IsFinal ? triple_dim.K : std::min(kSizePerWG, triple_dim.K - kGroupOffset); - // This is used to find out the lats K offset so that kGroupOffset -kSize can compute the coffset for loading to - // tile - kGroupOffset += kSize; - - auto thread_properties = - ThreadProperties(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset, - mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal); - - auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N); - - (thread_properties.is_internal) ? compute_panel(itemID, thread_properties, out_ptr) - : compute_panel(itemID, thread_properties, out_ptr); - } - // The compute block computes the contraction operation private block for each thread and store the resutl in the - // privateRes memory of Each computation the compute block function is independent of local and no local concepts as - // it only compute the block on each thread's private memory space - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr, - PacketReturnType *privateRes) { - StorageIndex idx = 0; - EIGEN_CONSTEXPR StorageIndex lhs_stride = - contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1; - EIGEN_UNROLL_LOOP - for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) { - auto rhsPacket = PacketReturnType{*(rhs_block_ptr + wLPTN)}; - StorageIndex lhs_index = 0; - EIGEN_UNROLL_LOOP - for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) { - PacketReturnType lhsPack{}; - Eigen::TensorSycl::internal::PacketWrapper::set_packet(lhsPack, - lhs_block_ptr + lhs_index); - privateRes[idx] = ::Eigen::internal::pmadd(lhsPack, rhsPacket, privateRes[idx]); - - lhs_index += lhs_stride; - idx++; - } - } - } - // The store function write the computed contraction operation in the private memory of each thread to the global - // memory. The store function is independent of local and no local concepts s that it can be abstract out in the base - // class. - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes, - StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) { - auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC { - return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N); - }; - // when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is - // available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId* - // WorkLoadPerThreadN slice of N - EIGEN_CONSTEXPR StorageIndex GlobalNStride = - contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN; - EIGEN_UNROLL_LOOP - for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) { - // output leading dimension - StorageIndex outputLD = 0; - // When local memory is used the PrivateNstride is always 1 because the coalesed access on N is loaded into Local - // memory and extracting from local to global is the same as no transposed version. However, when local memory is - // not used and RHS is transposed we packetize the load for RHS. - EIGEN_UNROLL_LOOP - for (StorageIndex nId = 0; nId < PrivateNStride; nId++) { - StorageIndex globalRow = mGlobalOffset; - EIGEN_UNROLL_LOOP - for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) { - PacketReturnType privetOut = privateRes[wLPTM]; - if (check_boundary(chk_bound(globalRow, nId))) { - // Store the final results in C. The C matrix has always M as a first StorageIndex and N as a second - // StorageIndex Therefore it is always coalesced layout - write(privetOut, out_ptr + outputLD + globalRow); - } else { - EIGEN_UNROLL_LOOP - for (StorageIndex mId = 0; mId < PacketSize; mId++) { - StorageIndex mOffset = globalRow + mId; - if (mOffset < triple_dim.M && (nGlobalOffset + nId < triple_dim.N)) { - out_ptr[mOffset + outputLD] = - Eigen::TensorSycl::internal::PacketWrapper::scalarize(mId, privetOut); - } - } - } - globalRow += (PacketSize * Properties::LocalThreadSizeM); - } - outputLD += triple_dim.M; - privateRes += Properties::WorkLoadPerThreadM / PacketSize; - } - out_ptr += (GlobalNStride * outputLD); - - nGlobalOffset += (PrivateNStride * GlobalNStride); - } - } - // when no local memory is used the following extract_block will be enabled - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename ::Eigen::internal::enable_if::type - extract_block(const Input &inpt, PrivateReg private_ptr, const std::pair &, - const StorageIndex &ncOffset, const StorageIndex cOffset) { - EIGEN_CONSTEXPR StorageIndex LocalThreadSizeNC = - InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM; - EIGEN_CONSTEXPR StorageIndex WorkLoadPerThreadNC = - InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM; - const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M; - - auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC { - return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) && - (NCIndex + InputBlockProperties::nc_stride - 1 < NC)); - }; - const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K; - StorageIndex cIndex = cOffset; - - EIGEN_UNROLL_LOOP - for (StorageIndex cId = 0; cId < Properties::TileSizeDimK / InputBlockProperties::c_stride; cId++) { - StorageIndex ncIndex = ncOffset; - EIGEN_UNROLL_LOOP - for (StorageIndex ncId = 0; ncId < WorkLoadPerThreadNC / InputBlockProperties::nc_stride; ncId++) { - if (check_boundary(chk_bound(cIndex, ncIndex))) { - auto val = - read(inpt, ncIndex, cIndex, ld); - - write(val, private_ptr); - } else { - EIGEN_UNROLL_LOOP - for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { - const StorageIndex ncInd = ncIndex + (InputBlockProperties::is_coalesced_layout ? i : 0); - const StorageIndex cInd = cIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i); - OutScalar val = - (ncInd < NC && cInd < triple_dim.K) - ? read( - inpt, ncInd, cInd, ld) - : OutScalar(0); - write( - val, private_ptr + (InputBlockProperties::is_coalesced_layout ? i : 0) + - ((InputBlockProperties::is_coalesced_layout ? 0 : i) * WorkLoadPerThreadNC)); - } - } - - // if it is lhs we have to load it packetised when the packet size is > 1, because the output is coalesced. So - // even if M is not accessed in a coalesced mode, we have to load packet_size number of m per thread. - ncIndex = (!InputBlockProperties::is_rhs && InputBlockProperties::nc_stride == 1 && PacketSize != 1) - ? ncOffset + (ncId + 1) % PacketSize + ((ncId + 1) / PacketSize) * LocalThreadSizeNC - : (ncIndex + InputBlockProperties::nc_stride * LocalThreadSizeNC); - private_ptr += InputBlockProperties::nc_stride; - } - // the previous for loop ( private_ptr += (ncId * nc_stride)) has already moved ptr with one WorkLoadPerThreadNC - private_ptr += (InputBlockProperties::c_stride - 1) * WorkLoadPerThreadNC; - cIndex += InputBlockProperties::c_stride; - } - } - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair local_id_extract( - const StorageIndex &linearLocalThreadId) { - const StorageIndex localThreadNC = - (InputBlockProperties::is_coalesced_layout) - ? linearLocalThreadId % (TileSizeDimNC / InputBlockProperties::nc_stride) - : linearLocalThreadId / (Properties::TileSizeDimK / InputBlockProperties::c_stride); - const StorageIndex localThreadC = - (InputBlockProperties::is_coalesced_layout) - ? linearLocalThreadId / (TileSizeDimNC / InputBlockProperties::nc_stride) - : linearLocalThreadId % (Properties::TileSizeDimK / InputBlockProperties::c_stride); - return std::pair(localThreadNC, localThreadC); - } - - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename ::Eigen::internal::enable_if::type - sync_mem(const cl::sycl::nd_item<1> &, bool &db_offset) noexcept { - db_offset = !db_offset; - } - - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename ::Eigen::internal::enable_if::type - sync_mem(const cl::sycl::nd_item<1> &itemID, bool &) noexcept { - itemID.barrier(cl::sycl::access::fence_space::local_space); - } - - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename ::Eigen::internal::enable_if::type - sync_mem(const cl::sycl::nd_item<1> &, bool &) noexcept { - return; - } - - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename ::Eigen::internal::enable_if::type - sync_thread(const cl::sycl::nd_item<1> & -#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION - itemID -#endif - ) noexcept { -#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION - itemID.barrier(cl::sycl::access::fence_spacce::local_space); -#else - return; -#endif - } - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename ::Eigen::internal::enable_if::type - sync_thread(const cl::sycl::nd_item<1> &itemID) { - itemID.barrier(cl::sycl::access::fence_space::local_space); - } - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if::type sync_thread( - const cl::sycl::nd_item<1> &) { - return; - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID, - ThreadProperties &thread_properties, - TiledMemory &tiled_input_block, - PacketReturnType *privateRes, bool &db_offset) { - // Tiling the Rhs block from global to local memory - extract_block( - rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR), - tiled_input_block.rhs_extract_index, - contraction_tp == contraction_type::local ? thread_properties.nGroupOffset : thread_properties.nGlobalOffset, - thread_properties.kGroupOffset - thread_properties.kSize); - - sync_thread(itemID); - - // Tiling the Lhs block from global to local memory - extract_block( - lhs, tiled_input_block.lhs_scratch_extract.ptr + (db_offset * LSDL * Properties::TileSizeDimK), - tiled_input_block.lhs_extract_index, - contraction_tp == contraction_type::local ? thread_properties.mGroupOffset : thread_properties.mGlobalOffset, - thread_properties.kGroupOffset - thread_properties.kSize); - - // itemID.barrier(cl::sycl::access::fence_space::local_space); - sync_thread(itemID); - // switch to compute mede - StorageIndex lhs_offset = (db_offset * LSDL * Properties::TileSizeDimK); - StorageIndex rhs_offset = (db_offset * Properties::TileSizeDimK * LSDR); - // Loop over the values of a single tile - for (StorageIndex k = 0; k < Properties::TileSizeDimK; k++) { - compute_block_per_tile(tiled_input_block.lhs_scratch_ptr_compute + lhs_offset, - tiled_input_block.rhs_scratch_ptr_compute + rhs_offset, privateRes); - lhs_offset += LSDL; - rhs_offset += LSDR; - } - // computing the K index for the next tile - thread_properties.kSize -= Properties::TileSizeDimK; - sync_mem(itemID, db_offset); - } - - // when local memory is available the following compute_panel will be enabled - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID, - ThreadProperties &thread_properties, - OutPtr out_ptr) { - auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()}; - // Allocate register space - PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = { - PacketReturnType{0}}; - bool db_offset = 0; - - while (thread_properties.kSize >= Properties::TileSizeDimK) { - compute_tile_per_panel(itemID, thread_properties, tiled_input_block, privateRes, db_offset); - } - if (thread_properties.kSize > 0) { - compute_tile_per_panel(itemID, thread_properties, tiled_input_block, privateRes, db_offset); - } - - // Storing the final results in the output - store(1) : RHSBlockProperties::nc_stride>( - out_ptr + thread_properties.nGlobalOffset * triple_dim.M, privateRes, thread_properties.mGlobalOffset, - thread_properties.nGlobalOffset); - } - // When local memory is available the following extract_block will be enabled - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename ::Eigen::internal::enable_if::type - extract_block(const Input &inpt, Local local_ptr, const std::pair& local_index, - const StorageIndex &ncOffset, const StorageIndex cOffset) { - EIGEN_CONSTEXPR StorageIndex TileSizeDimNC = - InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM; - EIGEN_CONSTEXPR StorageIndex LoadPerThread = - InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs; - EIGEN_CONSTEXPR StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL; - static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) && - (LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)), - " LocalOffset must be divisable by stride"); - const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M; - StorageIndex localThreadNC = local_index.first; - StorageIndex localThreadC = local_index.second; - auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC { - return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) && - (NCIndex + InputBlockProperties::nc_stride - 1 < NC)); - }; - EIGEN_UNROLL_LOOP - for (StorageIndex lPT = 0; lPT < LoadPerThread / InputBlockProperties::elements_per_access; lPT++) { - const StorageIndex CIndex = cOffset + (InputBlockProperties::c_stride * localThreadC); - const StorageIndex NCIndex = ncOffset + (InputBlockProperties::nc_stride * localThreadNC); - const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K; - if (check_boundary(chk_bound(CIndex, NCIndex))) { - auto val = - read(inpt, NCIndex, CIndex, ld); - write( - val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) + - (InputBlockProperties::c_stride * localThreadC * LSD)); - } else { - EIGEN_UNROLL_LOOP - for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { - const StorageIndex nCInd = NCIndex + (InputBlockProperties::is_coalesced_layout ? i : 0); - const StorageIndex cInd = CIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i); - OutScalar val = - (nCInd < NC && cInd < triple_dim.K) - ? read( - inpt, nCInd, cInd, ld) - : OutScalar(0); - - write( - val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) + - (InputBlockProperties::is_coalesced_layout ? i : 0) + - ((InputBlockProperties::c_stride * localThreadC + - (InputBlockProperties::is_coalesced_layout ? 0 : i)) * - LSD)); - } - } - localThreadNC += (InputBlockProperties::is_coalesced_layout) - ? LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) - : LocalOffset / (Properties::TileSizeDimK / InputBlockProperties::c_stride); - localThreadC += (InputBlockProperties::is_coalesced_layout) - ? LocalOffset / (TileSizeDimNC / InputBlockProperties::nc_stride) - : LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride); - } - } -}; - -#ifndef EIGEN_SYCL_DISABLE_GEMV - -/*! - * \brief GeneralVectorTensor is a template class that provides Tensor -vector contraction operation, which is a special - * case of Tensor Tensor contraction. - * - * \tparam OutScalar: determines the output scalar type - * - * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification - * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) - * - * \tparam VectorMapper: determines the tensor contraction mapper for the vector input (can be lhs or rhs) - * - * \tparam TensorMapper: determines the tensor contraction mapper for the tensor input (can be lhs or rhs) - * - * \tparam StorageIndex: determines the StorageIndex Type - * - * \tparam Properties: determines the Contraction Panel properties - * - * \tparam KFactor: determines the number of elements in K dimension in a Tile - * - * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. - * - * \tparam is_lhs_vec: determines whether lhs is a vector or rhs is a vector - * - * \tparam IsFinal: determine if this is the final kernel. If so, the result will be written in a final output. - * Otherwise, the result of contraction will be written iin a temporary buffer. - * - * \param scratch: determines the local memory containing the vector block for each work-group - * - * \param vec: determines the vector input (tensor mapper) - * - * \param mat: determines the tensor input (tensor mapper) - * - * \param out_res: determines the output vector containing the contraction result - * - * \param nonContractGroupSize: a logical number determining the number of work-group for non-contracting dimension - * - * \param nonContractDim: determines the size of non contracting dimension for the flattened tensor - * - * \param contractDim: determines the size of non contracting dimension for the flattened tensor - * - */ -template -struct GeneralVectorTensor { - typedef typename Eigen::TensorSycl::internal::Vectorise::PacketReturnType - PacketReturnType; - static EIGEN_CONSTEXPR int PacketSize = - Eigen::TensorSycl::internal::Vectorise::PacketSize; - typedef cl::sycl::accessor Scratch; - - static EIGEN_CONSTEXPR StorageIndex OutScratchOffset = - KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC; - - // Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make - // sure that the !^ is true When RHS is a vector, we pass true and true to make sure that the !^ is true. - typedef BlockProperties - VecBlockProperties; - - Scratch scratch; - const VectorMapper vec; - const TensorMapper mat; - OutAccessor out_res; - const StorageIndex nonContractGroupSize; - const StorageIndex nonContractDim; - const StorageIndex contractDim; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE GeneralVectorTensor(Scratch scratch_, const VectorMapper vec_, - const TensorMapper mat_, OutAccessor out_res_, - const StorageIndex nonContractGroupSize_, - const StorageIndex nonContractDim_, - const StorageIndex contractDim_) - : scratch(scratch_), - vec(vec_), - mat(mat_), - out_res(out_res_), - nonContractGroupSize(nonContractGroupSize_), - nonContractDim(nonContractDim_), - contractDim(contractDim_) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) { - auto scratch_ptr = scratch.get_pointer(); - const StorageIndex linearLocalThreadId = itemID.get_local_id(0); - StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC - : linearLocalThreadId % Properties::LocalThreadSizeNC; - StorageIndex contractId = is_lhs_vec ? linearLocalThreadId % Properties::LocalThreadSizeC - : linearLocalThreadId / Properties::LocalThreadSizeNC; - const StorageIndex cGroupSize = itemID.get_group_range(0) / nonContractGroupSize; - const StorageIndex nonContractGroupId = - is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize; - const StorageIndex contractGroupId = - is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize; - auto out_ptr = out_res.get_pointer() + (IsFinal ? 0 : contractGroupId * nonContractDim); - - const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC; - const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC; - auto outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC; - const StorageIndex globalNonContractDimOffset = nonContractGroupOffset + nonContractId; - const StorageIndex globalContractDimOffset = contractGroupOffset + contractId; - auto local_output = scratch_ptr + OutScratchOffset; - const bool is_internal = nonContractDim - nonContractGroupOffset >= Properties::TileSizeDimNC && - contractDim - contractGroupOffset >= Properties::TileSizeDimC; - is_internal - ? compute_panel(itemID, vec, mat, local_output, out_ptr, -#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON - scratch_ptr, contractGroupOffset, -#endif - nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId, - nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex) - : compute_panel(itemID, vec, mat, local_output, out_ptr, -#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON - scratch_ptr, contractGroupOffset, -#endif - nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId, - nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex); - } - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel( - const cl::sycl::nd_item<1> &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output, - OutPtr out_ptr, -#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON - OutScalar *scratch_ptr, const StorageIndex contractGroupOffset, -#endif - const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim, - StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId, - StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex) { - OutScalar outScalar[Properties::WorkLoadPerThreadNC] = {OutScalar(0)}; - // Reading the vector -#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON - const StorageIndex vectorOffset = contractGroupOffset + linearLocalThreadId; - extract_block(vec, scratch_ptr, linearLocalThreadId, - vectorOffset, contractDim); - - itemID.barrier(cl::sycl::access::fence_space::local_space); - auto in_scratch_ptr = scratch_ptr + contractId; -#endif - - StorageIndex privateOffsetC = 0; - EIGEN_UNROLL_LOOP - for (StorageIndex i = 0; i < Properties::WorkLoadPerThreadC; i++) { - StorageIndex privateOffsetNC = 0; - bool contract_conds = ((globalContractDimOffset + privateOffsetC) < contractDim); -#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON - auto vecScalar = *in_scratch_ptr; -#else - auto vecScalar = (check_boundary(contract_conds)) - ? vec(is_lhs_vec ? StorageIndex(0) : globalContractDimOffset + privateOffsetC, - is_lhs_vec ? globalContractDimOffset + privateOffsetC : StorageIndex(0)) - : OutScalar(0); -#endif - EIGEN_UNROLL_LOOP - for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { - auto matScalar = (check_boundary( - contract_conds && ((globalNonContractDimOffset + privateOffsetNC) < nonContractDim))) - ? mat(is_lhs_vec ? globalContractDimOffset + privateOffsetC - : globalNonContractDimOffset + privateOffsetNC, - is_lhs_vec ? globalNonContractDimOffset + privateOffsetNC - : globalContractDimOffset + privateOffsetC) - : OutScalar(0); - - outScalar[j] = cl::sycl::mad(matScalar, vecScalar, outScalar[j]); - privateOffsetNC += Properties::LocalThreadSizeNC; - } - privateOffsetC += Properties::LocalThreadSizeC; -#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON - in_scratch_ptr += Properties::LocalThreadSizeC; -#endif - } - - auto out_scratch_ptr = local_output + outScratchIndex; - // Each block of 16*16 element in shared memory should reduce to 16*1 - EIGEN_UNROLL_LOOP - for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { - *out_scratch_ptr = outScalar[j]; - - out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); - } - if (is_lhs_vec) { - nonContractId = linearLocalThreadId % Properties::LocalThreadSizeNC; - contractId = linearLocalThreadId / Properties::LocalThreadSizeNC; - outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC; - } - - out_scratch_ptr = local_output + outScratchIndex; - EIGEN_UNROLL_LOOP - for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { - EIGEN_UNROLL_LOOP - for (StorageIndex offset = Properties::LocalThreadSizeC >> 1; offset > 0; offset >>= 1) { - itemID.barrier(cl::sycl::access::fence_space::local_space); - if (contractId < offset) { - StorageIndex myNeigbourId = (Properties::LocalThreadSizeNC * offset); - *out_scratch_ptr += out_scratch_ptr[myNeigbourId]; - } - } - // moving to next 16 by 16 block - out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); - } - - if (contractId == 0) { - out_scratch_ptr = local_output + nonContractId; - StorageIndex global_final_offset = nonContractGroupOffset + nonContractId; - out_ptr += global_final_offset; - EIGEN_UNROLL_LOOP - for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) { - if (check_boundary(global_final_offset < nonContractDim)) { - auto res = *out_scratch_ptr; - - *out_ptr = res; - out_ptr += Properties::LocalThreadSizeNC; - } - // moving to next 16 by 16 block to ge the next 16 reduced elements - out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC); - if (!(is_internal_block)) global_final_offset += Properties::LocalThreadSizeNC; - } - } - } - - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr, - const StorageIndex &linearLocalThreadId, - const StorageIndex &cOffset, const StorageIndex &C) { - local_ptr += InputBlockProperties::c_stride * linearLocalThreadId; - StorageIndex cIndex = cOffset; - for (StorageIndex cId = 0; cId < CFactor / InputBlockProperties::c_stride; cId++) { - if (check_boundary(cIndex + InputBlockProperties::c_stride - 1 < C)) { - auto val = read(inpt, StorageIndex(0), - cIndex, StorageIndex(1)); - write(val, local_ptr); - } else { - EIGEN_UNROLL_LOOP - for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) { - OutScalar val = - (cIndex + i < C) - ? read( - inpt, StorageIndex(0), cIndex + i, StorageIndex(1)) - : OutScalar(0); - write(val, local_ptr + i); - } - } - local_ptr += InputBlockProperties::c_stride * GroupSize; - cIndex += InputBlockProperties::c_stride * GroupSize; - } - } -}; -#endif - -#ifndef EIGEN_SYCL_DISABLE_SCALAR - -/*! - * \brief GeneralScalarContraction is a template class that provides the scalar value of Tensor -Tensor contraction - * operation, when all the dimensions are contracting dimensions. This Kernel reduces two tensors to an scalar - * - * \tparam OutScalar: determines the output scalar type - * - * \tparam LhsScalar: determines the left-hand-side scalar type - * - * \tparam RhsScalar: determines the right-hand-side scalar type - * - * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification - * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition) - * - * \tparam LhsMapper: determines the tensor contraction mapper type for left-hand-side matrix - * - * \tparam RhsMapper: determines the tensor contraction mapper type for right-hand-side matrix - * - * \tparam StorageIndex: determines the StorageIndex Type - * - * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression. - * - * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group - * - * \param lhs: determines the left-hand-side flattened tensor (tensor mapper) - * - * \param rhs: determines the right-hand-side flattened tensor (tensor mapper) - * - * \param out_res: determines the output tensor containing the contraction result - * - * \param rng: determins the total input data size - */ -template -struct GeneralScalarContraction { - typedef cl::sycl::accessor Scratch; - Scratch scratch; - const LhsMapper lhs; - const RhsMapper rhs; - OutAccessor out_res; - const StorageIndex rng; - - EIGEN_DEVICE_FUNC - GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_, OutAccessor out_res_, - const StorageIndex rng_) - : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {} - - EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) { - auto out_ptr = out_res.get_pointer(); - auto scratch_ptr = scratch.get_pointer().get(); - - StorageIndex globalid = itemID.get_global_id(0); - StorageIndex localid = itemID.get_local_id(0); - OutScalar accumulator = OutScalar(0); - for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) { - accumulator = cl::sycl::mad(lhs(0, i), rhs(i, 0), accumulator); - } - auto out_scratch_ptr = scratch_ptr + localid; - *out_scratch_ptr = accumulator; - for (StorageIndex offset = itemID.get_local_range(0) >> 1; offset > 0; offset >>= 1) { - itemID.barrier(cl::sycl::access::fence_space::local_space); - if (localid < offset) { - *out_scratch_ptr = (accumulator += out_scratch_ptr[offset]); - } - } - if (localid == 0) { - out_ptr[itemID.get_group(0)] = accumulator; - } - } -}; -#endif - -} // namespace internal -} // namespace TensorSycl - -template -struct TensorEvaluator, - Eigen::SyclDevice> - : public TensorContractionEvaluatorBase, Eigen::SyclDevice>> { - static_assert(std::is_same::value, - "SYCL tensor contraction does not support output kernels."); - - typedef Eigen::SyclDevice Device; - - typedef TensorEvaluator, Device> Self; - typedef TensorContractionEvaluatorBase Base; - typedef TensorContractionOp XprType; - typedef typename internal::remove_const::type Scalar; - typedef typename XprType::Index StorageIndex; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef typename Base::Storage Storage; - typedef typename Base::EvaluatorPointerType EvaluatorPointerType; - struct TripleDim { - const StorageIndex M; - const StorageIndex N; - const StorageIndex K; - TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_) : M(M_), N(N_), K(K_) {} - }; - enum { - Layout = TensorEvaluator::Layout, - PacketAccess = (PacketType::size > 1), - BlockAccess = false, - }; - - static EIGEN_CONSTEXPR int LDims = Base::LDims; - static EIGEN_CONSTEXPR int RDims = Base::RDims; - static EIGEN_CONSTEXPR int ContractDims = Base::ContractDims; - - typedef array left_dim_mapper_t; - typedef array right_dim_mapper_t; - - typedef array contract_t; - typedef array left_nocontract_t; - typedef array right_nocontract_t; - - static const int NumDims = LDims + RDims - 2 * ContractDims; - - typedef DSizes Dimensions; - - typedef TensorEvaluator LeftEvaluator; - typedef TensorEvaluator RightEvaluator; - typedef typename Eigen::internal::remove_const::type LhsScalar; - typedef typename Eigen::internal::remove_const::type RhsScalar; - - typedef typename LeftEvaluator::Dimensions LeftDimensions; - typedef typename RightEvaluator::Dimensions RightDimensions; - - template - struct input_mapper_propertis { - static EIGEN_CONSTEXPR bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous; - static EIGEN_CONSTEXPR bool is_rhs_matrix = - (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered); - }; - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType &op, const Device &device) : Base(op, device) {} - - // We need to redefine this method to make nvcc happy - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data) { - this->m_leftImpl.evalSubExprsIfNeeded(NULL); - this->m_rightImpl.evalSubExprsIfNeeded(NULL); - if (!data) { - this->m_result = this->m_device.get( - static_cast(this->m_device.allocate_temp(this->dimensions().TotalSize() * sizeof(Scalar)))); - data = this->m_result; - } - evalToSycl(data); - return (this->m_result != NULL); - } - const Eigen::SyclDevice &device() const { return this->m_device; } - void evalToSycl(typename Base::EvaluatorPointerType buffer) const { - if (this->m_lhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } else { - evalTyped(buffer); - } - } else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } else { - evalTyped(buffer); - } - } - } else { - if (this->m_rhs_inner_dim_contiguous) { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } else { - evalTyped(buffer); - } - } else { - if (this->m_rhs_inner_dim_reordered) { - evalTyped(buffer); - } else { - evalTyped(buffer); - } - } - } - } - - template - void evalTyped(typename Base::EvaluatorPointerType buffer) const { - const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size}; - typedef internal::TensorContractionInputMapper< - LhsScalar, StorageIndex, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t, - PacketType::size, lhs_inner_dim_contiguous, false, Unaligned, MakeSYCLPointer> - LhsMapper; - - typedef internal::TensorContractionInputMapper::size, rhs_inner_dim_contiguous, - rhs_inner_dim_reordered, Unaligned, MakeSYCLPointer> - RhsMapper; - - // initialize data mappers - LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, - this->m_left_contracting_strides, this->m_k_strides); - - RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, - this->m_right_contracting_strides, this->m_k_strides); - -#ifndef EIGEN_SYCL_DISABLE_SCALAR - if (triple_dim.M == 1 && triple_dim.N == 1) { - launchSC(buffer, lhs, rhs, triple_dim.K); - } else -#endif -#ifndef EIGEN_SYCL_DISABLE_GEMV - if (triple_dim.M != 1 && triple_dim.N == 1) { - LaunchVT(buffer, rhs, lhs, triple_dim.M, triple_dim.K); - } else if (triple_dim.M == 1 && triple_dim.N != 1) { - LaunchVT(buffer, lhs, rhs, triple_dim.N, triple_dim.K); - } else // This is equivalent of if (m!=1 && n!=1) -#endif - { - typedef input_mapper_propertis - inpt_mapper_properties; -#ifndef EIGEN_SYCL_DISABLE_SKINNY - bool skinny = false; - auto platform_name = this->device().getPlatformName(); - // This is based on empirical calculation for AMD r9-nano and Fiji - if (platform_name.find("AMD") == 0) { - skinny = (triple_dim.M < triple_dim.K || triple_dim.N < triple_dim.K) && - ((triple_dim.M < 1024 && triple_dim.N < 1024) || - (uint64_t(triple_dim.M * triple_dim.N) < uint64_t(triple_dim.K))); - } else { - skinny = (((std::max(triple_dim.K, triple_dim.N) / std::min(triple_dim.K, triple_dim.N)) > 100) || - ((std::max(triple_dim.K, triple_dim.M) / std::min(triple_dim.K, triple_dim.M)) > 100) || - ((std::max(triple_dim.N, triple_dim.M) / std::min(triple_dim.N, triple_dim.M)) > 100)); - } - if (skinny) - adjustTT(buffer, lhs, rhs, triple_dim); - else -#endif // EIGEN_SYCL_DISABLE_SKINNY - adjustTT(buffer, lhs, rhs, triple_dim); - } - } - - template - void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, - const TripleDim &triple_dim) const { -#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON - if (device().has_local_memory()) { - typedef TensorSycl::internal::TTPanelSize PanelParameters; - launchTT( - buffer, lhs, rhs, triple_dim); - } -#endif -#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF - if (!(device().has_local_memory())) { - typedef TensorSycl::internal::TTPanelSize PanelParameters; - launchTT( - buffer, lhs, rhs, triple_dim); - } -#endif - } - - template - void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, - const TripleDim &triple_dim) const { - const StorageIndex roundUpM = Eigen::TensorSycl::internal::roundUp(triple_dim.M, Properties::TileSizeDimM); - const StorageIndex roundUpN = Eigen::TensorSycl::internal::roundUp(triple_dim.N, Properties::TileSizeDimN); - const StorageIndex groupSizeM = roundUpM / Properties::TileSizeDimM; - const StorageIndex groupSizeN = roundUpN / Properties::TileSizeDimN; - - const StorageIndex roundUpK = Eigen::TensorSycl::internal::roundUp(triple_dim.K, Properties::TileSizeDimK); - StorageIndex totalTilesK = roundUpK / Properties::TileSizeDimK; - StorageIndex groupSizeK = - skinny - ? std::max(std::min(totalTilesK, - (StorageIndex)(device().getPowerOfTwo(device().getNumSyclMultiProcessors(), true) * 4) / - (groupSizeM * groupSizeN)), - StorageIndex(1)) - : StorageIndex(1); - - const StorageIndex numTilesPerGroup = Eigen::TensorSycl::internal::roundUp(totalTilesK, groupSizeK) / groupSizeK; - - const StorageIndex totalGroupSize = groupSizeM * groupSizeN * groupSizeK; - - const StorageIndex localRange = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN; - const StorageIndex globalRange = totalGroupSize * localRange; - - const StorageIndex scratchSize = (ct == TensorSycl::internal::contraction_type::local) - ? ((Properties::DoubleBuffer + 1) * - (Properties::TileSizeDimM + Properties::BC) * (Properties::TileSizeDimK)) + - ((Properties::DoubleBuffer + 1) * (Properties::TileSizeDimK) * - (Properties::TileSizeDimN + Properties::BC)) - : StorageIndex(1); - - auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); - if (groupSizeK == 1) { - typedef TensorSycl::internal::TensorContractionKernel - ContractKernelName; - device().template binary_kernel_launcher( - lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim); - } else { - typedef TensorSycl::internal::TensorContractionKernel - ContractKernelName; - CoeffReturnType *temp_pointer = static_cast( - device().allocate_temp(triple_dim.M * triple_dim.N * groupSizeK * sizeof(CoeffReturnType))); - EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); - - device().template binary_kernel_launcher( - lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, - triple_dim); - - typedef Eigen::internal::SumReducer Op; - auto op = Op(); - typedef TensorSycl::internal::SecondStepPartialReduction - ReductionKernel; - - device().template unary_kernel_launcher( - tmp_global_accessor, buffer, - cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex( - Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))), - cl::sycl::range<1>(localRange)), - StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK); - - device().deallocate_temp(temp_pointer); - } - } - -#ifndef EIGEN_SYCL_DISABLE_GEMV - template - void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat, - StorageIndex NC, StorageIndex C) const { - const StorageIndex nonContractDim = NC; - EIGEN_CONSTEXPR StorageIndex NCFactor = 1; - EIGEN_CONSTEXPR StorageIndex CFactor = 1; - EIGEN_CONSTEXPR StorageIndex NCWindow = 16; - typedef Eigen::TensorSycl::internal::TVPanelSize - Properties; - const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC); - const StorageIndex cNumGroups = roundUpC / (Properties::LocalThreadSizeC * Properties::WorkLoadPerThreadC); - const StorageIndex roundUpNC = Eigen::TensorSycl::internal::roundUp(nonContractDim, Properties::TileSizeDimNC); - const StorageIndex nCNumGroups = roundUpNC / (Properties::LocalThreadSizeNC * Properties::WorkLoadPerThreadNC); - const StorageIndex globalRange = - (roundUpNC / (Properties::WorkLoadPerThreadNC)) * (roundUpC / (Properties::WorkLoadPerThreadC)); - const StorageIndex localRange = Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC; - const StorageIndex scratchSize = - (Properties::WorkLoadPerThreadNC + CFactor) * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC; - auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange)); - if (cNumGroups > 1) { - typedef Eigen::TensorSycl::internal::GeneralVectorTensor - ContractKernelName; - CoeffReturnType *temp_pointer = - static_cast(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType))); - EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); - - device().template binary_kernel_launcher( - vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C); - - typedef Eigen::internal::SumReducer Op; - typedef TensorSycl::internal::SecondStepPartialReduction - ReductionKernel; - - device().template unary_kernel_launcher( - tmp_global_accessor, buffer, - cl::sycl::nd_range<1>(cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)), - cl::sycl::range<1>(localRange)), - StorageIndex(1), Op(), nonContractDim, cNumGroups); - - device().deallocate_temp(temp_pointer); - } else { - typedef Eigen::TensorSycl::internal::GeneralVectorTensor - ContractKernelName; - device().template binary_kernel_launcher( - vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C); - } - } -#endif - -#ifndef EIGEN_SYCL_DISABLE_SCALAR - template - EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs, - StorageIndex K) const { - EIGEN_STATIC_ASSERT(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) & - (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)), - "The Local thread size must be a power of 2 for the reduction " - "operation"); - EIGEN_CONSTEXPR StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1; - - // Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread - // reduces at least 512 elementss individually, we get better performance. - const StorageIndex num_work_group = ((K + (512 * local_range - 1)) / (512 * local_range) > 1 ? local_range : 1); - const StorageIndex global_range = num_work_group * local_range; - - typedef Eigen::TensorSycl::internal::GeneralScalarContraction< - CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, LhsMapper, RhsMapper, StorageIndex, false> - ContractKernelName; - auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range)); - if (num_work_group > 1) { - CoeffReturnType *temp_pointer = - static_cast(device().allocate_temp(num_work_group * sizeof(CoeffReturnType))); - EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer); - device().template binary_kernel_launcher(lhs, rhs, tmp_global_accessor, - thread_range, local_range, K); - typedef Eigen::internal::SumReducer Op; - typedef TensorSycl::internal::SecondStepFullReducer - GenericRKernel; - device().template unary_kernel_launcher( - tmp_global_accessor, buffer, - cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range, Op()); - - device().deallocate_temp(temp_pointer); - } else { - device().template binary_kernel_launcher(lhs, rhs, buffer, thread_range, - local_range, K); - } - } -#endif - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - this->m_leftImpl.cleanup(); - this->m_rightImpl.cleanup(); - - if (this->m_result) { - this->m_device.deallocate_temp(this->m_result); - this->m_result = NULL; - } - } - // The placeholder accessors must bound to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - this->m_leftImpl.bind(cgh); - this->m_rightImpl.bind(cgh); - this->m_result.bind(cgh); - } -}; -} // namespace Eigen -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index 21be6ea42..c70dea053 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -15,16 +15,57 @@ namespace Eigen { -template -struct TensorEvaluator, ThreadPoolDevice> : - public TensorContractionEvaluatorBase, ThreadPoolDevice> > { +#ifdef EIGEN_USE_SIMPLE_THREAD_POOL +namespace internal { + +template +struct packLhsArg { + LhsScalar* blockA; + const LhsMapper& lhs; + const Index m_start; + const Index k_start; + const Index mc; + const Index kc; +}; + +template +struct packRhsAndKernelArg { + const MaxSizeVector* blockAs; + RhsScalar* blockB; + const RhsMapper& rhs; + OutputMapper& output; + const Index m; + const Index k; + const Index n; + const Index mc; + const Index kc; + const Index nc; + const Index num_threads; + const Index num_blockAs; + const Index max_m; + const Index k_block_idx; + const Index m_block_idx; + const Index n_block_idx; + const Index m_blocks; + const Index n_blocks; + MaxSizeVector* kernel_notifications; + const MaxSizeVector* lhs_notifications; + const bool need_to_pack; +}; + +} // end namespace internal +#endif // EIGEN_USE_SIMPLE_THREAD_POOL + +template +struct TensorEvaluator, ThreadPoolDevice> : + public TensorContractionEvaluatorBase, ThreadPoolDevice> > { typedef ThreadPoolDevice Device; - typedef TensorEvaluator, Device> Self; + typedef TensorEvaluator, Device> Self; typedef TensorContractionEvaluatorBase Base; - typedef TensorContractionOp XprType; + typedef TensorContractionOp XprType; typedef typename internal::remove_const::type Scalar; typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; @@ -71,35 +112,31 @@ struct TensorEvaluator +#ifndef EIGEN_USE_SIMPLE_THREAD_POOL + template void evalProduct(Scalar* buffer) const { - evalProductImpl(buffer, NoCallback()); - } - - template - void evalProductAsync(Scalar* buffer, EvalToCallback done) const { - evalProductImpl(buffer, std::move(done)); - } - - template - void evalProductImpl(Scalar* buffer, DoneCallback done) const { - // This function computes a lot of heuristics in multiple steps, and it - // also has multiple exit points. To keep it sane, readable and all in one - // place, sync/async execution decision is made at runtime at the very end. - // - // (1) In sync mode we allocate Context on the stack, submit computations - // to the device thread pool, and block on a barrier until it is - // completed. - // - // (2) In async mode we allocate Context on the heap, and after all tasks - // are finished, we call provided the done callback, and delete a - // context from the heap. - // - // (*) EvalParallelContext & EvalShardedByInnerDimContext owns all the state - // and temporary buffers, requried for executing the tensor contraction. - // They are responsible for cleaning it up after contraction is done. - static const bool IsEvalInSyncMode = - std::is_same::value; + typedef internal::TensorContractionInputMapper< + LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t, + contract_t, internal::packet_traits::size, + lhs_inner_dim_contiguous, false, Unaligned> + LhsMapper; + typedef internal::TensorContractionInputMapper< + RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t, + contract_t, internal::packet_traits::size, + rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> + RhsMapper; + typedef internal::blas_data_mapper OutputMapper; + typedef internal::gemm_pack_lhs + LhsPacker; + typedef internal::gemm_pack_rhs< + RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor> + RhsPacker; + typedef internal::gebp_kernel + GebpKernel; const Index m = this->m_i_size; const Index n = this->m_j_size; @@ -135,14 +172,14 @@ struct TensorEvaluator blocking(k, m, n, 2); bm = blocking.mc(); bn = blocking.nc(); bk = blocking.kc(); } else { - internal::TensorContractionBlocking blocking(k, m, n, 2); bm = blocking.mc(); @@ -158,45 +195,35 @@ struct TensorEvaluator::numThreads( static_cast(n) * m, cost, this->m_device.numThreads()); - int num_threads_by_k = numThreadsInnerDim(m, n, k); - if (shardByInnerDim(m, n, k, num_threads, num_threads_by_k)) { - // We are in the scenario where it is more effective to shard by the - // inner dimension. - if (IsEvalInSyncMode) { - EvalShardedByInnerDimContext ctx( - this, num_threads_by_k, buffer, m, n, k, std::move(done)); - ctx.template run(); - } else { - auto* ctx = new EvalShardedByInnerDimContext( - this, num_threads_by_k, buffer, m, n, k, std::move(done)); - ctx->template runAsync(); - } - - return; - } // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost // model is not tuned. Remove this when the cost model is tuned. if (n == 1) num_threads = 1; if (num_threads == 1) { - TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, - Unaligned, (buffer)); - if (!IsEvalInSyncMode) done(); + // The single-threaded algorithm should be faster in this case. + if (n == 1) + this->template evalGemv(buffer); + else + this->template evalGemm(buffer); return; } // Now that we know number of threads, recalculate sharding and blocking. shard_by_col = shardByCol(m, n, num_threads); if (shard_by_col) { - internal::TensorContractionBlocking blocking(k, m, n, num_threads); bm = blocking.mc(); bn = blocking.nc(); bk = blocking.kc(); } else { - internal::TensorContractionBlocking blocking(k, m, n, num_threads); bm = blocking.mc(); @@ -228,26 +255,6 @@ struct TensorEvaluatorm_device.numThreadsInPool(); - - // With small number of threads we want to make sure that we do not reduce - // parallelism too much. With large number of threads we trade maximum - // parallelism for better memory locality. - const float oversharding_factor = - num_worker_threads <= 4 ? 8.0 : - num_worker_threads <= 8 ? 4.0 : - num_worker_threads <= 16 ? 2.0 : - num_worker_threads <= 32 ? 1.0 : - num_worker_threads <= 64 ? 0.8 : /* num_worker_threads > 64 */ 0.6; - - const bool parallelize_by_sharding_dim_only = - sharding_dim_tasks >= oversharding_factor * num_worker_threads; - // Last by not least, decide whether we want to issue both lhs and rhs // packing in parallel; or issue lhs packing first, and then issue rhs // packing when lhs packing completes (for !shard_by_col lhs and rhs are @@ -263,139 +270,40 @@ struct TensorEvaluatorm_leftImpl, this->m_left_nocontract_strides, + this->m_i_strides, this->m_left_contracting_strides, + this->m_k_strides); - } else { -#define CONTEXT_ARGS \ - (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, \ - nn0, shard_by_col, parallel_pack, parallelize_by_sharding_dim_only, \ - std::move(done)) - TENSOR_CONTRACTION_ASYNC_DISPATCH(EvalParallelContext, DoneCallback, - Alignment, CONTEXT_ARGS, run()); -#undef CONTEXT_ARGS - } + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, + this->m_j_strides, this->m_right_contracting_strides, + this->m_k_strides); + + Context(this->m_device, num_threads, lhs, rhs, buffer, m, n, + k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0, + shard_by_col, parallel_pack) + .run(); } - // ------------------------------------------------------------------------ // - - // Dummy struct to represent an empty DoneCallback. - - struct NoCallback { - void operator()() { - eigen_assert(false && "NoCallback should never be called"); - } - }; - - // ------------------------------------------------------------------------ // - - template - class EvalParallelNotification; - - // Synchronous evaluation notification that blocks caller thread in Wait(). - template - class EvalParallelNotification { + // Context coordinates a single parallel gemm operation. + template + class Context { public: - EvalParallelNotification(Context*, NoCallback) {} - void Notify() { done_.Notify(); } - void Wait() { done_.Wait(); } - private: - Eigen::Notification done_; - }; - - // Asynchronous evaluation notification that does not block in Wait(). - template - class EvalParallelNotification { - public: - EvalParallelNotification(Context* ctx, DoneCallback done) - : ctx_(ctx), done_(std::move(done)) {} - - void Notify() { - // Make a copy of done callback, because it will be destructed when we - // will delete context in the next line (EvalParallelNotification is a - // data member of EvalParallelContext class). - DoneCallback done_copy = std::move(done_); - - // Delete parallel evaluation context. - delete ctx_; - - // Now safely call the done callback. - done_copy(); - } - - void Wait() {} - - private: - Context* ctx_; - DoneCallback done_; - }; - - // Context orchestrates sync/async parallel contraction evaluation. When it is - // executed in asynchronous mode, it owns all the shared state that might be - // accessible by block packing and kernel tasks. - - template - class EvalParallelContext { - public: - typedef internal::TensorContractionInputMapper< - LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t, - contract_t, internal::packet_traits::size, - lhs_inner_dim_contiguous, false, Unaligned> - LhsMapper; - typedef internal::TensorContractionInputMapper< - RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t, - contract_t, internal::packet_traits::size, - rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned> - RhsMapper; - - typedef internal::blas_data_mapper OutputMapper; - - typedef internal::TensorContractionKernel< - Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper> - TensorContractionKernel; - - typedef typename TensorContractionKernel::LhsBlock LhsBlock; - typedef typename TensorContractionKernel::RhsBlock RhsBlock; - typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle; - - EvalParallelContext(const Self* self, int num_threads, Scalar* buffer, - Index tm, Index tn, Index tk, Index bm, Index bn, - Index bk, Index nm, Index nn, Index nk, Index gm, - Index gn, Index nm0, Index nn0, bool shard_by_col, - bool parallel_pack, - bool parallelize_by_sharding_dim_only, - DoneCallback done) - : created_by_thread_id_(std::this_thread::get_id()), - done_(this, std::move(done)), - device_(self->m_device), - lhs_(self->m_leftImpl, self->m_left_nocontract_strides, - self->m_i_strides, self->m_left_contracting_strides, - self->m_k_strides), - rhs_(self->m_rightImpl, self->m_right_nocontract_strides, - self->m_j_strides, self->m_right_contracting_strides, - self->m_k_strides), + Context(const Device& device, int num_threads, LhsMapper& lhs, + RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm, + Index bn, Index bk, Index nm, Index nn, Index nk, Index gm, + Index gn, Index nm0, Index nn0, bool shard_by_col, + bool parallel_pack) + : device_(device), + lhs_(lhs), + rhs_(rhs), buffer_(buffer), output_(buffer, tm), - output_kernel_(self->m_output_kernel), - tensor_contraction_params_(self->m_tensor_contraction_params), num_threads_(num_threads), shard_by_col_(shard_by_col), parallel_pack_(parallel_pack), - parallelize_by_sharding_dim_only_(parallelize_by_sharding_dim_only), m_(tm), n_(tn), k_(tk), @@ -408,29 +316,13 @@ struct TensorEvaluator(nk_, P - 1), // - packed_lhs_, packed_rhs_); - - if (parallelize_by_sharding_dim_only_) { - const int num_worker_threads = device_.numThreadsInPool(); - - if (shard_by_col) { - can_use_thread_local_packed_ = new std::atomic[nn_]; - for (int i = 0; i < nn_; ++i) - can_use_thread_local_packed_[i].store(true, - std::memory_order_relaxed); - - Index num_blocks = num_worker_threads * gn_; - thread_local_pre_alocated_mem_ = kernel_.allocateSlices( // - device_, // - /*num_lhs=*/0, // - /*num_rhs=*/num_blocks, // - /*num_slices=*/1, // - /*lhs_blocks=*/nullptr, &rhs_thread_local_pre_allocated_); - - } else { - can_use_thread_local_packed_ = new std::atomic[nm_]; - for (int i = 0; i < nm_; ++i) - can_use_thread_local_packed_[i].store(true, - std::memory_order_relaxed); - - Index num_blocks = num_worker_threads * gm_; - thread_local_pre_alocated_mem_ = kernel_.allocateSlices( // - device_, // - /*num_lhs=*/num_blocks, // - /*num_rhs=*/0, // - /*num_slices=*/1, &lhs_thread_local_pre_allocated_, // - /*rhs_blocks=*/nullptr); + size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); + size_t lhs_size = + divup(bm_ * bk_ * sizeof(LhsScalar), align) * align; + size_t rhs_size = + divup(bn_ * bk_ * sizeof(RhsScalar), align) * align; + packed_mem_ = static_cast(internal::aligned_malloc( + (nm0_ * lhs_size + nn0_ * rhs_size) * std::min(nk_, P - 1))); + char* mem = static_cast(packed_mem_); + for (Index x = 0; x < numext::mini(nk_, P - 1); x++) { + packed_lhs_[x].resize(nm0_); + for (Index m = 0; m < nm0_; m++) { + packed_lhs_[x][m] = reinterpret_cast(mem); + mem += lhs_size; + } + packed_rhs_[x].resize(nn0_); + for (Index n = 0; n < nn0_; n++) { + packed_rhs_[x][n] = reinterpret_cast(mem); + mem += rhs_size; } } } - ~EvalParallelContext() { + ~Context() { for (Index x = 0; x < P; x++) { for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m]; delete[] state_kernel_[x]; } - kernel_.deallocate(device_, packed_mem_); - if (parallelize_by_sharding_dim_only_) { - kernel_.deallocate(device_, thread_local_pre_alocated_mem_); - delete[] can_use_thread_local_packed_; - } + internal::aligned_free(packed_mem_); } void run() { // Kick off packing of the first slice. signal_switch(0, 1); - // Wait for overall completion. - // - // If parallel evaluation is executed in async mode, this is a no-op, and - // Wait() will return immediately. In synchronous mode it will block the - // caller thread until it will receive notification from last task. - // - // In async mode, last task when completed will call done callback from - // the same thread, and will delete this context. - // - // TODO(dvyukov): This wait can lead to deadlock if contraction is - // evaluated in synchronous mode. If nthreads contractions are - // concurrently submitted from worker threads, this wait will block all - // worker threads and the system will deadlock. + // TODO(dvyukov): this wait can lead to deadlock. + // If nthreads contractions are concurrently submitted from worker + // threads, this wait will block all worker threads and the system will + // deadlock. done_.Wait(); } private: - std::thread::id created_by_thread_id_; - - // This notification is specialized on the type of DoneCallback and can be - // blocking or non-blocking. - EvalParallelNotification done_; - + Notification done_; const Device& device_; - LhsMapper lhs_; - RhsMapper rhs_; + LhsMapper& lhs_; + RhsMapper& rhs_; Scalar* const buffer_; OutputMapper output_; - OutputKernelType output_kernel_; - TensorContractionParams tensor_contraction_params_; const int num_threads_; const bool shard_by_col_; const bool parallel_pack_; - const bool parallelize_by_sharding_dim_only_; // Matrix sizes. const Index m_; const Index n_; @@ -562,8 +414,6 @@ struct TensorEvaluator packed_lhs_[P - 1]; - std::vector packed_rhs_[P - 1]; - - // If we choose to parallelize only by the sharding dimension, each thread - // will have it's own "thead local" (not a c++ thread local storage) memory - // for packed_lhs or packed_rhs (shard_by_col = false of true). This memory - // can't be passed to a kernel that might execute on a different thread. - // - // In practice when we are ready to pack memory for the sharding dimension - // (rhs if shard_by_col==true) of the K-th slice, all kernels for K-1 slice - // already computed (99% of the time), and we can pack data into the thread - // local storage, and guarantee that all the kernels will be executed - // immediately in the same thread. This significantly increases L1 cache hit - // ratio and reduces pressure on the memory bus. - // - // It's still possible that kernel for the K-th slice will be ready before - // completion of the K-1 kernel, so we have to allocate "global" packed_lhs_ - // and packed_rhs_ to allow kernels to be executed later on a thread - // different from the thread that was used for packing. - - // Handle for pre-allocated thread local memory buffers. - BlockMemHandle thread_local_pre_alocated_mem_; - - // Only one of these will be initialized depending on shard_by_col value - // (the size will be `num_worker_threads * num_grains_in_the_sharding_dim`). - std::vector lhs_thread_local_pre_allocated_; - std::vector rhs_thread_local_pre_allocated_; - - // How many thread local blocks were already allocated. - std::atomic num_thread_local_allocations_; - const int thread_local_capacity; - - // We will use pre-allocated Lhs/Rhs blocks defined above, if the number of - // unique threads in a system is below or equal to the number of threads in - // a thread pool. We will fallback on dynamic memory allocation after that. - - // ThreadLocalBlocks is a container for Lhs or Rhs thread local buffers. Its - // size is equal to the grain size in Lhs/Rhs sharding dimension. - template - class ThreadLocalBlocks { - public: - ThreadLocalBlocks() = default; - - ThreadLocalBlocks(BlockType* base, size_t grain_size) - : is_pre_allocated_(true), - thread_local_pre_allocated_base_(base), - grain_size_(grain_size) {} - - ThreadLocalBlocks(BlockMemHandle mem_handle, - std::vector blocks) - : is_pre_allocated_(false), - mem_handle_(std::move(mem_handle)), - blocks_(std::move(blocks)) {} - - BlockType& block(int grain_index) { - eigen_assert(grain_index >= 0); - eigen_assert(static_cast(grain_index) < size()); - return is_pre_allocated_ ? thread_local_pre_allocated_base_[grain_index] - : blocks_[grain_index]; - } - - void Release(EvalParallelContext& ctx) const { - if (!is_pre_allocated_) { - ctx.kernel_.deallocate(ctx.device_, mem_handle_); - } - } - - size_t size() const { - return is_pre_allocated_ ? grain_size_ : blocks_.size(); - } - - private: - bool is_pre_allocated_; - - // Reuse pre-allocated thread local buffers. - BlockType* thread_local_pre_allocated_base_ = nullptr; - size_t grain_size_ = 0; - - // These will be initialized only if `is_pre_allocated == false`. - BlockMemHandle mem_handle_{}; - std::vector blocks_; - }; - - // ThreadLocalBlocksInitialize callable does custom thread local blocks - // initialization, and will reuse pre-allocated buffers if possible, or will - // dynamically allocate new memory. - // - // Lhs/Rhs blocks might be of the same type, so we have to pass explicitly - // for what side do we plan to do block allocation. - template - class ThreadLocalBlocksInitialize { - static constexpr bool kIsLhs = - !is_rhs && std::is_same::value; - static const bool kIsRhs = - is_rhs && std::is_same::value; - static_assert(kIsLhs || kIsRhs, "Unkown block type"); - - using Blocks = ThreadLocalBlocks; - - public: - ThreadLocalBlocksInitialize(EvalParallelContext& ctx) - : ctx_(ctx), - num_worker_threads_(ctx_.device_.numThreadsInPool()) {} - - void operator()(Blocks& blocks) { - const int n = ctx_.num_thread_local_allocations_.fetch_add( - 1, std::memory_order_relaxed); - - if (n >= num_worker_threads_) { - ThreadLocalBlocksAllocator::allocate(ctx_, blocks); - } else { - ThreadLocalBlocksAllocator::reuse(ctx_, n, blocks); - } - } - - private: - // NOTE(ezhulenev): Without 'if constexpr' we have to put calls to - // TensorContractionKernel::allocateSlices into template specializations. - // Also explicit specializations are not allowed at class scope in C++03, - // EvalCtx type parameter is just a workaround for that limitation. - template - struct ThreadLocalBlocksAllocator; - - template - struct ThreadLocalBlocksAllocator { - static void allocate(EvalCtx& ctx, Blocks& blocks) { - std::vector rhs_blocks; - BlockMemHandle mem_handle = ctx.kernel_.allocateSlices( - ctx.device_, - /*num_lhs=*/0, - /*num_rhs=*/ctx.gn_, - /*num_slices=*/1, - /*lhs_blocks=*/nullptr, /*rhs_blocks=*/&rhs_blocks); - - blocks = ThreadLocalBlocks(std::move(mem_handle), - std::move(rhs_blocks)); - } - - static void reuse(EvalCtx& ctx, int index, Blocks& blocks) { - RhsBlock* ptr = &ctx.rhs_thread_local_pre_allocated_[ctx.gn_ * index]; - blocks = ThreadLocalBlocks(ptr, ctx.gn_); - } - }; - - template - struct ThreadLocalBlocksAllocator { - static void allocate(EvalCtx& ctx, Blocks& blocks) { - std::vector lhs_blocks; - BlockMemHandle mem_handle = ctx.kernel_.allocateSlices( - ctx.device_, - /*num_lhs=*/ctx.gm_, - /*num_rhs=*/0, - /*num_slices=*/1, - /*lhs_blocks=*/&lhs_blocks, /*rhs_blocks=*/nullptr); - - blocks = ThreadLocalBlocks(std::move(mem_handle), - std::move(lhs_blocks)); - } - - static void reuse(EvalCtx& ctx, int index, Blocks& blocks) { - LhsBlock* ptr = &ctx.lhs_thread_local_pre_allocated_[ctx.gm_ * index]; - blocks = ThreadLocalBlocks(ptr, ctx.gm_); - } - }; - - EvalParallelContext& ctx_; - const int num_worker_threads_; - }; - - template - class ThreadLocalBlocksRelease { - public: - using Blocks = ThreadLocalBlocks; - ThreadLocalBlocksRelease(EvalParallelContext& ctx) : ctx_(ctx) {} - void operator()(Blocks& blocks) { blocks.Release(ctx_); } - - private: - EvalParallelContext& ctx_; - }; - - // ThreadLocalBlocks initialization callables. - using ThreadLocalLhsInit = - ThreadLocalBlocksInitialize; - using ThreadLocalRhsInit = - ThreadLocalBlocksInitialize; - - // ThreadLocalBlocks release callables. - using ThreadLocalLhsRelease = ThreadLocalBlocksRelease; - using ThreadLocalRhsRelease = ThreadLocalBlocksRelease; - - // Thread local containers for Lhs/Rhs block packs. In practice only one of - // them will be used, depending on the shard_by_col value. - Eigen::ThreadLocal, ThreadLocalLhsInit, - ThreadLocalLhsRelease> - lhs_thread_local_blocks_; - Eigen::ThreadLocal, ThreadLocalRhsInit, - ThreadLocalRhsRelease> - rhs_thread_local_blocks_; - - // After a particular shard for Kth slice missed thread local execution - // opportunity (K-1 slice didn't complete kernels execution), we can no - // longer schedule K+1 and following slices in thread local mode, because - // there is no more guarantee that previous kernels were executed - // sequentially in the same thread (size is nn_ or nm_). - std::atomic* can_use_thread_local_packed_; - + void* packed_mem_; + std::vector packed_lhs_[P - 1]; + std::vector packed_rhs_[P - 1]; std::atomic** state_kernel_[P]; // state_switch_ is frequently modified by worker threads, while other // fields are read-only after constructor. Let's move it to a separate cache @@ -817,168 +461,69 @@ struct TensorEvaluator state_packing_ready_[P]; std::atomic state_switch_[P]; - LhsBlock& packed_lhs(Index m, Index k, Index m1, bool use_thread_local) { - if (use_thread_local) { - eigen_assert(!shard_by_col_); - ThreadLocalBlocks& blocks = lhs_thread_local_blocks_.local(); - - Index grain_index = m1 - m * gm_; - return blocks.block(internal::convert_index(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index? - } else { - return packed_lhs_[k % (P - 1)][m1]; - } - } - - RhsBlock& packed_rhs(Index n, Index k, Index n1, bool use_thread_local) { - if (use_thread_local) { - eigen_assert(shard_by_col_); - ThreadLocalBlocks& blocks = rhs_thread_local_blocks_.local(); - - Index grain_index = n1 - n * gn_; - return blocks.block(internal::convert_index(grain_index)); // FIXME better make ThreadLocalBlocks use Eigen::Index? - } else { - return packed_rhs_[k % (P - 1)][n1]; - } - } - - // In following two methods (pack_lhs and pack_rhs), if we know for sure - // that we'll be able to immediately call a kernel with packed data, and do - // not submit it to the thread pool, we can use thread local memory for - // packed data. - // - // We can only reliably check it if we are running all kernels in sync mode - // (parallelize only by sharding dim). If kernel for m==0 (n==0) is ready to - // run, it's guaranteed that all kernels with larger values of m (n) are - // also ready, because we execute them in the same order for all K slices. - void pack_lhs(Index m, Index k) { - bool use_thread_local = false; - - if (parallelize_by_sharding_dim_only_ && !shard_by_col_ && - can_use_thread_local_packed_[m].load(std::memory_order_relaxed)) { - if (state_kernel_[k % P][m][0].load(std::memory_order_relaxed) == 1) { - use_thread_local = true; - } else { - // If we can't guarantee that all kernels in `k` slice will be - // executed sequentially in current thread, it's no longer safe to use - // thread local memory in following slices along the k dimensions. - eigen_assert(k > 0); - can_use_thread_local_packed_[m].store(false, - std::memory_order_relaxed); - } - } - const Index mend = m * gm_ + gm(m); for (Index m1 = m * gm_; m1 < mend; m1++) - kernel_.packLhs(&packed_lhs(m, k, m1, use_thread_local), - lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1)); + LhsPacker()(packed_lhs_[k % (P - 1)][m1], + lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1)); if (!parallel_pack_ && shard_by_col_) { - assert(!use_thread_local); signal_packing(k); } else { signal_switch(k + 1); - for (Index n = nn_ - 1; n >= 0; n--) { - bool sync = parallelize_by_sharding_dim_only_ || n == 0; - signal_kernel(m, n, k, sync, use_thread_local); - } + for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0); } } void pack_rhs(Index n, Index k) { - bool use_thread_local = false; - - if (parallelize_by_sharding_dim_only_ && shard_by_col_ && - can_use_thread_local_packed_[n].load(std::memory_order_relaxed)) { - if (state_kernel_[k % P][0][n].load(std::memory_order_relaxed) == 1) { - use_thread_local = true; - } else { - // If we can't guarantee that all kernels in `k` slice will be - // executed sequentially in current thread, it's no longer safe to use - // thread local memory in followig slices along the k dimensions. - eigen_assert(k > 0); - can_use_thread_local_packed_[n].store(false, - std::memory_order_relaxed); - } - } - const Index nend = n * gn_ + gn(n); for (Index n1 = n * gn_; n1 < nend; n1++) { - if (!TensorContractionKernel::HasBeta && k == 0) { - // Zero the output memory in parallel, only if contraction kernel does - // not support `beta`. Otherwise we will pass beta 0.0 to the first - // call to the `TensorContractionKernel::invoke()`. - // - // On 10000x2x10000 mm zeroing can easily take half of time. Zero (bn - // x m) row. Safe to do here because all kernels that will write to - // this memory depend on completion of this task. Note: don't call - // device_.memset() here. device_.memset() blocks on thread pool - // worker thread, which can lead to underutilization and deadlocks. + if (k == 0) { + // Zero the output memory in parallel. + // On 10000x2x10000 mm zeroing can easily take half of time. + // Zero (bn x m) row. Safe to do here because all kernels that will + // write to this memory depend on completion of this task. + // Note: don't call device_.memset() here. device_.memset() blocks on + // thread pool worker thread, which can lead to underutilization and + // deadlocks. memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar)); } - kernel_.packRhs(&packed_rhs(n, k, n1, use_thread_local), - rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1)); + RhsPacker()(packed_rhs_[k % (P - 1)][n1], + rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1)); } if (parallel_pack_ || shard_by_col_) { signal_switch(k + 1); - for (Index m = nm_ - 1; m >= 0; m--) { - bool sync = parallelize_by_sharding_dim_only_ || m == 0; - signal_kernel(m, n, k, sync, use_thread_local); - } + for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0); } else { - assert(!use_thread_local); signal_packing(k); } } - void kernel(Index m, Index n, Index k, bool use_thread_local) { + void kernel(Index m, Index n, Index k) { // Note: order of iteration matters here. Iteration over m is innermost - // because we want to reuse the same packed rhs in consecutive tasks + // because we want to reuse the same packed rhs in consequetive tasks // (rhs fits into L2$ while lhs only into L3$). const Index nend = n * gn_ + gn(n); const Index mend = m * gm_ + gm(m); - - // NOTE: output = alpha * LHS * RHS + beta * output. - const Scalar alpha = Scalar(1); - const Scalar beta = - (TensorContractionKernel::HasBeta && k == 0) ? Scalar(0) : Scalar(1); - if (shard_by_col_) { for (Index n1 = n * gn_; n1 < nend; n1++) { - for (Index m1 = m * gm_; m1 < mend; m1++) { - const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_); - kernel_.invoke( - output_mapper, - packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local), - packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1), - bk(k), bn(n1), alpha, beta); - - // We are done with the last task for the [m1, n1] block. - if (k + 1 == nk_) { - output_kernel_(output_mapper, tensor_contraction_params_, - m1 * bm_, n1 * bn_, bm(m1), bn(n1)); - } - } + for (Index m1 = m * gm_; m1 < mend; m1++) + GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_), + packed_lhs_[k % (P - 1)][m1], + packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), + Scalar(1), -1, -1, 0, 0); } } else { for (Index m1 = m * gm_; m1 < mend; m1++) for (Index n1 = n * gn_; n1 < nend; n1++) { - const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_); - kernel_.invoke( - output_mapper, - packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local), - packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1), - bk(k), bn(n1), alpha, beta); - - // We are done with the last task for the [m1, n1] block. - if (k + 1 == nk_) { - output_kernel_(output_mapper, tensor_contraction_params_, - m1 * bm_, n1 * bn_, bm(m1), bn(n1)); - } + GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_), + packed_lhs_[k % (P - 1)][m1], + packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1), + Scalar(1), -1, -1, 0, 0); } } - signal_kernel(m, n, k + 1, /*sync=*/false, /*use_thread_local=*/false); + signal_kernel(m, n, k + 1, false); signal_switch(k + 2); } @@ -991,23 +536,16 @@ struct TensorEvaluator* state = &state_kernel_[k % P][m][n]; Index s = state->load(); eigen_assert(s > 0); - if (s != 1 && state->fetch_sub(1) != 1) { - eigen_assert(!use_thread_local); - return; - } + if (s != 1 && state->fetch_sub(1) != 1) return; state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed); - if (sync) { - kernel(m, n, k, use_thread_local); - } else { - eigen_assert(!use_thread_local); - device_.enqueueNoNotification( - [=]() { kernel(m, n, k, use_thread_local); }); - } + if (sync) + kernel(m, n, k); + else + device_.enqueueNoNotification([=]() { kernel(m, n, k); }); } void signal_switch(Index k, Index v = 1) { @@ -1057,32 +595,11 @@ struct TensorEvaluator 1) { - Index mid = (start + end) / 2; - device_.enqueueNoNotification( - [=]() { enqueue_packing_helper(mid, end, k, rhs); }); - end = mid; - } - - // Decide if we want to run first packing task (start == 0) in - // async mode if we parallelize only by sharding dim: - // (1) pack_lhs and pack_rhs call signal_switch before completing - // all calls to signal_kernel, which in sync mode might lead - // to the execution of the first kernel of the k+1 slice, before - // completing a call to the last kernel of the k slice. - // (2) all pack tasks for sharded dim must be executed in a thread - // pool to get pre-allocated thead local buffers. - bool pack_async = - (start == 0) && - (parallelize_by_sharding_dim_only_&& shard_by_col_ == rhs) && - (k > 0 || std::this_thread::get_id() == created_by_thread_id_); - - if (pack_async) { - device_.enqueueNoNotification( - [=]() { enqueue_packing_helper(start, end, k, rhs); }); - } else { - enqueue_packing_helper(start, end, k, rhs); - } + Index mid = (start + end) / 2; + device_.enqueueNoNotification( + [=]() { enqueue_packing_helper(mid, end, k, rhs); }); + device_.enqueueNoNotification( + [=]() { enqueue_packing_helper(start, mid, k, rhs); }); } } @@ -1094,364 +611,10 @@ struct TensorEvaluator - using SyncEvalParallelContext = - EvalParallelContext; - - // ------------------------------------------------------------------------ // - - // EvalShardedByInnerDimContext orchestrates sync/async contraction - // evaluation, when we shard by inner dimension. When it is executed in - // asynchronous mode, it owns all the shared state that might be accessible by - // block processing tasks. - - template - struct EvalShardedByInnerDimContext { - EvalShardedByInnerDimContext(const Self* self, int num_threads, - Scalar* result_buffer, - Index m_size, Index n_size, Index k_size, - DoneCallback done_callback) - : evaluator(self), - m_lhs_inner_dim_contiguous(evaluator->m_lhs_inner_dim_contiguous), - m_rhs_inner_dim_contiguous(evaluator->m_rhs_inner_dim_contiguous), - m_rhs_inner_dim_reordered(evaluator->m_rhs_inner_dim_reordered), - result(result_buffer), - m(m_size), - n(n_size), - k(k_size), - done(std::move(done_callback)), - buffer_size_bytes(m * n * sizeof(Scalar)), - block_size(blockSize(k, num_threads)), - num_blocks(divup(k, block_size)), - num_pending_blocks(internal::convert_index(num_blocks)), - l0_ranges(divup(num_blocks, l0_size)), - l0_state(l0_ranges), - block_buffers(num_blocks) { - // Keep count of pending gemm tasks for each l0 range. - for (int i = 0; i < l0_ranges; ++i) { - const Index num_pending_tasks = actualRangeSize(l0_ranges, l0_size, i); - l0_state.emplace_back(internal::convert_index(num_pending_tasks)); - } - - // Allocate temporary buffers for each block. - for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) { - Scalar* buf = block_idx == 0 - ? result - : static_cast(evaluator->m_device.allocate( - buffer_size_bytes)); - block_buffers.emplace_back(buf); - } - } - - ~EvalShardedByInnerDimContext() { - for (Index i = 1; i < num_blocks; ++i) { - evaluator->m_device.deallocate(block_buffers[i]); - } - } - - template - void run() { - Barrier barrier(internal::convert_index(num_blocks)); - eval(barrier, 0, num_blocks); - barrier.Wait(); - - // Aggregate partial sums from l0 ranges. - aggregateL0Blocks(); - - // Apply output kernel. - applyOutputKernel(); - } - - template - void runAsync() { - evalAsync(0, num_blocks); - } - - private: - // The underlying GEMM kernel assumes that k is a multiple of - // the packet size and subtle breakage occurs if this is violated. - static const Index packet_size = internal::packet_traits::size; - - const Self* evaluator; // TensorContraction evaluator - - // These fields required fromTENSOR_CONTRACTION_DISPATCH macro. - bool m_lhs_inner_dim_contiguous; - bool m_rhs_inner_dim_contiguous; - bool m_rhs_inner_dim_reordered; - - Scalar* result; - - Index m; - Index n; - Index k; - - DoneCallback done; - - // ----------------------------------------------------------------------// - // Algorithm parameters. - - // We will compute partial results into the buffers of this size. - Index buffer_size_bytes; - - Index block_size; - Index num_blocks; - - // Keep track of pending tasks when evaluate in async mode. - std::atomic num_pending_blocks; - - // We compute partial gemm results in parallel, and to get the final result - // we need to add them all together. For the large number of threads (>= 48) - // this adds a very expensive sequential step at the end. - // - // We split the [0, num_blocks) into small ranges, and when a task for the - // block finishes its partial gemm computation, it checks if it was the last - // gemm in the range, and if so, it will add all blocks of the range. - // - // After all tasks done, we need to add only these pre-aggregated blocks. - - // For now we use just a single level of ranges to compute pre-aggregated - // partial sums, but in general we can use more layers to compute tree - // aggregation in parallel and reduce the size of the sequential step. - // - // TODO(ezhulenev): Add multilevel tree aggregation? Probably will make - // sense only if number of threads >= ~128? - static const Index l0_size = 4; - Index l0_ranges; - - // Keep count of pending gemm tasks for each l0 range. - MaxSizeVector> l0_state; // [0, l0_ranges) - - // Buffers allocated for each temporary block computation. - MaxSizeVector block_buffers; // [0, num_blocks) - - template - void processBlock(Index block_idx, Index begin, Index end) { - Scalar* buf = block_buffers[block_idx]; - - TENSOR_CONTRACTION_DISPATCH( - evaluator->template evalGemmPartialWithoutOutputKernel, Alignment, - (buf, begin, end, - /*num_threads=*/internal::convert_index(num_blocks))); - - // Check if it was the last task in l0 range. - const Index l0_index = block_idx / l0_size; - const int v = l0_state[l0_index].fetch_sub(1); - eigen_assert(v >= 1); - - // If we processed the last block of the range, we can aggregate all - // partial results into the first block of the range. - if (v == 1) { - const Index rng_size = actualRangeSize(l0_ranges, l0_size, l0_index); - const Index dst_block_idx = l0_index * l0_size; - - if (rng_size == l0_size) { - addAllToBuffer( - m * n, - /*src_buf0=*/block_buffers[dst_block_idx + 1], - /*src_buf1=*/block_buffers[dst_block_idx + 2], - /*src_buf2=*/block_buffers[dst_block_idx + 3], - /*dst_buf= */ block_buffers[dst_block_idx]); - } else { - // Aggregate blocks of potentially incomplete last range. - for (int i = 1; i < rng_size; ++i) { - addToBuffer(m * n, - /*src_buf=*/block_buffers[dst_block_idx + i], - /*dst_buf=*/block_buffers[dst_block_idx]); - } - } - } - } - - // Aggregate partial sums from l0 ranges. - template - void aggregateL0Blocks() const { - Index l0_index = 1; - - for (; l0_index + 2 < l0_ranges; l0_index += 3) { - addAllToBuffer( - m * n, - /*src_buf0=*/block_buffers[(l0_index + 0) * l0_size], - /*src_buf1=*/block_buffers[(l0_index + 1) * l0_size], - /*src_buf2=*/block_buffers[(l0_index + 2) * l0_size], - /*dst_buf= */ block_buffers[0]); - } - - for (; l0_index < l0_ranges; ++l0_index) { - addToBuffer(m * n, block_buffers[l0_index * l0_size], - block_buffers[0]); - } - } - - void applyOutputKernel() const { - typedef internal::blas_data_mapper OutputMapper; - evaluator->m_output_kernel( - OutputMapper(result, m), evaluator->m_tensor_contraction_params, - static_cast(0), static_cast(0), m, n); - } - - // Compute block size with accounting for potentially incomplete last block. - Index actualBlockSize(Index block_idx) const { - return block_idx + 1 < num_blocks - ? block_size - : k + block_size - block_size * num_blocks; - }; - - // Compute range size with accounting for potentially incomplete last range. - Index actualRangeSize(Index num_ranges, Index range_size, - Index range_idx) const { - eigen_assert(range_idx < num_ranges); - return range_idx + 1 < num_ranges - ? range_size - : num_blocks + range_size - range_size * num_ranges; - }; - - template - EIGEN_STRONG_INLINE static void addToBuffer(size_t n, const Scalar* src_buf, - Scalar* tgt_buf) { - const int output_packet_size = - internal::unpacket_traits::size; - size_t i = 0; - const size_t num_packets = n / output_packet_size; - for (; i < output_packet_size * num_packets; i += output_packet_size) { - const PacketReturnType src_val = - internal::pload(src_buf + i); - const PacketReturnType tgt_val = - internal::ploadt(tgt_buf + i); - const PacketReturnType sum = internal::padd(src_val, tgt_val); - internal::pstoret(tgt_buf + i, - sum); - } - for (; i < n; ++i) { - tgt_buf[i] += src_buf[i]; - } - } - - template - EIGEN_STRONG_INLINE static void addAllToBuffer(size_t n, - const Scalar* src_buf0, - const Scalar* src_buf1, - const Scalar* src_buf2, - Scalar* dst_buf) { - using ::Eigen::internal::padd; - using ::Eigen::internal::pload; - using ::Eigen::internal::ploadt; - using ::Eigen::internal::pstoret; - - const int output_packet_size = - internal::unpacket_traits::size; - - size_t i = 0; - const size_t num_packets = n / output_packet_size; - for (; i < output_packet_size * num_packets; i += output_packet_size) { - const auto src_val0 = pload(src_buf0 + i); - const auto src_val1 = pload(src_buf1 + i); - const auto src_val2 = pload(src_buf2 + i); - - const auto dst_val = ploadt(dst_buf + i); - const auto sum = - padd(padd(dst_val, src_val0), padd(src_val1, src_val2)); - - pstoret(dst_buf + i, sum); - } - for (; i < n; ++i) { - dst_buf[i] += src_buf0[i] + src_buf1[i] + src_buf2[i]; - } - } - - template - void eval(Barrier& barrier, Index start_block_idx, Index end_block_idx) { - while (end_block_idx - start_block_idx > 1) { - Index mid_block_idx = (start_block_idx + end_block_idx) / 2; - evaluator->m_device.enqueueNoNotification( - [this, &barrier, mid_block_idx, end_block_idx]() { - eval(barrier, mid_block_idx, end_block_idx); - }); - end_block_idx = mid_block_idx; - } - - Index block_idx = start_block_idx; - Index block_start = block_idx * block_size; - Index block_end = block_start + actualBlockSize(block_idx); - - processBlock(block_idx, block_start, block_end); - barrier.Notify(); - } - - template - void evalAsync(Index start_block_idx, Index end_block_idx) { - while (end_block_idx - start_block_idx > 1) { - Index mid_block_idx = (start_block_idx + end_block_idx) / 2; - evaluator->m_device.enqueueNoNotification( - [this, mid_block_idx, end_block_idx]() { - evalAsync(mid_block_idx, end_block_idx); - }); - end_block_idx = mid_block_idx; - } - - Index block_idx = start_block_idx; - - Index block_start = block_idx * block_size; - Index block_end = block_start + actualBlockSize(block_idx); - - processBlock(block_idx, block_start, block_end); - - int v = num_pending_blocks.fetch_sub(1); - eigen_assert(v >= 1); - - if (v == 1) { - // Aggregate partial sums from l0 ranges. - aggregateL0Blocks(); - - // Apply output kernel. - applyOutputKernel(); - - // NOTE: If we call `done` callback before deleting this (context), - // it might deallocate Self* pointer captured by context, and we'll - // fail in destructor trying to deallocate temporary buffers. - - // Move done call back from context before it will be destructed. - DoneCallback done_copy = std::move(done); - - // We are confident that we are the last one who touches context. - delete this; - - // Now safely call the done callback. - done_copy(); - } - } - - // Cost model doesn't capture well the cost associated with constructing - // tensor contraction mappers and computing loop bounds in gemm_pack_lhs - // and gemm_pack_rhs, so we specify minimum desired block size. - static Index blockSize(Index k, int num_threads) { - const auto round_up = [=](Index index) -> Index { - const Index kmultiple = packet_size <= 8 ? 8 : packet_size; - return divup(index, kmultiple) * kmultiple; - }; - - const Index target_block_size = round_up(divup(k, num_threads)); - const Index desired_min_block_size = 12 * packet_size; - - return numext::mini( - k, numext::maxi(desired_min_block_size, target_block_size)); - } - - EvalShardedByInnerDimContext(const EvalShardedByInnerDimContext&) = delete; - void operator=(const EvalShardedByInnerDimContext&) = delete; - }; - - // ------------------------------------------------------------------------ // - - // Below are the function used by evalProductImpl heuristics, trying to select - // optimcal parameters for parallelization algorithm. - // Decide whether we want to shard m x n contraction by columns or by rows. static bool shardByCol(Index m, Index n, Index num_threads) { // Note: we are comparing both n and m against Traits::nr, it is not @@ -1555,15 +718,304 @@ struct TensorEvaluator + void evalProduct(Scalar* buffer) const { + if (this->m_j_size == 1) { + this->template evalGemv(buffer); + return; + } + + evalGemm(buffer); + } + + template + void evalGemm(Scalar* buffer) const { + // columns in left side, rows in right side + const Index k = this->m_k_size; + + // rows in left side + const Index m = this->m_i_size; + + // columns in right side + const Index n = this->m_j_size; + + // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar) + this->m_device.memset(buffer, 0, m * n * sizeof(Scalar)); + + + const int lhs_packet_size = internal::unpacket_traits::size; + const int rhs_packet_size = internal::unpacket_traits::size; + + typedef internal::TensorContractionInputMapper LhsMapper; + + typedef internal::TensorContractionInputMapper RhsMapper; + + typedef internal::blas_data_mapper OutputMapper; + + // TODO: packing could be faster sometimes if we supported row major tensor mappers + typedef internal::gemm_pack_lhs LhsPacker; + typedef internal::gemm_pack_rhs RhsPacker; + + // TODO: replace false, false with conjugate values? + typedef internal::gebp_kernel GebpKernel; + + typedef internal::packLhsArg packLArg; + typedef internal::packRhsAndKernelArg packRKArg; + + // initialize data mappers + LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides, + this->m_left_contracting_strides, this->m_k_strides); + + RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides, + this->m_right_contracting_strides, this->m_k_strides); + + OutputMapper output(buffer, m); + + // compute block sizes (which depend on number of threads) + const Index num_threads = this->m_device.numThreads(); + internal::TensorContractionBlocking blocking(k, m, n, num_threads); + Index mc = blocking.mc(); + Index nc = blocking.nc(); + Index kc = blocking.kc(); + eigen_assert(mc <= m); + eigen_assert(nc <= n); + eigen_assert(kc <= k); + +#define CEIL_DIV(a, b) (((a) + (b) - 1) / (b)) + const Index k_blocks = CEIL_DIV(k, kc); + const Index n_blocks = CEIL_DIV(n, nc); + const Index m_blocks = CEIL_DIV(m, mc); + const Index sizeA = mc * kc; + const Index sizeB = kc * nc; + + /* cout << "m: " << m << " n: " << n << " k: " << k << endl; + cout << "mc: " << mc << " nc: " << nc << " kc: " << kc << endl; + cout << "m_blocks: " << m_blocks << " n_blocks: " << n_blocks << " k_blocks: " << k_blocks << endl; + cout << "num threads: " << num_threads << endl; + */ + + // note: m_device.allocate should return 16 byte aligned pointers, but if blockA and blockB + // aren't 16 byte aligned segfaults will happen due to SIMD instructions + // note: You can get away with allocating just a single blockA and offsets and meet the + // the alignment requirements with the assumption that + // (Traits::mr * sizeof(ResScalar)) % 16 == 0 + const Index numBlockAs = numext::mini(num_threads, m_blocks); + MaxSizeVector blockAs(num_threads); + for (int i = 0; i < num_threads; i++) { + blockAs.push_back(static_cast(this->m_device.allocate(sizeA * sizeof(LhsScalar)))); + } + + // To circumvent alignment issues, I'm just going to separately allocate the memory for each thread + // TODO: is this too much memory to allocate? This simplifies coding a lot, but is wasteful. + // Other options: (1) reuse memory when a thread finishes. con: tricky + // (2) allocate block B memory in each thread. con: overhead + MaxSizeVector blockBs(n_blocks); + for (int i = 0; i < n_blocks; i++) { + blockBs.push_back(static_cast(this->m_device.allocate(sizeB * sizeof(RhsScalar)))); + } + + // lhs_notifications starts with all null Notifications + MaxSizeVector lhs_notifications(num_threads, nullptr); + + // this should really be numBlockAs * n_blocks; + const Index num_kernel_notifications = num_threads * n_blocks; + MaxSizeVector kernel_notifications(num_kernel_notifications, + nullptr); + + for (Index k_block_idx = 0; k_block_idx < k_blocks; k_block_idx++) { + const Index k_start = k_block_idx * kc; + // make sure we don't overshoot right edge of left matrix + const Index actual_kc = numext::mini(k_start + kc, k) - k_start; + + for (Index m_block_idx = 0; m_block_idx < m_blocks; m_block_idx += numBlockAs) { + const Index num_blocks = numext::mini(m_blocks-m_block_idx, numBlockAs); + + for (Index mt_block_idx = m_block_idx; mt_block_idx < m_block_idx+num_blocks; mt_block_idx++) { + const Index m_start = mt_block_idx * mc; + const Index actual_mc = numext::mini(m_start + mc, m) - m_start; + eigen_assert(actual_mc > 0); + + Index blockAId = (k_block_idx * m_blocks + mt_block_idx) % num_threads; + + for (int i = 0; i < n_blocks; ++i) { + Index notification_id = (blockAId * n_blocks + i); + // Wait for any current kernels using this slot to complete + // before using it. + if (kernel_notifications[notification_id]) { + wait_until_ready(kernel_notifications[notification_id]); + delete kernel_notifications[notification_id]; + } + kernel_notifications[notification_id] = new Notification(); + } + const packLArg arg = { + blockAs[blockAId], // blockA + lhs, // lhs + m_start, // m + k_start, // k + actual_mc, // mc + actual_kc, // kc + }; + + // Delete any existing notification since we may be + // replacing it. The algorithm should ensure that there are + // no existing waiters on this notification. + delete lhs_notifications[blockAId]; + lhs_notifications[blockAId] = + this->m_device.enqueue(&Self::packLhs, arg); + } + + // now start kernels. + const Index m_base_start = m_block_idx * mc; + const bool need_to_pack = m_block_idx == 0; + + for (Index n_block_idx = 0; n_block_idx < n_blocks; n_block_idx++) { + const Index n_start = n_block_idx * nc; + const Index actual_nc = numext::mini(n_start + nc, n) - n_start; + + // first make sure the previous kernels are all done before overwriting rhs. Also wait if + // we're going to start new k. In both cases need_to_pack is true. + if (need_to_pack) { + for (Index i = num_blocks; i < num_threads; ++i) { + Index blockAId = (k_block_idx * m_blocks + i + m_block_idx) % num_threads; + Index future_id = (blockAId * n_blocks + n_block_idx); + wait_until_ready(kernel_notifications[future_id]); + } + } + + packRKArg arg = { + &blockAs, // blockA + blockBs[n_block_idx], // blockB + rhs, // rhs + output, // output + m_base_start, // m + k_start, // k + n_start, // n + mc, // mc + actual_kc, // kc + actual_nc, // nc + num_threads, + numBlockAs, + m, + k_block_idx, + m_block_idx, + n_block_idx, // n_block_idx + m_blocks, // m_blocks + n_blocks, // n_blocks + &kernel_notifications, // kernel notifications + &lhs_notifications, // lhs notifications + need_to_pack, // need_to_pack + }; + + // We asynchronously kick off this function, which ends up + // notifying the appropriate kernel_notifications objects, + // which this thread waits on before exiting. + this->m_device.enqueueNoNotification(&Self::packRhsAndKernel, arg); + } + } + } + + // Make sure all the kernels are done. + for (size_t i = 0; i < kernel_notifications.size(); ++i) { + wait_until_ready(kernel_notifications[i]); + delete kernel_notifications[i]; + } + + // No need to wait for lhs notifications since they should have + // already been waited on. Just clean them up. + for (size_t i = 0; i < lhs_notifications.size(); ++i) { + delete lhs_notifications[i]; + } + + // deallocate all of the memory for both A and B's + for (size_t i = 0; i < blockAs.size(); i++) { + this->m_device.deallocate(blockAs[i]); + } + for (size_t i = 0; i < blockBs.size(); i++) { + this->m_device.deallocate(blockBs[i]); + } + +#undef CEIL_DIV + } + + /* + * Packs a LHS block of size (mt, kc) starting at lhs(m, k). Before packing + * the LHS block, check that all of the kernels that worked on the same + * mt_block_idx in the previous m_block are done. + */ + template + static void packLhs(const packLArg arg) { + // perform actual packing + LhsPacker pack_lhs; + pack_lhs(arg.blockA, arg.lhs.getSubMapper(arg.m_start, arg.k_start), arg.kc, arg.mc); + } + + /* + * Packs a RHS block of size (kc, nc) starting at (k, n) after checking that + * all kernels in the previous block are done. + * Then for each LHS future, we wait on the future and then call GEBP + * on the area packed by the future (which starts at + * blockA + future_idx * mt * kc) on the LHS and with the full packed + * RHS block. + * The output of this GEBP is written to output(m + i * mt, n). + */ + template + static void packRhsAndKernel(packRKArg arg) { + if (arg.need_to_pack) { + RhsPacker pack_rhs; + pack_rhs(arg.blockB, arg.rhs.getSubMapper(arg.k, arg.n), arg.kc, arg.nc); + } + + GebpKernel gebp; + for (Index mt_block_idx = 0; mt_block_idx < arg.num_blockAs; mt_block_idx++) { + const Index m_base_start = arg.m + arg.mc*mt_block_idx; + if (m_base_start < arg.max_m) { + Index blockAId = (arg.k_block_idx * arg.m_blocks + mt_block_idx + arg.m_block_idx) % arg.num_threads; + wait_until_ready((*arg.lhs_notifications)[blockAId]); + const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start; + gebp(arg.output.getSubMapper(m_base_start, arg.n), + (*arg.blockAs)[blockAId], arg.blockB, + actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0); + + // Notify that the kernel is done. + const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx; + (*arg.kernel_notifications)[set_idx]->Notify(); + } + } + } +#endif // EIGEN_USE_SIMPLE_THREAD_POOL + TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk, bool shard_by_col, bool prepacked) const { const int packed_size = std::min(PacketType::size, PacketType::size); const int output_packet_size = internal::unpacket_traits::size; const double kd = static_cast(bk); - double compute_bandwidth = computeBandwidth(false, bm, bn, bk); + // Peak VFMA bandwidth is 0.5. However if we have not enough data for + // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined + // experimentally. + double computeBandwidth = bk == 1 ? 4.0 : + (shard_by_col ? bn : bm) < Traits::nr || + (shard_by_col ? bm : bn) < Traits::mr ? 2.0 : 0.5; +#ifndef EIGEN_VECTORIZE_FMA + // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors. + // However for MULPS/ADDPS we have dependent sequence of 2 such instructions, + // so overall bandwidth is 1.0. + if (computeBandwidth == 0.5) computeBandwidth = 1.0; +#endif // Computations. - TensorOpCost cost = TensorOpCost(0, 0, kd * compute_bandwidth, true, packed_size); + TensorOpCost cost = TensorOpCost(0, 0, kd * computeBandwidth, true, packed_size); // Output stores. cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size); if (prepacked) { @@ -1583,94 +1035,6 @@ struct TensorEvaluator l3CacheSize() / num_threads_by_k || // need more buffer space - // than L3 cache or... - k / num_threads_by_k < 2 * Traits::nr) { // k per thread is tiny. - shard_by_k = false; - } else if (numext::maxi(m, n) / num_threads < - Traits::nr || // both other dimensions are tiny or... - // k per thread is not small and... - (k / num_threads_by_k > 8 * Traits::nr && - // one of the outer dimensions is tiny or sharding by k offers - // more parallelism. - (numext::mini(m, n) < 2 * Traits::nr || - num_threads_by_k > num_threads))) { - shard_by_k = true; - } - return shard_by_k; - } - - TensorOpCost contractionCostPerInnerDim(Index m, Index n, Index k) const { - // Compute cost. - const int output_packet_size = internal::unpacket_traits::size; - TensorOpCost cost(0, 0, (computeBandwidth(true, m, n, k) * m) * n, true, output_packet_size); - // Output stores. - cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size); - TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * m; - TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * n; - // Since the inner gemm kernel is always sharded by column, the lhs - // load cost is negligible. - lhsCost.dropMemoryCost(); - return cost + lhsCost + rhsCost; - } - - int numThreadsInnerDim(Index m, Index n, Index k) const { - const int output_packet_size = internal::unpacket_traits::size; - TensorOpCost cost = contractionCostPerInnerDim(m, n, k); - double total_parallel_cost = - TensorCostModel::totalCost(k, cost); - // Cost of reduction step accumulating the m*n per-thread buffers into the - // result. - double reduction_cost = TensorCostModel::totalCost( - m * n, TensorOpCost(2, 1, 1, true, output_packet_size)); - int num_threads = 1; - double min_cost = total_parallel_cost; - double kPerThreadOverHead = 3000; - double kFixedOverHead = 100000; - for (int nt = 2; nt <= this->m_device.numThreads(); nt += 2) { - double sequential_cost = - kFixedOverHead + nt * (reduction_cost + kPerThreadOverHead); - double parallel_cost = total_parallel_cost / nt + sequential_cost; - if (parallel_cost < min_cost) { - num_threads = nt; - min_cost = parallel_cost; - } - } - return num_threads; - } - - double computeBandwidth(bool shard_by_col, Index bm, Index bn, - Index bk) const { - // Peak VFMA bandwidth is 0.5. However if we have not enough data for - // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined - // experimentally. - double computeBandwidth = - bk == 1 ? 4.0 - : (shard_by_col ? bn : bm) < Traits::nr || - (shard_by_col ? bm : bn) < Traits::mr - ? 2.0 - : 0.5; -#ifndef EIGEN_VECTORIZE_FMA - // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors. - // However for MULPS/ADDPS we have dependent sequence of 2 such - // instructions, - // so overall bandwidth is 1.0. - if (computeBandwidth == 0.5) computeBandwidth = 1.0; -#endif - return computeBandwidth; - } - }; } // end namespace Eigen diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index cdbafbbb1..860a6949a 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -32,7 +32,6 @@ struct traits > static const int NumDimensions = traits::NumDimensions; static const int Layout = traits::Layout; enum { Flags = 0 }; - typedef typename TypeConversion::PointerType>::type PointerType; }; template @@ -129,7 +128,6 @@ struct PacketConverter { typedef typename internal::unpacket_traits::type TgtType; internal::scalar_cast_op converter; EIGEN_ALIGN_MAX typename internal::unpacket_traits::type values[TgtPacketSize]; - EIGEN_UNROLL_LOOP for (int i = 0; i < TgtPacketSize; ++i) { values[i] = converter(m_impl.coeff(index+i)); } @@ -165,116 +163,19 @@ class TensorConversionOp : public TensorBase struct ConversionSubExprEval { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) { +template struct ConversionSubExprEval { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar*) { impl.evalSubExprsIfNeeded(NULL); return true; } }; -template struct ConversionSubExprEval { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) { +template struct ConversionSubExprEval { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar* data) { return impl.evalSubExprsIfNeeded(data); } }; -#ifdef EIGEN_USE_THREADS -template -struct ConversionSubExprEvalAsync { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run( - Eval& impl, EvalPointerType, EvalSubExprsCallback done) { - impl.evalSubExprsIfNeededAsync(nullptr, std::move(done)); - } -}; - -template -struct ConversionSubExprEvalAsync { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run( - Eval& impl, EvalPointerType data, EvalSubExprsCallback done) { - impl.evalSubExprsIfNeededAsync(data, std::move(done)); - } -}; -#endif - -namespace internal { - -template -struct CoeffConv { - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator& impl, Index index) { - internal::scalar_cast_op converter; - return converter(impl.coeff(index)); - } -}; - -template -struct CoeffConv { - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator& impl, Index index) { - return impl.coeff(index); - } -}; - -template -struct PacketConv { - typedef typename internal::unpacket_traits::type SrcType; - typedef typename internal::unpacket_traits::type TargetType; - - static const int PacketSize = internal::unpacket_traits::size; - - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, Index index) { - internal::scalar_cast_op converter; - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - EIGEN_UNROLL_LOOP - for (int i = 0; i < PacketSize; ++i) { - values[i] = converter(impl.coeff(index+i)); - } - TargetPacket rslt = internal::pload(values); - return rslt; - } -}; - -template -struct PacketConv { - typedef typename internal::unpacket_traits::type SrcType; - typedef typename internal::unpacket_traits::type TargetType; - - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, Index index) { - const int SrcCoeffRatio = internal::type_casting_traits::SrcCoeffRatio; - const int TgtCoeffRatio = internal::type_casting_traits::TgtCoeffRatio; - PacketConverter, SrcPacket, TargetPacket, - SrcCoeffRatio, TgtCoeffRatio> converter(impl); - return converter.template packet(index); - } -}; - -template -struct PacketConv { - typedef typename internal::unpacket_traits::type TargetType; - static const int PacketSize = internal::unpacket_traits::size; - - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, Index index) { - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - for (int i = 0; i < PacketSize; ++i) values[i] = impl.coeff(index+i); - return internal::pload(values); - } -}; - -template -struct PacketConv { - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator& impl, Index index) { - return impl.template packet(index); - } -}; - -} // namespace internal // Eval as rvalue template @@ -288,52 +189,15 @@ struct TensorEvaluator, Device> typedef typename internal::remove_all::Scalar>::type SrcType; typedef typename PacketType::type PacketReturnType; typedef typename PacketType::type PacketSourceType; - static const int PacketSize = PacketType::size; - static const bool IsSameType = internal::is_same::value; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; + static const int PacketSize = internal::unpacket_traits::size; enum { - IsAligned = false, - PacketAccess = - #ifndef EIGEN_USE_SYCL - true, - #else - TensorEvaluator::PacketAccess & - internal::type_casting_traits::VectorizedCast, - #endif - BlockAccess = TensorEvaluator::BlockAccess, - PreferBlockAccess = TensorEvaluator::PreferBlockAccess, - Layout = TensorEvaluator::Layout, - RawAccess = false + IsAligned = false, + PacketAccess = true, + Layout = TensorEvaluator::Layout, + RawAccess = false }; - static const int NumDims = internal::array_size::value; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator TensorBlockScratch; - - typedef typename TensorEvaluator::TensorBlock - ArgTensorBlock; - - struct TensorConversionOpBlockFactory { - template - struct XprType { - typedef TensorConversionOp type; - }; - - template - typename XprType::type expr(const ArgXprType& expr) const { - return typename XprType::type(expr); - } - }; - - typedef internal::TensorUnaryExprBlock - TensorBlock; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { @@ -341,21 +205,11 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { - return ConversionSubExprEval, EvaluatorPointerType>::run(m_impl, data); + return ConversionSubExprEval::value, TensorEvaluator, Scalar>::run(m_impl, data); } -#ifdef EIGEN_USE_THREADS - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType data, EvalSubExprsCallback done) { - ConversionSubExprEvalAsync, - EvaluatorPointerType, - EvalSubExprsCallback>::run(m_impl, data, std::move(done)); - } -#endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); @@ -363,23 +217,16 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return internal::CoeffConv::run(m_impl,index); + internal::scalar_cast_op converter; + return converter(m_impl.coeff(index)); } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType - packet(Index index) const { - // If we are not going to do the cast, we just need to check that base - // TensorEvaluator has packet access. Otherwise we also need to make sure, - // that we have an implementation of vectorized cast. - const bool Vectorizable = - IsSameType - ? TensorEvaluator::PacketAccess - : TensorEvaluator::PacketAccess & - internal::type_casting_traits::VectorizedCast; - - return internal::PacketConv::run(m_impl, index); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const + { + const bool Vectorizable = TensorEvaluator::PacketAccess & + internal::type_casting_traits::VectorizedCast; + return PacketConv::run(m_impl, index); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost @@ -397,30 +244,33 @@ struct TensorEvaluator, Device> } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - return m_impl.getResourceRequirements(); - } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { - return TensorBlock(m_impl.block(desc, scratch), - TensorConversionOpBlockFactory()); - } + protected: + template + struct PacketConv { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator& impl, Index index) { + internal::scalar_cast_op converter; + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; + for (int i = 0; i < PacketSize; ++i) { + values[i] = converter(impl.coeff(index+i)); + } + PacketReturnType rslt = internal::pload(values); + return rslt; + } + }; - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + template + struct PacketConv { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator& impl, Index index) { + const int SrcCoeffRatio = internal::type_casting_traits::SrcCoeffRatio; + const int TgtCoeffRatio = internal::type_casting_traits::TgtCoeffRatio; + PacketConverter, PacketSourceType, PacketReturnType, + SrcCoeffRatio, TgtCoeffRatio> converter(impl); + return converter.template packet(index); + } + }; - /// required by sycl in order to extract the sycl accessor - const TensorEvaluator& impl() const { return m_impl; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif - - protected: TensorEvaluator m_impl; }; diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 27ad9f147..abdf742c6 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -54,8 +54,8 @@ class IndexMapper { } } - array gpuInputDimensions; - array gpuOutputDimensions; + array cudaInputDimensions; + array cudaOutputDimensions; array tmp = dimensions; array ordering; const size_t offset = static_cast(Layout) == static_cast(ColMajor) @@ -65,8 +65,8 @@ class IndexMapper { const Index index = i + offset; ordering[index] = indices[i]; tmp[indices[i]] = -1; - gpuInputDimensions[index] = input_dims[indices[i]]; - gpuOutputDimensions[index] = dimensions[indices[i]]; + cudaInputDimensions[index] = input_dims[indices[i]]; + cudaOutputDimensions[index] = dimensions[indices[i]]; } int written = static_cast(Layout) == static_cast(ColMajor) @@ -75,8 +75,8 @@ class IndexMapper { for (int i = 0; i < NumDims; ++i) { if (tmp[i] >= 0) { ordering[written] = i; - gpuInputDimensions[written] = input_dims[i]; - gpuOutputDimensions[written] = dimensions[i]; + cudaInputDimensions[written] = input_dims[i]; + cudaOutputDimensions[written] = dimensions[i]; ++written; } } @@ -89,37 +89,37 @@ class IndexMapper { if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = 0; i < NumDims; ++i) { if (i > NumKernelDims) { - m_gpuInputStrides[i] = - m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1]; - m_gpuOutputStrides[i] = - m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1]; + m_cudaInputStrides[i] = + m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1]; + m_cudaOutputStrides[i] = + m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1]; } else { - m_gpuInputStrides[i] = 1; - m_gpuOutputStrides[i] = 1; + m_cudaInputStrides[i] = 1; + m_cudaOutputStrides[i] = 1; } } } else { for (int i = NumDims - 1; i >= 0; --i) { - if (static_cast(i + 1) < offset) { - m_gpuInputStrides[i] = - m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1]; - m_gpuOutputStrides[i] = - m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1]; + if (i + 1 < offset) { + m_cudaInputStrides[i] = + m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1]; + m_cudaOutputStrides[i] = + m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1]; } else { - m_gpuInputStrides[i] = 1; - m_gpuOutputStrides[i] = 1; + m_cudaInputStrides[i] = 1; + m_cudaOutputStrides[i] = 1; } } } } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const { Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { for (int d = NumDims - 1; d > NumKernelDims; --d) { - const Index idx = p / m_gpuInputStrides[d]; + const Index idx = p / m_cudaInputStrides[d]; inputIndex += idx * m_inputStrides[d]; - p -= idx * m_gpuInputStrides[d]; + p -= idx * m_cudaInputStrides[d]; } inputIndex += p * m_inputStrides[NumKernelDims]; } else { @@ -128,22 +128,22 @@ class IndexMapper { limit = NumDims - NumKernelDims - 1; } for (int d = 0; d < limit; ++d) { - const Index idx = p / m_gpuInputStrides[d]; + const Index idx = p / m_cudaInputStrides[d]; inputIndex += idx * m_inputStrides[d]; - p -= idx * m_gpuInputStrides[d]; + p -= idx * m_cudaInputStrides[d]; } inputIndex += p * m_inputStrides[limit]; } return inputIndex; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const { Index outputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { for (int d = NumDims - 1; d > NumKernelDims; --d) { - const Index idx = p / m_gpuOutputStrides[d]; + const Index idx = p / m_cudaOutputStrides[d]; outputIndex += idx * m_outputStrides[d]; - p -= idx * m_gpuOutputStrides[d]; + p -= idx * m_cudaOutputStrides[d]; } outputIndex += p * m_outputStrides[NumKernelDims]; } else { @@ -152,44 +152,44 @@ class IndexMapper { limit = NumDims - NumKernelDims - 1; } for (int d = 0; d < limit; ++d) { - const Index idx = p / m_gpuOutputStrides[d]; + const Index idx = p / m_cudaOutputStrides[d]; outputIndex += idx * m_outputStrides[d]; - p -= idx * m_gpuOutputStrides[d]; + p -= idx * m_cudaOutputStrides[d]; } outputIndex += p * m_outputStrides[limit]; } return outputIndex; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const { const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; return i * m_inputStrides[offset]; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const { const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; return i * m_outputStrides[offset]; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const { const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1]; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const { const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1]; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const { const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; @@ -197,7 +197,7 @@ class IndexMapper { k * m_inputStrides[offset + 2]; } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { const size_t offset = static_cast(Layout) == static_cast(ColMajor) ? 0 : NumDims - NumKernelDims; @@ -209,8 +209,8 @@ class IndexMapper { static const int NumDims = internal::array_size::value; array m_inputStrides; array m_outputStrides; - array m_gpuInputStrides; - array m_gpuOutputStrides; + array m_cudaInputStrides; + array m_cudaOutputStrides; }; @@ -231,8 +231,6 @@ struct traits > typedef typename remove_reference::type _RhsNested; static const int NumDimensions = traits::NumDimensions; static const int Layout = traits::Layout; - typedef typename conditional::val, - typename traits::PointerType, typename traits::PointerType>::type PointerType; enum { Flags = 0 @@ -302,24 +300,16 @@ struct TensorEvaluator::type PacketReturnType; - static const int PacketSize = PacketType::size; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, - BlockAccess = false, - PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device) { @@ -475,7 +465,7 @@ struct TensorEvaluator EvalTo; EvalTo evalToTmp(local, m_kernelArg); - const bool Vectorize = internal::IsVectorizable::value; - internal::TensorExecutor::run(evalToTmp, m_device); + const bool PacketAccess = internal::IsVectorizable::value; + internal::TensorExecutor::run(evalToTmp, m_device); m_kernel = local; m_local_kernel = true; @@ -554,14 +544,14 @@ struct TensorEvaluator struct GetKernelSize { @@ -584,11 +574,7 @@ __global__ void EigenConvolutionKernel1D( indexMapper, const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize, float* buffer) { -#if defined(EIGEN_HIPCC) - HIP_DYNAMIC_SHARED(float, s) -#else extern __shared__ float s[]; -#endif const int first_x = blockIdx.x * maxX; const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; @@ -600,18 +586,18 @@ __global__ void EigenConvolutionKernel1D( for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) { // Load inputs to shared memory - const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); + const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); const int plane_kernel_offset = threadIdx.y * num_x_input; #pragma unroll for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { - const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x); + const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x); s[i + plane_kernel_offset] = eval.coeff(tensor_index); } __syncthreads(); // Compute the convolution - const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); + const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); #pragma unroll for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { @@ -621,7 +607,7 @@ __global__ void EigenConvolutionKernel1D( for (int k = 0; k < GetKernelSize()(kernelSize); ++k) { result += s[k + kernel_offset] * kernel[k]; } - const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x); + const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x); buffer[tensor_index] = result; } __syncthreads(); @@ -637,11 +623,7 @@ __global__ void EigenConvolutionKernel2D( const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY, const int kernelSizeX, const int kernelSizeY, float* buffer) { -#if defined(EIGEN_HIPCC) - HIP_DYNAMIC_SHARED(float, s) -#else extern __shared__ float s[]; -#endif const int first_x = blockIdx.x * maxX; const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; @@ -658,7 +640,7 @@ __global__ void EigenConvolutionKernel2D( for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) { - const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); + const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); const int plane_kernel_offset = threadIdx.z * num_y_input; // Load inputs to shared memory @@ -667,7 +649,7 @@ __global__ void EigenConvolutionKernel2D( const int input_offset = num_x_input * (j + plane_kernel_offset); #pragma unroll for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { - const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y); + const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y); s[i + input_offset] = eval.coeff(tensor_index); } } @@ -675,7 +657,7 @@ __global__ void EigenConvolutionKernel2D( __syncthreads(); // Convolution - const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); + const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); #pragma unroll for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { @@ -691,7 +673,7 @@ __global__ void EigenConvolutionKernel2D( result += s[k + input_offset] * kernel[k + kernel_offset]; } } - const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y); + const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y); buffer[tensor_index] = result; } } @@ -709,11 +691,7 @@ __global__ void EigenConvolutionKernel3D( const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, const size_t kernelSizeZ, float* buffer) { -#if defined(EIGEN_HIPCC) - HIP_DYNAMIC_SHARED(float, s) -#else extern __shared__ float s[]; -#endif // Load inputs to shared memory const int first_x = blockIdx.x * maxX; @@ -730,13 +708,13 @@ __global__ void EigenConvolutionKernel3D( for (int p = 0; p < numPlanes; ++p) { - const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); + const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); const int plane_kernel_offset = 0; for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) { for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { - const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); + const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); } } @@ -748,7 +726,7 @@ __global__ void EigenConvolutionKernel3D( const int num_z_output = last_z - first_z + 1; const int num_y_output = last_y - first_y + 1; const int num_x_output = last_x - first_x + 1; - const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); + const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) { for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { @@ -761,7 +739,7 @@ __global__ void EigenConvolutionKernel3D( } } } - const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); + const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); buffer[tensor_index] = result; } } @@ -786,19 +764,13 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, PacketAccess = false, - BlockAccess = false, - PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device) - : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) + : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) { EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -880,9 +852,9 @@ struct TensorEvaluator::Dimensions InputDims; const int maxSharedMem = m_device.sharedMemPerBlock(); - const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock(); - const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock; - const int numMultiProcessors = m_device.getNumGpuMultiProcessors(); + const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock(); + const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock; + const int numMultiProcessors = m_device.getNumCudaMultiProcessors(); const int warpSize = 32; switch (NumKernelDims) { @@ -917,7 +889,7 @@ struct TensorEvaluator, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); break; } case 7: { - LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); break; } default: { - LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); } } break; @@ -974,7 +946,7 @@ struct TensorEvaluator(1024/(block_size.x*block_size.y), maxP); const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); - gpu_assert(shared_mem <= maxSharedMem); + assert(shared_mem <= maxSharedMem); const int num_x_blocks = ceil(numX, maxX); const int num_y_blocks = ceil(numY, maxY); @@ -995,11 +967,11 @@ struct TensorEvaluator, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); break; } default: { - LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); break; } } @@ -1008,18 +980,18 @@ struct TensorEvaluator, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); break; } default: { - LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); break; } } break; } default: { - LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); break; } } @@ -1054,7 +1026,7 @@ struct TensorEvaluator indices(m_indices[idxX], m_indices[idxY], @@ -1065,7 +1037,7 @@ struct TensorEvaluator indexMapper( m_inputImpl.dimensions(), kernel_dims, indices); - LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); + LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); break; } diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h deleted file mode 100644 index 92003c766..000000000 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ /dev/null @@ -1,544 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Mehdi Goli Codeplay Software Ltd. -// Ralph Potter Codeplay Software Ltd. -// Luke Iwanski Codeplay Software Ltd. -// Contact: -// Copyright (C) 2016 Benoit Steiner - -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H -#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H - -namespace Eigen { - -/** \class TensorConvolution - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor convolution class. - * - * - */ - -enum class convolution_type { CONV1D, CONV2D, CONV3D }; -template -struct EigenConvolutionKernel; -template -struct EigenConvolutionKernel { - typedef cl::sycl::accessor - Local_accessor; - Local_accessor local_acc; - Evaluator device_evaluator; - Kernel_accessor kernel_filter; - Buffer_accessor buffer_acc; - internal::IndexMapper indexMapper; - const size_t kernelSize; - const cl::sycl::range<2> input_range; - EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, - Buffer_accessor buffer_acc_, - internal::IndexMapper indexMapper_, - const size_t kernelSize_, const cl::sycl::range<2> input_range_) - : local_acc(local_acc_), - device_evaluator(device_evaluator_), - kernel_filter(kernel_filter_), - buffer_acc(buffer_acc_), - indexMapper(indexMapper_), - kernelSize(kernelSize_), - input_range(input_range_) {} - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) { - return (boolean_check[0] && boolean_check[1]); - } - void operator()(cl::sycl::nd_item<2> itemID) { - auto buffer_ptr = buffer_acc.get_pointer(); - auto kernel_ptr = kernel_filter.get_pointer(); - // the required row to be calculated for the for each plane in shered memory - const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1); - const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input; - const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0]; - const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1)); - /// fill the shared memory - for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) { - const size_t local_index = i + plane_kernel_offset; - const size_t tensor_index = - plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset); - - local_acc[local_index] = - (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1]) - ? device_evaluator.coeff(tensor_index) - : CoeffReturnType(0); - } - - itemID.barrier(cl::sycl::access::fence_space::local_space); - - // calculate the convolution // output start x - const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]); - if (boundary_check(itemID.get_global_id() < input_range)) { - CoeffReturnType result = static_cast(0); - const size_t index = plane_kernel_offset + itemID.get_local_id(0); - for (size_t k = 0; k < kernelSize; ++k) { - result += (local_acc[k + index] * kernel_ptr[k]); - } - const size_t tensor_index = - indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) + - indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start); - buffer_ptr[tensor_index] = result; - } - } -}; - -template -struct EigenConvolutionKernel { - typedef cl::sycl::accessor - Local_accessor; - Local_accessor local_acc; - Evaluator device_evaluator; - Kernel_accessor kernel_filter; - Buffer_accessor buffer_acc; - internal::IndexMapper indexMapper; - const cl::sycl::range<2> kernel_size; - const cl::sycl::range<3> input_range; - EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, - Buffer_accessor buffer_acc_, - internal::IndexMapper indexMapper_, - const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_) - : local_acc(local_acc_), - device_evaluator(device_evaluator_), - kernel_filter(kernel_filter_), - buffer_acc(buffer_acc_), - indexMapper(indexMapper_), - kernel_size(kernel_size_), - input_range(input_range_) {} - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) { - return (boolean_check[0] && boolean_check[1] && boolean_check[2]); - } - - void operator()(cl::sycl::nd_item<3> itemID) { - auto buffer_ptr = buffer_acc.get_pointer(); - auto kernel_ptr = kernel_filter.get_pointer(); - // the required row to be calculated for the for each plane in shered memory - const auto num_input = cl::sycl::range<2>{ - (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)}; - - const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2)); - const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1]; - - const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0], - itemID.get_group(1) * itemID.get_local_range()[1]}; - - // fill the local memory - bool in_range_dim2 = itemID.get_global_id(2) < input_range[2]; - for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) { - const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset); - bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1)); - for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) { - const size_t local_index = i + local_input_offset; - const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset( - i + input_offset[0], j + input_offset[1]); - local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) && - in_range_dim1 && in_range_dim2) - ? device_evaluator.coeff(tensor_index) - : CoeffReturnType(0); - } - } - - itemID.barrier(cl::sycl::access::fence_space::local_space); - - // output offset start for each thread - const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0], - itemID.get_group(1) * itemID.get_local_range()[1]}; - - if (boundary_check(itemID.get_global_id() < input_range)) { - CoeffReturnType result = static_cast(0); - - for (size_t j = 0; j < kernel_size[1]; j++) { - size_t kernel_offset = kernel_size[0] * j; - const size_t index = - (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0); - for (size_t i = 0; i < kernel_size[0]; i++) { - result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]); - } - } - const size_t tensor_index = - indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) + - indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0], - itemID.get_local_id(1) + output_offset[1]); - - buffer_ptr[tensor_index] = result; - } - } -}; - -template -struct EigenConvolutionKernel { - typedef cl::sycl::accessor - Local_accessor; - Local_accessor local_acc; - Evaluator device_evaluator; - Kernel_accessor kernel_filter; - Buffer_accessor buffer_acc; - internal::IndexMapper indexMapper; - const cl::sycl::range<3> kernel_size; - const cl::sycl::range<3> input_range; - const size_t numP; - - EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_, - Buffer_accessor buffer_acc_, - internal::IndexMapper indexMapper_, - const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_, - const size_t numP_) - : local_acc(local_acc_), - device_evaluator(device_evaluator_), - kernel_filter(kernel_filter_), - buffer_acc(buffer_acc_), - indexMapper(indexMapper_), - kernel_size(kernel_size_), - input_range(input_range_), - numP(numP_) {} - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) { - return (boolean_check[0] && boolean_check[1] && boolean_check[2]); - } - void operator()(cl::sycl::nd_item<3> itemID) { - auto buffer_ptr = buffer_acc.get_pointer(); - auto kernel_ptr = kernel_filter.get_pointer(); - const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1}; - - const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()}; - - const auto output_offset = - cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()}; - - for (size_t p = 0; p < numP; p++) { - /// fill the shared memory - const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); - for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) { - size_t local_index_dim2 = num_input[0] * num_input[1] * k; - bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1)); - for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) { - bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1)); - size_t local_index_dim1 = (num_input[0] * j) + local_index_dim2; - for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) { - bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1)); - const size_t local_index = local_index_dim1 + i; - const size_t tensor_index = - plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset( - i + input_offset[0], j + input_offset[1], k + input_offset[2]); - local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0); - } - } - } - itemID.barrier(cl::sycl::access::fence_space::local_space); - - // calculate the convolution - - if (boundary_check(itemID.get_global_id() < input_range)) { - CoeffReturnType result = static_cast(0); - for (size_t k = 0; k < kernel_size[2]; k++) { - for (size_t j = 0; j < kernel_size[1]; j++) { - for (size_t i = 0; i < kernel_size[0]; i++) { - const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k); - const size_t local_index = - ((i + itemID.get_local_id(0)) + - num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2)))); - - result += (local_acc[local_index] * kernel_ptr[kernel_index]); - } - } - } - const size_t tensor_index = - indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) + - indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]); - buffer_ptr[tensor_index] = result; - } - - itemID.barrier(cl::sycl::access::fence_space::local_space); - } - } -}; - -template -struct TensorEvaluator, Eigen::SyclDevice> { - typedef TensorConvolutionOp XprType; - - static const int NumDims = - internal::array_size::Dimensions>::value; - static const int NumKernelDims = internal::array_size::value; - typedef typename XprType::Index Index; - typedef DSizes Dimensions; - typedef typename TensorEvaluator::Dimensions KernelDimensions; - typedef const Eigen::SyclDevice Device; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef typename InputArgType::Scalar Scalar; - static const int PacketSize = PacketType::size; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; - typedef StorageMemory KernelStorage; - - enum { - IsAligned = TensorEvaluator::IsAligned & - TensorEvaluator::IsAligned, - PacketAccess = false, - BlockAccess = false, - PreferBlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device) - : m_inputImpl(op.inputExpression(), device), - m_kernelArg(op.kernelExpression()), - m_kernelImpl(op.kernelExpression(), device), - m_indices(op.indices()), - m_buf(NULL), - m_kernel(NULL), - m_local_kernel(false), - m_device(device) { - EIGEN_STATIC_ASSERT((static_cast(TensorEvaluator::Layout) == - static_cast(TensorEvaluator::Layout)), - YOU_MADE_A_PROGRAMMING_MISTAKE); - - const typename TensorEvaluator::Dimensions &input_dims = m_inputImpl.dimensions(); - const typename TensorEvaluator::Dimensions &kernel_dims = - m_kernelImpl.dimensions(); - - m_dimensions = m_inputImpl.dimensions(); - for (int i = 0; i < NumKernelDims; ++i) { - const Index index = op.indices()[i]; - const Index input_dim = input_dims[index]; - const Index kernel_dim = kernel_dims[i]; - const Index result_dim = input_dim - kernel_dim + 1; - m_dimensions[index] = result_dim; - } - } - - EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { - preloadKernel(); - m_inputImpl.evalSubExprsIfNeeded(NULL); - if (data) { - executeEval(data); - return false; - } else { - m_buf = (EvaluatorPointerType)m_device.get( - (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar))); - executeEval(m_buf); - return true; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_inputImpl.cleanup(); - if (m_buf) { - m_device.deallocate_temp(m_buf); - m_buf = NULL; - } - if (m_local_kernel) { - m_device.deallocate_temp(m_kernel); - m_local_kernel = false; - } - m_kernel = NULL; - } - /// used by sycl in order to build the sycl buffer - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; } - /// used by sycl in order to build the sycl buffer - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { - // Don't make a local copy of the kernel unless we have to (i.e. it's an - // expression that needs to be evaluated) - typename KernelStorage::Type in_place = m_kernelImpl.data(); - if (in_place) { - m_kernel = in_place; - m_local_kernel = false; - } else { - ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); - EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz)); - typedef TensorEvalToOp EvalTo; - EvalTo evalToTmp(m_device.get(local), m_kernelArg); - const bool PacketAccess = internal::IsVectorizable::value; - internal::TensorExecutor::run(evalToTmp, m_device); - m_kernel = local; - m_local_kernel = true; - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const { - typedef TensorEvaluator InputEvaluator; - typedef typename InputEvaluator::Dimensions InputDims; - switch (NumKernelDims) { - case 1: { - const size_t numX = dimensions()[m_indices[0]]; - const size_t numP = dimensions().TotalSize() / numX; - const auto input_dim = std::array{numX, numP}; - auto global_range = cl::sycl::range<2>{}; - auto local_range = cl::sycl::range<2>{}; - const size_t kernel_size = m_kernelImpl.dimensions().TotalSize(); - - m_device.parallel_for_setup(input_dim, global_range, local_range); - const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]); - gpu_assert(static_cast(local_memory_size) <= m_device.sharedMemPerBlock()); - const array indices{{m_indices[0]}}; - const array kernel_dims{{m_kernelImpl.dimensions()[0]}}; - internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); - - typedef EigenConvolutionKernel - ConvKernel; - - m_device.template binary_kernel_launcher( - m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size, - indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1])); - break; - } - - case 2: { - auto kernel_index = std::array{static_cast(Layout) == static_cast(ColMajor) ? 0 : 1, - static_cast(Layout) == static_cast(ColMajor) ? 1 : 0}; - auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]], - (size_t)m_kernelImpl.dimensions()[kernel_index[1]]}; - const size_t numX = dimensions()[m_indices[kernel_index[0]]]; - const size_t numY = dimensions()[m_indices[kernel_index[1]]]; - const size_t numP = dimensions().TotalSize() / (numX * numY); - auto input_dim = std::array{numX, numY, numP}; - - auto global_range = cl::sycl::range<3>{}; - auto local_range = cl::sycl::range<3>{}; - - m_device.parallel_for_setup(input_dim, global_range, local_range); - - const size_t local_memory_size = - (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2]; - gpu_assert(static_cast(local_memory_size) <= m_device.sharedMemPerBlock()); - const array indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}}; - const array kernel_dims{ - {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}}; - internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); - typedef EigenConvolutionKernel - ConvKernel; - m_device.template binary_kernel_launcher( - m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size, - indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]}); - break; - } - - case 3: { - auto kernel_index = std::array{static_cast(Layout) == static_cast(ColMajor) ? 0 : 2, - static_cast(Layout) == static_cast(ColMajor) ? 1 : 1, - static_cast(Layout) == static_cast(ColMajor) ? 2 : 0}; - - auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]], - (size_t)m_kernelImpl.dimensions()[kernel_index[1]], - (size_t)m_kernelImpl.dimensions()[kernel_index[2]]}; - - const size_t numX = dimensions()[m_indices[kernel_index[0]]]; - const size_t numY = dimensions()[m_indices[kernel_index[1]]]; - const size_t numZ = dimensions()[m_indices[kernel_index[2]]]; - auto input_dim = std::array{numX, numY, numZ}; - const size_t numP = dimensions().TotalSize() / (numX * numY * numZ); - - const array indices{ - {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}}; - const array kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]], - m_kernelImpl.dimensions()[kernel_index[1]], - m_kernelImpl.dimensions()[kernel_index[2]]}}; - - internal::IndexMapper indexMapper(m_inputImpl.dimensions(), kernel_dims, indices); - - auto global_range = cl::sycl::range<3>{}; - auto local_range = cl::sycl::range<3>{}; - - m_device.parallel_for_setup(input_dim, global_range, local_range); - auto local_memory_range = (local_range + kernel_size - 1); - const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2]; - - gpu_assert(static_cast(local_memory_size) <= m_device.sharedMemPerBlock()); - typedef EigenConvolutionKernel - ConvKernel; - m_device.template binary_kernel_launcher( - m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size, - indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP); - break; - } - - default: { - EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), - THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); - } - } - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - eigen_assert(m_buf != NULL); - eigen_assert(index < m_dimensions.TotalSize()); - return m_buf[index]; - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const { - eigen_assert(m_buf != NULL); - eigen_assert(index < m_dimensions.TotalSize()); - return internal::ploadt(m_buf + index); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost - // model. - const double kernel_size = m_kernelImpl.dimensions().TotalSize(); - // We ignore the use of fused multiply-add. - const double convolve_compute_cost = TensorOpCost::AddCost() + TensorOpCost::MulCost(); - const double firstIndex_compute_cost = - NumDims * - (2 * TensorOpCost::AddCost() + 2 * TensorOpCost::MulCost() + TensorOpCost::DivCost()); - return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + - kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) + - TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize)); - } - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_kernelImpl.bind(cgh); - m_inputImpl.bind(cgh); - m_buf.bind(cgh); - m_kernel.bind(cgh); - } - - private: - // No assignment (copies are needed by the kernels) - TensorEvaluator &operator=(const TensorEvaluator &); - TensorEvaluator m_inputImpl; - KernelArgType m_kernelArg; - TensorEvaluator m_kernelImpl; - Indices m_indices; - Dimensions m_dimensions; - EvaluatorPointerType m_buf; - typename KernelStorage::Type m_kernel; - bool m_local_kernel; - const Eigen::SyclDevice EIGEN_DEVICE_REF m_device; -}; // namespace Eigen - -} // end namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h index 195267ce8..83c449cf1 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h @@ -174,11 +174,8 @@ class TensorCostModel { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads( double output_size, const TensorOpCost& cost_per_coeff, int max_threads) { double cost = totalCost(output_size, cost_per_coeff); - double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; - // Make sure we don't invoke undefined behavior when we convert to an int. - threads = numext::mini(threads, GenericNumTraits::highest()); - return numext::mini(max_threads, - numext::maxi(1, static_cast(threads))); + int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9; + return numext::mini(max_threads, numext::maxi(1, threads)); } // taskSize assesses parallel task size. @@ -189,13 +186,14 @@ class TensorCostModel { return totalCost(output_size, cost_per_coeff) / kTaskSize; } + private: static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost( double output_size, const TensorOpCost& cost_per_coeff) { // Cost of memory fetches from L2 cache. 64 is typical cache line size. // 11 is L2 cache latency on Haswell. // We don't know whether data is in L1, L2 or L3. But we are most interested // in single-threaded computational time around 100us-10ms (smaller time - // is too small for parallelization, larger time is not interesting + // is too small for parallelization, larger time is not intersting // either because we are probably using all available threads already). // And for the target time range, L2 seems to be what matters. Data set // fitting into L1 is too small to take noticeable time. Data set fitting diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h index 476b2282a..e020d076f 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h @@ -30,13 +30,12 @@ struct traits > typedef typename remove_reference::type _Nested; static const int NumDimensions = traits::NumDimensions; static const int Layout = traits::Layout; - typedef typename traits::PointerType PointerType; }; template struct eval, Eigen::Dense> { - typedef const TensorCustomUnaryOpEIGEN_DEVICE_REF type; + typedef const TensorCustomUnaryOp& type; }; template @@ -87,25 +86,17 @@ struct TensorEvaluator, Devi typedef typename internal::remove_const::type Scalar; typedef typename internal::remove_const::type CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = PacketType::size; - typedef typename Eigen::internal::traits::PointerType TensorPointerType; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, - PacketAccess = (PacketType::size > 1), + PacketAccess = (internal::packet_traits::size > 1), BlockAccess = false, - PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device) : m_op(op), m_device(device), m_result(NULL) { @@ -114,21 +105,21 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { if (data) { evalTo(data); return false; } else { - m_result = static_cast(m_device.get( (CoeffReturnType*) - m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)))); + m_result = static_cast( + m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); evalTo(m_result); return true; } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - if (m_result) { - m_device.deallocate_temp(m_result); + if (m_result != NULL) { + m_device.deallocate(m_result); m_result = NULL; } } @@ -147,25 +138,19 @@ struct TensorEvaluator, Devi return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_result.bind(cgh); - } -#endif + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return m_result; } protected: - EIGEN_DEVICE_FUNC void evalTo(EvaluatorPointerType data) { - TensorMap > result(m_device.get(data), m_dimensions); + EIGEN_DEVICE_FUNC void evalTo(Scalar* data) { + TensorMap > result( + data, m_dimensions); m_op.func().eval(m_op.expression(), result, m_device); } Dimensions m_dimensions; const ArgType m_op; - const Device EIGEN_DEVICE_REF m_device; - EvaluatorPointerType m_result; + const Device& m_device; + CoeffReturnType* m_result; }; @@ -195,8 +180,6 @@ struct traits > typedef typename remove_reference::type _RhsNested; static const int NumDimensions = traits::NumDimensions; static const int Layout = traits::Layout; - typedef typename conditional::val, - typename traits::PointerType, typename traits::PointerType>::type PointerType; }; template @@ -259,26 +242,17 @@ struct TensorEvaluator::type CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = PacketType::size; - - typedef typename Eigen::internal::traits::PointerType TensorPointerType; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, - PacketAccess = (PacketType::size > 1), + PacketAccess = (internal::packet_traits::size > 1), BlockAccess = false, - PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_op(op), m_device(device), m_result(NULL) { @@ -287,13 +261,12 @@ struct TensorEvaluator(m_device.get( (CoeffReturnType*) - m_device.allocate_temp(dimensions().TotalSize() * sizeof(CoeffReturnType)))); + m_result = static_cast(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar))); evalTo(m_result); return true; } @@ -301,7 +274,7 @@ struct TensorEvaluator > result(m_device.get(data), m_dimensions); + EIGEN_DEVICE_FUNC void evalTo(Scalar* data) { + TensorMap > result(data, m_dimensions); m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device); } Dimensions m_dimensions; const XprType m_op; - const Device EIGEN_DEVICE_REF m_device; - EvaluatorPointerType m_result; + const Device& m_device; + CoeffReturnType* m_result; }; diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h index 804a16cc5..29e50a3b2 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h @@ -63,73 +63,6 @@ template class TensorDevice { ExpressionType& m_expression; }; -/** \class TensorAsyncDevice - * \ingroup CXX11_Tensor_Module - * - * \brief Pseudo expression providing an operator = that will evaluate its - * argument asynchronously on the specified device. Currently only - * ThreadPoolDevice implements proper asynchronous execution, while the default - * and GPU devices just run the expression synchronously and call m_done() on - * completion.. - * - * Example: - * auto done = []() { ... expression evaluation done ... }; - * C.device(thread_pool_device, std::move(done)) = A + B; - */ - -template -class TensorAsyncDevice { - public: - TensorAsyncDevice(const DeviceType& device, ExpressionType& expression, - DoneCallback done) - : m_device(device), m_expression(expression), m_done(std::move(done)) {} - - template - EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) { - typedef TensorAssignOp Assign; - typedef internal::TensorExecutor Executor; - - Assign assign(m_expression, other); - Executor::run(assign, m_device); - m_done(); - - return *this; - } - - protected: - const DeviceType& m_device; - ExpressionType& m_expression; - DoneCallback m_done; -}; - - -#ifdef EIGEN_USE_THREADS -template -class TensorAsyncDevice { - public: - TensorAsyncDevice(const ThreadPoolDevice& device, ExpressionType& expression, - DoneCallback done) - : m_device(device), m_expression(expression), m_done(std::move(done)) {} - - template - EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) { - typedef TensorAssignOp Assign; - typedef internal::TensorAsyncExecutor Executor; - - // WARNING: After assignment 'm_done' callback will be in undefined state. - Assign assign(m_expression, other); - Executor::runAsync(assign, m_device, std::move(m_done)); - - return *this; - } - - protected: - const ThreadPoolDevice& m_device; - ExpressionType& m_expression; - DoneCallback m_done; -}; -#endif - } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h index f77923933..4f5767bc7 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h @@ -1,6 +1,337 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#if defined(__clang__) || defined(__GNUC__) -#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorDeviceGpu.h file" +#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H) +#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H + +namespace Eigen { + +static const int kCudaScratchSize = 1024; + +// This defines an interface that GPUDevice can take to use +// CUDA streams underneath. +class StreamInterface { + public: + virtual ~StreamInterface() {} + + virtual const cudaStream_t& stream() const = 0; + virtual const cudaDeviceProp& deviceProperties() const = 0; + + // Allocate memory on the actual device where the computation will run + virtual void* allocate(size_t num_bytes) const = 0; + virtual void deallocate(void* buffer) const = 0; + + // Return a scratchpad buffer of size 1k + virtual void* scratchpad() const = 0; + + // Return a semaphore. The semaphore is initially initialized to 0, and + // each kernel using it is responsible for resetting to 0 upon completion + // to maintain the invariant that the semaphore is always equal to 0 upon + // each kernel start. + virtual unsigned int* semaphore() const = 0; +}; + +static cudaDeviceProp* m_deviceProperties; +static bool m_devicePropInitialized = false; + +static void initializeDeviceProp() { + if (!m_devicePropInitialized) { + // Attempts to ensure proper behavior in the case of multiple threads + // calling this function simultaneously. This would be trivial to + // implement if we could use std::mutex, but unfortunately mutex don't + // compile with nvcc, so we resort to atomics and thread fences instead. + // Note that if the caller uses a compiler that doesn't support c++11 we + // can't ensure that the initialization is thread safe. +#if __cplusplus >= 201103L + static std::atomic first(true); + if (first.exchange(false)) { +#else + static bool first = true; + if (first) { + first = false; +#endif + // We're the first thread to reach this point. + int num_devices; + cudaError_t status = cudaGetDeviceCount(&num_devices); + if (status != cudaSuccess) { + std::cerr << "Failed to get the number of CUDA devices: " + << cudaGetErrorString(status) + << std::endl; + assert(status == cudaSuccess); + } + m_deviceProperties = new cudaDeviceProp[num_devices]; + for (int i = 0; i < num_devices; ++i) { + status = cudaGetDeviceProperties(&m_deviceProperties[i], i); + if (status != cudaSuccess) { + std::cerr << "Failed to initialize CUDA device #" + << i + << ": " + << cudaGetErrorString(status) + << std::endl; + assert(status == cudaSuccess); + } + } + +#if __cplusplus >= 201103L + std::atomic_thread_fence(std::memory_order_release); +#endif + m_devicePropInitialized = true; + } else { + // Wait for the other thread to inititialize the properties. + while (!m_devicePropInitialized) { +#if __cplusplus >= 201103L + std::atomic_thread_fence(std::memory_order_acquire); +#endif + sleep(1); + } + } + } +} + +static const cudaStream_t default_stream = cudaStreamDefault; + +class CudaStreamDevice : public StreamInterface { + public: + // Use the default stream on the current device + CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { + cudaGetDevice(&device_); + initializeDeviceProp(); + } + // Use the default stream on the specified device + CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) { + initializeDeviceProp(); + } + // Use the specified stream. Note that it's the + // caller responsibility to ensure that the stream can run on + // the specified device. If no device is specified the code + // assumes that the stream is associated to the current gpu device. + CudaStreamDevice(const cudaStream_t* stream, int device = -1) + : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { + if (device < 0) { + cudaGetDevice(&device_); + } else { + int num_devices; + cudaError_t err = cudaGetDeviceCount(&num_devices); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); + assert(device < num_devices); + device_ = device; + } + initializeDeviceProp(); + } + + virtual ~CudaStreamDevice() { + if (scratch_) { + deallocate(scratch_); + } + } + + const cudaStream_t& stream() const { return *stream_; } + const cudaDeviceProp& deviceProperties() const { + return m_deviceProperties[device_]; + } + virtual void* allocate(size_t num_bytes) const { + cudaError_t err = cudaSetDevice(device_); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); + void* result; + err = cudaMalloc(&result, num_bytes); + assert(err == cudaSuccess); + assert(result != NULL); + return result; + } + virtual void deallocate(void* buffer) const { + cudaError_t err = cudaSetDevice(device_); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); + assert(buffer != NULL); + err = cudaFree(buffer); + assert(err == cudaSuccess); + } + + virtual void* scratchpad() const { + if (scratch_ == NULL) { + scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int)); + } + return scratch_; + } + + virtual unsigned int* semaphore() const { + if (semaphore_ == NULL) { + char* scratch = static_cast(scratchpad()) + kCudaScratchSize; + semaphore_ = reinterpret_cast(scratch); + cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); + } + return semaphore_; + } + + private: + const cudaStream_t* stream_; + int device_; + mutable void* scratch_; + mutable unsigned int* semaphore_; +}; + +struct GpuDevice { + // The StreamInterface is not owned: the caller is + // responsible for its initialization and eventual destruction. + explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { + eigen_assert(stream); + } + explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) { + eigen_assert(stream); + } + // TODO(bsteiner): This is an internal API, we should not expose it. + EIGEN_STRONG_INLINE const cudaStream_t& stream() const { + return stream_->stream(); + } + + EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { + return stream_->allocate(num_bytes); + } + + EIGEN_STRONG_INLINE void deallocate(void* buffer) const { + stream_->deallocate(buffer); + } + + EIGEN_STRONG_INLINE void* scratchpad() const { + return stream_->scratchpad(); + } + + EIGEN_STRONG_INLINE unsigned int* semaphore() const { + return stream_->semaphore(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { +#ifndef __CUDA_ARCH__ + cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice, + stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { + cudaError_t err = + cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); + } + + EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { + cudaError_t err = + cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { +#ifndef __CUDA_ARCH__ + cudaError_t err = cudaMemsetAsync(buffer, c, n, stream_->stream()); + EIGEN_UNUSED_VARIABLE(err) + assert(err == cudaSuccess); +#else + eigen_assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE size_t numThreads() const { + // FIXME + return 32; + } + + EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { + // FIXME + return 48*1024; + } + + EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { + // We won't try to take advantage of the l2 cache for the time being, and + // there is no l3 cache on cuda devices. + return firstLevelCacheSize(); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { +#if defined(__CUDACC__) && !defined(__CUDA_ARCH__) + cudaError_t err = cudaStreamSynchronize(stream_->stream()); + if (err != cudaSuccess) { + std::cerr << "Error detected in CUDA stream: " + << cudaGetErrorString(err) + << std::endl; + assert(err == cudaSuccess); + } +#else + assert(false && "The default device should be used instead to generate kernel code"); +#endif + } + + EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const { + return stream_->deviceProperties().multiProcessorCount; + } + EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const { + return stream_->deviceProperties().maxThreadsPerBlock; + } + EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const { + return stream_->deviceProperties().maxThreadsPerMultiProcessor; + } + EIGEN_STRONG_INLINE int sharedMemPerBlock() const { + return stream_->deviceProperties().sharedMemPerBlock; + } + EIGEN_STRONG_INLINE int majorDeviceVersion() const { + return stream_->deviceProperties().major; + } + EIGEN_STRONG_INLINE int minorDeviceVersion() const { + return stream_->deviceProperties().minor; + } + + EIGEN_STRONG_INLINE int maxBlocks() const { + return max_blocks_; + } + + // This function checks if the CUDA runtime recorded an error for the + // underlying stream device. + inline bool ok() const { +#ifdef __CUDACC__ + cudaError_t error = cudaStreamQuery(stream_->stream()); + return (error == cudaSuccess) || (error == cudaErrorNotReady); +#else + return false; +#endif + } + + private: + const StreamInterface* stream_; + int max_blocks_; +}; + +#define LAUNCH_CUDA_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ + (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ + assert(cudaGetLastError() == cudaSuccess); + + +// FIXME: Should be device and kernel specific. +#ifdef __CUDACC__ +static EIGEN_DEVICE_FUNC inline void setCudaSharedMemConfig(cudaSharedMemConfig config) { +#ifndef __CUDA_ARCH__ + cudaError_t status = cudaDeviceSetSharedMemConfig(config); + EIGEN_UNUSED_VARIABLE(status) + assert(status == cudaSuccess); +#else + EIGEN_UNUSED_VARIABLE(config) +#endif +} #endif -#include "TensorDeviceGpu.h" +} // end namespace Eigen + +#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_CUDA_H diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h index 46b9d3ab2..9d141395b 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h @@ -20,12 +20,6 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { internal::aligned_free(buffer); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { - return allocate(num_bytes); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { - deallocate(buffer); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { ::memcpy(dst, src, n); @@ -39,18 +33,11 @@ struct DefaultDevice { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { ::memset(buffer, c, n); } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { - return data; - } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const { -#if !defined(EIGEN_GPU_COMPILE_PHASE) +#ifndef __CUDA_ARCH__ // Running on the host CPU return 1; -#elif defined(EIGEN_HIP_DEVICE_COMPILE) - // Running on a HIP device - return 64; #else // Running on a CUDA device return 32; @@ -58,12 +45,9 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { -#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) +#ifndef __CUDA_ARCH__ // Running on the host CPU return l1CacheSize(); -#elif defined(EIGEN_HIP_DEVICE_COMPILE) - // Running on a HIP device - return 48*1024; // FIXME : update this number for HIP #else // Running on a CUDA device, return the amount of shared memory available. return 48*1024; @@ -71,12 +55,9 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { -#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) +#ifndef __CUDA_ARCH__ // Running single threaded on the host CPU return l3CacheSize(); -#elif defined(EIGEN_HIP_DEVICE_COMPILE) - // Running on a HIP device - return firstLevelCacheSize(); // FIXME : update this number for HIP #else // Running on a CUDA device return firstLevelCacheSize(); @@ -84,17 +65,13 @@ struct DefaultDevice { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { -#if !defined(EIGEN_GPU_COMPILE_PHASE) +#ifndef __CUDA_ARCH__ // Running single threaded on the host CPU // Should return an enum that encodes the ISA supported by the CPU return 1; -#elif defined(EIGEN_HIP_DEVICE_COMPILE) - // Running on a HIP device - // return 1 as major for HIP - return 1; #else // Running on a CUDA device - return EIGEN_CUDA_ARCH / 100; + return __CUDA_ARCH__ / 100; #endif } }; diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h deleted file mode 100644 index 7f3394438..000000000 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h +++ /dev/null @@ -1,360 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H) -#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H - -// This header file container defines fo gpu* macros which will resolve to -// their equivalent hip* or cuda* versions depending on the compiler in use -// A separate header (included at the end of this file) will undefine all -#include "TensorGpuHipCudaDefines.h" - -namespace Eigen { - -static const int kGpuScratchSize = 1024; - -// This defines an interface that GPUDevice can take to use -// HIP / CUDA streams underneath. -class StreamInterface { - public: - virtual ~StreamInterface() {} - - virtual const gpuStream_t& stream() const = 0; - virtual const gpuDeviceProp_t& deviceProperties() const = 0; - - // Allocate memory on the actual device where the computation will run - virtual void* allocate(size_t num_bytes) const = 0; - virtual void deallocate(void* buffer) const = 0; - - // Return a scratchpad buffer of size 1k - virtual void* scratchpad() const = 0; - - // Return a semaphore. The semaphore is initially initialized to 0, and - // each kernel using it is responsible for resetting to 0 upon completion - // to maintain the invariant that the semaphore is always equal to 0 upon - // each kernel start. - virtual unsigned int* semaphore() const = 0; -}; - -static gpuDeviceProp_t* m_deviceProperties; -static bool m_devicePropInitialized = false; - -static void initializeDeviceProp() { - if (!m_devicePropInitialized) { - // Attempts to ensure proper behavior in the case of multiple threads - // calling this function simultaneously. This would be trivial to - // implement if we could use std::mutex, but unfortunately mutex don't - // compile with nvcc, so we resort to atomics and thread fences instead. - // Note that if the caller uses a compiler that doesn't support c++11 we - // can't ensure that the initialization is thread safe. - static std::atomic first(true); - if (first.exchange(false)) { - // We're the first thread to reach this point. - int num_devices; - gpuError_t status = gpuGetDeviceCount(&num_devices); - if (status != gpuSuccess) { - std::cerr << "Failed to get the number of GPU devices: " - << gpuGetErrorString(status) - << std::endl; - gpu_assert(status == gpuSuccess); - } - m_deviceProperties = new gpuDeviceProp_t[num_devices]; - for (int i = 0; i < num_devices; ++i) { - status = gpuGetDeviceProperties(&m_deviceProperties[i], i); - if (status != gpuSuccess) { - std::cerr << "Failed to initialize GPU device #" - << i - << ": " - << gpuGetErrorString(status) - << std::endl; - gpu_assert(status == gpuSuccess); - } - } - - std::atomic_thread_fence(std::memory_order_release); - m_devicePropInitialized = true; - } else { - // Wait for the other thread to inititialize the properties. - while (!m_devicePropInitialized) { - std::atomic_thread_fence(std::memory_order_acquire); - EIGEN_SLEEP(1000); - } - } - } -} - -static const gpuStream_t default_stream = gpuStreamDefault; - -class GpuStreamDevice : public StreamInterface { - public: - // Use the default stream on the current device - GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { - gpuGetDevice(&device_); - initializeDeviceProp(); - } - // Use the default stream on the specified device - GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) { - initializeDeviceProp(); - } - // Use the specified stream. Note that it's the - // caller responsibility to ensure that the stream can run on - // the specified device. If no device is specified the code - // assumes that the stream is associated to the current gpu device. - GpuStreamDevice(const gpuStream_t* stream, int device = -1) - : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) { - if (device < 0) { - gpuGetDevice(&device_); - } else { - int num_devices; - gpuError_t err = gpuGetDeviceCount(&num_devices); - EIGEN_UNUSED_VARIABLE(err) - gpu_assert(err == gpuSuccess); - gpu_assert(device < num_devices); - device_ = device; - } - initializeDeviceProp(); - } - - virtual ~GpuStreamDevice() { - if (scratch_) { - deallocate(scratch_); - } - } - - const gpuStream_t& stream() const { return *stream_; } - const gpuDeviceProp_t& deviceProperties() const { - return m_deviceProperties[device_]; - } - virtual void* allocate(size_t num_bytes) const { - gpuError_t err = gpuSetDevice(device_); - EIGEN_UNUSED_VARIABLE(err) - gpu_assert(err == gpuSuccess); - void* result; - err = gpuMalloc(&result, num_bytes); - gpu_assert(err == gpuSuccess); - gpu_assert(result != NULL); - return result; - } - virtual void deallocate(void* buffer) const { - gpuError_t err = gpuSetDevice(device_); - EIGEN_UNUSED_VARIABLE(err) - gpu_assert(err == gpuSuccess); - gpu_assert(buffer != NULL); - err = gpuFree(buffer); - gpu_assert(err == gpuSuccess); - } - - virtual void* scratchpad() const { - if (scratch_ == NULL) { - scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int)); - } - return scratch_; - } - - virtual unsigned int* semaphore() const { - if (semaphore_ == NULL) { - char* scratch = static_cast(scratchpad()) + kGpuScratchSize; - semaphore_ = reinterpret_cast(scratch); - gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_); - EIGEN_UNUSED_VARIABLE(err) - gpu_assert(err == gpuSuccess); - } - return semaphore_; - } - - private: - const gpuStream_t* stream_; - int device_; - mutable void* scratch_; - mutable unsigned int* semaphore_; -}; - -struct GpuDevice { - // The StreamInterface is not owned: the caller is - // responsible for its initialization and eventual destruction. - explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { - eigen_assert(stream); - } - explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) { - eigen_assert(stream); - } - // TODO(bsteiner): This is an internal API, we should not expose it. - EIGEN_STRONG_INLINE const gpuStream_t& stream() const { - return stream_->stream(); - } - - EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - return stream_->allocate(num_bytes); - } - - EIGEN_STRONG_INLINE void deallocate(void* buffer) const { - stream_->deallocate(buffer); - } - - EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { - return stream_->allocate(num_bytes); - } - - EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { - stream_->deallocate(buffer); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { - return data; - } - - EIGEN_STRONG_INLINE void* scratchpad() const { - return stream_->scratchpad(); - } - - EIGEN_STRONG_INLINE unsigned int* semaphore() const { - return stream_->semaphore(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { -#ifndef EIGEN_GPU_COMPILE_PHASE - gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice, - stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - gpu_assert(err == gpuSuccess); -#else - EIGEN_UNUSED_VARIABLE(dst); - EIGEN_UNUSED_VARIABLE(src); - EIGEN_UNUSED_VARIABLE(n); - eigen_assert(false && "The default device should be used instead to generate kernel code"); -#endif - } - - EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { - gpuError_t err = - gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - gpu_assert(err == gpuSuccess); - } - - EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { - gpuError_t err = - gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - gpu_assert(err == gpuSuccess); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { -#ifndef EIGEN_GPU_COMPILE_PHASE - gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream()); - EIGEN_UNUSED_VARIABLE(err) - gpu_assert(err == gpuSuccess); -#else - eigen_assert(false && "The default device should be used instead to generate kernel code"); -#endif - } - - EIGEN_STRONG_INLINE size_t numThreads() const { - // FIXME - return 32; - } - - EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { - // FIXME - return 48*1024; - } - - EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { - // We won't try to take advantage of the l2 cache for the time being, and - // there is no l3 cache on hip/cuda devices. - return firstLevelCacheSize(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const { -#ifndef EIGEN_GPU_COMPILE_PHASE - gpuError_t err = gpuStreamSynchronize(stream_->stream()); - if (err != gpuSuccess) { - std::cerr << "Error detected in GPU stream: " - << gpuGetErrorString(err) - << std::endl; - gpu_assert(err == gpuSuccess); - } -#else - gpu_assert(false && "The default device should be used instead to generate kernel code"); -#endif - } - - EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const { - return stream_->deviceProperties().multiProcessorCount; - } - EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const { - return stream_->deviceProperties().maxThreadsPerBlock; - } - EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const { - return stream_->deviceProperties().maxThreadsPerMultiProcessor; - } - EIGEN_STRONG_INLINE int sharedMemPerBlock() const { - return stream_->deviceProperties().sharedMemPerBlock; - } - EIGEN_STRONG_INLINE int majorDeviceVersion() const { - return stream_->deviceProperties().major; - } - EIGEN_STRONG_INLINE int minorDeviceVersion() const { - return stream_->deviceProperties().minor; - } - - EIGEN_STRONG_INLINE int maxBlocks() const { - return max_blocks_; - } - - // This function checks if the GPU runtime recorded an error for the - // underlying stream device. - inline bool ok() const { -#ifdef EIGEN_GPUCC - gpuError_t error = gpuStreamQuery(stream_->stream()); - return (error == gpuSuccess) || (error == gpuErrorNotReady); -#else - return false; -#endif - } - - private: - const StreamInterface* stream_; - int max_blocks_; -}; - -#if defined(EIGEN_HIPCC) - -#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ - hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \ - gpu_assert(hipGetLastError() == hipSuccess); - -#else - -#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...) \ - (kernel) <<< (gridsize), (blocksize), (sharedmem), (device).stream() >>> (__VA_ARGS__); \ - gpu_assert(cudaGetLastError() == cudaSuccess); - -#endif - -// FIXME: Should be device and kernel specific. -#ifdef EIGEN_GPUCC -static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) { -#ifndef EIGEN_GPU_COMPILE_PHASE - gpuError_t status = gpuDeviceSetSharedMemConfig(config); - EIGEN_UNUSED_VARIABLE(status) - gpu_assert(status == gpuSuccess); -#else - EIGEN_UNUSED_VARIABLE(config) -#endif -} -#endif - -} // end namespace Eigen - -// undefine all the gpu* macros we defined at the beginning of the file -#include "TensorGpuHipCudaUndefines.h" - -#endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h index df591c21d..7c039890e 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h @@ -14,1035 +14,109 @@ #if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H) #define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H -#include namespace Eigen { - -namespace TensorSycl { -namespace internal { - -/// Cache all the device information needed -struct SyclDeviceInfo { - SyclDeviceInfo(cl::sycl::queue queue) - : local_mem_type( - queue.get_device() - .template get_info()), - max_work_item_sizes( - queue.get_device() - .template get_info< - cl::sycl::info::device::max_work_item_sizes>()), - max_mem_alloc_size( - queue.get_device() - .template get_info< - cl::sycl::info::device::max_mem_alloc_size>()), - max_compute_units(queue.get_device() - .template get_info< - cl::sycl::info::device::max_compute_units>()), - max_work_group_size( - queue.get_device() - .template get_info< - cl::sycl::info::device::max_work_group_size>()), - local_mem_size( - queue.get_device() - .template get_info()), - platform_name(queue.get_device() - .get_platform() - .template get_info()), - device_name(queue.get_device() - .template get_info()), - device_vendor( - queue.get_device() - .template get_info()) {} - - cl::sycl::info::local_mem_type local_mem_type; - cl::sycl::id<3> max_work_item_sizes; - unsigned long max_mem_alloc_size; - unsigned long max_compute_units; - unsigned long max_work_group_size; - size_t local_mem_size; - std::string platform_name; - std::string device_name; - std::string device_vendor; -}; - -} // end namespace internal -} // end namespace TensorSycl - -typedef TensorSycl::internal::buffer_data_type_t buffer_scalar_t; -// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and -// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently -// TensorFlow via the Eigen SYCL Backend. -EIGEN_STRONG_INLINE auto get_sycl_supported_devices() - -> decltype(cl::sycl::device::get_devices()) { -#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR - return {cl::sycl::device(cl::sycl::default_selector())}; -#else - std::vector supported_devices; - auto platform_list = cl::sycl::platform::get_platforms(); - for (const auto &platform : platform_list) { - auto device_list = platform.get_devices(); - auto platform_name = - platform.template get_info(); - std::transform(platform_name.begin(), platform_name.end(), - platform_name.begin(), ::tolower); - for (const auto &device : device_list) { - auto vendor = device.template get_info(); - std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower); - bool unsupported_condition = - (device.is_cpu() && platform_name.find("amd") != std::string::npos && - vendor.find("apu") == std::string::npos) || - (platform_name.find("experimental") != std::string::npos) || - device.is_host(); - if (!unsupported_condition) { - supported_devices.push_back(device); - } - } - } - return supported_devices; -#endif -} - -class QueueInterface { - public: - /// Creating device by using cl::sycl::selector or cl::sycl::device. - template - explicit QueueInterface( - const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler, - unsigned num_threads = std::thread::hardware_concurrency()) - : m_queue(dev_or_sel, handler), -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - m_prog(m_queue.get_context(), get_sycl_supported_devices()), -#endif - m_thread_pool(num_threads), - m_device_info(m_queue) { -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - m_prog.build_with_kernel_type(); - auto f = [&](cl::sycl::handler &cgh) { - cgh.single_task(m_prog.get_kernel(), - [=]() {}) - }; - EIGEN_SYCL_TRY_CATCH(m_queue.submit(f)); -#endif - } - - template - explicit QueueInterface( - const DeviceOrSelector &dev_or_sel, - unsigned num_threads = std::thread::hardware_concurrency()) - : QueueInterface(dev_or_sel, - [this](cl::sycl::exception_list l) { - this->exception_caught_ = this->sycl_async_handler(l); - }, - num_threads) {} - -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - EIGEN_STRONG_INLINE cl::sycl::program &program() const { return m_prog; } -#endif - - /// Attach an existing buffer to the pointer map, Eigen will not reuse it - EIGEN_STRONG_INLINE void *attach_buffer( - cl::sycl::buffer &buf) const { - std::lock_guard lock(pmapper_mutex_); - return static_cast(pMapper.add_pointer(buf)); - } - - /// Detach previously attached buffer - EIGEN_STRONG_INLINE void detach_buffer(void *p) const { - std::lock_guard lock(pmapper_mutex_); - TensorSycl::internal::SYCLfree(p, pMapper); - } - - /// Allocating device pointer. This pointer is actually an 8 bytes host - /// pointer used as key to access the sycl device buffer. The reason is that - /// we cannot use device buffer as a pointer as a m_data in Eigen leafNode - /// expressions. So we create a key pointer to be used in Eigen expression - /// construction. When we convert the Eigen construction into the sycl - /// construction we use this pointer as a key in our buffer_map and we make - /// sure that we dedicate only one buffer only for this pointer. The device - /// pointer would be deleted by calling deallocate function. - EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { -#if EIGEN_MAX_ALIGN_BYTES > 0 - size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES; - if (align > 0) { - num_bytes += EIGEN_MAX_ALIGN_BYTES - align; - } -#endif - std::lock_guard lock(pmapper_mutex_); - return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); - } - - EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { -#if EIGEN_MAX_ALIGN_BYTES > 0 - size_t align = num_bytes % EIGEN_MAX_ALIGN_BYTES; - if (align > 0) { - num_bytes += EIGEN_MAX_ALIGN_BYTES - align; - } -#endif - std::lock_guard lock(pmapper_mutex_); -#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS - if (scratch_buffers.empty()) { - return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); - ; - } else { - for (auto it = scratch_buffers.begin(); it != scratch_buffers.end();) { - auto buff = pMapper.get_buffer(*it); - if (buff.get_size() >= num_bytes) { - auto ptr = *it; - scratch_buffers.erase(it); - return ptr; - } else { - ++it; - } - } - return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); - } -#else - return TensorSycl::internal::SYCLmalloc(num_bytes, pMapper); -#endif - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess< - cl::sycl::access::mode::read_write, data_t> - get(data_t *data) const { - return get_range_accessor(data); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get( - TensorSycl::internal::RangeAccess - data) const { - return static_cast(data.get_virtual_pointer()); - } - - EIGEN_STRONG_INLINE void deallocate_temp(void *p) const { - std::lock_guard lock(pmapper_mutex_); -#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS - scratch_buffers.insert(p); -#else - TensorSycl::internal::SYCLfree(p, pMapper); -#endif - } - template - EIGEN_STRONG_INLINE void deallocate_temp( - const TensorSycl::internal::RangeAccess &p) const { - deallocate_temp(p.get_virtual_pointer()); - } - - /// This is used to deallocate the device pointer. p is used as a key inside - /// the map to find the device buffer and delete it. - EIGEN_STRONG_INLINE void deallocate(void *p) const { - std::lock_guard lock(pmapper_mutex_); - TensorSycl::internal::SYCLfree(p, pMapper); - } - - EIGEN_STRONG_INLINE void deallocate_all() const { - std::lock_guard lock(pmapper_mutex_); - TensorSycl::internal::SYCLfreeAll(pMapper); -#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS - scratch_buffers.clear(); -#endif - } - - /// The memcpyHostToDevice is used to copy the data from host to device - /// The destination pointer could be deleted before the copy happend which is - /// why a callback function is needed. By default if none is provided, the - /// function is blocking. - EIGEN_STRONG_INLINE void memcpyHostToDevice( - void *dst, const void *src, size_t n, - std::function callback) const { - static const auto write_mode = cl::sycl::access::mode::discard_write; - static const auto global_access = cl::sycl::access::target::global_buffer; - typedef cl::sycl::accessor - write_accessor; - if (n == 0) { - if (callback) callback(); - return; - } - n /= sizeof(buffer_scalar_t); - auto f = [&](cl::sycl::handler &cgh) { - write_accessor dst_acc = get_range_accessor(cgh, dst, n); - buffer_scalar_t const *ptr = static_cast(src); - auto non_deleter = [](buffer_scalar_t const *) {}; - std::shared_ptr s_ptr(ptr, non_deleter); - cgh.copy(s_ptr, dst_acc); - }; - cl::sycl::event e; - EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); - synchronize_and_callback(e, callback); - } - - /// The memcpyDeviceToHost is used to copy the data from device to host. - /// The source pointer could be deleted before the copy happend which is - /// why a callback function is needed. By default if none is provided, the - /// function is blocking. - EIGEN_STRONG_INLINE void memcpyDeviceToHost( - void *dst, const void *src, size_t n, - std::function callback) const { - static const auto read_mode = cl::sycl::access::mode::read; - static const auto global_access = cl::sycl::access::target::global_buffer; - typedef cl::sycl::accessor - read_accessor; - if (n == 0) { - if (callback) callback(); - return; - } - n /= sizeof(buffer_scalar_t); - auto f = [&](cl::sycl::handler &cgh) { - read_accessor src_acc = get_range_accessor(cgh, src, n); - buffer_scalar_t *ptr = static_cast(dst); - auto non_deleter = [](buffer_scalar_t *) {}; - std::shared_ptr s_ptr(ptr, non_deleter); - cgh.copy(src_acc, s_ptr); - }; - cl::sycl::event e; - EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); - synchronize_and_callback(e, callback); - } - - /// The memcpy function. - /// No callback is required here as both arguments are on the device - /// and SYCL can handle the dependency. - EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const { - static const auto read_mode = cl::sycl::access::mode::read; - static const auto write_mode = cl::sycl::access::mode::discard_write; - if (n == 0) { - return; - } - n /= sizeof(buffer_scalar_t); - auto f = [&](cl::sycl::handler &cgh) { - auto src_acc = get_range_accessor(cgh, src, n); - auto dst_acc = get_range_accessor(cgh, dst, n); - cgh.copy(src_acc, dst_acc); - }; - cl::sycl::event e; - EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); - async_synchronize(e); - } - - /// the memset function. - /// No callback is required here as both arguments are on the device - /// and SYCL can handle the dependency. - EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { - static const auto write_mode = cl::sycl::access::mode::discard_write; - if (n == 0) { - return; - } - n /= sizeof(buffer_scalar_t); - auto f = [&](cl::sycl::handler &cgh) { - auto dst_acc = get_range_accessor(cgh, data, n); - // The cast to uint8_t is here to match the behaviour of the standard - // memset. The cast to buffer_scalar_t is needed to match the type of the - // accessor (in case buffer_scalar_t is not uint8_t) - cgh.fill(dst_acc, static_cast(static_cast(c))); - }; - cl::sycl::event e; - EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(f)); - async_synchronize(e); - } - - /// Get a range accessor to the virtual pointer's device memory. This range - /// accessor will allow access to the memory from the pointer to the end of - /// the buffer. - /// - /// NOTE: Inside a kernel the range accessor will always be indexed from the - /// start of the buffer, so the offset in the accessor is only used by - /// methods like handler::copy and will not be available inside a kernel. - template - EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess - get_range_accessor(const void *ptr) const { - static const auto global_access = cl::sycl::access::target::global_buffer; - static const auto is_place_holder = cl::sycl::access::placeholder::true_t; - typedef TensorSycl::internal::RangeAccess ret_type; - typedef const TensorSycl::internal::buffer_data_type_t *internal_ptr_t; - - std::lock_guard lock(pmapper_mutex_); - - auto original_buffer = pMapper.get_buffer(ptr); - const ptrdiff_t offset = pMapper.get_offset(ptr); - const ptrdiff_t typed_offset = offset / sizeof(T); - eigen_assert(typed_offset >= 0); - const auto typed_size = original_buffer.get_size() / sizeof(T); - auto buffer = original_buffer.template reinterpret< - typename Eigen::internal::remove_const::type>( - cl::sycl::range<1>(typed_size)); - const ptrdiff_t size = buffer.get_count() - typed_offset; - eigen_assert(size >= 0); - typedef cl::sycl::accessor::type, - 1, AcMd, global_access, is_place_holder> - placeholder_accessor_t; - const auto start_ptr = static_cast(ptr) - offset; - return ret_type(placeholder_accessor_t(buffer, cl::sycl::range<1>(size), - cl::sycl::id<1>(typed_offset)), - static_cast(typed_offset), - reinterpret_cast(start_ptr)); - } - - /// Get a range accessor to the virtual pointer's device memory with a - /// specified size. - template - EIGEN_STRONG_INLINE cl::sycl::accessor< - buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> - get_range_accessor(cl::sycl::handler &cgh, const void *ptr, - const Index n_bytes) const { - static const auto global_access = cl::sycl::access::target::global_buffer; - eigen_assert(n_bytes >= 0); - std::lock_guard lock(pmapper_mutex_); - auto buffer = pMapper.get_buffer(ptr); - const ptrdiff_t offset = pMapper.get_offset(ptr); - eigen_assert(offset >= 0); - eigen_assert(offset + n_bytes <= buffer.get_size()); - return buffer.template get_access( - cgh, cl::sycl::range<1>(n_bytes), cl::sycl::id<1>(offset)); - } - - /// Creation of sycl accessor for a buffer. This function first tries to find - /// the buffer in the buffer_map. If found it gets the accessor from it, if - /// not, the function then adds an entry by creating a sycl buffer for that - /// particular pointer. - template - EIGEN_STRONG_INLINE cl::sycl::accessor< - buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> - get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const { - std::lock_guard lock(pmapper_mutex_); - return pMapper.get_buffer(ptr) - .template get_access( - cgh); - } - - EIGEN_STRONG_INLINE cl::sycl::buffer get_sycl_buffer( - const void *ptr) const { - std::lock_guard lock(pmapper_mutex_); - return pMapper.get_buffer(ptr); - } - - EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { - std::lock_guard lock(pmapper_mutex_); - return pMapper.get_offset(ptr); - } - - template - EIGEN_ALWAYS_INLINE void binary_kernel_launcher(const Lhs &lhs, - const Rhs &rhs, OutPtr outptr, - Range thread_range, - Index scratchSize, - T... var) const { - auto kernel_functor = [=](cl::sycl::handler &cgh) { - // binding the placeholder accessors to a commandgroup handler - lhs.bind(cgh); - rhs.bind(cgh); - outptr.bind(cgh); - typedef cl::sycl::accessor - LocalAccessor; - - LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); - cgh.parallel_for( -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - program().template get_kernel(), -#endif - thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...)); - }; - cl::sycl::event e; - EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); - async_synchronize(e); - } - - template - EIGEN_ALWAYS_INLINE void unary_kernel_launcher(const InPtr &inptr, - OutPtr &outptr, - Range thread_range, - Index scratchSize, - T... var) const { - auto kernel_functor = [=](cl::sycl::handler &cgh) { - // binding the placeholder accessors to a commandgroup handler - inptr.bind(cgh); - outptr.bind(cgh); - typedef cl::sycl::accessor - LocalAccessor; - - LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); - cgh.parallel_for( -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - program().template get_kernel(), -#endif - thread_range, sycl_kernel(scratch, inptr, outptr, var...)); - }; - cl::sycl::event e; - EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); - async_synchronize(e); - } - - template - EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(const InPtr &inptr, - Range thread_range, - Index scratchSize, - T... var) const { - auto kernel_functor = [=](cl::sycl::handler &cgh) { - // binding the placeholder accessors to a commandgroup handler - inptr.bind(cgh); - typedef cl::sycl::accessor - LocalAccessor; - - LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh); - cgh.parallel_for( -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - program().template get_kernel(), -#endif - thread_range, sycl_kernel(scratch, inptr, var...)); - }; - cl::sycl::event e; - EIGEN_SYCL_TRY_CATCH(e = m_queue.submit(kernel_functor)); - async_synchronize(e); - } - - - EIGEN_STRONG_INLINE void synchronize() const { -#ifdef EIGEN_EXCEPTIONS - m_queue.wait_and_throw(); -#else - m_queue.wait(); -#endif - } - - - EIGEN_STRONG_INLINE void async_synchronize(cl::sycl::event e) const { - set_latest_event(e); -#ifndef EIGEN_SYCL_ASYNC_EXECUTION - synchronize(); -#endif - } - - template - EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, - Index &rng, Index &GRange) const { - tileSize = static_cast(getNearestPowerOfTwoWorkGroupSize()); - tileSize = std::min(static_cast(EIGEN_SYCL_LOCAL_THREAD_DIM0 * - EIGEN_SYCL_LOCAL_THREAD_DIM1), - static_cast(tileSize)); - rng = n; - if (rng == 0) rng = static_cast(1); - GRange = rng; - if (tileSize > GRange) - tileSize = GRange; - else if (GRange > tileSize) { - Index xMode = static_cast(GRange % tileSize); - if (xMode != 0) GRange += static_cast(tileSize - xMode); - } - } - - /// This is used to prepare the number of threads and also the number of - /// threads per block for sycl kernels - template - EIGEN_STRONG_INLINE void parallel_for_setup( - const std::array &input_dim, cl::sycl::range<2> &global_range, - cl::sycl::range<2> &local_range) const { - std::array input_range = input_dim; - Index max_workgroup_Size = - static_cast(getNearestPowerOfTwoWorkGroupSize()); - max_workgroup_Size = - std::min(static_cast(EIGEN_SYCL_LOCAL_THREAD_DIM0 * - EIGEN_SYCL_LOCAL_THREAD_DIM1), - static_cast(max_workgroup_Size)); - Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); - local_range[1] = - static_cast(std::pow(2, static_cast(pow_of_2 / 2))); - input_range[1] = input_dim[1]; - if (input_range[1] == 0) input_range[1] = static_cast(1); - global_range[1] = input_range[1]; - if (local_range[1] > global_range[1]) - local_range[1] = global_range[1]; - else if (global_range[1] > local_range[1]) { - Index xMode = static_cast(global_range[1] % local_range[1]); - if (xMode != 0) - global_range[1] += static_cast(local_range[1] - xMode); - } - local_range[0] = static_cast(max_workgroup_Size / local_range[1]); - input_range[0] = input_dim[0]; - if (input_range[0] == 0) input_range[0] = static_cast(1); - global_range[0] = input_range[0]; - if (local_range[0] > global_range[0]) - local_range[0] = global_range[0]; - else if (global_range[0] > local_range[0]) { - Index xMode = static_cast(global_range[0] % local_range[0]); - if (xMode != 0) - global_range[0] += static_cast(local_range[0] - xMode); - } - } - - /// This is used to prepare the number of threads and also the number of - /// threads per block for sycl kernels - template - EIGEN_STRONG_INLINE void parallel_for_setup( - const std::array &input_dim, cl::sycl::range<3> &global_range, - cl::sycl::range<3> &local_range) const { - std::array input_range = input_dim; - Index max_workgroup_Size = - static_cast(getNearestPowerOfTwoWorkGroupSize()); - max_workgroup_Size = - std::min(static_cast(EIGEN_SYCL_LOCAL_THREAD_DIM0 * - EIGEN_SYCL_LOCAL_THREAD_DIM1), - static_cast(max_workgroup_Size)); - Index pow_of_2 = static_cast(std::log2(max_workgroup_Size)); - local_range[2] = - static_cast(std::pow(2, static_cast(pow_of_2 / 3))); - input_range[2] = input_dim[2]; - if (input_range[2] == 0) input_range[1] = static_cast(1); - global_range[2] = input_range[2]; - if (local_range[2] > global_range[2]) - local_range[2] = global_range[2]; - else if (global_range[2] > local_range[2]) { - Index xMode = static_cast(global_range[2] % local_range[2]); - if (xMode != 0) - global_range[2] += static_cast(local_range[2] - xMode); - } - pow_of_2 = static_cast( - std::log2(static_cast(max_workgroup_Size / local_range[2]))); - local_range[1] = - static_cast(std::pow(2, static_cast(pow_of_2 / 2))); - input_range[1] = input_dim[1]; - if (input_range[1] == 0) input_range[1] = static_cast(1); - global_range[1] = input_range[1]; - if (local_range[1] > global_range[1]) - local_range[1] = global_range[1]; - else if (global_range[1] > local_range[1]) { - Index xMode = static_cast(global_range[1] % local_range[1]); - if (xMode != 0) - global_range[1] += static_cast(local_range[1] - xMode); - } - local_range[0] = static_cast(max_workgroup_Size / - (local_range[1] * local_range[2])); - input_range[0] = input_dim[0]; - if (input_range[0] == 0) input_range[0] = static_cast(1); - global_range[0] = input_range[0]; - if (local_range[0] > global_range[0]) - local_range[0] = global_range[0]; - else if (global_range[0] > local_range[0]) { - Index xMode = static_cast(global_range[0] % local_range[0]); - if (xMode != 0) - global_range[0] += static_cast(local_range[0] - xMode); - } - } - - EIGEN_STRONG_INLINE bool has_local_memory() const { -#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) - return false; -#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM) - return true; -#else - return m_device_info.local_mem_type == - cl::sycl::info::local_mem_type::local; -#endif - } - - EIGEN_STRONG_INLINE unsigned long max_buffer_size() const { - return m_device_info.max_mem_alloc_size; - } - - EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { - return m_device_info.max_compute_units; - } - - EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { - return m_device_info.max_work_group_size; - } - - EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { - return m_device_info.max_work_item_sizes; - } - - /// No need for sycl it should act the same as CPU version - EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; } - - EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { - // OpenCL doesnot have such concept - return 2; - } - - EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { - return m_device_info.local_mem_size; - } - - // This function returns the nearest power of 2 Work-group size which is <= - // maximum device workgroup size. - EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const { - return getPowerOfTwo(m_device_info.max_work_group_size, false); - } - - EIGEN_STRONG_INLINE std::string getPlatformName() const { - return m_device_info.platform_name; - } - - EIGEN_STRONG_INLINE std::string getDeviceName() const { - return m_device_info.device_name; - } - - EIGEN_STRONG_INLINE std::string getDeviceVendor() const { - return m_device_info.device_vendor; - } - - // This function returns the nearest power of 2 - // if roundup is true returns result>=wgsize - // else it return result <= wgsize - EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const { - if (roundUp) --wGSize; - wGSize |= (wGSize >> 1); - wGSize |= (wGSize >> 2); - wGSize |= (wGSize >> 4); - wGSize |= (wGSize >> 8); - wGSize |= (wGSize >> 16); -#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64 - wGSize |= (wGSize >> 32); -#endif - return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize); - } - - EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; } - - // This function checks if the runtime recorded an error for the - // underlying stream device. - EIGEN_STRONG_INLINE bool ok() const { - if (!exception_caught_) { - synchronize(); - } - return !exception_caught_; - } - - EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const { -#ifdef EIGEN_SYCL_STORE_LATEST_EVENT - std::lock_guard lock(event_mutex_); - return latest_events_[std::this_thread::get_id()]; -#else - eigen_assert(false); - return cl::sycl::event(); -#endif - } - - // destructor - ~QueueInterface() { - pMapper.clear(); -#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS - scratch_buffers.clear(); -#endif - } - - protected: - EIGEN_STRONG_INLINE void set_latest_event(cl::sycl::event e) const { -#ifdef EIGEN_SYCL_STORE_LATEST_EVENT - std::lock_guard lock(event_mutex_); - latest_events_[std::this_thread::get_id()] = e; -#else - EIGEN_UNUSED_VARIABLE(e); -#endif - } - - void synchronize_and_callback(cl::sycl::event e, - const std::function &callback) const { - set_latest_event(e); - if (callback) { - auto callback_ = [=]() { -#ifdef EIGEN_EXCEPTIONS - cl::sycl::event(e).wait_and_throw(); -#else - cl::sycl::event(e).wait(); -#endif - callback(); - }; - m_thread_pool.Schedule(std::move(callback_)); - } else { -#ifdef EIGEN_EXCEPTIONS - m_queue.wait_and_throw(); -#else - m_queue.wait(); -#endif - } - } - - bool sycl_async_handler(cl::sycl::exception_list exceptions) const { - bool exception_caught = false; - for (const auto &e : exceptions) { - if (e) { - exception_caught = true; - EIGEN_THROW_X(e); - } - } - return exception_caught; - } - - /// class members: - bool exception_caught_ = false; - - mutable std::mutex pmapper_mutex_; - -#ifdef EIGEN_SYCL_STORE_LATEST_EVENT - mutable std::mutex event_mutex_; - mutable std::unordered_map latest_events_; -#endif - - /// std::map is the container used to make sure that we create only one buffer - /// per pointer. The lifespan of the buffer now depends on the lifespan of - /// SyclDevice. If a non-read-only pointer is needed to be accessed on the - /// host we should manually deallocate it. - mutable TensorSycl::internal::PointerMapper pMapper; -#ifndef EIGEN_SYCL_NO_REUSE_BUFFERS - mutable std::unordered_set scratch_buffers; -#endif +struct SyclDevice { + /// class members /// sycl queue mutable cl::sycl::queue m_queue; -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - mutable cl::sycl::program m_prog; + /// std::map is the container used to make sure that we create only one buffer + /// per pointer. The lifespan of the buffer now depends on the lifespan of SyclDevice. + /// If a non-read-only pointer is needed to be accessed on the host we should manually deallocate it. + mutable std::map> buffer_map; + /// creating device by using selector + template SyclDevice(dev_Selector s) + : +#ifdef EIGEN_EXCEPTIONS + m_queue(cl::sycl::queue(s, [=](cl::sycl::exception_list l) { + for (const auto& e : l) { + try { + std::rethrow_exception(e); + } catch (cl::sycl::exception e) { + std::cout << e.what() << std::endl; + } + } + })) +#else + m_queue(cl::sycl::queue(s)) #endif + {} + // destructor + ~SyclDevice() { deallocate_all(); } - /// The thread pool is used to wait on events and call callbacks - /// asynchronously - mutable Eigen::ThreadPool m_thread_pool; - - const TensorSycl::internal::SyclDeviceInfo m_device_info; -}; - -struct SyclDeviceBase { - /// QueueInterface is not owned. it is the caller's responsibility to destroy - /// it - const QueueInterface *m_queue_stream; - explicit SyclDeviceBase(const QueueInterface *queue_stream) - : m_queue_stream(queue_stream) {} - EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const { - return m_queue_stream; + template void deallocate(T *p) const { + auto it = buffer_map.find(p); + if (it != buffer_map.end()) { + buffer_map.erase(it); + internal::aligned_free(p); + } } -}; - -// Here is a sycl device struct which accept the sycl queue interface -// as an input -struct SyclDevice : public SyclDeviceBase { - explicit SyclDevice(const QueueInterface *queue_stream) - : SyclDeviceBase(queue_stream) {} - - // this is the accessor used to construct the evaluator - template - EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess - get_range_accessor(const void *ptr) const { - return queue_stream()->template get_range_accessor(ptr); + void deallocate_all() const { + std::map>::iterator it=buffer_map.begin(); + while (it!=buffer_map.end()) { + auto p=it->first; + buffer_map.erase(it); + internal::aligned_free(const_cast(p)); + it=buffer_map.begin(); + } + buffer_map.clear(); } - // get sycl accessor - template - EIGEN_STRONG_INLINE cl::sycl::accessor< - buffer_scalar_t, 1, AcMd, cl::sycl::access::target::global_buffer> - get_sycl_accessor(cl::sycl::handler &cgh, const void *ptr) const { - return queue_stream()->template get_sycl_accessor(cgh, ptr); + /// creation of sycl accessor for a buffer. This function first tries to find + /// the buffer in the buffer_map. If found it gets the accessor from it, if not, + ///the function then adds an entry by creating a sycl buffer for that particular pointer. + template inline cl::sycl::accessor + get_sycl_accessor(size_t num_bytes, cl::sycl::handler &cgh, const T * ptr) const { + return (get_sycl_buffer(num_bytes, ptr)->template get_access(cgh)); } - /// Accessing the created sycl device buffer for the device pointer - EIGEN_STRONG_INLINE cl::sycl::buffer get_sycl_buffer( - const void *ptr) const { - return queue_stream()->get_sycl_buffer(ptr); + template inline std::pair>::iterator,bool> add_sycl_buffer(const T *ptr, size_t num_bytes) const { + using Type = cl::sycl::buffer; + std::pair>::iterator,bool> ret = buffer_map.insert(std::pair>(ptr, std::shared_ptr(new Type(cl::sycl::range<1>(num_bytes)), + [](void *dataMem) { delete static_cast(dataMem); }))); + (static_cast(buffer_map.at(ptr).get()))->set_final_data(nullptr); + return ret; } - /// This is used to prepare the number of threads and also the number of - /// threads per block for sycl kernels - template - EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, - Index &rng, Index &GRange) const { - queue_stream()->parallel_for_setup(n, tileSize, rng, GRange); + template inline cl::sycl::buffer* get_sycl_buffer(size_t num_bytes,const T * ptr) const { + return static_cast*>(add_sycl_buffer(ptr, num_bytes).first->second.get()); } - /// This is used to prepare the number of threads and also the number of - /// threads per block for sycl kernels - template - EIGEN_STRONG_INLINE void parallel_for_setup( - const std::array &input_dim, cl::sycl::range<2> &global_range, - cl::sycl::range<2> &local_range) const { - queue_stream()->parallel_for_setup(input_dim, global_range, local_range); - } - - /// This is used to prepare the number of threads and also the number of - /// threads per block for sycl kernels - template - EIGEN_STRONG_INLINE void parallel_for_setup( - const std::array &input_dim, cl::sycl::range<3> &global_range, - cl::sycl::range<3> &local_range) const { - queue_stream()->parallel_for_setup(input_dim, global_range, local_range); - } - - /// allocate device memory - EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { - return queue_stream()->allocate(num_bytes); - } - - EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { - return queue_stream()->allocate_temp(num_bytes); - } - - /// deallocate device memory - EIGEN_STRONG_INLINE void deallocate(void *p) const { - queue_stream()->deallocate(p); - } - - EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const { - queue_stream()->deallocate_temp(buffer); - } - template - EIGEN_STRONG_INLINE void deallocate_temp( - const TensorSycl::internal::RangeAccess &buffer) const { - queue_stream()->deallocate_temp(buffer); - } - EIGEN_STRONG_INLINE void deallocate_all() const { - queue_stream()->deallocate_all(); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSycl::internal::RangeAccess< - cl::sycl::access::mode::read_write, data_t> - get(data_t *data) const { - return queue_stream()->get(data); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get( - TensorSycl::internal::RangeAccess - data) const { - return queue_stream()->get(data); - } - - /// attach existing buffer - EIGEN_STRONG_INLINE void *attach_buffer( - cl::sycl::buffer &buf) const { - return queue_stream()->attach_buffer(buf); - } - /// detach buffer - EIGEN_STRONG_INLINE void detach_buffer(void *p) const { - queue_stream()->detach_buffer(p); - } - EIGEN_STRONG_INLINE ptrdiff_t get_offset(const void *ptr) const { - return queue_stream()->get_offset(ptr); + /// allocating memory on the cpu + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void *allocate(size_t) const { + return internal::aligned_malloc(8); } // some runtime conditions that can be applied here - EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; } + bool isDeviceSuitable() const { return true; } - /// memcpyHostToDevice - template - EIGEN_STRONG_INLINE void memcpyHostToDevice( - Index *dst, const Index *src, size_t n, - std::function callback = {}) const { - queue_stream()->memcpyHostToDevice(dst, src, n, callback); - } - /// memcpyDeviceToHost - template - EIGEN_STRONG_INLINE void memcpyDeviceToHost( - void *dst, const Index *src, size_t n, - std::function callback = {}) const { - queue_stream()->memcpyDeviceToHost(dst, src, n, callback); - } - /// the memcpy function - template - EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const { - queue_stream()->memcpy(dst, src, n); - } - /// the memset function - EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { - queue_stream()->memset(data, c, n); - } - /// returning the sycl queue - EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { - return queue_stream()->sycl_queue(); - } -#ifdef EIGEN_SYCL_USE_PROGRAM_CLASS - EIGEN_STRONG_INLINE cl::sycl::program &program() const { - return queue_stream()->program(); - } -#endif - - EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; } - - EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const { - // We won't try to take advantage of the l2 cache for the time being, and - // there is no l3 cache on sycl devices. - return firstLevelCacheSize(); - } - EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { - return queue_stream()->getNumSyclMultiProcessors(); - } - EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { - return queue_stream()->maxSyclThreadsPerBlock(); - } - EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { - return queue_stream()->maxWorkItemSizes(); - } - EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const { - // OpenCL doesnot have such concept - return queue_stream()->maxSyclThreadsPerMultiProcessor(); - } - EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { - return queue_stream()->sharedMemPerBlock(); - } - EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const { - return queue_stream()->getNearestPowerOfTwoWorkGroupSize(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const { + ::memcpy(dst, src, n); } - EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const { - return queue_stream()->getPowerOfTwo(val, roundUp); + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(T *dst, const T *src, size_t n) const { + auto host_acc= (static_cast*>(add_sycl_buffer(dst, n).first->second.get()))-> template get_access(); + memcpy(host_acc.get_pointer(), src, n); } - /// No need for sycl it should act the same as CPU version - EIGEN_STRONG_INLINE int majorDeviceVersion() const { - return queue_stream()->majorDeviceVersion(); + /// whith the current implementation of sycl, the data is copied twice from device to host. This will be fixed soon. + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(T *dst, const T *src, size_t n) const { + auto it = buffer_map.find(src); + if (it != buffer_map.end()) { + auto host_acc= (static_cast*>(it->second.get()))-> template get_access(); + memcpy(dst,host_acc.get_pointer(), n); + } else{ + eigen_assert("no device memory found. The memory might be destroyed before creation"); + } } - EIGEN_STRONG_INLINE void synchronize() const { - queue_stream()->synchronize(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void *buffer, int c, size_t n) const { + ::memset(buffer, c, n); } - EIGEN_STRONG_INLINE void async_synchronize( - cl::sycl::event e = cl::sycl::event()) const { - queue_stream()->async_synchronize(e); - } - EIGEN_STRONG_INLINE cl::sycl::event get_latest_event() const { - return queue_stream()->get_latest_event(); - } - - // This function checks if the runtime recorded an error for the - // underlying stream device. - EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); } - - EIGEN_STRONG_INLINE bool has_local_memory() const { - return queue_stream()->has_local_memory(); - } - EIGEN_STRONG_INLINE long max_buffer_size() const { - return queue_stream()->max_buffer_size(); - } - EIGEN_STRONG_INLINE std::string getPlatformName() const { - return queue_stream()->getPlatformName(); - } - EIGEN_STRONG_INLINE std::string getDeviceName() const { - return queue_stream()->getDeviceName(); - } - EIGEN_STRONG_INLINE std::string getDeviceVendor() const { - return queue_stream()->getDeviceVendor(); - } - template - EIGEN_ALWAYS_INLINE void binary_kernel_launcher(T... var) const { - queue_stream()->template binary_kernel_launcher( - var...); - } - template - EIGEN_ALWAYS_INLINE void unary_kernel_launcher(T... var) const { - queue_stream()->template unary_kernel_launcher( - var...); - } - - template - EIGEN_ALWAYS_INLINE void nullary_kernel_launcher(T... var) const { - queue_stream()->template nullary_kernel_launcher( - var...); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const { + return 1; } }; + } // end namespace Eigen #endif // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index e524b535a..a5e084a24 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -12,6 +12,67 @@ namespace Eigen { +// Use the SimpleThreadPool by default. We'll switch to the new non blocking +// thread pool later. +#ifndef EIGEN_USE_SIMPLE_THREAD_POOL +template using ThreadPoolTempl = NonBlockingThreadPoolTempl; +typedef NonBlockingThreadPool ThreadPool; +#else +template using ThreadPoolTempl = SimpleThreadPoolTempl; +typedef SimpleThreadPool ThreadPool; +#endif + + +// Barrier is an object that allows one or more threads to wait until +// Notify has been called a specified number of times. +class Barrier { + public: + Barrier(unsigned int count) : state_(count << 1), notified_(false) { + eigen_assert(((count << 1) >> 1) == count); + } + ~Barrier() { + eigen_plain_assert((state_>>1) == 0); + } + + void Notify() { + unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2; + if (v != 1) { + eigen_assert(((v + 2) & ~1) != 0); + return; // either count has not dropped to 0, or waiter is not waiting + } + std::unique_lock l(mu_); + eigen_assert(!notified_); + notified_ = true; + cv_.notify_all(); + } + + void Wait() { + unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel); + if ((v >> 1) == 0) return; + std::unique_lock l(mu_); + while (!notified_) { + cv_.wait(l); + } + } + + private: + std::mutex mu_; + std::condition_variable cv_; + std::atomic state_; // low bit is waiter flag + bool notified_; +}; + + +// Notification is an object that allows a user to to wait for another +// thread to signal a notification that an event has occurred. +// +// Multiple threads can wait on the same Notification object, +// but only one caller must call Notify() on the object. +struct Notification : Barrier { + Notification() : Barrier(1) {}; +}; + + // Runs an arbitrary function and then calls Notify() on the passed in // Notification. template struct FunctionWrapperWithNotification @@ -41,75 +102,22 @@ static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) { } } -// An abstract interface to a device specific memory allocator. -class Allocator { - public: - virtual ~Allocator() {} - virtual void* allocate(size_t num_bytes) const = 0; - virtual void deallocate(void* buffer) const = 0; -}; // Build a thread pool device on top the an existing pool of threads. struct ThreadPoolDevice { // The ownership of the thread pool remains with the caller. - ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores, Allocator* allocator = nullptr) - : pool_(pool), num_threads_(num_cores), allocator_(allocator) { } + ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { } EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { - return allocator_ ? allocator_->allocate(num_bytes) - : internal::aligned_malloc(num_bytes); + return internal::aligned_malloc(num_bytes); } EIGEN_STRONG_INLINE void deallocate(void* buffer) const { - if (allocator_) { - allocator_->deallocate(buffer); - } else { - internal::aligned_free(buffer); - } - } - - EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { - return allocate(num_bytes); - } - - EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { - deallocate(buffer); - } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const { - return data; + internal::aligned_free(buffer); } EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const { -#ifdef __ANDROID__ ::memcpy(dst, src, n); -#else - // TODO(rmlarsen): Align blocks on cache lines. - // We have observed that going beyond 4 threads usually just wastes - // CPU cycles due to the threads competing for memory bandwidth, so we - // statically schedule at most 4 block copies here. - const size_t kMinBlockSize = 32768; - const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), 4); - if (n <= kMinBlockSize || num_threads < 2) { - ::memcpy(dst, src, n); - } else { - const char* src_ptr = static_cast(src); - char* dst_ptr = static_cast(dst); - const size_t blocksize = (n + (num_threads - 1)) / num_threads; - Barrier barrier(static_cast(num_threads - 1)); - // Launch the last 3 blocks on worker threads. - for (size_t i = 1; i < num_threads; ++i) { - enqueue_with_barrier(&barrier, [n, i, src_ptr, dst_ptr, blocksize] { - ::memcpy(dst_ptr + i * blocksize, src_ptr + i * blocksize, - numext::mini(blocksize, n - (i * blocksize))); - }); - } - // Launch the first block on the main thread. - ::memcpy(dst_ptr, src_ptr, blocksize); - barrier.Wait(); - } -#endif } EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { memcpy(dst, src, n); @@ -126,12 +134,6 @@ struct ThreadPoolDevice { return num_threads_; } - // Number of theads available in the underlying thread pool. This number can - // be different from the value returned by numThreads(). - EIGEN_STRONG_INLINE int numThreadsInPool() const { - return pool_->NumThreads(); - } - EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return l1CacheSize(); } @@ -147,31 +149,23 @@ struct ThreadPoolDevice { } template - EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, - Args&&... args) const { + EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const { Notification* n = new Notification(); - pool_->Schedule( - std::bind(&FunctionWrapperWithNotification::run, n, - std::move(f), args...)); + pool_->Schedule(std::bind(&FunctionWrapperWithNotification::run, n, f, args...)); return n; } template - EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, Function&& f, + EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b, + Function&& f, Args&&... args) const { - pool_->Schedule( - std::bind(&FunctionWrapperWithBarrier::run, b, - std::move(f), args...)); + pool_->Schedule(std::bind( + &FunctionWrapperWithBarrier::run, b, f, args...)); } template - EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, - Args&&... args) const { - if (sizeof...(args) > 0) { - pool_->Schedule(std::bind(std::move(f), args...)); - } else { - pool_->Schedule(std::move(f)); - } + EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const { + pool_->Schedule(std::bind(f, args...)); } // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if @@ -180,189 +174,44 @@ struct ThreadPoolDevice { return pool_->CurrentThreadId(); } - // WARNING: This function is synchronous and will block the calling thread. - // - // Synchronous parallelFor executes f with [0, n) arguments in parallel and - // waits for completion. F accepts a half-open interval [first, last). Block - // size is chosen based on the iteration cost and resulting parallel + // parallelFor executes f with [0, n) arguments in parallel and waits for + // completion. F accepts a half-open interval [first, last). + // Block size is choosen based on the iteration cost and resulting parallel // efficiency. If block_align is not nullptr, it is called to round up the // block size. void parallelFor(Index n, const TensorOpCost& cost, std::function block_align, std::function f) const { - if (EIGEN_PREDICT_FALSE(n <= 0)){ - return; - // Compute small problems directly in the caller thread. - } else if (n == 1 || numThreads() == 1 || - CostModel::numThreads(n, cost, static_cast(numThreads())) == 1) { - f(0, n); - return; - } - - // Compute block size and total count of blocks. - ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align); - - // Recursively divide size into halves until we reach block_size. - // Division code rounds mid to block_size, so we are guaranteed to get - // block_count leaves that do actual computations. - Barrier barrier(static_cast(block.count)); - std::function handleRange; - handleRange = [=, &handleRange, &barrier, &f](Index firstIdx, - Index lastIdx) { - while (lastIdx - firstIdx > block.size) { - // Split into halves and schedule the second half on a different thread. - const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size; - pool_->Schedule([=, &handleRange]() { handleRange(midIdx, lastIdx); }); - lastIdx = midIdx; - } - // Single block or less, execute directly. - f(firstIdx, lastIdx); - barrier.Notify(); - }; - - if (block.count <= numThreads()) { - // Avoid a thread hop by running the root of the tree and one block on the - // main thread. - handleRange(0, n); - } else { - // Execute the root in the thread pool to avoid running work on more than - // numThreads() threads. - pool_->Schedule([=, &handleRange]() { handleRange(0, n); }); - } - - barrier.Wait(); - } - - // Convenience wrapper for parallelFor that does not align blocks. - void parallelFor(Index n, const TensorOpCost& cost, - std::function f) const { - parallelFor(n, cost, nullptr, std::move(f)); - } - - // WARNING: This function is asynchronous and will not block the calling thread. - // - // Asynchronous parallelFor executes f with [0, n) arguments in parallel - // without waiting for completion. When the last block finished, it will call - // 'done' callback. F accepts a half-open interval [first, last). Block size - // is chosen based on the iteration cost and resulting parallel efficiency. If - // block_align is not nullptr, it is called to round up the block size. - void parallelForAsync(Index n, const TensorOpCost& cost, - std::function block_align, - std::function f, - std::function done) const { - // Compute small problems directly in the caller thread. + typedef TensorCostModel CostModel; if (n <= 1 || numThreads() == 1 || CostModel::numThreads(n, cost, static_cast(numThreads())) == 1) { f(0, n); - done(); return; } - // Compute block size and total count of blocks. - ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align); + // Calculate block size based on (1) the iteration cost and (2) parallel + // efficiency. We want blocks to be not too small to mitigate + // parallelization overheads; not too large to mitigate tail + // effect and potential load imbalance and we also want number + // of blocks to be evenly dividable across threads. - ParallelForAsyncContext* const ctx = - new ParallelForAsyncContext(block.count, std::move(f), std::move(done)); - - // Recursively divide size into halves until we reach block_size. - // Division code rounds mid to block_size, so we are guaranteed to get - // block_count leaves that do actual computations. - ctx->handle_range = [this, ctx, block](Index firstIdx, Index lastIdx) { - while (lastIdx - firstIdx > block.size) { - // Split into halves and schedule the second half on a different thread. - const Index midIdx = firstIdx + divup((lastIdx - firstIdx) / 2, block.size) * block.size; - pool_->Schedule( - [ctx, midIdx, lastIdx]() { ctx->handle_range(midIdx, lastIdx); }); - lastIdx = midIdx; - } - - // Single block or less, execute directly. - ctx->f(firstIdx, lastIdx); - - // Delete async context if it was the last block. - if (ctx->count.fetch_sub(1) == 1) delete ctx; - }; - - if (block.count <= numThreads()) { - // Avoid a thread hop by running the root of the tree and one block on the - // main thread. - ctx->handle_range(0, n); - } else { - // Execute the root in the thread pool to avoid running work on more than - // numThreads() threads. - pool_->Schedule([ctx, n]() { ctx->handle_range(0, n); }); - } - } - - // Convenience wrapper for parallelForAsync that does not align blocks. - void parallelForAsync(Index n, const TensorOpCost& cost, - std::function f, - std::function done) const { - parallelForAsync(n, cost, nullptr, std::move(f), std::move(done)); - } - - // Thread pool accessor. - ThreadPoolInterface* getPool() const { return pool_; } - - // Allocator accessor. - Allocator* allocator() const { return allocator_; } - - private: - typedef TensorCostModel CostModel; - - // For parallelForAsync we must keep passed in closures on the heap, and - // delete them only after `done` callback finished. - struct ParallelForAsyncContext { - ParallelForAsyncContext(Index block_count, - std::function block_f, - std::function done_callback) - : count(block_count), - f(std::move(block_f)), - done(std::move(done_callback)) {} - ~ParallelForAsyncContext() { done(); } - - std::atomic count; - std::function f; - std::function done; - - std::function handle_range; - }; - - struct ParallelForBlock { - Index size; // block size - Index count; // number of blocks - }; - - // Calculates block size based on (1) the iteration cost and (2) parallel - // efficiency. We want blocks to be not too small to mitigate parallelization - // overheads; not too large to mitigate tail effect and potential load - // imbalance and we also want number of blocks to be evenly dividable across - // threads. - ParallelForBlock CalculateParallelForBlock( - const Index n, const TensorOpCost& cost, - std::function block_align) const { - const double block_size_f = 1.0 / CostModel::taskSize(1, cost); + double block_size_f = 1.0 / CostModel::taskSize(1, cost); const Index max_oversharding_factor = 4; Index block_size = numext::mini( - n, numext::maxi( - divup(n, max_oversharding_factor * numThreads()), - block_size_f)); + n, numext::maxi(divup(n, max_oversharding_factor * numThreads()), + block_size_f)); const Index max_block_size = numext::mini(n, 2 * block_size); - if (block_align) { Index new_block_size = block_align(block_size); eigen_assert(new_block_size >= block_size); block_size = numext::mini(n, new_block_size); } - Index block_count = divup(n, block_size); - // Calculate parallel efficiency as fraction of total CPU time used for // computations: double max_efficiency = static_cast(block_count) / (divup(block_count, numThreads()) * numThreads()); - // Now try to increase block size up to max_block_size as long as it // doesn't decrease parallel efficiency. for (Index prev_block_count = block_count; @@ -395,12 +244,36 @@ struct ThreadPoolDevice { } } - return {block_size, block_count}; + // Recursively divide size into halves until we reach block_size. + // Division code rounds mid to block_size, so we are guaranteed to get + // block_count leaves that do actual computations. + Barrier barrier(static_cast(block_count)); + std::function handleRange; + handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) { + if (last - first <= block_size) { + // Single block or less, execute directly. + f(first, last); + barrier.Notify(); + return; + } + // Split into halves and submit to the pool. + Index mid = first + divup((last - first) / 2, block_size) * block_size; + pool_->Schedule([=, &handleRange]() { handleRange(mid, last); }); + pool_->Schedule([=, &handleRange]() { handleRange(first, mid); }); + }; + handleRange(0, n); + barrier.Wait(); } + // Convenience wrapper for parallelFor that does not align blocks. + void parallelFor(Index n, const TensorOpCost& cost, + std::function f) const { + parallelFor(n, cost, nullptr, std::move(f)); + } + + private: ThreadPoolInterface* pool_; int num_threads_; - Allocator* allocator_; }; diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h index 132458a20..451940de3 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h @@ -32,16 +32,16 @@ namespace Eigen { // Boilerplate code namespace internal { -template struct dget { - static const std::ptrdiff_t value = get::value; +template struct dget { + static const std::size_t value = get::value; }; -template +template struct fixed_size_tensor_index_linearization_helper { template EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Index run(array const& indices, + static inline Index run(array const& indices, const Dimensions& dimensions) { return array_get(indices) + @@ -50,21 +50,21 @@ struct fixed_size_tensor_index_linearization_helper } }; -template +template struct fixed_size_tensor_index_linearization_helper { template EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Index run(array const&, const Dimensions&) + static inline Index run(array const&, const Dimensions&) { return 0; } }; -template +template struct fixed_size_tensor_index_extraction_helper { template EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Index run(const Index index, + static inline Index run(const Index index, const Dimensions& dimensions) { const Index mult = (index == n-1) ? 1 : 0; @@ -77,7 +77,7 @@ template struct fixed_size_tensor_index_extraction_helper { template EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE Index run(const Index, + static inline Index run(const Index, const Dimensions&) { return 0; @@ -90,11 +90,9 @@ struct fixed_size_tensor_index_extraction_helper // Fixed size #ifndef EIGEN_EMULATE_CXX11_META_H template -struct Sizes { +struct Sizes : internal::numeric_list { typedef internal::numeric_list Base; - const Base t = Base(); static const std::ptrdiff_t total_size = internal::arg_prod(Indices...); - static const ptrdiff_t count = Base::count; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const { return Base::count; @@ -121,17 +119,17 @@ struct Sizes { return *this; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::ptrdiff_t index) const { - return internal::fixed_size_tensor_index_extraction_helper::run(index, t); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[] (const std::size_t index) const { + return internal::fixed_size_tensor_index_extraction_helper::run(index, *this); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - ptrdiff_t IndexOfColMajor(const array& indices) const { - return internal::fixed_size_tensor_index_linearization_helper::run(indices, t); + size_t IndexOfColMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - ptrdiff_t IndexOfRowMajor(const array& indices) const { - return internal::fixed_size_tensor_index_linearization_helper::run(indices, t); + size_t IndexOfRowMajor(const array& indices) const { + return internal::fixed_size_tensor_index_linearization_helper::run(indices, *static_cast(this)); } }; @@ -144,25 +142,25 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes +template struct non_zero_size { - typedef internal::type2val type; + typedef internal::type2val type; }; template <> struct non_zero_size<0> { typedef internal::null_type type; }; -template struct Sizes { +template struct Sizes { typedef typename internal::make_type_list::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type, typename non_zero_size::type >::type Base; - static const std::ptrdiff_t count = Base::count; - static const std::ptrdiff_t total_size = internal::arg_prod::value; + static const size_t count = Base::count; + static const std::size_t total_size = internal::arg_prod::value; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t rank() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { return count; } - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t TotalSize() { + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t TotalSize() { return internal::arg_prod::value; } @@ -178,7 +176,7 @@ template Sizes(DenseIndex... /*indices*/) { } - explicit Sizes(std::initializer_list) { + explicit Sizes(std::initializer_list) { // todo: add assertion } #else @@ -213,18 +211,18 @@ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - ptrdiff_t IndexOfColMajor(const array& indices) const { + size_t IndexOfColMajor(const array& indices) const { return internal::fixed_size_tensor_index_linearization_helper::run(indices, *reinterpret_cast(this)); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - ptrdiff_t IndexOfRowMajor(const array& indices) const { + size_t IndexOfRowMajor(const array& indices) const { return internal::fixed_size_tensor_index_linearization_helper::run(indices, *reinterpret_cast(this)); } }; namespace internal { -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes&) { +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_prod(const Sizes&) { return Sizes::total_size; } } @@ -233,7 +231,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes +template struct tensor_index_linearization_helper { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -245,7 +243,7 @@ struct tensor_index_linearization_helper } }; -template +template struct tensor_index_linearization_helper { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -264,7 +262,7 @@ struct DSizes : array { typedef array Base; static const int count = NumDims; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t rank() const { return NumDims; } @@ -284,57 +282,6 @@ struct DSizes : array { (*this)[0] = i0; } - EIGEN_DEVICE_FUNC DSizes(const DimensionList& a) { - for (int i = 0 ; i < NumDims; ++i) { - (*this)[i] = a[i]; - } - } - - // Enable DSizes index type promotion only if we are promoting to the - // larger type, e.g. allow to promote dimensions of type int to long. - template - EIGEN_DEVICE_FUNC - explicit DSizes(const array& other, - // Default template parameters require c++11. - typename internal::enable_if< - internal::is_same< - DenseIndex, - typename internal::promote_index_type< - DenseIndex, - OtherIndex - >::type - >::value, void*>::type = 0) { - for (int i = 0; i < NumDims; ++i) { - (*this)[i] = static_cast(other[i]); - } - } - -#ifdef EIGEN_HAS_INDEX_LIST - template - EIGEN_DEVICE_FUNC - explicit DSizes(const Eigen::IndexList& dimensions) { - for (int i = 0; i < dimensions.count; ++i) { - (*this)[i] = dimensions[i]; - } - } -#endif - -#ifndef EIGEN_EMULATE_CXX11_META_H - template - EIGEN_DEVICE_FUNC DSizes(const Sizes& a) { - for (int i = 0 ; i < NumDims; ++i) { - (*this)[i] = a[i]; - } - } -#else - template - EIGEN_DEVICE_FUNC DSizes(const Sizes& a) { - for (int i = 0 ; i < NumDims; ++i) { - (*this)[i] = a[i]; - } - } -#endif - #if EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) { @@ -383,21 +330,12 @@ struct DSizes : array { } }; -template -std::ostream& operator<<(std::ostream& os, - const DSizes& dims) { - os << "["; - for (int i = 0; i < NumDims; ++i) { - if (i > 0) os << ", "; - os << dims[i]; - } - os << "]"; - return os; -} + + // Boilerplate namespace internal { -template +template struct tensor_vsize_index_linearization_helper { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -409,7 +347,7 @@ struct tensor_vsize_index_linearization_helper } }; -template +template struct tensor_vsize_index_linearization_helper { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -424,10 +362,10 @@ struct tensor_vsize_index_linearization_helper namespace internal { template struct array_size > { - static const ptrdiff_t value = NumDims; + static const size_t value = NumDims; }; template struct array_size > { - static const ptrdiff_t value = NumDims; + static const size_t value = NumDims; }; #ifndef EIGEN_EMULATE_CXX11_META_H template struct array_size > { @@ -437,42 +375,42 @@ template struct array_size::count; }; template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes&) { - return get >::value; + return get >::value; } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<>&) { eigen_assert(false && "should never be called"); return -1; } #else -template struct array_size > { - static const ptrdiff_t value = Sizes::count; +template struct array_size > { + static const size_t value = Sizes::count; }; -template struct array_size > { - static const ptrdiff_t value = Sizes::count; +template struct array_size > { + static const size_t value = Sizes::count; }; -template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes&) { +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t array_get(const Sizes&) { return get::Base>::value; } #endif -template +template struct sizes_match_below_dim { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) { + static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) { return false; } }; -template +template struct sizes_match_below_dim { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1& dims1, Dims2& dims2) { + static EIGEN_DEVICE_FUNC inline bool run(Dims1& dims1, Dims2& dims2) { return (array_get(dims1) == array_get(dims2)) & sizes_match_below_dim::run(dims1, dims2); } }; template struct sizes_match_below_dim { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) { + static EIGEN_DEVICE_FUNC inline bool run(Dims1&, Dims2&) { return true; } }; @@ -481,7 +419,7 @@ struct sizes_match_below_dim { template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2) { +EIGEN_DEVICE_FUNC bool dimensions_match(Dims1& dims1, Dims2& dims2) { return internal::sizes_match_below_dim::value, internal::array_size::value>::run(dims1, dims2); } diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 4689b0230..06987132b 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -32,7 +32,6 @@ struct traits > typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - typedef typename MakePointer_::Type PointerType; enum { Flags = 0 @@ -42,8 +41,6 @@ struct traits > // Intermediate typedef to workaround MSVC issue. typedef MakePointer_ MakePointerT; typedef typename MakePointerT::Type Type; - - }; }; @@ -76,8 +73,6 @@ class TensorEvalToOp : public TensorBase, typedef typename Eigen::internal::traits::StorageKind StorageKind; typedef typename Eigen::internal::traits::Index Index; - static const int NumDims = Eigen::internal::traits::NumDimensions; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr) : m_xpr(expr), m_buffer(buffer) {} @@ -103,60 +98,38 @@ struct TensorEvaluator, Device> typedef typename XprType::Index Index; typedef typename internal::remove_const::type CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = PacketType::size; - typedef typename Eigen::internal::traits::PointerType TensorPointerType; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; + static const int PacketSize = internal::unpacket_traits::size; + enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = true, - PreferBlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = true + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = true }; - static const int NumDims = internal::traits::NumDimensions; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator TensorBlockScratch; - - typedef typename TensorEvaluator::TensorBlock - ArgTensorBlock; - - typedef internal::TensorBlockAssignment< - CoeffReturnType, NumDims, typename ArgTensorBlock::XprType, Index> - TensorBlockAssignment; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){} - + : m_impl(op.expression(), device), m_device(device), + m_buffer(op.buffer()), m_op(op), m_expression(op.expression()) + { } + // Used for accessor extraction in SYCL Managed TensorMap: + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& op() const { + return m_op; + } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() { } - + typedef typename internal::traits >::template MakePointer::Type DevicePointer; EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(DevicePointer scalar) { EIGEN_UNUSED_VARIABLE(scalar); eigen_assert(scalar == NULL); return m_impl.evalSubExprsIfNeeded(m_buffer); } -#ifdef EIGEN_USE_THREADS - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType scalar, EvalSubExprsCallback done) { - EIGEN_UNUSED_VARIABLE(scalar); - eigen_assert(scalar == NULL); - m_impl.evalSubExprsIfNeededAsync(m_buffer, std::move(done)); - } -#endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) { m_buffer[i] = m_impl.coeff(i); } @@ -164,33 +137,6 @@ struct TensorEvaluator, Device> internal::pstoret(m_buffer + i, m_impl.template packet::IsAligned ? Aligned : Unaligned>(i)); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - return m_impl.getResourceRequirements(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock( - TensorBlockDesc& desc, TensorBlockScratch& scratch) { - // Add `m_buffer` as destination buffer to the block descriptor. - desc.template AddDestinationBuffer( - /*dst_base=*/m_buffer + desc.offset(), - /*dst_strides=*/internal::strides(m_impl.dimensions())); - - ArgTensorBlock block = - m_impl.block(desc, scratch, /*root_of_expr_ast=*/true); - - // If block was evaluated into a destination buffer, there is no need to do - // an assignment. - if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) { - TensorBlockAssignment::Run( - TensorBlockAssignment::target( - desc.dimensions(), internal::strides(m_impl.dimensions()), - m_buffer, desc.offset()), - block.expr()); - } - block.cleanup(); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -213,20 +159,19 @@ struct TensorEvaluator, Device> TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_buffer; } + EIGEN_DEVICE_FUNC DevicePointer data() const { return m_buffer; } ArgType expression() const { return m_expression; } - #ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - m_buffer.bind(cgh); - } - #endif + /// required by sycl in order to extract the accessor + const TensorEvaluator& impl() const { return m_impl; } + /// added for sycl in order to construct the buffer from the sycl device + const Device& device() const{return m_device;} private: TensorEvaluator m_impl; - EvaluatorPointerType m_buffer; + const Device& m_device; + DevicePointer m_buffer; + const XprType& m_op; const ArgType m_expression; }; diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index d4532b72c..834ce07df 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -32,72 +32,44 @@ struct TensorEvaluator typedef typename Derived::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef typename Derived::Dimensions Dimensions; - typedef Derived XprType; - static const int PacketSize = PacketType::size; - typedef typename internal::traits::template MakePointer::Type TensorPointerType; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; // NumDimensions is -1 for variable dim tensors static const int NumCoords = internal::traits::NumDimensions > 0 ? internal::traits::NumDimensions : 0; enum { - IsAligned = Derived::IsAligned, - PacketAccess = (PacketType::size > 1), - BlockAccess = internal::is_arithmetic::type>::value, - PreferBlockAccess = false, - Layout = Derived::Layout, - CoordAccess = NumCoords > 0, - RawAccess = true + IsAligned = Derived::IsAligned, + PacketAccess = (internal::unpacket_traits::size > 1), + Layout = Derived::Layout, + CoordAccess = NumCoords > 0, + RawAccess = true }; - typedef typename internal::remove_const::type ScalarNoConst; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator TensorBlockScratch; - - typedef typename internal::TensorMaterializedBlock - TensorBlock; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) - : m_data(device.get((const_cast(m.data())))), - m_dims(m.dimensions()), - m_device(device) + : m_data(const_cast::template MakePointer::Type>(m.data())), m_dims(m.dimensions()), m_device(device), m_impl(m) { } - + // Used for accessor extraction in SYCL Managed TensorMap: + const Derived& derived() const { return m_impl; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) { - if (!NumTraits::type>::RequireInitialization && dest) { - m_device.memcpy((void*)(m_device.get(dest)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar)); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* dest) { + if (dest) { + m_device.memcpy((void*)dest, m_data, sizeof(Scalar) * m_dims.TotalSize()); return false; } return true; } -#ifdef EIGEN_USE_THREADS - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType dest, EvalSubExprsCallback done) { - // TODO(ezhulenev): ThreadPoolDevice memcpy is blockign operation. - done(evalSubExprsIfNeeded(dest)); - } -#endif // EIGEN_USE_THREADS - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - eigen_assert(m_data != NULL); + eigen_assert(m_data); return m_data[index]; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { - eigen_assert(m_data != NULL); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { + eigen_assert(m_data); return m_data[index]; } @@ -107,18 +79,6 @@ struct TensorEvaluator return internal::ploadt(m_data + index); } - // Return a packet starting at `index` where `umask` specifies which elements - // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for - // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding - // float element will be loaded, otherwise 0 will be loaded. - // Function has been templatized to enable Sfinae. - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename internal::enable_if::masked_load_available, PacketReturnTypeT>::type - partialPacket(Index index, typename internal::unpacket_traits::mask_t umask) const - { - return internal::ploadu(m_data + index, umask); - } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { @@ -126,7 +86,7 @@ struct TensorEvaluator } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { - eigen_assert(m_data != NULL); + eigen_assert(m_data); if (static_cast(Layout) == static_cast(ColMajor)) { return m_data[m_dims.IndexOfColMajor(coords)]; } else { @@ -134,9 +94,8 @@ struct TensorEvaluator } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& - coeffRef(const array& coords) { - eigen_assert(m_data != NULL); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array& coords) { + eigen_assert(m_data); if (static_cast(Layout) == static_cast(ColMajor)) { return m_data[m_dims.IndexOfColMajor(coords)]; } else { @@ -146,50 +105,19 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - PacketType::size); + internal::unpacket_traits::size); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - return internal::TensorBlockResourceRequirements::any(); - } + EIGEN_DEVICE_FUNC typename internal::traits::template MakePointer::Type data() const { return m_data; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { - assert(m_data != NULL); - return TensorBlock::materialize(m_data, m_dims, desc, scratch); - } + /// required by sycl in order to construct sycl buffer from raw pointer + const Device& device() const{return m_device;} - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( - const TensorBlockDesc& desc, const TensorBlock& block) { - assert(m_data != NULL); - - typedef typename TensorBlock::XprType TensorBlockExpr; - typedef internal::TensorBlockAssignment - TensorBlockAssign; - - TensorBlockAssign::Run( - TensorBlockAssign::target(desc.dimensions(), - internal::strides(m_dims), m_data, - desc.offset()), - block.expr()); - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_data.bind(cgh); - } -#endif protected: - EvaluatorPointerType m_data; + typename internal::traits::template MakePointer::Type m_data; Dimensions m_dims; - const Device EIGEN_DEVICE_REF m_device; + const Device& m_device; + const Derived& m_impl; }; namespace { @@ -198,7 +126,7 @@ T loadConstant(const T* address) { return *address; } // Use the texture cache on CUDA devices whenever possible -#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float loadConstant(const float* address) { return __ldg(address); @@ -212,13 +140,6 @@ Eigen::half loadConstant(const Eigen::half* address) { return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x))); } #endif -#ifdef EIGEN_USE_SYCL -// overload of load constant should be implemented here based on range access -template -T &loadConstant(const Eigen::TensorSycl::internal::RangeAccess &address) { - return *address; -} -#endif } @@ -231,64 +152,40 @@ struct TensorEvaluator typedef typename Derived::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef typename Derived::Dimensions Dimensions; - typedef const Derived XprType; - typedef typename internal::traits::template MakePointer::Type TensorPointerType; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; - - typedef typename internal::remove_const::type ScalarNoConst; // NumDimensions is -1 for variable dim tensors static const int NumCoords = internal::traits::NumDimensions > 0 ? internal::traits::NumDimensions : 0; - static const int PacketSize = PacketType::size; enum { - IsAligned = Derived::IsAligned, - PacketAccess = (PacketType::size > 1), - BlockAccess = internal::is_arithmetic::value, - PreferBlockAccess = false, - Layout = Derived::Layout, - CoordAccess = NumCoords > 0, - RawAccess = true + IsAligned = Derived::IsAligned, + PacketAccess = (internal::unpacket_traits::size > 1), + Layout = Derived::Layout, + CoordAccess = NumCoords > 0, + RawAccess = true }; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator TensorBlockScratch; - - typedef typename internal::TensorMaterializedBlock - TensorBlock; - //===--------------------------------------------------------------------===// + // Used for accessor extraction in SYCL Managed TensorMap: + const Derived& derived() const { return m_impl; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) - : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device) + : m_data(m.data()), m_dims(m.dimensions()), m_device(device), m_impl(m) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { if (!NumTraits::type>::RequireInitialization && data) { - m_device.memcpy((void*)(m_device.get(data)),m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar)); + m_device.memcpy((void*)data, m_data, m_dims.TotalSize() * sizeof(Scalar)); return false; } return true; } -#ifdef EIGEN_USE_THREADS - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType dest, EvalSubExprsCallback done) { - // TODO(ezhulenev): ThreadPoolDevice memcpy is a blockign operation. - done(evalSubExprsIfNeeded(dest)); - } -#endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - eigen_assert(m_data != NULL); + eigen_assert(m_data); return loadConstant(m_data+index); } @@ -298,20 +195,8 @@ struct TensorEvaluator return internal::ploadt_ro(m_data + index); } - // Return a packet starting at `index` where `umask` specifies which elements - // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for - // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding - // float element will be loaded, otherwise 0 will be loaded. - // Function has been templatized to enable Sfinae. - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - typename internal::enable_if::masked_load_available, PacketReturnTypeT>::type - partialPacket(Index index, typename internal::unpacket_traits::mask_t umask) const - { - return internal::ploadu(m_data + index, umask); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array& coords) const { - eigen_assert(m_data != NULL); + eigen_assert(m_data); const Index index = (static_cast(Layout) == static_cast(ColMajor)) ? m_dims.IndexOfColMajor(coords) : m_dims.IndexOfRowMajor(coords); return loadConstant(m_data+index); @@ -319,32 +204,19 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - PacketType::size); + internal::unpacket_traits::size); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - return internal::TensorBlockResourceRequirements::any(); - } + EIGEN_DEVICE_FUNC typename internal::traits::template MakePointer::Type data() const { return m_data; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { - assert(m_data != NULL); - return TensorBlock::materialize(m_data, m_dims, desc, scratch); - } + /// added for sycl in order to construct the buffer from the sycl device + const Device& device() const{return m_device;} - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_data.bind(cgh); - } -#endif protected: - EvaluatorPointerType m_data; + typename internal::traits::template MakePointer::Type m_data; Dimensions m_dims; - const Device EIGEN_DEVICE_REF m_device; + const Device& m_device; + const Derived& m_impl; }; @@ -357,6 +229,14 @@ struct TensorEvaluator, Device> { typedef TensorCwiseNullaryOp XprType; + enum { + IsAligned = true, + PacketAccess = internal::functor_traits::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false + }; + EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper() @@ -366,41 +246,12 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename internal::traits::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = PacketType::size; + static const int PacketSize = internal::unpacket_traits::size; typedef typename TensorEvaluator::Dimensions Dimensions; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; - - enum { - IsAligned = true, - PacketAccess = internal::functor_traits::PacketAccess - #ifdef EIGEN_USE_SYCL - && (PacketType::size >1) - #endif - , - BlockAccess = false, - PreferBlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false - }; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; } - -#ifdef EIGEN_USE_THREADS - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType, EvalSubExprsCallback done) { - done(true); - } -#endif // EIGEN_USE_THREADS - + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const @@ -417,17 +268,16 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, - PacketType::size); + internal::unpacket_traits::size); } - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + + /// required by sycl in order to extract the accessor + const TensorEvaluator& impl() const { return m_argImpl; } + /// required by sycl in order to extract the accessor + NullaryOp functor() const { return m_functor; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_argImpl.bind(cgh); - } -#endif private: const NullaryOp m_functor; @@ -445,59 +295,31 @@ struct TensorEvaluator, Device> typedef TensorCwiseUnaryOp XprType; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & - internal::functor_traits::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, - PreferBlockAccess = TensorEvaluator::PreferBlockAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & internal::functor_traits::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_device(device), - m_functor(op.functor()), + : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device) { } typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; - typedef typename internal::remove_const::type ScalarNoConst; typedef typename internal::traits::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = PacketType::size; + static const int PacketSize = internal::unpacket_traits::size; typedef typename TensorEvaluator::Dimensions Dimensions; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; - static const int NumDims = internal::array_size::value; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator TensorBlockScratch; - - typedef typename TensorEvaluator::TensorBlock - ArgTensorBlock; - - typedef internal::TensorCwiseUnaryBlock - TensorBlock; - //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { m_argImpl.evalSubExprsIfNeeded(NULL); return true; } - -#ifdef EIGEN_USE_THREADS - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType, EvalSubExprsCallback done) { - m_argImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); - } -#endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_argImpl.cleanup(); } @@ -519,31 +341,15 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, functor_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - static const double functor_cost = internal::functor_traits::Cost; - return m_argImpl.getResourceRequirements().addCostPerCoeff( - {0, 0, functor_cost / PacketSize}); - } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { - return TensorBlock(m_argImpl.block(desc, scratch), m_functor); - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const{ - m_argImpl.bind(cgh); - } -#endif + /// required by sycl in order to extract the accessor + const TensorEvaluator & impl() const { return m_argImpl; } + /// added for sycl in order to construct the buffer from sycl device + UnaryOp functor() const { return m_functor; } private: - const Device EIGEN_DEVICE_REF m_device; const UnaryOp m_functor; TensorEvaluator m_argImpl; }; @@ -557,23 +363,16 @@ struct TensorEvaluator XprType; enum { - IsAligned = TensorEvaluator::IsAligned & - TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & - TensorEvaluator::PacketAccess & - internal::functor_traits::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess & - TensorEvaluator::BlockAccess, - PreferBlockAccess = TensorEvaluator::PreferBlockAccess | - TensorEvaluator::PreferBlockAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & + internal::functor_traits::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_device(device), - m_functor(op.functor()), + : m_functor(op.functor()), m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) { @@ -585,27 +384,8 @@ struct TensorEvaluator::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = PacketType::size; + static const int PacketSize = internal::unpacket_traits::size; typedef typename TensorEvaluator::Dimensions Dimensions; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; - - static const int NumDims = internal::array_size< - typename TensorEvaluator::Dimensions>::value; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator TensorBlockScratch; - - typedef typename TensorEvaluator::TensorBlock - LeftTensorBlock; - typedef typename TensorEvaluator::TensorBlock - RightTensorBlock; - - typedef internal::TensorCwiseBinaryBlock - TensorBlock; - //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { @@ -613,24 +393,11 @@ struct TensorEvaluator - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType, EvalSubExprsCallback done) { - // TODO(ezhulenev): Evaluate two expression in parallel? - m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) { - m_rightImpl.evalSubExprsIfNeededAsync(nullptr, - [done](bool) { done(true); }); - }); - } -#endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); m_rightImpl.cleanup(); @@ -654,34 +421,15 @@ struct TensorEvaluator::Cost; - return internal::TensorBlockResourceRequirements::merge( - m_leftImpl.getResourceRequirements(), - m_rightImpl.getResourceRequirements()) - .addCostPerCoeff({0, 0, functor_cost / PacketSize}); - } + EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& left_impl() const { return m_leftImpl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& right_impl() const { return m_rightImpl; } + /// required by sycl in order to extract the accessor + BinaryOp functor() const { return m_functor; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { - desc.DropDestinationBuffer(); - return TensorBlock(m_leftImpl.block(desc, scratch), - m_rightImpl.block(desc, scratch), m_functor); - } - - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - - #ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_leftImpl.bind(cgh); - m_rightImpl.bind(cgh); - } - #endif private: - const Device EIGEN_DEVICE_REF m_device; const BinaryOp m_functor; TensorEvaluator m_leftImpl; TensorEvaluator m_rightImpl; @@ -696,17 +444,11 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess && - TensorEvaluator::PacketAccess && - TensorEvaluator::PacketAccess && - internal::functor_traits::PacketAccess, - BlockAccess = false, - PreferBlockAccess = TensorEvaluator::PreferBlockAccess || - TensorEvaluator::PreferBlockAccess || - TensorEvaluator::PreferBlockAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & + internal::functor_traits::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) @@ -737,14 +479,8 @@ struct TensorEvaluator::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = PacketType::size; + static const int PacketSize = internal::unpacket_traits::size; typedef typename TensorEvaluator::Dimensions Dimensions; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { @@ -752,7 +488,7 @@ struct TensorEvaluator & arg1Impl() const { return m_arg1Impl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& arg2Impl() const { return m_arg2Impl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& arg3Impl() const { return m_arg3Impl; } private: const TernaryOp m_functor; @@ -813,20 +547,12 @@ struct TensorEvaluator typedef typename XprType::Scalar Scalar; enum { - IsAligned = TensorEvaluator::IsAligned & - TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & - TensorEvaluator::PacketAccess & - PacketType::HasBlend, - BlockAccess = TensorEvaluator::BlockAccess && - TensorEvaluator::BlockAccess && - TensorEvaluator::BlockAccess, - PreferBlockAccess = TensorEvaluator::PreferBlockAccess || - TensorEvaluator::PreferBlockAccess || - TensorEvaluator::PreferBlockAccess, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess & + internal::packet_traits::HasBlend, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) @@ -843,42 +569,8 @@ struct TensorEvaluator typedef typename XprType::Index Index; typedef typename internal::traits::Scalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = PacketType::size; + static const int PacketSize = internal::unpacket_traits::size; typedef typename TensorEvaluator::Dimensions Dimensions; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; - - static const int NumDims = internal::array_size::value; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator TensorBlockScratch; - - typedef typename TensorEvaluator::TensorBlock - IfArgTensorBlock; - typedef typename TensorEvaluator::TensorBlock - ThenArgTensorBlock; - typedef typename TensorEvaluator::TensorBlock - ElseArgTensorBlock; - - struct TensorSelectOpBlockFactory { - template - struct XprType { - typedef TensorSelectOp type; - }; - - template - typename XprType::type expr( - const IfArgXprType& if_expr, const ThenArgXprType& then_expr, const ElseArgXprType& else_expr) const { - return typename XprType::type(if_expr, then_expr, else_expr); - } - }; - - typedef internal::TensorTernaryExprBlock - TensorBlock; - //===--------------------------------------------------------------------===// EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { @@ -886,25 +578,12 @@ struct TensorEvaluator return m_condImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { m_condImpl.evalSubExprsIfNeeded(NULL); m_thenImpl.evalSubExprsIfNeeded(NULL); m_elseImpl.evalSubExprsIfNeeded(NULL); return true; } - -#ifdef EIGEN_USE_THREADS - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType, EvalSubExprsCallback done) { - m_condImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) { - m_thenImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) { - m_elseImpl.evalSubExprsIfNeeded(nullptr, [done](bool) { done(true); }); - }); - }); - } -#endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_condImpl.cleanup(); m_thenImpl.cleanup(); @@ -918,15 +597,13 @@ struct TensorEvaluator template EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const { - internal::Selector select; - EIGEN_UNROLL_LOOP - for (Index i = 0; i < PacketSize; ++i) { - select.select[i] = m_condImpl.coeff(index+i); - } - return internal::pblend(select, - m_thenImpl.template packet(index), - m_elseImpl.template packet(index)); - + internal::Selector select; + for (Index i = 0; i < PacketSize; ++i) { + select.select[i] = m_condImpl.coeff(index+i); + } + return internal::pblend(select, + m_thenImpl.template packet(index), + m_elseImpl.template packet(index)); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost @@ -936,42 +613,14 @@ struct TensorEvaluator .cwiseMax(m_elseImpl.costPerCoeff(vectorized)); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - auto then_req = m_thenImpl.getResourceRequirements(); - auto else_req = m_elseImpl.getResourceRequirements(); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; } + /// required by sycl in order to extract the accessor + const TensorEvaluator & cond_impl() const { return m_condImpl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& then_impl() const { return m_thenImpl; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& else_impl() const { return m_elseImpl; } - auto merged_req = - internal::TensorBlockResourceRequirements::merge(then_req, else_req); - merged_req.cost_per_coeff = - then_req.cost_per_coeff.cwiseMax(else_req.cost_per_coeff); - - return internal::TensorBlockResourceRequirements::merge( - m_condImpl.getResourceRequirements(), merged_req); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { - // It's unsafe to pass destination buffer to underlying expressions, because - // output might be aliased with one of the inputs. - desc.DropDestinationBuffer(); - - return TensorBlock( - m_condImpl.block(desc, scratch), m_thenImpl.block(desc, scratch), - m_elseImpl.block(desc, scratch), TensorSelectOpBlockFactory()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_condImpl.bind(cgh); - m_thenImpl.bind(cgh); - m_elseImpl.bind(cgh); - } -#endif private: TensorEvaluator m_condImpl; TensorEvaluator m_thenImpl; diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h index c52fb77dc..f01d77c0a 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h @@ -12,94 +12,31 @@ namespace Eigen { -/** - * \class TensorExecutor - * \ingroup CXX11_Tensor_Module - * - * \brief The tensor executor class. - * - * This class is responsible for launch the evaluation of the expression on - * the specified computing device. - * - * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and - * instructions) - * @tparam Tiling can use block based tensor evaluation - * (see TensorBlock.h) - */ +/** \class TensorExecutor + * \ingroup CXX11_Tensor_Module + * + * \brief The tensor executor class. + * + * This class is responsible for launch the evaluation of the expression on + * the specified computing device. + */ namespace internal { -/** - * Evaluating TensorBroadcastingOp via coefficient of packet path is extremely - * expensive. If expression has at least one broadcast op in it, and it supports - * block based evaluation, we always prefer it, even for the small tensors. For - * all other tileable ops, block evaluation overhead for small tensors (fits - * into L1) is too large, and we fallback on vectorized evaluation. - */ - -// TODO(ezhulenev): Add specializations for all other types of Tensor ops. - -template -struct ExpressionHasTensorBroadcastingOp { - enum { value = false }; -}; - -template -struct ExpressionHasTensorBroadcastingOp< - const TensorAssignOp > { - enum { value = ExpressionHasTensorBroadcastingOp::value }; -}; - -template -struct ExpressionHasTensorBroadcastingOp< - const TensorCwiseUnaryOp > { - enum { value = ExpressionHasTensorBroadcastingOp::value }; -}; - -template -struct ExpressionHasTensorBroadcastingOp< - const TensorCwiseBinaryOp > { - enum { - value = ExpressionHasTensorBroadcastingOp::value || - ExpressionHasTensorBroadcastingOp::value - }; -}; - -template -struct ExpressionHasTensorBroadcastingOp< - const TensorBroadcastingOp > { - enum { value = true }; -}; - -// -------------------------------------------------------------------------- // - -/** - * Default strategy: the expression is evaluated sequentially with a single cpu - * thread, without vectorization and block evaluation. - */ -template -class TensorExecutor { +// Default strategy: the expression is evaluated with a single cpu thread. +template +class TensorExecutor +{ public: - typedef typename Expression::Index StorageIndex; - - // Including `unsupported/Eigen/CXX11/Tensor` in different translation units - // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR - // violation. If this template is instantiated with a non-default device, it - // means that this header file was included without defining - // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`. - static_assert(std::is_same::value, - "Default executor instantiated with non-default device. " - "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or " - "EIGEN_USE_SYCL before including Eigen headers."); - + typedef typename Expression::Index Index; EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE void run(const Expression& expr, - const Device& device = Device()) { + static inline void run(const Expression& expr, const Device& device = Device()) + { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) { - const StorageIndex size = array_prod(evaluator.dimensions()); - for (StorageIndex i = 0; i < size; ++i) { + if (needs_assign) + { + const Index size = array_prod(evaluator.dimensions()); + for (Index i = 0; i < size; ++i) { evaluator.evalScalar(i); } } @@ -107,48 +44,35 @@ class TensorExecutor { } }; -/** - * Default async execution strategy is not implemented. Currently it's only - * available for ThreadPoolDevice (see definition below). - */ -template -class TensorAsyncExecutor {}; -/** - * Process all the data with a single cpu thread, using vectorized instructions. - */ -template -class TensorExecutor { +template +class TensorExecutor +{ public: - typedef typename Expression::Index StorageIndex; - + typedef typename Expression::Index Index; EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE void run( - const Expression& expr, const DefaultDevice& device = DefaultDevice()) { + static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) + { TensorEvaluator evaluator(expr, device); const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) { - const StorageIndex size = array_prod(evaluator.dimensions()); - const int PacketSize = unpacket_traits::PacketReturnType>::size; - - // Give compiler a strong possibility to unroll the loop. But don't insist - // on unrolling, because if the function is expensive compiler should not + if (needs_assign) + { + const Index size = array_prod(evaluator.dimensions()); + const int PacketSize = unpacket_traits::PacketReturnType>::size; + // Give the compiler a strong hint to unroll the loop. But don't insist + // on unrolling, because if the function is expensive the compiler should not // unroll the loop at the expense of inlining. - const StorageIndex UnrolledSize = - (size / (4 * PacketSize)) * 4 * PacketSize; - for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) { - for (StorageIndex j = 0; j < 4; j++) { + const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize; + for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) { + for (Index j = 0; j < 4; j++) { evaluator.evalPacket(i + j * PacketSize); } } - const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize; - for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) { + const Index VectorizedSize = (size / PacketSize) * PacketSize; + for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) { evaluator.evalPacket(i); } - for (StorageIndex i = VectorizedSize; i < size; ++i) { + for (Index i = VectorizedSize; i < size; ++i) { evaluator.evalScalar(i); } } @@ -156,162 +80,55 @@ class TensorExecutor -class TensorExecutor { - public: - typedef typename traits::Scalar Scalar; - typedef typename remove_const::type ScalarNoConst; - typedef TensorEvaluator Evaluator; - typedef typename traits::Index StorageIndex; - static const int NumDims = traits::NumDimensions; - - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE void run(const Expression& expr, - const DefaultDevice& device = DefaultDevice()) { - typedef TensorBlockMapper - TensorBlockMapper; - - typedef internal::TensorBlockDescriptor - TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator - TensorBlockScratch; - - Evaluator evaluator(expr, device); - - // TODO(ezhulenev): Do not use tiling for small tensors? - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - - if (needs_assign) { - // Query expression tree for desired block size/shape. - const TensorBlockResourceRequirements requirements = - evaluator.getResourceRequirements(); - - const TensorBlockMapper block_mapper( - typename TensorBlockDesc::Dimensions(evaluator.dimensions()), - requirements); - - // Share scratch memory allocator between all blocks. - TensorBlockScratch scratch(device); - - const StorageIndex total_block_count = block_mapper.blockCount(); - for (StorageIndex i = 0; i < total_block_count; ++i) { - TensorBlockDesc desc = block_mapper.blockDescriptor(i); - evaluator.evalBlock(desc, scratch); - scratch.reset(); - } - } - evaluator.cleanup(); - } -}; - -/** - * Multicore strategy: the index space is partitioned and each partition is - * executed on a single core. - * - * (1) TensorExecutor will submit work to the ThreadPoolDevice managed thread - * pool, and will block the caller thread until all tasks are finished. - * - * (2) TensorAsyncExecutor is a non-blocking version, that will submit work to - * the ThreadPoolDevice managed thread pool, and will return immediately. - * It will call 'done' callback after all tasks are finished. - */ +// Multicore strategy: the index space is partitioned and each partition is executed on a single core #ifdef EIGEN_USE_THREADS - -template -struct TensorExecutorTilingContext { - TensorExecutorTilingContext() = default; - TensorExecutorTilingContext(const TensorBlockMapper& b_mapper, - const TensorOpCost& b_cost, size_t b_aligned_size) - : block_mapper(b_mapper), - cost(b_cost), - aligned_blocksize(b_aligned_size) {} - - TensorBlockMapper block_mapper; // navigate through blocks - TensorOpCost cost; // cost of computing a single block - size_t aligned_blocksize; // block size after memory alignment -}; - -// Computes a block evaluation parameters, and allocates temporary memory buffer -// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below. -template -TensorExecutorTilingContext GetTensorExecutorTilingContext( - const Evaluator& evaluator) { - // Query expression tree for desired block size/shape. - TensorBlockResourceRequirements requirements = - evaluator.getResourceRequirements(); - - // Update target block size based on cost model. - double taskSize = TensorCostModel::taskSize( - 1, requirements.cost_per_coeff); - requirements.size = static_cast(1.0 / taskSize); - - TensorBlockMapper block_mapper( - typename TensorBlockMapper::Dimensions(evaluator.dimensions()), - requirements); - - size_t block_size = block_mapper.blockTotalSize(); - const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1); - const size_t aligned_blocksize = - align * - divup(block_size * sizeof(typename Evaluator::Scalar), align); - - return {block_mapper, requirements.cost_per_coeff * block_size, - aligned_blocksize}; -} - -template +template struct EvalRange { - static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, - const StorageIndex lastIdx) { + static void run(Evaluator* evaluator_in, const Index first, const Index last) { Evaluator evaluator = *evaluator_in; - eigen_assert(lastIdx >= firstIdx); - for (StorageIndex i = firstIdx; i < lastIdx; ++i) { + eigen_assert(last >= first); + for (Index i = first; i < last; ++i) { evaluator.evalScalar(i); } } - static StorageIndex alignBlockSize(StorageIndex size) { return size; } + static Index alignBlockSize(Index size) { + return size; + } }; -template -struct EvalRange { - static const int PacketSize = - unpacket_traits::size; +template +struct EvalRange { + static const int PacketSize = unpacket_traits::size; - static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, - const StorageIndex lastIdx) { + static void run(Evaluator* evaluator_in, const Index first, const Index last) { Evaluator evaluator = *evaluator_in; - eigen_assert(lastIdx >= firstIdx); - StorageIndex i = firstIdx; - if (lastIdx - firstIdx >= PacketSize) { - eigen_assert(firstIdx % PacketSize == 0); - StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize; - // Give compiler a strong possibility to unroll the loop. But don't insist - // on unrolling, because if the function is expensive compiler should not + eigen_assert(last >= first); + Index i = first; + if (last - first >= PacketSize) { + eigen_assert(first % PacketSize == 0); + Index last_chunk_offset = last - 4 * PacketSize; + // Give the compiler a strong hint to unroll the loop. But don't insist + // on unrolling, because if the function is expensive the compiler should not // unroll the loop at the expense of inlining. - for (; i <= last_chunk_offset; i += 4 * PacketSize) { - for (StorageIndex j = 0; j < 4; j++) { + for (; i <= last_chunk_offset; i += 4*PacketSize) { + for (Index j = 0; j < 4; j++) { evaluator.evalPacket(i + j * PacketSize); } } - last_chunk_offset = lastIdx - PacketSize; + last_chunk_offset = last - PacketSize; for (; i <= last_chunk_offset; i += PacketSize) { evaluator.evalPacket(i); } } - for (; i < lastIdx; ++i) { + for (; i < last; ++i) { evaluator.evalScalar(i); } } - static StorageIndex alignBlockSize(StorageIndex size) { + static Index alignBlockSize(Index size) { // Align block size to packet size and account for unrolling in run above. if (size >= 16 * PacketSize) { return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1); @@ -321,376 +138,144 @@ struct EvalRange { } }; -template -class TensorExecutor { - public: - typedef typename Expression::Index StorageIndex; - - static EIGEN_STRONG_INLINE void run(const Expression& expr, - const ThreadPoolDevice& device) { - typedef TensorEvaluator Evaluator; - typedef EvalRange EvalRange; - - Evaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); - if (needs_assign) { - const StorageIndex size = array_prod(evaluator.dimensions()); - device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), - EvalRange::alignBlockSize, - [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) { - EvalRange::run(&evaluator, firstIdx, lastIdx); - }); - } - evaluator.cleanup(); - } -}; - template -class TensorExecutor { +class TensorExecutor { public: - typedef typename traits::Index IndexType; - typedef typename traits::Scalar Scalar; - typedef typename remove_const::type ScalarNoConst; - - static const int NumDims = traits::NumDimensions; - - typedef TensorEvaluator Evaluator; - typedef TensorBlockMapper BlockMapper; - typedef TensorExecutorTilingContext TilingContext; - - typedef internal::TensorBlockDescriptor - TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator - TensorBlockScratch; - - static EIGEN_STRONG_INLINE void run(const Expression& expr, - const ThreadPoolDevice& device) { + typedef typename Expression::Index Index; + static inline void run(const Expression& expr, const ThreadPoolDevice& device) + { + typedef TensorEvaluator Evaluator; Evaluator evaluator(expr, device); - - const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); - if (needs_assign) { - const TilingContext tiling = - internal::GetTensorExecutorTilingContext(evaluator); - - auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx, - IndexType lastBlockIdx) { - TensorBlockScratch scratch(device); - - for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; - ++block_idx) { - TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx); - evaluator.evalBlock(desc, scratch); - scratch.reset(); - } - }; - - // Evaluate small expressions directly as a single block. - if (tiling.block_mapper.blockCount() == 1) { - TensorBlockScratch scratch(device); - TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions()); - evaluator.evalBlock(desc, scratch); - } else { - device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost, - eval_block); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); + if (needs_assign) + { + const Index size = array_prod(evaluator.dimensions()); +#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL) + device.parallelFor(size, evaluator.costPerCoeff(Vectorizable), + EvalRange::alignBlockSize, + [&evaluator](Index first, Index last) { + EvalRange::run(&evaluator, first, last); + }); +#else + size_t num_threads = device.numThreads(); + if (num_threads > 1) { + num_threads = TensorCostModel::numThreads( + size, evaluator.costPerCoeff(Vectorizable), num_threads); } + if (num_threads == 1) { + EvalRange::run(&evaluator, 0, size); + } else { + const Index PacketSize = Vectorizable ? unpacket_traits::size : 1; + Index blocksz = std::ceil(static_cast(size)/num_threads) + PacketSize - 1; + const Index blocksize = numext::maxi(PacketSize, (blocksz - (blocksz % PacketSize))); + const Index numblocks = size / blocksize; + + Barrier barrier(numblocks); + for (int i = 0; i < numblocks; ++i) { + device.enqueue_with_barrier( + &barrier, &EvalRange::run, + &evaluator, i * blocksize, (i + 1) * blocksize); + } + if (numblocks * blocksize < size) { + EvalRange::run( + &evaluator, numblocks * blocksize, size); + } + barrier.Wait(); + } +#endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL) } evaluator.cleanup(); } }; - -template -class TensorAsyncExecutor { - public: - typedef typename Expression::Index StorageIndex; - typedef TensorEvaluator Evaluator; - - static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, - const ThreadPoolDevice& device, - DoneCallback done) { - TensorAsyncExecutorContext* const ctx = - new TensorAsyncExecutorContext(expr, device, std::move(done)); - - const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void { - if (!need_assign) { - delete ctx; - return; - } - - typedef EvalRange EvalRange; - const StorageIndex size = array_prod(ctx->evaluator.dimensions()); - device.parallelForAsync( - size, ctx->evaluator.costPerCoeff(Vectorizable), - EvalRange::alignBlockSize, - [ctx](StorageIndex firstIdx, StorageIndex lastIdx) { - EvalRange::run(&ctx->evaluator, firstIdx, lastIdx); - }, - [ctx]() { delete ctx; }); - }; - - ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs); - } - - private: - struct TensorAsyncExecutorContext { - TensorAsyncExecutorContext(const Expression& expr, - const ThreadPoolDevice& thread_pool, - DoneCallback done) - : evaluator(expr, thread_pool), on_done(std::move(done)) {} - - ~TensorAsyncExecutorContext() { - evaluator.cleanup(); - on_done(); - } - - Evaluator evaluator; - - private: - DoneCallback on_done; - }; -}; - -template -class TensorAsyncExecutor { - public: - typedef typename traits::Index IndexType; - typedef typename traits::Scalar Scalar; - typedef typename remove_const::type ScalarNoConst; - - static const int NumDims = traits::NumDimensions; - - typedef TensorEvaluator Evaluator; - typedef TensorBlockMapper BlockMapper; - typedef TensorExecutorTilingContext TilingContext; - - typedef internal::TensorBlockDescriptor TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator - TensorBlockScratch; - - static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, - const ThreadPoolDevice& device, - DoneCallback done) { - - TensorAsyncExecutorContext* const ctx = - new TensorAsyncExecutorContext(expr, device, std::move(done)); - - const auto on_eval_subexprs = [ctx](bool need_assign) -> void { - if (!need_assign) { - delete ctx; - return; - } - - ctx->tiling = internal::GetTensorExecutorTilingContext< - Evaluator, BlockMapper, Vectorizable>(ctx->evaluator); - - auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) { - TensorBlockScratch scratch(ctx->device); - - for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; - ++block_idx) { - TensorBlockDesc desc = - ctx->tiling.block_mapper.blockDescriptor(block_idx); - ctx->evaluator.evalBlock(desc, scratch); - scratch.reset(); - } - }; - - // Evaluate small expressions directly as a single block. - if (ctx->tiling.block_mapper.blockCount() == 1) { - TensorBlockScratch scratch(ctx->device); - TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions()); - ctx->evaluator.evalBlock(desc, scratch); - delete ctx; - } else { - ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(), - ctx->tiling.cost, eval_block, - [ctx]() { delete ctx; }); - } - }; - - ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs); - } - - private: - struct TensorAsyncExecutorContext { - TensorAsyncExecutorContext(const Expression& expr, - const ThreadPoolDevice& thread_pool, - DoneCallback done) - : device(thread_pool), - evaluator(expr, thread_pool), - on_done(std::move(done)) {} - - ~TensorAsyncExecutorContext() { - evaluator.cleanup(); - on_done(); - } - - const ThreadPoolDevice& device; - Evaluator evaluator; - TilingContext tiling; - - private: - DoneCallback on_done; - }; -}; - #endif // EIGEN_USE_THREADS + // GPU: the evaluation of the expression is offloaded to a GPU. #if defined(EIGEN_USE_GPU) -template -class TensorExecutor { +template +class TensorExecutor { public: - typedef typename Expression::Index StorageIndex; + typedef typename Expression::Index Index; static void run(const Expression& expr, const GpuDevice& device); }; -#if defined(EIGEN_GPUCC) -template + +#if defined(__CUDACC__) +template struct EigenMetaKernelEval { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) { - for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) { + static __device__ EIGEN_ALWAYS_INLINE + void run(Evaluator& eval, Index first, Index last, Index step_size) { + for (Index i = first; i < last; i += step_size) { eval.evalScalar(i); } } }; -template -struct EigenMetaKernelEval { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) { - const StorageIndex PacketSize = unpacket_traits::size; - const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize; - const StorageIndex vectorized_step_size = step_size * PacketSize; +template +struct EigenMetaKernelEval { + static __device__ EIGEN_ALWAYS_INLINE + void run(Evaluator& eval, Index first, Index last, Index step_size) { + const Index PacketSize = unpacket_traits::size; + const Index vectorized_size = (last / PacketSize) * PacketSize; + const Index vectorized_step_size = step_size * PacketSize; // Use the vector path - for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size; + for (Index i = first * PacketSize; i < vectorized_size; i += vectorized_step_size) { eval.evalPacket(i); } - for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) { + for (Index i = vectorized_size + first; i < last; i += step_size) { eval.evalScalar(i); } } }; -template +template __global__ void __launch_bounds__(1024) -EigenMetaKernel(Evaluator eval, StorageIndex size) { +EigenMetaKernel(Evaluator eval, Index size) { - const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x; - const StorageIndex step_size = blockDim.x * gridDim.x; + const Index first_index = blockIdx.x * blockDim.x + threadIdx.x; + const Index step_size = blockDim.x * gridDim.x; const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned; - EigenMetaKernelEval::run(eval, first_index, size, step_size); + EigenMetaKernelEval::run(eval, first_index, size, step_size); } /*static*/ -template -EIGEN_STRONG_INLINE void TensorExecutor::run( +template +inline void TensorExecutor::run( const Expression& expr, const GpuDevice& device) { TensorEvaluator evaluator(expr, device); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr); + const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); if (needs_assign) { - - const int block_size = device.maxGpuThreadsPerBlock(); - const int max_blocks = device.getNumGpuMultiProcessors() * - device.maxGpuThreadsPerMultiProcessor() / block_size; - const StorageIndex size = array_prod(evaluator.dimensions()); + const int block_size = device.maxCudaThreadsPerBlock(); + const int max_blocks = device.getNumCudaMultiProcessors() * + device.maxCudaThreadsPerMultiProcessor() / block_size; + const Index size = array_prod(evaluator.dimensions()); // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0. const int num_blocks = numext::maxi(numext::mini(max_blocks, divup(size, block_size)), 1); - LAUNCH_GPU_KERNEL( - (EigenMetaKernel, StorageIndex>), + LAUNCH_CUDA_KERNEL( + (EigenMetaKernel, Index>), num_blocks, block_size, 0, device, evaluator, size); } evaluator.cleanup(); } -#endif // EIGEN_GPUCC +#endif // __CUDACC__ #endif // EIGEN_USE_GPU // SYCL Executor policy #ifdef EIGEN_USE_SYCL -template -struct ExecExprFunctorKernel { - typedef typename Evaluator::Index Index; - Evaluator evaluator; - const Index range; - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel( - const Scratch, Evaluator evaluator_, const Index range_) - : evaluator(evaluator_), range(range_) {} - - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()( - cl::sycl::nd_item<1> itemID) { - compute(itemID); - } - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if::type - compute(const cl::sycl::nd_item<1>& itemID) { - Index gId = static_cast(itemID.get_global_linear_id()); - Index total_threads = itemID.get_global_range(0); - - for (Index i = gId; i < range; i += total_threads) { - evaluator.evalScalar(i); - } - } - template - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if::type - compute(const cl::sycl::nd_item<1>& itemID) { - const Index vectorizedRange = - (range / Evaluator::PacketSize) * Evaluator::PacketSize; - Index gId = static_cast(itemID.get_global_linear_id()); - const Index step = Evaluator::PacketSize * itemID.get_global_range(0); - const Index start = Evaluator::PacketSize * gId; - for (Index i = start; i < vectorizedRange; i += step) { - evaluator.evalPacket(i); - } - gId += vectorizedRange; - for (Index i = gId; i < range; i += itemID.get_global_range(0)) { - evaluator.evalScalar(i); - } - } -}; - -template -class TensorExecutor { - public: - typedef typename Expression::Index Index; - static EIGEN_STRONG_INLINE void run(const Expression& expr, - const Eigen::SyclDevice& dev) { - typedef Eigen::TensorEvaluator Evaluator; - Evaluator evaluator(expr, dev); - const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL); - if (needs_assign) { - Index range, GRange, tileSize; - Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions()); - total_size = (total_size == 0) ? 1 : total_size; - const int PacketSize = - Eigen::PacketType::size; - Index vectorizable_threads = static_cast(total_size / PacketSize); - dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange); - range = total_size; - - dev.template nullary_kernel_launcher< - typename Evaluator::CoeffReturnType, - ExecExprFunctorKernel >( - evaluator, - cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), - cl::sycl::range<1>(tileSize)), - Index(1), range); - } - evaluator.cleanup(); +template +class TensorExecutor { +public: + static inline void run(const Expression &expr, const SyclDevice &device) { + // call TensorSYCL module + TensorSycl::run(expr, device); } }; diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h index c9bccfc66..85dfc7a69 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h @@ -38,7 +38,7 @@ struct traits > typedef typename remove_reference::type _XprTypeNested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; + enum { Flags = 0 }; @@ -89,10 +89,6 @@ struct traits > typedef typename remove_reference::type _XprTypeNested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - typedef typename TypeConversion::type - PointerType; }; template @@ -165,12 +161,7 @@ struct traits > typedef typename remove_reference::type _RhsNested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - typedef typename TypeConversion::val, - typename traits::PointerType, - typename traits::PointerType>::type - >::type - PointerType; + enum { Flags = 0 }; @@ -247,12 +238,7 @@ struct traits::type _Arg3Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - typedef typename TypeConversion::val, - typename traits::PointerType, - typename traits::PointerType>::type - >::type - PointerType; + enum { Flags = 0 }; @@ -328,9 +314,6 @@ struct traits > typedef typename ElseXprType::Nested ElseNested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - typedef typename conditional::val, - typename traits::PointerType, - typename traits::PointerType>::type PointerType; }; template diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index c62bc5fa9..08eb5595a 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -10,6 +10,10 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H #define EIGEN_CXX11_TENSOR_TENSOR_FFT_H +// This code requires the ability to initialize arrays of constant +// values directly inside a class. +#if __cplusplus >= 201103L || EIGEN_COMP_MSVC >= 1900 + namespace Eigen { /** \class TensorFFT @@ -67,7 +71,6 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - typedef typename traits::PointerType PointerType; }; template @@ -127,23 +130,16 @@ struct TensorEvaluator, D typedef OutputScalar CoeffReturnType; typedef typename PacketType::type PacketReturnType; static const int PacketSize = internal::unpacket_traits::size; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; enum { IsAligned = false, PacketAccess = true, BlockAccess = false, - PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false }; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) { const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); for (int i = 0; i < NumDims; ++i) { @@ -169,13 +165,13 @@ struct TensorEvaluator, D return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(OutputScalar* data) { m_impl.evalSubExprsIfNeeded(NULL); if (data) { evalToBuf(data); return false; } else { - m_data = (EvaluatorPointerType)m_device.get((CoeffReturnType*)(m_device.allocate_temp(sizeof(CoeffReturnType) * m_size))); + m_data = (CoeffReturnType*)m_device.allocate(sizeof(CoeffReturnType) * m_size); evalToBuf(m_data); return true; } @@ -204,16 +200,11 @@ struct TensorEvaluator, D return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_data.bind(cgh); - } -#endif + EIGEN_DEVICE_FUNC Scalar* data() const { return m_data; } + private: - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(OutputScalar* data) { const bool write_to_out = internal::is_same::value; ComplexScalar* buf = write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size); @@ -239,32 +230,20 @@ struct TensorEvaluator, D // t_n = exp(sqrt(-1) * pi * n^2 / line_len) // for n = 0, 1,..., line_len-1. // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2 - - // The recurrence is correct in exact arithmetic, but causes - // numerical issues for large transforms, especially in - // single-precision floating point. - // - // pos_j_base_powered[0] = ComplexScalar(1, 0); - // if (line_len > 1) { - // const ComplexScalar pos_j_base = ComplexScalar( - // numext::cos(M_PI / line_len), numext::sin(M_PI / line_len)); - // pos_j_base_powered[1] = pos_j_base; - // if (line_len > 2) { - // const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; - // for (int i = 2; i < line_len + 1; ++i) { - // pos_j_base_powered[i] = pos_j_base_powered[i - 1] * - // pos_j_base_powered[i - 1] / - // pos_j_base_powered[i - 2] * - // pos_j_base_sq; - // } - // } - // } - // TODO(rmlarsen): Find a way to use Eigen's vectorized sin - // and cosine functions here. - for (int j = 0; j < line_len + 1; ++j) { - double arg = ((EIGEN_PI * j) * j) / line_len; - std::complex tmp(numext::cos(arg), numext::sin(arg)); - pos_j_base_powered[j] = static_cast(tmp); + pos_j_base_powered[0] = ComplexScalar(1, 0); + if (line_len > 1) { + const RealScalar pi_over_len(EIGEN_PI / line_len); + const ComplexScalar pos_j_base = ComplexScalar( + std::cos(pi_over_len), std::sin(pi_over_len)); + pos_j_base_powered[1] = pos_j_base; + if (line_len > 2) { + const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base; + for (int j = 2; j < line_len + 1; ++j) { + pos_j_base_powered[j] = pos_j_base_powered[j - 1] * + pos_j_base_powered[j - 1] / + pos_j_base_powered[j - 2] * pos_j_base_sq; + } + } } } @@ -274,7 +253,7 @@ struct TensorEvaluator, D // get data into line_buf const Index stride = m_strides[dim]; if (stride == 1) { - m_device.memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar)); + memcpy(line_buf, &buf[base_offset], line_len*sizeof(ComplexScalar)); } else { Index offset = base_offset; for (int j = 0; j < line_len; ++j, offset += stride) { @@ -282,7 +261,7 @@ struct TensorEvaluator, D } } - // process the line + // processs the line if (is_power_of_two) { processDataLineCooleyTukey(line_buf, line_len, log_len); } @@ -292,7 +271,7 @@ struct TensorEvaluator, D // write back if (FFTDir == FFT_FORWARD && stride == 1) { - m_device.memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar)); + memcpy(&buf[base_offset], line_buf, line_len*sizeof(ComplexScalar)); } else { Index offset = base_offset; const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0); @@ -583,12 +562,12 @@ struct TensorEvaluator, D protected: Index m_size; - const FFT EIGEN_DEVICE_REF m_fft; + const FFT& m_fft; Dimensions m_dimensions; array m_strides; TensorEvaluator m_impl; - EvaluatorPointerType m_data; - const Device EIGEN_DEVICE_REF m_device; + CoeffReturnType* m_data; + const Device& m_device; // This will support a maximum FFT size of 2^32 for each dimension // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(M_PI / std::pow(2,i)) ^ 2; @@ -666,4 +645,7 @@ struct TensorEvaluator, D } // end namespace Eigen +#endif // EIGEN_HAS_CONSTEXPR + + #endif // EIGEN_CXX11_TENSOR_TENSOR_FFT_H diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index a5be54bcd..fcee5f60d 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -20,7 +20,7 @@ namespace Eigen { * The fixed sized equivalent of * Eigen::Tensor t(3, 5, 7); * is - * Eigen::TensorFixedSize> t; + * Eigen::TensorFixedSize> t; */ template @@ -40,18 +40,11 @@ class TensorFixedSize : public TensorBase0), - PacketAccess = (internal::packet_traits::size > 1), - BlockAccess = false, - PreferBlockAccess = false, Layout = Options_ & RowMajor ? RowMajor : ColMajor, CoordAccess = true, RawAccess = true }; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - typedef Dimensions_ Dimensions; static const std::size_t NumIndices = Dimensions::count; diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 14020aa68..8bece4e65 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -12,16 +12,9 @@ namespace Eigen { -/** \class TensorForcedEval - * \ingroup CXX11_Tensor_Module - * - * \brief Tensor reshaping class. - * - * - */ namespace internal { -template -struct traits > +template class MakePointer_> +struct traits > { // Type promotion to handle the case where the types of the lhs and the rhs are different. typedef typename XprType::Scalar Scalar; @@ -32,31 +25,50 @@ struct traits > typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; enum { Flags = 0 }; + template struct MakePointer { + // Intermediate typedef to workaround MSVC issue. + typedef MakePointer_ MakePointerT; + typedef typename MakePointerT::Type Type; + }; }; -template -struct eval, Eigen::Dense> +template class MakePointer_> +struct eval, Eigen::Dense> { - typedef const TensorForcedEvalOp& type; + typedef const TensorForcedEvalOp& type; }; -template -struct nested, 1, typename eval >::type> +template class MakePointer_> +struct nested, 1, typename eval >::type> { - typedef TensorForcedEvalOp type; + typedef TensorForcedEvalOp type; }; } // end namespace internal -template -class TensorForcedEvalOp : public TensorBase, ReadOnlyAccessors> +// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_) + +/** \class TensorForcedEvalOp + * \ingroup CXX11_Tensor_Module + * + * \brief Tensor reshaping class. + * + * + */ +/// `template class MakePointer_` is added to convert the host pointer to the device pointer. +/// It is added due to the fact that for our device compiler `T*` is not allowed. +/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer `T`. +/// This is done through our `MakePointer_` class. By default the Type in the `MakePointer_` is `T*` . +/// Therefore, by adding the default value, we managed to convert the type and it does not break any +/// existing code as its default value is `T*`. +template class MakePointer_> +class TensorForcedEvalOp : public TensorBase, ReadOnlyAccessors> { public: typedef typename Eigen::internal::traits::Scalar Scalar; @@ -77,116 +89,49 @@ class TensorForcedEvalOp : public TensorBase, ReadOn typename XprType::Nested m_xpr; }; -namespace internal { -template -struct non_integral_type_placement_new{ - template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index numValues, StorageType m_buffer) { - // Initialize non-trivially constructible types. - if (!internal::is_arithmetic::value) { - for (Index i = 0; i < numValues; ++i) new (m_buffer + i) CoeffReturnType(); - } -} -}; -// SYCL does not support non-integral types -// having new (m_buffer + i) CoeffReturnType() causes the following compiler error for SYCL Devices -// no matching function for call to 'operator new' -template -struct non_integral_type_placement_new { - template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index, StorageType) { -} -}; -} // end namespace internal - -template -struct TensorEvaluator, Device> +template class MakePointer_> +struct TensorEvaluator, Device> { - typedef const typename internal::remove_all::type ArgType; - typedef TensorForcedEvalOp XprType; + typedef TensorForcedEvalOp XprType; typedef typename ArgType::Scalar Scalar; typedef typename TensorEvaluator::Dimensions Dimensions; typedef typename XprType::Index Index; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = PacketType::size; - typedef typename Eigen::internal::traits::PointerType TensorPointerType; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; + static const int PacketSize = internal::unpacket_traits::size; enum { - IsAligned = true, - PacketAccess = (PacketType::size > 1), - BlockAccess = internal::is_arithmetic::value, - PreferBlockAccess = false, - Layout = TensorEvaluator::Layout, - RawAccess = true + IsAligned = true, + PacketAccess = (PacketSize > 1), + Layout = TensorEvaluator::Layout, + RawAccess = true }; - static const int NumDims = internal::traits::NumDimensions; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator TensorBlockScratch; - - typedef typename internal::TensorMaterializedBlock - TensorBlock; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_op(op.expression()), - m_device(device), m_buffer(NULL) + /// op_ is used for sycl + : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) { } EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - #if !defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC - #endif - EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { const Index numValues = internal::array_prod(m_impl.dimensions()); - m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType))); - - internal::non_integral_type_placement_new()(numValues, m_buffer); - + m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType)); + // Should initialize the memory in case we're dealing with non POD types. + if (NumTraits::RequireInitialization) { + for (Index i = 0; i < numValues; ++i) { + new(m_buffer+i) CoeffReturnType(); + } + } typedef TensorEvalToOp< const typename internal::remove_const::type > EvalTo; - EvalTo evalToTmp(m_device.get(m_buffer), m_op); - - internal::TensorExecutor< - const EvalTo, typename internal::remove_const::type, - /*Vectorizable=*/internal::IsVectorizable::value, - /*Tiling=*/internal::IsTileable::value>:: - run(evalToTmp, m_device); - + EvalTo evalToTmp(m_buffer, m_op); + const bool PacketAccess = internal::IsVectorizable::value; + internal::TensorExecutor::type, PacketAccess>::run(evalToTmp, m_device); return true; } - -#ifdef EIGEN_USE_THREADS - template - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void evalSubExprsIfNeededAsync( - EvaluatorPointerType, EvalSubExprsCallback done) { - const Index numValues = internal::array_prod(m_impl.dimensions()); - m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp( - numValues * sizeof(CoeffReturnType))); - typedef TensorEvalToOp::type> - EvalTo; - EvalTo evalToTmp(m_device.get(m_buffer), m_op); - - auto on_done = std::bind([](EvalSubExprsCallback done_) { done_(true); }, - std::move(done)); - internal::TensorAsyncExecutor< - const EvalTo, typename internal::remove_const::type, - decltype(on_done), - /*Vectorizable=*/internal::IsVectorizable::value, - /*Tiling=*/internal::IsTileable::value>:: - runAsync(evalToTmp, m_device, std::move(on_done)); - } -#endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { - m_device.deallocate_temp(m_buffer); + m_device.deallocate(m_buffer); m_buffer = NULL; } @@ -201,37 +146,21 @@ struct TensorEvaluator, Device> return internal::ploadt(m_buffer + index); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - return internal::TensorBlockResourceRequirements::any(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { - assert(m_buffer != NULL); - return TensorBlock::materialize(m_buffer, m_impl.dimensions(), desc, scratch); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - EvaluatorPointerType data() const { return m_buffer; } + EIGEN_DEVICE_FUNC typename MakePointer::Type data() const { return m_buffer; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_buffer.bind(cgh); - m_impl.bind(cgh); - } -#endif + /// required by sycl in order to extract the sycl accessor + const TensorEvaluator& impl() { return m_impl; } + /// used by sycl in order to build the sycl buffer + const Device& device() const{return m_device;} private: TensorEvaluator m_impl; const ArgType m_op; - const Device EIGEN_DEVICE_REF m_device; - EvaluatorPointerType m_buffer; + const Device& m_device; + typename MakePointer::Type m_buffer; }; diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h index 246ebe44e..52b803d7f 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h @@ -12,7 +12,7 @@ namespace Eigen { -// MakePointer class is used as a container of the address space of the pointer +// MakePointer class is used as a container of the adress space of the pointer // on the host and on the device. From the host side it generates the T* pointer // and when EIGEN_USE_SYCL is used it construct a buffer with a map_allocator to // T* m_data on the host. It is always called on the device. @@ -20,35 +20,8 @@ namespace Eigen { // map_allocator. template struct MakePointer { typedef T* Type; - typedef const T* ConstType; }; -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* constCast(const T* data) { - return const_cast(data); -} - -// The StorageMemory class is a container of the device specific pointer -// used for refering to a Pointer on TensorEvaluator class. While the TensorExpression -// is a device-agnostic type and need MakePointer class for type conversion, -// the TensorEvaluator class can be specialized for a device, hence it is possible -// to construct different types of temproray storage memory in TensorEvaluator -// for different devices by specializing the following StorageMemory class. -template struct StorageMemory: MakePointer {}; - -namespace internal{ -template struct Pointer_type_promotion { - static const bool val=false; -}; -template struct Pointer_type_promotion { - static const bool val = true; -}; -template struct TypeConversion { - typedef A* type; -}; -} - - template class MakePointer_ = MakePointer> class TensorMap; template class Tensor; template class TensorFixedSize; @@ -64,7 +37,7 @@ template class Ma template class TensorIndexTupleOp; template class TensorTupleReducerOp; template class TensorConcatenationOp; -template class TensorContractionOp; +template class TensorContractionOp; template class TensorConversionOp; template class TensorConvolutionOp; template class TensorFFTOp; @@ -85,50 +58,21 @@ template class TensorInflationOp; template class TensorGeneratorOp; template class TensorAssignOp; template class TensorScanOp; -template class TensorTraceOp; template class TensorCustomUnaryOp; template class TensorCustomBinaryOp; template class MakePointer_ = MakePointer> class TensorEvalToOp; -template class TensorForcedEvalOp; +template class MakePointer_ = MakePointer> class TensorForcedEvalOp; template class TensorDevice; -template class TensorAsyncDevice; template struct TensorEvaluator; -struct NoOpOutputKernel; - struct DefaultDevice; struct ThreadPoolDevice; struct GpuDevice; struct SyclDevice; -#ifdef EIGEN_USE_SYCL - -template struct MakeSYCLPointer { - typedef Eigen::TensorSycl::internal::RangeAccess Type; -}; - -template -EIGEN_STRONG_INLINE const Eigen::TensorSycl::internal::RangeAccess& -constCast(const Eigen::TensorSycl::internal::RangeAccess& data) { - return data; -} - -template -struct StorageMemory : MakeSYCLPointer {}; -template -struct StorageMemory : StorageMemory {}; - -namespace TensorSycl { -namespace internal{ -template class GenericNondeterministicReducer; -} -} -#endif - - enum FFTResultType { RealPart = 0, ImagPart = 1, @@ -154,36 +98,10 @@ struct IsVectorizable { TensorEvaluator::IsAligned; }; -// Tiled evaluation strategy. -enum TiledEvaluation { - Off = 0, // tiled evaluation is not supported - On = 1, // still work in progress (see TensorBlock.h) -}; - -template -struct IsTileable { - // Check that block evaluation is supported and it's a preferred option (at - // least one sub-expression has much faster block evaluation, e.g. - // broadcasting). - static const bool BlockAccess = - TensorEvaluator::BlockAccess && - TensorEvaluator::PreferBlockAccess; - - static const TiledEvaluation value = - BlockAccess ? TiledEvaluation::On : TiledEvaluation::Off; -}; - template ::value, - TiledEvaluation Tiling = IsTileable::value> + bool Vectorizable = IsVectorizable::value> class TensorExecutor; -template ::value, - TiledEvaluation Tiling = IsTileable::value> -class TensorAsyncExecutor; - - } // end namespace internal } // end namespace Eigen diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 2edc45f1a..d73f6dc68 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -20,7 +20,7 @@ namespace internal { template struct scalar_mod_op { EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a % m_divisor; } + EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a) const { return a % m_divisor; } const Scalar m_divisor; }; template @@ -33,8 +33,8 @@ struct functor_traits > */ template struct scalar_mod2_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; } + EIGEN_EMPTY_STRUCT_CTOR(scalar_mod2_op); + EIGEN_DEVICE_FUNC inline Scalar operator() (const Scalar& a, const Scalar& b) const { return a % b; } }; template struct functor_traits > @@ -42,7 +42,7 @@ struct functor_traits > template struct scalar_fmod_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op) + EIGEN_EMPTY_STRUCT_CTOR(scalar_fmod_op); EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const { return numext::fmod(a, b); @@ -54,19 +54,50 @@ struct functor_traits > { PacketAccess = false }; }; + +/** \internal + * \brief Template functor to compute the sigmoid of a scalar + * \sa class CwiseUnaryOp, ArrayBase::sigmoid() + */ +template +struct scalar_sigmoid_op { + EIGEN_EMPTY_STRUCT_CTOR(scalar_sigmoid_op) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { + const T one = T(1); + return one / (one + numext::exp(-x)); + } + + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + Packet packetOp(const Packet& x) const { + const Packet one = pset1(T(1)); + return pdiv(one, padd(one, pexp(pnegate(x)))); + } +}; + +template +struct functor_traits > { + enum { + Cost = NumTraits::AddCost * 2 + NumTraits::MulCost * 6, + PacketAccess = packet_traits::HasAdd && packet_traits::HasDiv && + packet_traits::HasNegate && packet_traits::HasExp + }; +}; + + template struct reducer_traits { enum { Cost = 1, - PacketAccess = false, - IsStateful = false, - IsExactlyAssociative = true + PacketAccess = false }; }; // Standard reduction functors template struct SumReducer { + static const bool PacketAccess = packet_traits::HasAdd; + static const bool IsStateful = false; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { internal::scalar_sum_op sum_op; *accum = sum_op(*accum, t); @@ -102,14 +133,16 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = PacketType::HasAdd, - IsStateful = false, - IsExactlyAssociative = NumTraits::IsInteger + PacketAccess = PacketType::HasAdd }; }; + template struct MeanReducer { + static const bool PacketAccess = packet_traits::HasAdd && !NumTraits::IsInteger; + static const bool IsStateful = true; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MeanReducer() : scalarCount_(0), packetCount_(0) { } @@ -133,20 +166,16 @@ template struct MeanReducer return pset1(initialize()); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { - internal::scalar_quotient_op quotient_op; - return quotient_op(accum, T(scalarCount_)); + return accum / scalarCount_; } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const { - return pdiv(vaccum, pset1(T(packetCount_))); + return pdiv(vaccum, pset1(packetCount_)); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const { internal::scalar_sum_op sum_op; - internal::scalar_quotient_op quotient_op; - return quotient_op( - sum_op(saccum, predux(vaccum)), - T(scalarCount_ + packetCount_ * unpacket_traits::size)); + return sum_op(saccum, predux(vaccum)) / (scalarCount_ + packetCount_ * unpacket_traits::size); } protected: @@ -158,10 +187,7 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = PacketType::HasAdd && - PacketType::HasDiv && !NumTraits::IsInteger, - IsStateful = true, - IsExactlyAssociative = NumTraits::IsInteger + PacketAccess = PacketType::HasAdd }; }; @@ -194,6 +220,9 @@ struct MinMaxBottomValue { template struct MaxReducer { + static const bool PacketAccess = packet_traits::HasMax; + static const bool IsStateful = false; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { if (t > *accum) { *accum = t; } } @@ -225,15 +254,16 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = PacketType::HasMax, - IsStateful = false, - IsExactlyAssociative = true + PacketAccess = PacketType::HasMax }; }; template struct MinReducer { + static const bool PacketAccess = packet_traits::HasMin; + static const bool IsStateful = false; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { if (t < *accum) { *accum = t; } } @@ -265,15 +295,16 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = PacketType::HasMin, - IsStateful = false, - IsExactlyAssociative = true + PacketAccess = PacketType::HasMin }; }; template struct ProdReducer { + static const bool PacketAccess = packet_traits::HasMul; + static const bool IsStateful = false; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { internal::scalar_product_op prod_op; (*accum) = prod_op(*accum, t); @@ -309,15 +340,16 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::MulCost, - PacketAccess = PacketType::HasMul, - IsStateful = false, - IsExactlyAssociative = true + PacketAccess = PacketType::HasMul }; }; struct AndReducer { + static const bool PacketAccess = false; + static const bool IsStateful = false; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { *accum = *accum && t; } @@ -333,14 +365,15 @@ template struct reducer_traits { enum { Cost = 1, - PacketAccess = false, - IsStateful = false, - IsExactlyAssociative = true + PacketAccess = false }; }; struct OrReducer { + static const bool PacketAccess = false; + static const bool IsStateful = false; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { *accum = *accum || t; } @@ -356,9 +389,7 @@ template struct reducer_traits { enum { Cost = 1, - PacketAccess = false, - IsStateful = false, - IsExactlyAssociative = true + PacketAccess = false }; }; @@ -366,6 +397,9 @@ struct reducer_traits { // Argmin/Argmax reducers template struct ArgMaxTupleReducer { + static const bool PacketAccess = false; + static const bool IsStateful = false; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { if (t.second > accum->second) { *accum = t; } } @@ -381,15 +415,16 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = false, - IsStateful = false, - IsExactlyAssociative = true + PacketAccess = false }; }; template struct ArgMinTupleReducer { + static const bool PacketAccess = false; + static const bool IsStateful = false; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const { if (t.second < accum->second) { *accum = t; } } @@ -405,9 +440,7 @@ template struct reducer_traits, Device> { enum { Cost = NumTraits::AddCost, - PacketAccess = false, - IsStateful = false, - IsExactlyAssociative = true + PacketAccess = false }; }; @@ -421,7 +454,6 @@ class GaussianGenerator { const array& std_devs) : m_means(means) { - EIGEN_UNROLL_LOOP for (size_t i = 0; i < NumDims; ++i) { m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2; } @@ -429,7 +461,6 @@ class GaussianGenerator { EIGEN_DEVICE_FUNC T operator()(const array& coordinates) const { T tmp = T(0); - EIGEN_UNROLL_LOOP for (size_t i = 0; i < NumDims; ++i) { T offset = coordinates[i] - m_means[i]; tmp += offset * offset / m_two_sigmas[i]; @@ -452,25 +483,6 @@ struct functor_traits > { }; }; -template -struct scalar_clamp_op { - EIGEN_DEVICE_FUNC inline scalar_clamp_op(const Scalar& _min, const Scalar& _max) : m_min(_min), m_max(_max) {} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar - operator()(const Scalar& x) const { - return numext::mini(numext::maxi(x, m_min), m_max); - } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet - packetOp(const Packet& x) const { - return internal::pmin(internal::pmax(x, pset1(m_min)), pset1(m_max)); - } - const Scalar m_min; - const Scalar m_max; -}; -template -struct functor_traits > -{ enum { Cost = 2 * NumTraits::AddCost, PacketAccess = (packet_traits::HasMin && packet_traits::HasMax)}; }; - } // end namespace internal } // end namespace Eigen diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index b1ff1d8b1..e27753b19 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -31,7 +31,6 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; }; template @@ -88,55 +87,37 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; enum { - IsAligned = false, - PacketAccess = (PacketType::size > 1), - BlockAccess = true, - PreferBlockAccess = true, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = false + IsAligned = false, + PacketAccess = (internal::unpacket_traits::size > 1), + BlockAccess = false, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = false }; - typedef internal::TensorIntDivisor IndexDivisor; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator TensorBlockScratch; - - typedef typename internal::TensorMaterializedBlock - TensorBlock; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_device(device), m_generator(op.generator()) + : m_generator(op.generator()) { - TensorEvaluator argImpl(op.expression(), device); - m_dimensions = argImpl.dimensions(); + TensorEvaluator impl(op.expression(), device); + m_dimensions = impl.dimensions(); if (static_cast(Layout) == static_cast(ColMajor)) { m_strides[0] = 1; - EIGEN_UNROLL_LOOP for (int i = 1; i < NumDims; ++i) { m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1]; - if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); } } else { m_strides[NumDims - 1] = 1; - EIGEN_UNROLL_LOOP for (int i = NumDims - 2; i >= 0; --i) { m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1]; - if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]); } } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { return true; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { @@ -152,7 +133,7 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = PacketType::size; + const int packetSize = internal::unpacket_traits::size; EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < dimensions().TotalSize()); @@ -164,97 +145,6 @@ struct TensorEvaluator, Device> return rslt; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - const size_t target_size = m_device.firstLevelCacheSize(); - // TODO(ezhulenev): Generator should have a cost. - return internal::TensorBlockResourceRequirements::skewed( - target_size); - } - - struct BlockIteratorState { - Index stride; - Index span; - Index size; - Index count; - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { - static const bool is_col_major = - static_cast(Layout) == static_cast(ColMajor); - - // Compute spatial coordinates for the first block element. - array coords; - extract_coordinates(desc.offset(), coords); - array initial_coords = coords; - - // Offset in the output block buffer. - Index offset = 0; - - // Initialize output block iterator state. Dimension in this array are - // always in inner_most -> outer_most order (col major layout). - array it; - for (int i = 0; i < NumDims; ++i) { - const int dim = is_col_major ? i : NumDims - 1 - i; - it[i].size = desc.dimension(dim); - it[i].stride = i == 0 ? 1 : (it[i - 1].size * it[i - 1].stride); - it[i].span = it[i].stride * (it[i].size - 1); - it[i].count = 0; - } - eigen_assert(it[0].stride == 1); - - // Prepare storage for the materialized generator result. - const typename TensorBlock::Storage block_storage = - TensorBlock::prepareStorage(desc, scratch); - - CoeffReturnType* block_buffer = block_storage.data(); - - static const int packet_size = PacketType::size; - - static const int inner_dim = is_col_major ? 0 : NumDims - 1; - const Index inner_dim_size = it[0].size; - const Index inner_dim_vectorized = inner_dim_size - packet_size; - - while (it[NumDims - 1].count < it[NumDims - 1].size) { - Index i = 0; - // Generate data for the vectorized part of the inner-most dimension. - for (; i <= inner_dim_vectorized; i += packet_size) { - for (Index j = 0; j < packet_size; ++j) { - array j_coords = coords; // Break loop dependence. - j_coords[inner_dim] += j; - *(block_buffer + offset + i + j) = m_generator(j_coords); - } - coords[inner_dim] += packet_size; - } - // Finalize non-vectorized part of the inner-most dimension. - for (; i < inner_dim_size; ++i) { - *(block_buffer + offset + i) = m_generator(coords); - coords[inner_dim]++; - } - coords[inner_dim] = initial_coords[inner_dim]; - - // For the 1d tensor we need to generate only one inner-most dimension. - if (NumDims == 1) break; - - // Update offset. - for (i = 1; i < NumDims; ++i) { - if (++it[i].count < it[i].size) { - offset += it[i].stride; - coords[is_col_major ? i : NumDims - 1 - i]++; - break; - } - if (i != NumDims - 1) it[i].count = 0; - coords[is_col_major ? i : NumDims - 1 - i] = - initial_coords[is_col_major ? i : NumDims - 1 - i]; - offset -= it[i].span; - } - } - - return block_storage.AsTensorMaterializedBlock(); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const { // TODO(rmlarsen): This is just a placeholder. Define interface to make @@ -263,26 +153,21 @@ struct TensorEvaluator, Device> TensorOpCost::MulCost()); } - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler&) const {} -#endif + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_coordinates(Index index, array& coords) const { if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = NumDims - 1; i > 0; --i) { - const Index idx = index / m_fast_strides[i]; + const Index idx = index / m_strides[i]; index -= idx * m_strides[i]; coords[i] = idx; } coords[0] = index; } else { for (int i = 0; i < NumDims - 1; ++i) { - const Index idx = index / m_fast_strides[i]; + const Index idx = index / m_strides[i]; index -= idx * m_strides[i]; coords[i] = idx; } @@ -290,10 +175,8 @@ struct TensorEvaluator, Device> } } - const Device EIGEN_DEVICE_REF m_device; Dimensions m_dimensions; array m_strides; - array m_fast_strides; Generator m_generator; }; diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h deleted file mode 100644 index f32ce27e9..000000000 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h +++ /dev/null @@ -1,93 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// Copyright (C) 2018 Deven Desai -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H) -#define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H - -// Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design -// There is code in the Tensorflow codebase that will define EIGEN_USE_GPU, but -// for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler -// When compiling such files, gcc will end up trying to pick up the CUDA headers by -// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU) -// This will obsviously not work when trying to compile tensorflow on a system with no CUDA -// To work around this issue for HIP systems (and leave the default behaviour intact), the -// HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and -// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is -// defined. In continuation of that requirement, the guard here needs to be EIGEN_USE_HIP as well - -#if defined(EIGEN_USE_HIP) - -#define gpuStream_t hipStream_t -#define gpuDeviceProp_t hipDeviceProp_t -#define gpuError_t hipError_t -#define gpuSuccess hipSuccess -#define gpuErrorNotReady hipErrorNotReady -#define gpuGetDeviceCount hipGetDeviceCount -#define gpuGetErrorString hipGetErrorString -#define gpuGetDeviceProperties hipGetDeviceProperties -#define gpuStreamDefault hipStreamDefault -#define gpuGetDevice hipGetDevice -#define gpuSetDevice hipSetDevice -#define gpuMalloc hipMalloc -#define gpuFree hipFree -#define gpuMemsetAsync hipMemsetAsync -#define gpuMemcpyAsync hipMemcpyAsync -#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice -#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost -#define gpuMemcpyHostToDevice hipMemcpyHostToDevice -#define gpuStreamQuery hipStreamQuery -#define gpuSharedMemConfig hipSharedMemConfig -#define gpuDeviceSetSharedMemConfig hipDeviceSetSharedMemConfig -#define gpuStreamSynchronize hipStreamSynchronize -#define gpuDeviceSynchronize hipDeviceSynchronize -#define gpuMemcpy hipMemcpy - -#else - -#define gpuStream_t cudaStream_t -#define gpuDeviceProp_t cudaDeviceProp -#define gpuError_t cudaError_t -#define gpuSuccess cudaSuccess -#define gpuErrorNotReady cudaErrorNotReady -#define gpuGetDeviceCount cudaGetDeviceCount -#define gpuGetErrorString cudaGetErrorString -#define gpuGetDeviceProperties cudaGetDeviceProperties -#define gpuStreamDefault cudaStreamDefault -#define gpuGetDevice cudaGetDevice -#define gpuSetDevice cudaSetDevice -#define gpuMalloc cudaMalloc -#define gpuFree cudaFree -#define gpuMemsetAsync cudaMemsetAsync -#define gpuMemcpyAsync cudaMemcpyAsync -#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice -#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost -#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice -#define gpuStreamQuery cudaStreamQuery -#define gpuSharedMemConfig cudaSharedMemConfig -#define gpuDeviceSetSharedMemConfig cudaDeviceSetSharedMemConfig -#define gpuStreamSynchronize cudaStreamSynchronize -#define gpuDeviceSynchronize cudaDeviceSynchronize -#define gpuMemcpy cudaMemcpy - -#endif - -// gpu_assert can be overridden -#ifndef gpu_assert - -#if defined(EIGEN_HIP_DEVICE_COMPILE) -// HIPCC do not support the use of assert on the GPU side. -#define gpu_assert(COND) -#else -#define gpu_assert(COND) assert(COND) -#endif - -#endif // gpu_assert - -#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h deleted file mode 100644 index db394bcbb..000000000 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h +++ /dev/null @@ -1,40 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// Copyright (C) 2018 Deven Desai -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#if defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H) - -#undef gpuStream_t -#undef gpuDeviceProp_t -#undef gpuError_t -#undef gpuSuccess -#undef gpuErrorNotReady -#undef gpuGetDeviceCount -#undef gpuGetErrorString -#undef gpuGetDeviceProperties -#undef gpuStreamDefault -#undef gpuGetDevice -#undef gpuSetDevice -#undef gpuMalloc -#undef gpuFree -#undef gpuMemsetAsync -#undef gpuMemcpyAsync -#undef gpuMemcpyDeviceToDevice -#undef gpuMemcpyDeviceToHost -#undef gpuMemcpyHostToDevice -#undef gpuStreamQuery -#undef gpuSharedMemConfig -#undef gpuDeviceSetSharedMemConfig -#undef gpuStreamSynchronize -#undef gpuDeviceSynchronize -#undef gpuMemcpy - -#undef EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H - -#endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 49d1004f3..566856ed2 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -27,7 +27,6 @@ namespace Eigen { * patch_cols, and 1 for all the additional dimensions. */ namespace internal { - template struct traits > : public traits { @@ -39,7 +38,6 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions + 1; static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; }; template @@ -54,66 +52,6 @@ struct nested, 1, typename eval type; }; -template -struct ImagePatchCopyOp { - typedef typename Self::Index Index; - typedef typename Self::Scalar Scalar; - typedef typename Self::Impl Impl; - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const Self& self, const Index num_coeff_to_copy, const Index dst_index, - Scalar* dst_data, const Index src_index) { - const Impl& impl = self.impl(); - for (Index i = 0; i < num_coeff_to_copy; ++i) { - dst_data[dst_index + i] = impl.coeff(src_index + i); - } - } -}; - -template -struct ImagePatchCopyOp { - typedef typename Self::Index Index; - typedef typename Self::Scalar Scalar; - typedef typename Self::Impl Impl; - typedef typename packet_traits::type Packet; - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const Self& self, const Index num_coeff_to_copy, const Index dst_index, - Scalar* dst_data, const Index src_index) { - const Impl& impl = self.impl(); - const Index packet_size = internal::unpacket_traits::size; - const Index vectorized_size = - (num_coeff_to_copy / packet_size) * packet_size; - for (Index i = 0; i < vectorized_size; i += packet_size) { - Packet p = impl.template packet(src_index + i); - internal::pstoret(dst_data + dst_index + i, p); - } - for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) { - dst_data[dst_index + i] = impl.coeff(src_index + i); - } - } -}; - -template -struct ImagePatchPaddingOp { - typedef typename Self::Index Index; - typedef typename Self::Scalar Scalar; - typedef typename packet_traits::type Packet; - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const Index num_coeff_to_pad, const Scalar padding_value, - const Index dst_index, Scalar* dst_data) { - const Index packet_size = internal::unpacket_traits::size; - const Packet padded_packet = internal::pset1(padding_value); - const Index vectorized_size = - (num_coeff_to_pad / packet_size) * packet_size; - for (Index i = 0; i < vectorized_size; i += packet_size) { - internal::pstoret(dst_data + dst_index + i, - padded_packet); - } - for (Index i = vectorized_size; i < num_coeff_to_pad; ++i) { - dst_data[dst_index + i] = padding_value; - } - } -}; - } // end namespace internal template @@ -132,12 +70,12 @@ class TensorImagePatchOp : public TensorBase, Device> typedef TensorEvaluator Impl; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = PacketType::size; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; + static const int PacketSize = internal::unpacket_traits::size; enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - PreferBlockAccess = true, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false + IsAligned = false, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false }; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) - : m_device(device), m_impl(op.expression(), device) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device) { EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -309,15 +238,9 @@ struct TensorEvaluator, Device> // Calculate the padding m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2; m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2; - // The padding size calculation for PADDING_SAME has been updated to - // be consistent with how TensorFlow extracts its paddings. - m_rowPaddingTop = numext::maxi(0, m_rowPaddingTop); - m_colPaddingLeft = numext::maxi(0, m_colPaddingLeft); break; default: eigen_assert(false && "unexpected padding"); - m_outputCols=0; // silence the uninitialised warning; - m_outputRows=0; //// silence the uninitialised warning; } } eigen_assert(m_outputRows > 0); @@ -389,19 +312,11 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } -#ifdef EIGEN_USE_THREADS - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType, EvalSubExprsCallback done) { - m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); - } -#endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -503,16 +418,9 @@ struct TensorEvaluator, Device> return packetWithPossibleZero(index); } - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator& impl() const { return m_impl; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif + const TensorEvaluator& impl() const { return m_impl; } Index rowPaddingTop() const { return m_rowPaddingTop; } Index colPaddingLeft() const { return m_colPaddingLeft; } @@ -541,7 +449,6 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } @@ -593,7 +500,6 @@ struct TensorEvaluator, Device> Scalar m_paddingValue; - const Device EIGEN_DEVICE_REF m_device; TensorEvaluator m_impl; }; diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index 0e9133c49..3209fecd3 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -37,36 +37,36 @@ namespace Eigen { * \sa Tensor */ -template +template struct type2index { - static const Index value = n; - EIGEN_DEVICE_FUNC constexpr operator Index() const { return n; } - EIGEN_DEVICE_FUNC void set(Index val) { + static const DenseIndex value = n; + EIGEN_DEVICE_FUNC constexpr operator DenseIndex() const { return n; } + EIGEN_DEVICE_FUNC void set(DenseIndex val) { eigen_assert(val == n); } }; // This can be used with IndexPairList to get compile-time constant pairs, // such as IndexPairList, type2indexpair<3,4>>(). -template +template struct type2indexpair { - static const Index first = f; - static const Index second = s; + static const DenseIndex first = f; + static const DenseIndex second = s; - constexpr EIGEN_DEVICE_FUNC operator IndexPair() const { - return IndexPair(f, s); + constexpr EIGEN_DEVICE_FUNC operator IndexPair() const { + return IndexPair(f, s); } - EIGEN_DEVICE_FUNC void set(const IndexPair& val) { + EIGEN_DEVICE_FUNC void set(const IndexPair& val) { eigen_assert(val.first == f); eigen_assert(val.second == s); } }; -template struct NumTraits > +template struct NumTraits > { - typedef Index Real; + typedef DenseIndex Real; enum { IsComplex = 0, RequireInitialization = false, @@ -75,28 +75,28 @@ template struct NumTraits > MulCost = 1 }; - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Real epsilon() { return 0; } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Real dummy_precision() { return 0; } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Real highest() { return n; } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Real lowest() { return n; } + EIGEN_DEVICE_FUNC static inline Real epsilon() { return 0; } + EIGEN_DEVICE_FUNC static inline Real dummy_precision() { return 0; } + EIGEN_DEVICE_FUNC static inline Real highest() { return n; } + EIGEN_DEVICE_FUNC static inline Real lowest() { return n; } }; namespace internal { template -EIGEN_DEVICE_FUNC void update_value(T& val, Index new_val) { - val = internal::convert_index(new_val); +EIGEN_DEVICE_FUNC void update_value(T& val, DenseIndex new_val) { + val = new_val; } -template -EIGEN_DEVICE_FUNC void update_value(type2index& val, Index new_val) { +template +EIGEN_DEVICE_FUNC void update_value(type2index& val, DenseIndex new_val) { val.set(new_val); } template -EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair new_val) { +EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair new_val) { val = new_val; } -template -EIGEN_DEVICE_FUNC void update_value(type2indexpair& val, IndexPair new_val) { +template +EIGEN_DEVICE_FUNC void update_value(type2indexpair& val, IndexPair new_val) { val.set(new_val); } @@ -106,36 +106,36 @@ struct is_compile_time_constant { static constexpr bool value = false; }; -template +template struct is_compile_time_constant > { static constexpr bool value = true; }; -template +template struct is_compile_time_constant > { static constexpr bool value = true; }; -template +template struct is_compile_time_constant& > { static constexpr bool value = true; }; -template +template struct is_compile_time_constant& > { static constexpr bool value = true; }; -template +template struct is_compile_time_constant > { static constexpr bool value = true; }; -template +template struct is_compile_time_constant > { static constexpr bool value = true; }; -template +template struct is_compile_time_constant& > { static constexpr bool value = true; }; -template +template struct is_compile_time_constant& > { static constexpr bool value = true; }; @@ -228,15 +228,15 @@ template -template +template struct tuple_coeff { template - EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index i, const IndexTuple& t) { + EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex i, const IndexTuple& t) { // return array_get(t) * (i == Idx) + tuple_coeff::get(i, t) * (i != Idx); return (i == Idx ? array_get(t) : tuple_coeff::get(i, t)); } template - EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple& t, const ValueT& value) { + EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple& t, const ValueT& value) { if (i == Idx) { update_value(array_get(t), value); } else { @@ -245,7 +245,7 @@ struct tuple_coeff { } template - EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple& t) { + EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple& t) { return ((i == Idx) & is_compile_time_constant::ValType>::value) || tuple_coeff::value_known_statically(i, t); } @@ -268,18 +268,18 @@ struct tuple_coeff { template struct tuple_coeff<0, ValueT> { template - EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index /*i*/, const IndexTuple& t) { + EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex /*i*/, const IndexTuple& t) { // eigen_assert (i == 0); // gcc fails to compile assertions in constexpr return array_get<0>(t)/* * (i == 0)*/; } template - EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple& t, const ValueT value) { + EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple& t, const ValueT value) { eigen_assert (i == 0); update_value(array_get<0>(t), value); } template - EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple&) { - return is_compile_time_constant::ValType>::value && (i == 0); + EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple&) { + return is_compile_time_constant::ValType>::value & (i == 0); } template @@ -298,43 +298,32 @@ struct tuple_coeff<0, ValueT> { template struct IndexList : internal::IndexTuple { - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index operator[] (const Index i) const { - return internal::tuple_coeff >::value-1, Index>::get(i, *this); + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const { + return internal::tuple_coeff >::value-1, DenseIndex>::get(i, *this); } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index get(const Index i) const { - return internal::tuple_coeff >::value-1, Index>::get(i, *this); + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex get(const DenseIndex i) const { + return internal::tuple_coeff >::value-1, DenseIndex>::get(i, *this); } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const Index value) { - return internal::tuple_coeff >::value-1, Index>::set(i, *this, value); + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) { + return internal::tuple_coeff >::value-1, DenseIndex>::set(i, *this, value); } EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple& other) : internal::IndexTuple(other) { } EIGEN_DEVICE_FUNC constexpr IndexList(FirstType& first, OtherTypes... other) : internal::IndexTuple(first, other...) { } EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple() { } - EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const { - return internal::tuple_coeff >::value-1, Index>::value_known_statically(i, *this); + EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const { + return internal::tuple_coeff >::value-1, DenseIndex>::value_known_statically(i, *this); } EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const { - return internal::tuple_coeff >::value-1, Index>::values_up_to_known_statically(*this); + return internal::tuple_coeff >::value-1, DenseIndex>::values_up_to_known_statically(*this); } EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const { - return internal::tuple_coeff >::value-1, Index>::values_up_to_statically_known_to_increase(*this); + return internal::tuple_coeff >::value-1, DenseIndex>::values_up_to_statically_known_to_increase(*this); } }; -template -std::ostream& operator<<(std::ostream& os, - const IndexList& dims) { - os << "["; - for (size_t i = 0; i < 1 + sizeof...(OtherTypes); ++i) { - if (i > 0) os << ", "; - os << dims[i]; - } - os << "]"; - return os; -} template constexpr IndexList make_index_list(FirstType val1, OtherTypes... other_vals) { @@ -344,28 +333,26 @@ constexpr IndexList make_index_list(FirstType val1, Ot template struct IndexPairList : internal::IndexTuple { - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair operator[] (const Index i) const { - return internal::tuple_coeff >::value-1, IndexPair>::get(i, *this); + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair operator[] (const DenseIndex i) const { + return internal::tuple_coeff >::value-1, IndexPair>::get(i, *this); } - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const IndexPair value) { - return internal::tuple_coeff>::value-1, IndexPair >::set(i, *this, value); + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const IndexPair value) { + return internal::tuple_coeff>::value-1, IndexPair >::set(i, *this, value); } EIGEN_DEVICE_FUNC constexpr IndexPairList(const internal::IndexTuple& other) : internal::IndexTuple(other) { } EIGEN_DEVICE_FUNC constexpr IndexPairList() : internal::IndexTuple() { } - EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const { - return internal::tuple_coeff >::value-1, Index>::value_known_statically(i, *this); + EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const { + return internal::tuple_coeff >::value-1, DenseIndex>::value_known_statically(i, *this); } }; namespace internal { -template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index array_prod(const IndexList& sizes) { - Index result = 1; - EIGEN_UNROLL_LOOP - for (size_t i = 0; i < array_size >::value; ++i) { +template size_t array_prod(const IndexList& sizes) { + size_t result = 1; + for (int i = 0; i < array_size >::value; ++i) { result *= sizes[i]; } return result; @@ -385,30 +372,30 @@ template struct array_size >::value; }; -template EIGEN_DEVICE_FUNC constexpr Index array_get(IndexList& a) { +template EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(IndexList& a) { return IndexTupleExtractor::get_val(a); } -template EIGEN_DEVICE_FUNC constexpr Index array_get(const IndexList& a) { +template EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(const IndexList& a) { return IndexTupleExtractor::get_val(a); } template struct index_known_statically_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index) { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { return false; } }; template struct index_known_statically_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) { return IndexList().value_known_statically(i); } }; template struct index_known_statically_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i) { return IndexList().value_known_statically(i); } }; @@ -460,14 +447,14 @@ template template struct index_statically_eq_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { + EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { return false; } }; template struct index_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return IndexList().value_known_statically(i) & (IndexList().get(i) == value); } @@ -475,7 +462,7 @@ struct index_statically_eq_impl > { template struct index_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return IndexList().value_known_statically(i) & (IndexList().get(i) == value); } @@ -484,14 +471,14 @@ struct index_statically_eq_impl > { template struct index_statically_ne_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { + EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { return false; } }; template struct index_statically_ne_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return IndexList().value_known_statically(i) & (IndexList().get(i) != value); } @@ -499,7 +486,7 @@ struct index_statically_ne_impl > { template struct index_statically_ne_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return IndexList().value_known_statically(i) & (IndexList().get(i) != value); } @@ -508,14 +495,14 @@ struct index_statically_ne_impl > { template struct index_statically_gt_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { + EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { return false; } }; template struct index_statically_gt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return IndexList().value_known_statically(i) & (IndexList().get(i) > value); } @@ -523,7 +510,7 @@ struct index_statically_gt_impl > { template struct index_statically_gt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return IndexList().value_known_statically(i) & (IndexList().get(i) > value); } @@ -533,14 +520,14 @@ struct index_statically_gt_impl > { template struct index_statically_lt_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { + EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { return false; } }; template struct index_statically_lt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return IndexList().value_known_statically(i) & (IndexList().get(i) < value); } @@ -548,7 +535,7 @@ struct index_statically_lt_impl > { template struct index_statically_lt_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return IndexList().value_known_statically(i) & (IndexList().get(i) < value); } @@ -558,14 +545,14 @@ struct index_statically_lt_impl > { template struct index_pair_first_statically_eq_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { + EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { return false; } }; template struct index_pair_first_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return IndexPairList().value_known_statically(i) & (IndexPairList().operator[](i).first == value); } @@ -573,7 +560,7 @@ struct index_pair_first_statically_eq_impl struct index_pair_first_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return IndexPairList().value_known_statically(i) & (IndexPairList().operator[](i).first == value); } @@ -583,14 +570,14 @@ struct index_pair_first_statically_eq_impl struct index_pair_second_statically_eq_impl { - EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { + EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) { return false; } }; template struct index_pair_second_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return IndexPairList().value_known_statically(i) & (IndexPairList().operator[](i).second == value); } @@ -598,7 +585,7 @@ struct index_pair_second_statically_eq_impl struct index_pair_second_statically_eq_impl > { - EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { + EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return IndexPairList().value_known_statically(i) & (IndexPairList().operator[](i).second == value); } @@ -615,7 +602,7 @@ namespace internal { template struct index_known_statically_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const Index) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex) { return false; } }; @@ -636,42 +623,42 @@ struct indices_statically_known_to_increase_impl { template struct index_statically_eq_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { return false; } }; template struct index_statically_ne_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { return false; } }; template struct index_statically_gt_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { return false; } }; template struct index_statically_lt_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { return false; } }; template struct index_pair_first_statically_eq_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { return false; } }; template struct index_pair_second_statically_eq_impl { - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(Index, Index) { + static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) { return false; } }; @@ -687,7 +674,7 @@ struct index_pair_second_statically_eq_impl { namespace Eigen { namespace internal { template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(Index i) { +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_known_statically(DenseIndex i) { return index_known_statically_impl::run(i); } @@ -702,32 +689,32 @@ static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool indices_statically_known_to_increa } template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(Index i, Index value) { +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_eq(DenseIndex i, DenseIndex value) { return index_statically_eq_impl::run(i, value); } template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(Index i, Index value) { +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_ne(DenseIndex i, DenseIndex value) { return index_statically_ne_impl::run(i, value); } template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(Index i, Index value) { +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_gt(DenseIndex i, DenseIndex value) { return index_statically_gt_impl::run(i, value); } template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(Index i, Index value) { +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(DenseIndex i, DenseIndex value) { return index_statically_lt_impl::run(i, value); } template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(Index i, Index value) { +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(DenseIndex i, DenseIndex value) { return index_pair_first_statically_eq_impl::run(i, value); } template -static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(Index i, Index value) { +static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(DenseIndex i, DenseIndex value) { return index_pair_second_statically_eq_impl::run(i, value); } diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index 7dadec7fb..f391fb9ee 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -31,7 +31,6 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; }; template @@ -85,24 +84,17 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = PacketType::size; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = /*TensorEvaluator::IsAligned*/ false, PacketAccess = TensorEvaluator::PacketAccess, BlockAccess = false, - PreferBlockAccess = false, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_strides(op.strides()) { @@ -137,7 +129,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -152,7 +144,6 @@ struct TensorEvaluator, Device> eigen_assert(index < dimensions().TotalSize()); *inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { - EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (idx != idx / m_fastStrides[i] * m_strides[i]) { @@ -167,7 +158,6 @@ struct TensorEvaluator, Device> *inputIndex += index / m_strides[0]; return true; } else { - EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i]; if (idx != idx / m_fastStrides[i] * m_strides[i]) { @@ -203,7 +193,6 @@ struct TensorEvaluator, Device> eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } @@ -224,14 +213,7 @@ struct TensorEvaluator, Device> compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: Dimensions m_dimensions; diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h index 26a3818f3..33edc49e3 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h @@ -32,7 +32,7 @@ struct Initializer { Eigen::array::Index, traits::NumDimensions>* indices, const InitList& vals) { int i = 0; - for (const auto& v : vals) { + for (auto v : vals) { (*indices)[traits::NumDimensions - N] = i++; Initializer::run(tensor, indices, v); } @@ -48,7 +48,7 @@ struct Initializer { const InitList& vals) { int i = 0; // There is likely a faster way to do that than iterating. - for (const auto& v : vals) { + for (auto v : vals) { (*indices)[traits::NumDimensions - 1] = i++; tensor.coeffRef(*indices) = v; } diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index 6d5cce4aa..ede3939c2 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -21,7 +21,7 @@ namespace Eigen { * \brief Fast integer division by a constant. * * See the paper from Granlund and Montgomery for explanation. - * (at https://doi.org/10.1145/773473.178249) + * (at http://dx.doi.org/10.1145/773473.178249) * * \sa Tensor */ @@ -35,10 +35,8 @@ namespace { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename internal::enable_if::type count_leading_zeros(const T val) { -#ifdef EIGEN_GPU_COMPILE_PHASE +#ifdef __CUDA_ARCH__ return __clz(val); -#elif defined(SYCL_DEVICE_ONLY) - return cl::sycl::clz(val); #elif EIGEN_COMP_MSVC unsigned long index; _BitScanReverse(&index, val); @@ -53,10 +51,8 @@ namespace { EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename internal::enable_if::type count_leading_zeros(const T val) { -#ifdef EIGEN_GPU_COMPILE_PHASE +#ifdef __CUDA_ARCH__ return __clzll(val); -#elif defined(SYCL_DEVICE_ONLY) - return static_cast(cl::sycl::clz(val)); #elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64 unsigned long index; _BitScanReverse64(&index, val); @@ -90,10 +86,8 @@ namespace { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) { -#if defined(EIGEN_GPU_COMPILE_PHASE) +#if defined(__CUDA_ARCH__) return __umulhi(a, b); -#elif defined(SYCL_DEVICE_ONLY) - return cl::sycl::mul_hi(a, static_cast(b)); #else return (static_cast(a) * b) >> 32; #endif @@ -101,11 +95,9 @@ namespace { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) { -#if defined(EIGEN_GPU_COMPILE_PHASE) +#if defined(__CUDA_ARCH__) return __umul64hi(a, b); -#elif defined(SYCL_DEVICE_ONLY) - return cl::sycl::mul_hi(a, static_cast(b)); -#elif EIGEN_HAS_BUILTIN_INT128 +#elif defined(__SIZEOF_INT128__) __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b); return static_cast(v >> 64); #else @@ -124,7 +116,7 @@ namespace { template struct DividerHelper<64, T> { static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) { -#if EIGEN_HAS_BUILTIN_INT128 && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY) +#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__) return static_cast((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1); #else const uint64_t shift = 1ULL << log_div; @@ -167,7 +159,7 @@ struct TensorIntDivisor { shift2 = log_div > 1 ? log_div-1 : 0; } - // Must have 0 <= numerator. On platforms that don't support the __uint128_t + // Must have 0 <= numerator. On platforms that dont support the __uint128_t // type numerator should also be less than 2^32-1. EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const { eigen_assert(static_cast::type>(numerator) < NumTraits::highest()/2); @@ -203,10 +195,8 @@ class TensorIntDivisor { } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const { -#ifdef EIGEN_GPU_COMPILE_PHASE +#ifdef __CUDA_ARCH__ return (__umulhi(magic, n) >> shift); -#elif defined(SYCL_DEVICE_ONLY) - return (cl::sycl::mul_hi(magic, static_cast(n)) >> shift); #else uint64_t v = static_cast(magic) * static_cast(n); return (static_cast(v >> 32) >> shift); diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index 05fa80e59..cd0109ef4 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -46,7 +46,6 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = traits::NumDimensions; static const int Layout = (traits::Layout == ColMajor) ? RowMajor : ColMajor; - typedef typename XprTraits::PointerType PointerType; }; template @@ -119,17 +118,11 @@ struct TensorEvaluator, Device> enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false, // to be implemented RawAccess = TensorEvaluator::RawAccess }; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { @@ -138,22 +131,13 @@ struct TensorEvaluator, Device> } } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif - typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { return m_impl.evalSubExprsIfNeeded(data); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { @@ -175,9 +159,7 @@ struct TensorEvaluator, Device> return m_impl.costPerCoeff(vectorized); } - EIGEN_DEVICE_FUNC typename Storage::Type data() const { - return constCast(m_impl.data()); - } + EIGEN_DEVICE_FUNC Scalar* data() const { return m_impl.data(); } const TensorEvaluator& impl() const { return m_impl; } @@ -198,20 +180,14 @@ template enum { IsAligned = TensorEvaluator::IsAligned, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = (static_cast(TensorEvaluator::Layout) == static_cast(ColMajor)) ? RowMajor : ColMajor, CoordAccess = false // to be implemented }; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } - + typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h index af9e5db70..ee0078bbc 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h @@ -27,7 +27,7 @@ */ // SFINAE requires variadic templates -#if !defined(EIGEN_GPUCC) +#ifndef __CUDACC__ #if EIGEN_HAS_VARIADIC_TEMPLATES // SFINAE doesn't work for gcc <= 4.7 #ifdef EIGEN_COMP_GNUC @@ -51,43 +51,4 @@ #endif -#if EIGEN_OS_WIN || EIGEN_OS_WIN64 -#define EIGEN_SLEEP(n) Sleep(n) -#elif EIGEN_OS_GNULINUX -#define EIGEN_SLEEP(n) usleep(n * 1000); -#else -#define EIGEN_SLEEP(n) sleep(std::max(1, n/1000)) -#endif - -// Define a macro to use a reference on the host but a value on the device -#if defined(SYCL_DEVICE_ONLY) - #define EIGEN_DEVICE_REF -#else - #define EIGEN_DEVICE_REF & -#endif - -// Define a macro for catching SYCL exceptions if exceptions are enabled -#define EIGEN_SYCL_TRY_CATCH(X) \ - do { \ - EIGEN_TRY {X;} \ - EIGEN_CATCH(const cl::sycl::exception& e) { \ - EIGEN_THROW_X(std::runtime_error("SYCL exception at " + \ - std::string(__FILE__) + ":" + \ - std::to_string(__LINE__) + "\n" + \ - e.what())); \ - } \ - } while (false) - -// Define a macro if local memory flags are unset or one of them is set -// Setting both flags is the same as unsetting them -#if (!defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)) || \ - (defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)) - #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1 - #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1 -#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM) - #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1 -#elif !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM) - #define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1 -#endif - #endif diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h index 172a6bab8..e4fc86a40 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h @@ -31,38 +31,20 @@ template class MakePoin public: typedef TensorMap Self; typedef typename PlainObjectType::Base Base; - #ifdef EIGEN_USE_SYCL - typedef typename Eigen::internal::remove_reference::type>::type Nested; - #else - typedef typename Eigen::internal::nested::type Nested; - #endif - typedef typename internal::traits::StorageKind StorageKind; + typedef typename Eigen::internal::nested::type Nested; + typedef typename internal::traits::StorageKind StorageKind; typedef typename internal::traits::Index Index; typedef typename internal::traits::Scalar Scalar; typedef typename NumTraits::Real RealScalar; typedef typename Base::CoeffReturnType CoeffReturnType; + /* typedef typename internal::conditional< + bool(internal::is_lvalue::value), + Scalar *, + const Scalar *>::type + PointerType;*/ typedef typename MakePointer_::Type PointerType; - typedef typename MakePointer_::ConstType PointerConstType; - - // WARN: PointerType still can be a pointer to const (const Scalar*), for - // example in TensorMap> expression. This type of - // expression should be illegal, but adding this restriction is not possible - // in practice (see https://bitbucket.org/eigen/eigen/pull-requests/488). - typedef typename internal::conditional< - bool(internal::is_lvalue::value), - PointerType, // use simple pointer in lvalue expressions - PointerConstType // use const pointer in rvalue expressions - >::type StoragePointerType; - - // If TensorMap was constructed over rvalue expression (e.g. const Tensor), - // we should return a reference to const from operator() (and others), even - // if TensorMap itself is not const. - typedef typename internal::conditional< - bool(internal::is_lvalue::value), - Scalar&, - const Scalar& - >::type StorageRefType; + typedef PointerType PointerArgType; static const int Options = Options_; @@ -77,47 +59,47 @@ template class MakePoin }; EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr) : m_data(dataPtr), m_dimensions() { + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr) : m_data(dataPtr), m_dimensions() { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) } #if EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) { + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) } #else EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) { + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension) : m_data(dataPtr), m_dimensions(firstDimension) { // The number of dimensions used to construct a tensor must be equal to the rank of the tensor. EIGEN_STATIC_ASSERT((1 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE) } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) { + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2) : m_data(dataPtr), m_dimensions(dim1, dim2) { EIGEN_STATIC_ASSERT(2 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) { + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3) { EIGEN_STATIC_ASSERT(3 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) { + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4) { EIGEN_STATIC_ASSERT(4 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) { + EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index dim1, Index dim2, Index dim3, Index dim4, Index dim5) : m_data(dataPtr), m_dimensions(dim1, dim2, dim3, dim4, dim5) { EIGEN_STATIC_ASSERT(5 == NumIndices || NumIndices == Dynamic, YOU_MADE_A_PROGRAMMING_MISTAKE) } #endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const array& dimensions) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const array& dimensions) : m_data(dataPtr), m_dimensions(dimensions) { } template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const Dimensions& dimensions) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, const Dimensions& dimensions) : m_data(dataPtr), m_dimensions(dimensions) { } @@ -134,12 +116,12 @@ template class MakePoin EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StoragePointerType data() { return m_data; } + EIGEN_STRONG_INLINE PointerType data() { return m_data; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StoragePointerType data() const { return m_data; } + EIGEN_STRONG_INLINE const PointerType data() const { return m_data; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(const array& indices) const + EIGEN_STRONG_INLINE const Scalar& operator()(const array& indices) const { // eigen_assert(checkIndexRange(indices)); if (PlainObjectType::Options&RowMajor) { @@ -152,14 +134,14 @@ template class MakePoin } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()() const + EIGEN_STRONG_INLINE const Scalar& operator()() const { EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE) return m_data[0]; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(Index index) const + EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const { eigen_internal_assert(index >= 0 && index < size()); return m_data[index]; @@ -167,10 +149,9 @@ template class MakePoin #if EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const + EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const { EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(internal::all((Eigen::NumTraits::highest() >= otherIndices)...)); if (PlainObjectType::Options&RowMajor) { const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); return m_data[index]; @@ -181,7 +162,7 @@ template class MakePoin } #else EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1) const + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1) const { if (PlainObjectType::Options&RowMajor) { const Index index = i1 + i0 * m_dimensions[1]; @@ -192,7 +173,7 @@ template class MakePoin } } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2) const + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2) const { if (PlainObjectType::Options&RowMajor) { const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0); @@ -203,7 +184,7 @@ template class MakePoin } } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3) const + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3) const { if (PlainObjectType::Options&RowMajor) { const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); @@ -214,7 +195,7 @@ template class MakePoin } } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const + EIGEN_STRONG_INLINE const Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) const { if (PlainObjectType::Options&RowMajor) { const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); @@ -227,7 +208,7 @@ template class MakePoin #endif EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(const array& indices) + EIGEN_STRONG_INLINE Scalar& operator()(const array& indices) { // eigen_assert(checkIndexRange(indices)); if (PlainObjectType::Options&RowMajor) { @@ -240,14 +221,14 @@ template class MakePoin } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()() + EIGEN_STRONG_INLINE Scalar& operator()() { EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE) return m_data[0]; } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(Index index) + EIGEN_STRONG_INLINE Scalar& operator()(Index index) { eigen_internal_assert(index >= 0 && index < size()); return m_data[index]; @@ -255,10 +236,9 @@ template class MakePoin #if EIGEN_HAS_VARIADIC_TEMPLATES template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) + EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) { static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic, "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor."); - eigen_assert(internal::all((Eigen::NumTraits::highest() >= otherIndices)...)); const std::size_t NumDims = sizeof...(otherIndices) + 2; if (PlainObjectType::Options&RowMajor) { const Index index = m_dimensions.IndexOfRowMajor(array{{firstIndex, secondIndex, otherIndices...}}); @@ -270,7 +250,7 @@ template class MakePoin } #else EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1) + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1) { if (PlainObjectType::Options&RowMajor) { const Index index = i1 + i0 * m_dimensions[1]; @@ -281,7 +261,7 @@ template class MakePoin } } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2) + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2) { if (PlainObjectType::Options&RowMajor) { const Index index = i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0); @@ -292,7 +272,7 @@ template class MakePoin } } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3) + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3) { if (PlainObjectType::Options&RowMajor) { const Index index = i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0)); @@ -303,7 +283,7 @@ template class MakePoin } } EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE StorageRefType operator()(Index i0, Index i1, Index i2, Index i3, Index i4) + EIGEN_STRONG_INLINE Scalar& operator()(Index i0, Index i1, Index i2, Index i3, Index i4) { if (PlainObjectType::Options&RowMajor) { const Index index = i4 + m_dimensions[4] * (i3 + m_dimensions[3] * (i2 + m_dimensions[2] * (i1 + m_dimensions[1] * i0))); @@ -334,7 +314,7 @@ template class MakePoin } private: - StoragePointerType m_data; + typename MakePointer_::Type m_data; Dimensions m_dimensions; }; diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index a3a750f21..615559d44 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -52,13 +52,11 @@ struct PacketType : internal::packet_traits { }; // For CUDA packet types when using a GpuDevice -#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) - -typedef ulonglong2 Packet4h2; -template<> +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16) +template <> struct PacketType { - typedef Packet4h2 type; - static const int size = 8; + typedef half2 type; + static const int size = 2; enum { HasAdd = 1, HasSub = 1, @@ -77,7 +75,6 @@ struct PacketType { HasSqrt = 1, HasRsqrt = 1, HasExp = 1, - HasExpm1 = 0, HasLog = 1, HasLog1p = 0, HasLog10 = 0, @@ -87,57 +84,9 @@ struct PacketType { #endif #if defined(EIGEN_USE_SYCL) - -namespace TensorSycl { -namespace internal { - -template struct PlusOp { - static constexpr Index Value = A + B; -}; - -template struct DivOp { - static constexpr Index Value = A / B; -}; - -template class StepOp> -struct static_for { - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator op) { - op(start); - static_for::Value, end, step, - StepOp>::loop(op); - } -}; -template class StepOp> -struct static_for { - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator) {} -}; - -template -struct Vectorise { - static const int PacketSize = 1; - typedef OutScalar PacketReturnType; -}; - -template -struct Vectorise { - static const int PacketSize = Eigen::PacketType::size; - typedef typename Eigen::PacketType::type PacketReturnType; -}; - -static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index roundUp(Index x, Index y) { - return ((((x) + (y)-1) / (y)) * (y)); -} - -} // namespace internal -} // namespace TensorSycl - -template <> - struct PacketType { - typedef half type; +template + struct PacketType { + typedef T type; static const int size = 1; enum { HasAdd = 0, @@ -154,59 +103,9 @@ template <> HasBlend = 0 }; }; -template -struct PacketType : internal::default_packet_traits { - typedef Scalar type; - typedef Scalar half; - enum { - Vectorizable = 0, - size = 1, - AlignedOnScalar = 0, - HasHalfPacket = 0 - }; - enum { - HasAdd = 0, - HasSub = 0, - HasMul = 0, - HasNegate = 0, - HasAbs = 0, - HasAbs2 = 0, - HasMin = 0, - HasMax = 0, - HasConj = 0, - HasSetLinear = 0 - }; - -}; - -template -struct PacketType : PacketType{}; - -#ifndef EIGEN_DONT_VECTORIZE_SYCL -#define PACKET_TYPE(CVQual, Type, val, lengths, DEV)\ -template<> struct PacketType : internal::sycl_packet_traits \ -{\ - typedef typename internal::packet_traits::type type;\ - typedef typename internal::packet_traits::half half;\ -}; - - -PACKET_TYPE(const, float, 1, 4, SyclDevice) -PACKET_TYPE(, float, 1, 4, SyclDevice) -PACKET_TYPE(const, float, 1, 4, const SyclDevice) -PACKET_TYPE(, float, 1, 4, const SyclDevice) - -PACKET_TYPE(const, double, 0, 2, SyclDevice) -PACKET_TYPE(, double, 0, 2, SyclDevice) -PACKET_TYPE(const, double, 0, 2, const SyclDevice) -PACKET_TYPE(, double, 0, 2, const SyclDevice) -#undef PACKET_TYPE - -template<> struct PacketType: PacketType{}; -template<> struct PacketType: PacketType{}; -#endif #endif + // Tuple mimics std::pair but works on e.g. nvcc. template struct Tuple { public: @@ -224,9 +123,7 @@ template struct Tuple { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tuple& operator= (const Tuple& rhs) { - #ifndef SYCL_DEVICE_ONLY if (&rhs == this) return *this; - #endif first = rhs.first; second = rhs.second; return *this; @@ -271,12 +168,12 @@ template struct IndexPair { #ifdef EIGEN_HAS_SFINAE namespace internal { - template + template EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array customIndices2Array(IndexType& idx, numeric_list) { return { idx[Is]... }; } - template + template EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array customIndices2Array(IndexType&, numeric_list) { return array(); diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index f107d1b19..d34f1e328 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -31,13 +31,12 @@ struct traits > : public traits::type _Nested; static const int NumDimensions = array_size::value; static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; }; template struct eval, Eigen::Dense> { - typedef const TensorReshapingOpEIGEN_DEVICE_REF type; + typedef const TensorReshapingOp& type; }; template @@ -102,62 +101,14 @@ struct TensorEvaluator, Device> typedef TensorReshapingOp XprType; typedef NewDimensions Dimensions; - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; - typedef StorageMemory::type, Device> ConstCastStorage; - - static const int NumOutputDims = internal::array_size::value; - static const int NumInputDims = internal::array_size::Dimensions>::value; - - enum ReshapingKind { - // We do not use layout information to determine reshaping kind. - // Depending on the layout `N` can be inner or outer dimension. - OneByN = 0, // expr.reshape(1, N) - NByOne = 1, // expr.reshape(N, 1) - Runtime = 2 // Reshape dimensions are dynamic (specified at runtime). - }; - - // clang-format off - static const ReshapingKind kind = -#if defined(EIGEN_HAS_INDEX_LIST) - (NumOutputDims == 2 && internal::index_statically_eq(/*index=*/0, /*value=*/1)) ? OneByN - : (NumOutputDims == 2 && internal::index_statically_eq(/*index=*/1, /*value=*/1)) ? NByOne - : Runtime; -#else - Runtime; -#endif - // clang-format on - enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, - // For trivial reshapes with raw access to underlying data we will provide - // zero overhead block access. - // TODO(ezhulenev): Consider adding block access without raw access? - BlockAccess = TensorEvaluator::RawAccess && - NumInputDims > 0 && NumOutputDims > 0, - PreferBlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = TensorEvaluator::RawAccess + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess }; - typedef typename internal::remove_const::type ScalarNoConst; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator TensorBlockScratch; - - typedef - typename internal::TensorMaterializedBlock - TensorBlock; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dimensions(op.dimensions()) { @@ -166,17 +117,14 @@ struct TensorEvaluator, Device> eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions())); } + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } -#ifdef EIGEN_USE_THREADS - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType data, EvalSubExprsCallback done) { - m_impl.evalSubExprsIfNeededAsync(data, std::move(done)); - } -#endif - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { return m_impl.evalSubExprsIfNeeded(data); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { @@ -198,53 +146,10 @@ struct TensorEvaluator, Device> return m_impl.costPerCoeff(vectorized); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - return internal::TensorBlockResourceRequirements::any(); - } - - // required in block(OutputTensorBlock* output_block) const - // For C++03 compatibility this must be defined outside the method - struct BlockIteratorState { - Index stride; - Index span; - Index size; - Index count; - }; - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { - eigen_assert(m_impl.data() != NULL); - eigen_assert((kind == Runtime) || - (kind == OneByN && desc.dimensions()[0] == 1) || - (kind == NByOne && desc.dimensions()[1] == 1)); - - if (kind == OneByN || kind == NByOne) { - // We can guarantee at compile time that block is just a contiguous slice - // of the underlying expression memory buffer. - return TensorBlock(internal::TensorBlockKind::kView, - m_impl.data() + desc.offset(), desc.dimensions()); - } else { - // This will do additional runtime checks, and in the end it might be also - // a view, or it might be a block materialized in the temporary buffer. - return TensorBlock::materialize(m_impl.data(), m_dimensions, desc, - scratch); - } - } - - EIGEN_DEVICE_FUNC typename Storage::Type data() const { - return constCast(m_impl.data()); - } + EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast(m_impl.data()); } EIGEN_DEVICE_FUNC const TensorEvaluator& impl() const { return m_impl; } - #ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } - #endif protected: TensorEvaluator m_impl; NewDimensions m_dimensions; @@ -262,13 +167,11 @@ template typedef NewDimensions Dimensions; enum { - IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::RawAccess, - PreferBlockAccess = false, - Layout = TensorEvaluator::Layout, - CoordAccess = false, // to be implemented - RawAccess = TensorEvaluator::RawAccess + IsAligned = TensorEvaluator::IsAligned, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, // to be implemented + RawAccess = TensorEvaluator::RawAccess }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) @@ -280,38 +183,15 @@ template typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor - TensorBlockDesc; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { return this->m_impl.coeffRef(index); } - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { this->m_impl.template writePacket(index, x); } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( - const TensorBlockDesc& desc, const TensorBlock& block) { - assert(this->m_impl.data() != NULL); - - typedef typename TensorBlock::XprType TensorBlockExpr; - typedef internal::TensorBlockAssignment< - Scalar, TensorEvaluator::NumOutputDims, TensorBlockExpr, Index> - TensorBlockAssign; - - TensorBlockAssign::Run( - TensorBlockAssign::target(desc.dimensions(), - internal::strides(this->dimensions()), - this->m_impl.data(), desc.offset()), - block.expr()); - } }; @@ -334,13 +214,12 @@ struct traits > : public traits::type _Nested; static const int NumDimensions = array_size::value; static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; }; template struct eval, Eigen::Dense> { - typedef const TensorSlicingOpEIGEN_DEVICE_REF type; + typedef const TensorSlicingOp& type; }; template @@ -404,12 +283,9 @@ class TensorSlicingOp : public TensorBase struct MemcpyTriggerForSlicing { +template struct MemcpyTriggerForSlicing { EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) { } - EIGEN_DEVICE_FUNC bool operator ()(Index total, Index contiguous) const { - const bool prefer_block_evaluation = BlockAccess && total > 32*1024; - return !prefer_block_evaluation && contiguous > threshold_; - } + EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > threshold_; } private: Index threshold_; @@ -418,21 +294,11 @@ template struct MemcpyTrigge // It is very expensive to start the memcpy kernel on GPU: we therefore only // use it for large copies. #ifdef EIGEN_USE_GPU -template struct MemcpyTriggerForSlicing { +template struct MemcpyTriggerForSlicing { EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) { } - EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; } + EIGEN_DEVICE_FUNC bool operator ()(Index val) const { return val > 4*1024*1024; } }; #endif - -// It is very expensive to start the memcpy kernel on GPU: we therefore only -// use it for large copies. -#ifdef EIGEN_USE_SYCL -template struct MemcpyTriggerForSlicing { - EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) { } - EIGEN_DEVICE_FUNC bool operator ()(Index, Index contiguous) const { return contiguous > 4*1024*1024; } -}; -#endif - } // Eval as rvalue @@ -442,60 +308,23 @@ struct TensorEvaluator, Devi typedef TensorSlicingOp XprType; static const int NumDims = internal::array_size::value; - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef Sizes Dimensions; - typedef StorageMemory Storage; - typedef StorageMemory::type, Device> ConstCastStorage; - typedef typename Storage::Type EvaluatorPointerType; - enum { // Alignment can't be guaranteed at compile time since it depends on the // slice offsets and sizes. - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess && - // FIXME: Temporary workaround for bug in slicing of bool tensors. - !internal::is_same::type, bool>::value, - PreferBlockAccess = true, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = false + IsAligned = /*TensorEvaluator::IsAligned*/false, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false }; - typedef typename internal::remove_const::type ScalarNoConst; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator TensorBlockScratch; - - // Tensor slicing does not change the block type. - typedef typename TensorEvaluator::TensorBlock - TensorBlock; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) { - for (Index i = 0; i < internal::array_size::value; ++i) { + for (std::size_t i = 0; i < internal::array_size::value; ++i) { eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); } - m_is_identity = true; - for (int i = 0; i < internal::array_size::value; ++i) { - eigen_assert(m_impl.dimensions()[i] >= - op.sizes()[i] + op.startIndices()[i]); - if (m_impl.dimensions()[i] != op.sizes()[i] || - op.startIndices()[i] != 0) { - m_is_identity = false; - } - } - - // No strides for scalars. - if (NumDims == 0) return; - const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); const Sizes& output_dims = op.sizes(); if (static_cast(Layout) == static_cast(ColMajor)) { @@ -525,12 +354,18 @@ struct TensorEvaluator, Devi } } + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Sizes Dimensions; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType* data) { m_impl.evalSubExprsIfNeeded(NULL); - if (!NumTraits::type>::RequireInitialization - && data && m_impl.data()) { + if (!NumTraits::type>::RequireInitialization && data && m_impl.data()) { Index contiguous_values = 1; if (static_cast(Layout) == static_cast(ColMajor)) { for (int i = 0; i < NumDims; ++i) { @@ -548,12 +383,12 @@ struct TensorEvaluator, Devi } } // Use memcpy if it's going to be faster than using the regular evaluation. - const MemcpyTriggerForSlicing trigger(m_device); - if (trigger(internal::array_prod(dimensions()), contiguous_values)) { - EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data(); - for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { + const MemcpyTriggerForSlicing trigger(m_device); + if (trigger(contiguous_values)) { + Scalar* src = (Scalar*)m_impl.data(); + for (int i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { Index offset = srcCoeff(i); - m_device.memcpy((void*)(m_device.get(data + i)), m_device.get(src+offset), contiguous_values * sizeof(Scalar)); + m_device.memcpy((void*)(data+i), src+offset, contiguous_values * sizeof(Scalar)); } return false; } @@ -561,42 +396,25 @@ struct TensorEvaluator, Devi return true; } -#ifdef EIGEN_USE_THREADS - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType data, EvalSubExprsCallback done) { - m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); - } -#endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - if (m_is_identity) { - return m_impl.coeff(index); - } else { - return m_impl.coeff(srcCoeff(index)); - } + return m_impl.coeff(srcCoeff(index)); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const { - const int packetSize = PacketType::size; + const int packetSize = internal::unpacket_traits::size; EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+packetSize-1 < internal::array_prod(dimensions())); - if (m_is_identity) { - return m_impl.template packet(index); - } - Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; if (static_cast(Layout) == static_cast(ColMajor)) { - EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / m_fastOutputStrides[i]; const Index idx1 = indices[1] / m_fastOutputStrides[i]; @@ -608,7 +426,6 @@ struct TensorEvaluator, Devi inputIndices[0] += (indices[0] + m_offsets[0]); inputIndices[1] += (indices[1] + m_offsets[0]); } else { - EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx0 = indices[0] / m_fastOutputStrides[i]; const Index idx1 = indices[1] / m_fastOutputStrides[i]; @@ -628,7 +445,6 @@ struct TensorEvaluator, Devi EIGEN_ALIGN_MAX typename internal::remove_const::type values[packetSize]; values[0] = m_impl.coeff(inputIndices[0]); values[packetSize-1] = m_impl.coeff(inputIndices[1]); - EIGEN_UNROLL_LOOP for (int i = 1; i < packetSize-1; ++i) { values[i] = coeff(index+i); } @@ -638,28 +454,12 @@ struct TensorEvaluator, Devi } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { - return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims); + return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - const size_t target_size = m_device.lastLevelCacheSize(); - return internal::TensorBlockResourceRequirements::merge( - internal::TensorBlockResourceRequirements::skewed(target_size), - m_impl.getResourceRequirements()); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { - TensorBlockDesc arg_desc = desc.WithOffset(srcCoeff(desc.offset())); - TensorBlock block = m_impl.block(arg_desc, scratch); - if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer(); - return block; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { - typename Storage::Type result = constCast(m_impl.data()); + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const { + Scalar* result = m_impl.data(); if (result) { Index offset = 0; if (static_cast(Layout) == static_cast(ColMajor)) { @@ -693,19 +493,12 @@ struct TensorEvaluator, Devi } return NULL; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { - EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; @@ -713,7 +506,6 @@ struct TensorEvaluator, Devi } inputIndex += (index + m_offsets[0]); } else { - EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += (idx + m_offsets[i]) * m_inputStrides[i]; @@ -728,9 +520,8 @@ struct TensorEvaluator, Devi array, NumDims> m_fastOutputStrides; array m_inputStrides; TensorEvaluator m_impl; - const Device EIGEN_DEVICE_REF m_device; + const Device& m_device; Dimensions m_dimensions; - bool m_is_identity; const StartIndices m_offsets; }; @@ -744,55 +535,36 @@ struct TensorEvaluator, Device> typedef TensorSlicingOp XprType; static const int NumDims = internal::array_size::value; + enum { + IsAligned = /*TensorEvaluator::IsAligned*/false, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = false, + RawAccess = false + }; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : Base(op, device) + { } + typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef Sizes Dimensions; - enum { - IsAligned = false, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess, - PreferBlockAccess = true, - Layout = TensorEvaluator::Layout, - CoordAccess = false, - RawAccess = (NumDims == 1) & TensorEvaluator::RawAccess - }; - - typedef typename internal::remove_const::type ScalarNoConst; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator TensorBlockScratch; - //===--------------------------------------------------------------------===// - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : Base(op, device) - { } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { - if (this->m_is_identity) { - return this->m_impl.coeffRef(index); - } else { - return this->m_impl.coeffRef(this->srcCoeff(index)); - } + return this->m_impl.coeffRef(this->srcCoeff(index)); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) { - if (this->m_is_identity) { - this->m_impl.template writePacket(index, x); - return; - } - - const int packetSize = PacketType::size; + const int packetSize = internal::unpacket_traits::size; Index inputIndices[] = {0, 0}; Index indices[] = {index, index + packetSize - 1}; if (static_cast(Layout) == static_cast(ColMajor)) { - EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; @@ -804,7 +576,6 @@ struct TensorEvaluator, Device> inputIndices[0] += (indices[0] + this->m_offsets[0]); inputIndices[1] += (indices[1] + this->m_offsets[0]); } else { - EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx0 = indices[0] / this->m_fastOutputStrides[i]; const Index idx1 = indices[1] / this->m_fastOutputStrides[i]; @@ -824,21 +595,15 @@ struct TensorEvaluator, Device> internal::pstore(values, x); this->m_impl.coeffRef(inputIndices[0]) = values[0]; this->m_impl.coeffRef(inputIndices[1]) = values[packetSize-1]; - EIGEN_UNROLL_LOOP for (int i = 1; i < packetSize-1; ++i) { this->coeffRef(index+i) = values[i]; } } } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( - const TensorBlockDesc& desc, const TensorBlock& block) { - TensorBlockDesc arg_desc = desc.WithOffset(this->srcCoeff(desc.offset())); - this->m_impl.writeBlock(arg_desc, block); - } }; + + namespace internal { template struct traits > : public traits @@ -851,13 +616,12 @@ struct traits::type _Nested; static const int NumDimensions = array_size::value; static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; }; template struct eval, Eigen::Dense> { - typedef const TensorStridingSlicingOpEIGEN_DEVICE_REF type; + typedef const TensorStridingSlicingOp& type; }; template @@ -930,13 +694,6 @@ struct TensorEvaluator XprType; static const int NumDims = internal::array_size::value; - typedef typename XprType::Index Index; - typedef typename XprType::Scalar Scalar; - typedef typename XprType::CoeffReturnType CoeffReturnType; - typedef typename PacketType::type PacketReturnType; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; - typedef Strides Dimensions; enum { // Alignment can't be guaranteed at compile time since it depends on the @@ -944,60 +701,43 @@ struct TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, RawAccess = false }; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), - m_device(device), - m_strides(op.strides()) + : m_impl(op.expression(), device), m_device(device), m_strides(op.strides()) { // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero - DSizes startIndicesClamped, stopIndicesClamped; - for (ptrdiff_t i = 0; i < internal::array_size::value; ++i) { + DSizes startIndicesClamped, stopIndicesClamped; + for (size_t i = 0; i < internal::array_size::value; ++i) { eigen_assert(m_strides[i] != 0 && "0 stride is invalid"); - if (m_strides[i] > 0) { - startIndicesClamped[i] = - clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]); - stopIndicesClamped[i] = - clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]); - } else { - /* implies m_strides[i] < 0 by assert */ - startIndicesClamped[i] = - clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1); - stopIndicesClamped[i] = - clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1); + if(m_strides[i]>0){ + startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]); + stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]); + }else{ + /* implies m_strides[i]<0 by assert */ + startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1); + stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1); } m_startIndices[i] = startIndicesClamped[i]; } - typedef typename TensorEvaluator::Dimensions InputDimensions; - const InputDimensions& input_dims = m_impl.dimensions(); + const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); // check for degenerate intervals and compute output tensor shape - bool degenerate = false; - m_is_identity = true; - for (int i = 0; i < NumDims; i++) { + bool degenerate = false;; + for(int i = 0; i < NumDims; i++){ Index interval = stopIndicesClamped[i] - startIndicesClamped[i]; - if (interval == 0 || ((interval < 0) != (m_strides[i] < 0))) { + if(interval == 0 || ((interval<0) != (m_strides[i]<0))){ m_dimensions[i] = 0; degenerate = true; - } else { - m_dimensions[i] = - (interval / m_strides[i]) + (interval % m_strides[i] != 0 ? 1 : 0); + }else{ + m_dimensions[i] = interval / m_strides[i] + + (interval % m_strides[i] != 0 ? 1 : 0); eigen_assert(m_dimensions[i] >= 0); } - if (m_strides[i] != 1 || interval != m_impl.dimensions()[i]) { - m_is_identity = false; - } } - Strides output_dims = m_dimensions; if (static_cast(Layout) == static_cast(ColMajor)) { @@ -1034,12 +774,22 @@ struct TensorEvaluator(degenerate ? 1 : m_outputStrides[i]); } } + m_block_total_size_max = numext::maxi(static_cast(1), + device.lastLevelCacheSize() / + sizeof(Scalar)); } + typedef typename XprType::Index Index; + typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const::type ScalarNonConst; + typedef typename XprType::CoeffReturnType CoeffReturnType; + typedef typename PacketType::type PacketReturnType; + typedef Strides Dimensions; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -1050,39 +800,28 @@ struct TensorEvaluator(Layout) == static_cast(ColMajor)) { - EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i >= 0; --i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += idx * m_inputStrides[i] + m_offsets[i]; index -= idx * m_outputStrides[i]; } } else { - EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims; ++i) { const Index idx = index / m_fastOutputStrides[i]; inputIndex += idx * m_inputStrides[i] + m_offsets[i]; @@ -1092,24 +831,20 @@ struct TensorEvaluator m_outputStrides; array, NumDims> m_fastOutputStrides; array m_inputStrides; - bool m_is_identity; TensorEvaluator m_impl; - const Device EIGEN_DEVICE_REF m_device; + const Device& m_device; DSizes m_startIndices; // clamped startIndices DSizes m_dimensions; DSizes m_offsets; // offset in a flattened shape const Strides m_strides; + std::size_t m_block_total_size_max; }; // Eval as lvalue @@ -1125,33 +860,25 @@ struct TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, CoordAccess = TensorEvaluator::CoordAccess, RawAccess = false }; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } typedef typename XprType::Index Index; typedef typename XprType::Scalar Scalar; + typedef typename internal::remove_const::type ScalarNonConst; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; typedef Strides Dimensions; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) { - if (this->m_is_identity) { - return this->m_impl.coeffRef(index); - } else { - return this->m_impl.coeffRef(this->srcCoeff(index)); - } + return this->m_impl.coeffRef(this->srcCoeff(index)); } }; diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h index 561666c6f..647bcf108 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h @@ -31,7 +31,6 @@ struct traits > : public traits::type _Nested; static const int NumDimensions = XprTraits::NumDimensions; static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; }; template @@ -91,33 +90,18 @@ struct TensorEvaluator, Device typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = PacketType::size; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; + static const int PacketSize = internal::unpacket_traits::size; enum { - IsAligned = true, - PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::RawAccess, - PreferBlockAccess = true, - Layout = TensorEvaluator::Layout, - CoordAccess = true, - RawAccess = false + IsAligned = true, + PacketAccess = TensorEvaluator::PacketAccess, + Layout = TensorEvaluator::Layout, + CoordAccess = true, + RawAccess = false }; - typedef typename internal::remove_const::type ScalarNoConst; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor TensorBlockDesc; - typedef internal::TensorBlockScratchAllocator TensorBlockScratch; - - typedef typename internal::TensorMaterializedBlock - TensorBlock; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device) + : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()) { // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector @@ -151,19 +135,10 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - -#ifdef EIGEN_USE_THREADS - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( - EvaluatorPointerType, EvalSubExprsCallback done) { - m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); - } -#endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -173,7 +148,6 @@ struct TensorEvaluator, Device eigen_assert(index < dimensions().TotalSize()); Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { - EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { const Index idx = index / m_outputStrides[i]; if (isPaddingAtIndexForDim(idx, i)) { @@ -187,7 +161,6 @@ struct TensorEvaluator, Device } inputIndex += (index - m_padding[0].first); } else { - EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 1; ++i) { const Index idx = index / m_outputStrides[i+1]; if (isPaddingAtIndexForDim(idx, i)) { @@ -216,298 +189,18 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { TensorOpCost cost = m_impl.costPerCoeff(vectorized); if (static_cast(Layout) == static_cast(ColMajor)) { - EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims; ++i) updateCostPerDimension(cost, i, i == 0); } else { - EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i >= 0; --i) updateCostPerDimension(cost, i, i == NumDims - 1); } return cost; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - internal::TensorBlockResourceRequirements getResourceRequirements() const { - const size_t target_size = m_device.lastLevelCacheSize(); - return internal::TensorBlockResourceRequirements::merge( - internal::TensorBlockResourceRequirements::skewed(target_size), - m_impl.getResourceRequirements()); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock - block(TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool /*root_of_expr_ast*/ = false) const { - // If one of the dimensions is zero, return empty block view. - if (desc.size() == 0) { - return TensorBlock(internal::TensorBlockKind::kView, NULL, - desc.dimensions()); - } - - static const bool IsColMajor = Layout == static_cast(ColMajor); - const int inner_dim_idx = IsColMajor ? 0 : NumDims - 1; - - Index offset = desc.offset(); - - // Compute offsets in the output tensor corresponding to the desc.offset(). - DSizes output_offsets; - for (int i = NumDims - 1; i > 0; --i) { - const int dim = IsColMajor ? i : NumDims - i - 1; - const int stride_dim = IsColMajor ? dim : dim + 1; - output_offsets[dim] = offset / m_outputStrides[stride_dim]; - offset -= output_offsets[dim] * m_outputStrides[stride_dim]; - } - output_offsets[inner_dim_idx] = offset; - - // Offsets in the input corresponding to output offsets. - DSizes input_offsets = output_offsets; - for (int i = 0; i < NumDims; ++i) { - const int dim = IsColMajor ? i : NumDims - i - 1; - input_offsets[dim] = input_offsets[dim] - m_padding[dim].first; - } - - // Compute offset in the input buffer (at this point it might be illegal and - // point outside of the input buffer, because we don't check for negative - // offsets, it will be autocorrected in the block iteration loop below). - Index input_offset = 0; - for (int i = 0; i < NumDims; ++i) { - const int dim = IsColMajor ? i : NumDims - i - 1; - input_offset += input_offsets[dim] * m_inputStrides[dim]; - } - - // Destination buffer and scratch buffer both indexed from 0 and have the - // same dimensions as the requested block (for destination buffer this - // property is guaranteed by `desc.destination()`). - Index output_offset = 0; - const DSizes output_strides = - internal::strides(desc.dimensions()); - - // NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1` - // dimensions, skipping innermost dimension. In theory it should be possible - // to squeeze matching innermost dimensions, however in practice that did - // not show any improvements in benchmarks. Also in practice first outer - // dimension usually has padding, and will prevent squeezing. - - // Initialize output block iterator state. Dimension in this array are - // always in inner_most -> outer_most order (col major layout). - array it; - for (int i = 0; i < NumDims - 1; ++i) { - const int dim = IsColMajor ? i + 1 : NumDims - i - 2; - it[i].count = 0; - it[i].size = desc.dimension(dim); - - it[i].input_stride = m_inputStrides[dim]; - it[i].input_span = it[i].input_stride * (it[i].size - 1); - - it[i].output_stride = output_strides[dim]; - it[i].output_span = it[i].output_stride * (it[i].size - 1); - } - - const Index input_inner_dim_size = - static_cast(m_impl.dimensions()[inner_dim_idx]); - - // Total output size. - const Index output_size = desc.size(); - - // We will fill inner dimension of this size in the output. It might be - // larger than the inner dimension in the input, so we might have to pad - // before/after we copy values from the input inner dimension. - const Index output_inner_dim_size = desc.dimension(inner_dim_idx); - - // How many values to fill with padding BEFORE reading from the input inner - // dimension. - const Index output_inner_pad_before_size = - input_offsets[inner_dim_idx] < 0 - ? numext::mini(numext::abs(input_offsets[inner_dim_idx]), - output_inner_dim_size) - : 0; - - // How many values we can actually copy from the input inner dimension. - const Index output_inner_copy_size = numext::mini( - // Want to copy from input. - (output_inner_dim_size - output_inner_pad_before_size), - // Can copy from input. - numext::maxi(input_inner_dim_size - (input_offsets[inner_dim_idx] + - output_inner_pad_before_size), - Index(0))); - - eigen_assert(output_inner_copy_size >= 0); - - // How many values to fill with padding AFTER reading from the input inner - // dimension. - const Index output_inner_pad_after_size = - (output_inner_dim_size - output_inner_copy_size - - output_inner_pad_before_size); - - // Sanity check, sum of all sizes must be equal to the output size. - eigen_assert(output_inner_dim_size == - (output_inner_pad_before_size + output_inner_copy_size + - output_inner_pad_after_size)); - - // Keep track of current coordinates and padding in the output. - DSizes output_coord = output_offsets; - DSizes output_padded; - for (int i = 0; i < NumDims; ++i) { - const int dim = IsColMajor ? i : NumDims - i - 1; - output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); - } - - typedef internal::StridedLinearBufferCopy LinCopy; - - // Prepare storage for the materialized padding result. - const typename TensorBlock::Storage block_storage = - TensorBlock::prepareStorage(desc, scratch); - - // TODO(ezhulenev): Squeeze multiple non-padded inner dimensions into a - // single logical inner dimension. - - // When possible we squeeze writes for the innermost (only if non-padded) - // dimension with the first padded dimension. This allows to reduce the - // number of calls to LinCopy and better utilize vector instructions. - const bool squeeze_writes = - NumDims > 1 && - // inner dimension is not padded - (input_inner_dim_size == m_dimensions[inner_dim_idx]) && - // and equal to the block inner dimension - (input_inner_dim_size == output_inner_dim_size); - - const int squeeze_dim = IsColMajor ? inner_dim_idx + 1 : inner_dim_idx - 1; - - // Maximum coordinate on a squeeze dimension that we can write to. - const Index squeeze_max_coord = - squeeze_writes ? numext::mini( - // max non-padded element in the input - static_cast(m_dimensions[squeeze_dim] - - m_padding[squeeze_dim].second), - // max element in the output buffer - static_cast(output_offsets[squeeze_dim] + - desc.dimension(squeeze_dim))) - : static_cast(0); - - // Iterate copying data from `m_impl.data()` to the output buffer. - for (Index size = 0; size < output_size;) { - // Detect if we are in the padded region (exclude innermost dimension). - bool is_padded = false; - for (int j = 1; j < NumDims; ++j) { - const int dim = IsColMajor ? j : NumDims - j - 1; - is_padded = output_padded[dim]; - if (is_padded) break; - } - - if (is_padded) { - // Fill single innermost dimension with padding value. - size += output_inner_dim_size; - - LinCopy::template Run( - typename LinCopy::Dst(output_offset, 1, block_storage.data()), - typename LinCopy::Src(0, 0, &m_paddingValue), - output_inner_dim_size); - - - } else if (squeeze_writes) { - // Squeeze multiple reads from innermost dimensions. - const Index squeeze_num = squeeze_max_coord - output_coord[squeeze_dim]; - size += output_inner_dim_size * squeeze_num; - - // Copy `squeeze_num` inner dimensions from input to output. - LinCopy::template Run( - typename LinCopy::Dst(output_offset, 1, block_storage.data()), - typename LinCopy::Src(input_offset, 1, m_impl.data()), - output_inner_dim_size * squeeze_num); - - // Update iteration state for only `squeeze_num - 1` processed inner - // dimensions, because we have another iteration state update at the end - // of the loop that will update iteration state for the last inner - // processed dimension. - it[0].count += (squeeze_num - 1); - input_offset += it[0].input_stride * (squeeze_num - 1); - output_offset += it[0].output_stride * (squeeze_num - 1); - output_coord[squeeze_dim] += (squeeze_num - 1); - - } else { - // Single read from innermost dimension. - size += output_inner_dim_size; - - { // Fill with padding before copying from input inner dimension. - const Index out = output_offset; - - LinCopy::template Run( - typename LinCopy::Dst(out, 1, block_storage.data()), - typename LinCopy::Src(0, 0, &m_paddingValue), - output_inner_pad_before_size); - } - - { // Copy data from input inner dimension. - const Index out = output_offset + output_inner_pad_before_size; - const Index in = input_offset + output_inner_pad_before_size; - - eigen_assert(output_inner_copy_size == 0 || m_impl.data() != NULL); - - LinCopy::template Run( - typename LinCopy::Dst(out, 1, block_storage.data()), - typename LinCopy::Src(in, 1, m_impl.data()), - output_inner_copy_size); - } - - { // Fill with padding after copying from input inner dimension. - const Index out = output_offset + output_inner_pad_before_size + - output_inner_copy_size; - - LinCopy::template Run( - typename LinCopy::Dst(out, 1, block_storage.data()), - typename LinCopy::Src(0, 0, &m_paddingValue), - output_inner_pad_after_size); - } - } - - for (int j = 0; j < NumDims - 1; ++j) { - const int dim = IsColMajor ? j + 1 : NumDims - j - 2; - - if (++it[j].count < it[j].size) { - input_offset += it[j].input_stride; - output_offset += it[j].output_stride; - output_coord[dim] += 1; - output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); - break; - } - it[j].count = 0; - input_offset -= it[j].input_span; - output_offset -= it[j].output_span; - output_coord[dim] -= it[j].size - 1; - output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim); - } - } - - return block_storage.AsTensorMaterializedBlock(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } private: - struct BlockIteratorState { - BlockIteratorState() - : count(0), - size(0), - input_stride(0), - input_span(0), - output_stride(0), - output_span(0) {} - - Index count; - Index size; - Index input_stride; - Index input_span; - Index output_stride; - Index output_span; - }; - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim( Index index, int dim_index) const { #if defined(EIGEN_HAS_INDEX_LIST) @@ -569,23 +262,22 @@ struct TensorEvaluator, Device const Index initialIndex = index; Index inputIndex = 0; - EIGEN_UNROLL_LOOP for (int i = NumDims - 1; i > 0; --i) { - const Index firstIdx = index; - const Index lastIdx = index + PacketSize - 1; + const Index first = index; + const Index last = index + PacketSize - 1; const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i]; const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i]; const Index lastPaddedRight = m_outputStrides[i+1]; - if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) { + if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) { // all the coefficient are in the padding zone. return internal::pset1(m_paddingValue); } - else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { + else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) { // all the coefficient are in the padding zone. return internal::pset1(m_paddingValue); } - else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { + else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { // all the coefficient are between the 2 padding zones. const Index idx = index / m_outputStrides[i]; inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; @@ -597,21 +289,21 @@ struct TensorEvaluator, Device } } - const Index lastIdx = index + PacketSize - 1; - const Index firstIdx = index; + const Index last = index + PacketSize - 1; + const Index first = index; const Index lastPaddedLeft = m_padding[0].first; const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second); const Index lastPaddedRight = m_outputStrides[1]; - if (!isLeftPaddingCompileTimeZero(0) && lastIdx < lastPaddedLeft) { + if (!isLeftPaddingCompileTimeZero(0) && last < lastPaddedLeft) { // all the coefficient are in the padding zone. return internal::pset1(m_paddingValue); } - else if (!isRightPaddingCompileTimeZero(0) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { + else if (!isRightPaddingCompileTimeZero(0) && first >= firstPaddedRight && last < lastPaddedRight) { // all the coefficient are in the padding zone. return internal::pset1(m_paddingValue); } - else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { + else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { // all the coefficient are between the 2 padding zones. inputIndex += (index - m_padding[0].first); return m_impl.template packet(inputIndex); @@ -627,23 +319,23 @@ struct TensorEvaluator, Device const Index initialIndex = index; Index inputIndex = 0; - EIGEN_UNROLL_LOOP + for (int i = 0; i < NumDims - 1; ++i) { - const Index firstIdx = index; - const Index lastIdx = index + PacketSize - 1; + const Index first = index; + const Index last = index + PacketSize - 1; const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i+1]; const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1]; const Index lastPaddedRight = m_outputStrides[i]; - if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) { + if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) { // all the coefficient are in the padding zone. return internal::pset1(m_paddingValue); } - else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { + else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) { // all the coefficient are in the padding zone. return internal::pset1(m_paddingValue); } - else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { + else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { // all the coefficient are between the 2 padding zones. const Index idx = index / m_outputStrides[i+1]; inputIndex += (idx - m_padding[i].first) * m_inputStrides[i]; @@ -655,21 +347,21 @@ struct TensorEvaluator, Device } } - const Index lastIdx = index + PacketSize - 1; - const Index firstIdx = index; + const Index last = index + PacketSize - 1; + const Index first = index; const Index lastPaddedLeft = m_padding[NumDims-1].first; const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second); const Index lastPaddedRight = m_outputStrides[NumDims-1]; - if (!isLeftPaddingCompileTimeZero(NumDims-1) && lastIdx < lastPaddedLeft) { + if (!isLeftPaddingCompileTimeZero(NumDims-1) && last < lastPaddedLeft) { // all the coefficient are in the padding zone. return internal::pset1(m_paddingValue); } - else if (!isRightPaddingCompileTimeZero(NumDims-1) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) { + else if (!isRightPaddingCompileTimeZero(NumDims-1) && first >= firstPaddedRight && last < lastPaddedRight) { // all the coefficient are in the padding zone. return internal::pset1(m_paddingValue); } - else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) { + else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (first >= lastPaddedLeft && last < firstPaddedRight)) { // all the coefficient are between the 2 padding zones. inputIndex += (index - m_padding[NumDims-1].first); return m_impl.template packet(inputIndex); @@ -681,7 +373,6 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const { EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { values[i] = coeff(index+i); } @@ -696,8 +387,6 @@ struct TensorEvaluator, Device PaddingDimensions m_padding; Scalar m_paddingValue; - - const Device EIGEN_DEVICE_REF m_device; }; diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 64a436e50..886a254f6 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -31,7 +31,6 @@ struct traits > : public traits typedef typename remove_reference::type _Nested; static const int NumDimensions = XprTraits::NumDimensions + 1; static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; }; template @@ -88,25 +87,17 @@ struct TensorEvaluator, Device> typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const int PacketSize = PacketType::size; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, PacketAccess = TensorEvaluator::PacketAccess, - BlockAccess = false, - PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, RawAccess = false }; - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { @@ -152,7 +143,7 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } @@ -170,7 +161,6 @@ struct TensorEvaluator, Device> Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index]; Index inputIndex = 0; if (static_cast(Layout) == static_cast(ColMajor)) { - EIGEN_UNROLL_LOOP for (int i = NumDims - 2; i > 0; --i) { const Index patchIdx = patchIndex / m_patchStrides[i]; patchIndex -= patchIdx * m_patchStrides[i]; @@ -179,7 +169,6 @@ struct TensorEvaluator, Device> inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i]; } } else { - EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 2; ++i) { const Index patchIdx = patchIndex / m_patchStrides[i]; patchIndex -= patchIdx * m_patchStrides[i]; @@ -207,7 +196,6 @@ struct TensorEvaluator, Device> Index inputIndices[2] = {0, 0}; if (static_cast(Layout) == static_cast(ColMajor)) { - EIGEN_UNROLL_LOOP for (int i = NumDims - 2; i > 0; --i) { const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], patchIndices[1] / m_patchStrides[i]}; @@ -223,7 +211,6 @@ struct TensorEvaluator, Device> inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i]; } } else { - EIGEN_UNROLL_LOOP for (int i = 0; i < NumDims - 2; ++i) { const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], patchIndices[1] / m_patchStrides[i]}; @@ -250,7 +237,6 @@ struct TensorEvaluator, Device> EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize]; values[0] = m_impl.coeff(inputIndices[0]); values[PacketSize-1] = m_impl.coeff(inputIndices[1]); - EIGEN_UNROLL_LOOP for (int i = 1; i < PacketSize-1; ++i) { values[i] = coeff(index+i); } @@ -267,14 +253,7 @@ struct TensorEvaluator, Device> TensorOpCost(0, 0, compute_cost, vectorized, PacketSize); } - EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } - -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - } -#endif + EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } protected: Dimensions m_dimensions; @@ -283,7 +262,6 @@ struct TensorEvaluator, Device> array m_patchStrides; TensorEvaluator m_impl; - }; } // end namespace Eigen diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h index 445248163..1655a813e 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -2,7 +2,6 @@ // for linear algebra. // // Copyright (C) 2016 Benoit Steiner -// Copyright (C) 2018 Mehdi Goli Codeplay Software Ltd. // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -17,10 +16,10 @@ namespace internal { namespace { EIGEN_DEVICE_FUNC uint64_t get_random_seed() { -#if defined(EIGEN_GPU_COMPILE_PHASE) +#ifdef __CUDA_ARCH__ // We don't support 3d kernels since we currently only use 1 and // 2d kernels. - gpu_assert(threadIdx.z == 0); + assert(threadIdx.z == 0); return clock64() + blockIdx.x * blockDim.x + threadIdx.x + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y); @@ -45,15 +44,6 @@ EIGEN_DEVICE_FUNC uint64_t get_random_seed() { uint64_t rnd = ::random() ^ mach_absolute_time(); return rnd; -#elif defined __native_client__ - // Same approach as for win32, except using clock_gettime - timespec ts; - clock_gettime(CLOCK_REALTIME, &ts); - int rnd1 = ::rand(); - int rnd2 = ::rand(); - uint64_t rnd = (rnd1 | rnd2 << 16) ^ ts.tv_nsec; - return rnd; - #else // Augment the current time with pseudo random number generation // to ensure that we get different seeds if we try to generate seeds @@ -65,11 +55,11 @@ EIGEN_DEVICE_FUNC uint64_t get_random_seed() { #endif } -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) { +static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) { // TODO: Unify with the implementation in the non blocking thread pool. uint64_t current = *state; // Update the internal state - *state = current * 6364136223846793005ULL + (stream << 1 | 1); + *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; // Generate the random output (using the PCG-XSH-RS scheme) return static_cast((current ^ (current >> 22)) >> (22 + (current >> 61))); } @@ -83,17 +73,17 @@ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -T RandomToTypeUniform(uint64_t* state, uint64_t stream) { - unsigned rnd = PCG_XSH_RS_generator(state, stream); +T RandomToTypeUniform(uint64_t* state) { + unsigned rnd = PCG_XSH_RS_generator(state); return static_cast(rnd); } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -Eigen::half RandomToTypeUniform(uint64_t* state, uint64_t stream) { +Eigen::half RandomToTypeUniform(uint64_t* state) { Eigen::half result; // Generate 10 random bits for the mantissa - unsigned rnd = PCG_XSH_RS_generator(state, stream); + unsigned rnd = PCG_XSH_RS_generator(state); result.x = static_cast(rnd & 0x3ffu); // Set the exponent result.x |= (static_cast(15) << 10); @@ -103,14 +93,14 @@ Eigen::half RandomToTypeUniform(uint64_t* state, uint64_t stream) { template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -float RandomToTypeUniform(uint64_t* state, uint64_t stream) { +float RandomToTypeUniform(uint64_t* state) { typedef union { uint32_t raw; float fp; } internal; internal result; // Generate 23 random bits for the mantissa mantissa - const unsigned rnd = PCG_XSH_RS_generator(state, stream); + const unsigned rnd = PCG_XSH_RS_generator(state); result.raw = rnd & 0x7fffffu; // Set the exponent result.raw |= (static_cast(127) << 23); @@ -119,7 +109,7 @@ float RandomToTypeUniform(uint64_t* state, uint64_t stream) { } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -double RandomToTypeUniform(uint64_t* state, uint64_t stream) { +double RandomToTypeUniform(uint64_t* state) { typedef union { uint64_t raw; double dp; @@ -128,9 +118,9 @@ double RandomToTypeUniform(uint64_t* state, uint64_t stream) { result.raw = 0; // Generate 52 random bits for the mantissa // First generate the upper 20 bits - unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu; + unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu; // The generate the lower 32 bits - unsigned rnd2 = PCG_XSH_RS_generator(state, stream); + unsigned rnd2 = PCG_XSH_RS_generator(state); result.raw = (static_cast(rnd1) << 32) | rnd2; // Set the exponent result.raw |= (static_cast(1023) << 52); @@ -139,14 +129,14 @@ double RandomToTypeUniform(uint64_t* state, uint64_t stream) { } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeUniform >(uint64_t* state, uint64_t stream) { - return std::complex(RandomToTypeUniform(state, stream), - RandomToTypeUniform(state, stream)); +std::complex RandomToTypeUniform >(uint64_t* state) { + return std::complex(RandomToTypeUniform(state), + RandomToTypeUniform(state)); } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeUniform >(uint64_t* state, uint64_t stream) { - return std::complex(RandomToTypeUniform(state, stream), - RandomToTypeUniform(state, stream)); +std::complex RandomToTypeUniform >(uint64_t* state) { + return std::complex(RandomToTypeUniform(state), + RandomToTypeUniform(state)); } template class UniformRandomGenerator { @@ -157,42 +147,17 @@ template class UniformRandomGenerator { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( uint64_t seed = 0) { m_state = PCG_XSH_RS_state(seed); - #ifdef EIGEN_USE_SYCL - // In SYCL it is not possible to build PCG_XSH_RS_state in one step. - // Therefor, we need two step to initializate the m_state. - // IN SYCL, the constructor of the functor is s called on the CPU - // and we get the clock seed here from the CPU. However, This seed is - //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function. - // and only available on the Operator() function (which is called on the GPU). - // Thus for CUDA (((CLOCK + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread - // but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each thread adds - // the (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the construction - // similar to CUDA Therefore, the thread Id injection is not available at this stage. - //However when the operator() is called the thread ID will be avilable. So inside the opeator, - // we add the thrreadID, BlockId,... (which is equivalent of i) - //to the seed and construct the unique m_state per thead similar to cuda. - m_exec_once =false; - #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator( const UniformRandomGenerator& other) { m_state = other.m_state; - #ifdef EIGEN_USE_SYCL - m_exec_once =other.m_exec_once; - #endif } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const { - #ifdef EIGEN_USE_SYCL - if(!m_exec_once) { - // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread - // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side - m_state += (i * 6364136223846793005ULL); - m_exec_once =true; - } - #endif - T result = RandomToTypeUniform(&m_state, i); + uint64_t local_state = m_state + i; + T result = RandomToTypeUniform(&local_state); + m_state = local_state; return result; } @@ -200,25 +165,16 @@ template class UniformRandomGenerator { Packet packetOp(Index i) const { const int packetSize = internal::unpacket_traits::size; EIGEN_ALIGN_MAX T values[packetSize]; - #ifdef EIGEN_USE_SYCL - if(!m_exec_once) { - // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread - m_state += (i * 6364136223846793005ULL); - m_exec_once =true; - } - #endif - EIGEN_UNROLL_LOOP + uint64_t local_state = m_state + i; for (int j = 0; j < packetSize; ++j) { - values[j] = RandomToTypeUniform(&m_state, i); + values[j] = RandomToTypeUniform(&local_state); } + m_state = local_state; return internal::pload(values); } private: mutable uint64_t m_state; - #ifdef EIGEN_USE_SYCL - mutable bool m_exec_once; - #endif }; template @@ -234,14 +190,14 @@ struct functor_traits > { template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -T RandomToTypeNormal(uint64_t* state, uint64_t stream) { +T RandomToTypeNormal(uint64_t* state) { // Use the ratio of uniform method to generate numbers following a normal // distribution. See for example Numerical Recipes chapter 7.3.9 for the // details. T u, v, q; do { - u = RandomToTypeUniform(state, stream); - v = T(1.7156) * (RandomToTypeUniform(state, stream) - T(0.5)); + u = RandomToTypeUniform(state); + v = T(1.7156) * (RandomToTypeUniform(state) - T(0.5)); const T x = u - T(0.449871); const T y = numext::abs(v) + T(0.386595); q = x*x + y * (T(0.196)*y - T(0.25472)*x); @@ -252,14 +208,14 @@ T RandomToTypeNormal(uint64_t* state, uint64_t stream) { } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeNormal >(uint64_t* state, uint64_t stream) { - return std::complex(RandomToTypeNormal(state, stream), - RandomToTypeNormal(state, stream)); +std::complex RandomToTypeNormal >(uint64_t* state) { + return std::complex(RandomToTypeNormal(state), + RandomToTypeNormal(state)); } template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -std::complex RandomToTypeNormal >(uint64_t* state, uint64_t stream) { - return std::complex(RandomToTypeNormal(state, stream), - RandomToTypeNormal(state, stream)); +std::complex RandomToTypeNormal >(uint64_t* state) { + return std::complex(RandomToTypeNormal(state), + RandomToTypeNormal(state)); } @@ -270,38 +226,17 @@ template class NormalRandomGenerator { // Uses the given "seed" if non-zero, otherwise uses a random seed. EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) { m_state = PCG_XSH_RS_state(seed); - #ifdef EIGEN_USE_SYCL - // In SYCL it is not possible to build PCG_XSH_RS_state in one step. - // Therefor, we need two steps to initializate the m_state. - // IN SYCL, the constructor of the functor is s called on the CPU - // and we get the clock seed here from the CPU. However, This seed is - //the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function. - // and only available on the Operator() function (which is called on the GPU). - // Therefore, the thread Id injection is not available at this stage. However when the operator() - //is called the thread ID will be avilable. So inside the opeator, - // we add the thrreadID, BlockId,... (which is equivalent of i) - //to the seed and construct the unique m_state per thead similar to cuda. - m_exec_once =false; - #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator( const NormalRandomGenerator& other) { m_state = other.m_state; -#ifdef EIGEN_USE_SYCL - m_exec_once=other.m_exec_once; -#endif } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const { - #ifdef EIGEN_USE_SYCL - if(!m_exec_once) { - // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread - m_state += (i * 6364136223846793005ULL); - m_exec_once =true; - } - #endif - T result = RandomToTypeNormal(&m_state, i); + uint64_t local_state = m_state + i; + T result = RandomToTypeNormal(&local_state); + m_state = local_state; return result; } @@ -309,25 +244,16 @@ template class NormalRandomGenerator { Packet packetOp(Index i) const { const int packetSize = internal::unpacket_traits::size; EIGEN_ALIGN_MAX T values[packetSize]; - #ifdef EIGEN_USE_SYCL - if(!m_exec_once) { - // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread - m_state += (i * 6364136223846793005ULL); - m_exec_once =true; - } - #endif - EIGEN_UNROLL_LOOP + uint64_t local_state = m_state + i; for (int j = 0; j < packetSize; ++j) { - values[j] = RandomToTypeNormal(&m_state, i); + values[j] = RandomToTypeNormal(&local_state); } + m_state = local_state; return internal::pload(values); } private: mutable uint64_t m_state; - #ifdef EIGEN_USE_SYCL - mutable bool m_exec_once; - #endif }; diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 8332a9ae0..41d0d0022 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -11,20 +11,8 @@ #ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H #define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H -// clang is incompatible with the CUDA syntax wrt making a kernel a class friend, -// so we'll use a macro to make clang happy. -#ifndef KERNEL_FRIEND -#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__)) -#define KERNEL_FRIEND friend __global__ -#else -#define KERNEL_FRIEND friend -#endif -#endif - - namespace Eigen { - /** \class TensorReduction * \ingroup CXX11_Tensor_Module * @@ -44,7 +32,6 @@ namespace internal { typedef typename XprType::Nested Nested; static const int NumDimensions = XprTraits::NumDimensions - array_size::value; static const int Layout = XprTraits::Layout; - typedef typename XprTraits::PointerType PointerType; template struct MakePointer { // Intermediate typedef to workaround MSVC issue. @@ -165,9 +152,7 @@ struct GenericDimReducer<-1, Self, Op> { } }; -template +template struct InnerMostDimReducer { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { typename Self::CoeffReturnType accum = reducer.initialize(); @@ -179,88 +164,23 @@ struct InnerMostDimReducer { }; template -struct InnerMostDimReducer { +struct InnerMostDimReducer { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { - const typename Self::Index packetSize = internal::unpacket_traits::size; + const int packetSize = internal::unpacket_traits::size; const typename Self::Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize; - typename Self::PacketReturnType paccum = reducer.template initializePacket(); + typename Self::PacketReturnType p = reducer.template initializePacket(); for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) { - reducer.reducePacket(self.m_impl.template packet(firstIndex + j), &paccum); + reducer.reducePacket(self.m_impl.template packet(firstIndex + j), &p); } typename Self::CoeffReturnType accum = reducer.initialize(); for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; ++j) { reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); } - return reducer.finalizeBoth(accum, paccum); + return reducer.finalizeBoth(accum, p); } }; -#if !defined(EIGEN_HIPCC) -static const int kLeafSize = 1024; - -template -struct InnerMostDimReducer { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType - reduce(const Self& self, typename Self::Index firstIndex, - typename Self::Index numValuesToReduce, Op& reducer) { - typename Self::CoeffReturnType accum = reducer.initialize(); - if (numValuesToReduce > kLeafSize) { - const typename Self::Index half = numValuesToReduce / 2; - reducer.reduce(reduce(self, firstIndex, half, reducer), &accum); - reducer.reduce( - reduce(self, firstIndex + half, numValuesToReduce - half, reducer), - &accum); - } else { - for (typename Self::Index j = 0; j < numValuesToReduce; ++j) { - reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); - } - } - return reducer.finalize(accum); - } -}; - -template -struct InnerMostDimReducer { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType - reduce(const Self& self, typename Self::Index firstIndex, - typename Self::Index numValuesToReduce, Op& reducer) { - const typename Self::Index packetSize = - internal::unpacket_traits::size; - typename Self::CoeffReturnType accum = reducer.initialize(); - if (numValuesToReduce > packetSize * kLeafSize) { - // Make sure the split point is aligned on a packet boundary. - const typename Self::Index split = - packetSize * - divup(firstIndex + divup(numValuesToReduce, typename Self::Index(2)), - packetSize); - const typename Self::Index num_left = - numext::mini(split - firstIndex, numValuesToReduce); - reducer.reduce(reduce(self, firstIndex, num_left, reducer), &accum); - if (num_left < numValuesToReduce) { - reducer.reduce( - reduce(self, split, numValuesToReduce - num_left, reducer), &accum); - } - return reducer.finalize(accum); - } else { - const typename Self::Index VectorizedSize = - (numValuesToReduce / packetSize) * packetSize; - typename Self::PacketReturnType paccum = - reducer.template initializePacket(); - for (typename Self::Index j = 0; j < VectorizedSize; j += packetSize) { - reducer.reducePacket( - self.m_impl.template packet(firstIndex + j), &paccum); - } - for (typename Self::Index j = VectorizedSize; j < numValuesToReduce; - ++j) { - reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum); - } - return reducer.finalizeBoth(accum, paccum); - } - } -}; -#endif - -template +template struct InnerMostDimPreserver { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&, typename Self::PacketReturnType*) { eigen_assert(false && "should never be called"); @@ -295,11 +215,11 @@ struct InnerMostDimPreserver<-1, Self, Op, true> { }; // Default full reducer -template +template struct FullReducer { static const bool HasOptimizedImplementation = false; - static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::EvaluatorPointerType output) { + static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&, typename Self::CoeffReturnType* output) { const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions()); *output = InnerMostDimReducer::reduce(self, 0, num_coeffs, reducer); } @@ -309,7 +229,7 @@ struct FullReducer { #ifdef EIGEN_USE_THREADS // Multithreaded full reducers template + bool Vectorizable = (Self::InputPacketAccess & Op::PacketAccess)> struct FullReducerShard { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer, @@ -322,8 +242,8 @@ struct FullReducerShard { // Multithreaded full reducer template struct FullReducer { - static const bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful; - static const Index PacketSize = + static const bool HasOptimizedImplementation = !Op::IsStateful; + static const int PacketSize = unpacket_traits::size; // launch one reducer per thread and accumulate the result. @@ -400,58 +320,29 @@ struct OuterReducer { } }; -#ifdef EIGEN_USE_SYCL -// Default Generic reducer -template -struct GenericReducer { - static const bool HasOptimizedImplementation = false; - EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*, typename Self::Index, typename Self::Index) { - eigen_assert(false && "Not implemented"); - return true; - } -}; -#endif - -#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) -template -__global__ void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +template +__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); -#if defined(EIGEN_HAS_GPU_FP16) -template -__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits::type*); -template -__global__ void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits::type*); -template -__global__ void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); +#ifdef EIGEN_HAS_CUDA_FP16 +template +__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); +template +__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*); +template +__global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*); #endif -template -__global__ void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); +template +__global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); -template -__global__ void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); +template +__global__ void OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); #endif -/** - * For SYCL, the return type of the reduction is deduced from the initialize method of the given Op. - * This allows the reduction to have a different type for the accumulator than the input data type. - * If this is the case, the functor needs to have two reduce method: one for reducing an element of the input - * with the accumulator and the other for reducing two accumulators. - * Such a reducer can be useful for instance when the accumulator is a boolean or a bitset that checks for - * some properties of the input. - */ -template -struct ReductionReturnType { -#if defined(EIGEN_USE_SYCL) - typedef typename remove_const().initialize())>::type type; -#else - typedef typename remove_const::type type; -#endif -}; - } // end namespace internal @@ -485,15 +376,11 @@ class TensorReductionOp : public TensorBase -struct TensorReductionEvaluatorBase; // Eval as rvalue template class MakePointer_, typename Device> -struct TensorReductionEvaluatorBase, Device> +struct TensorEvaluator, Device> { - typedef internal::reducer_traits ReducerTraits; - typedef Dims ReducedDims; typedef TensorReductionOp XprType; typedef typename XprType::Index Index; typedef ArgType ChildType; @@ -503,42 +390,26 @@ struct TensorReductionEvaluatorBase, DSizes >::type Dimensions; typedef typename XprType::Scalar Scalar; - typedef TensorReductionEvaluatorBase, Device> Self; + typedef TensorEvaluator, Device> Self; static const bool InputPacketAccess = TensorEvaluator::PacketAccess; - typedef typename internal::ReductionReturnType::type CoeffReturnType; + typedef typename internal::remove_const::type CoeffReturnType; typedef typename PacketType::type PacketReturnType; - static const Index PacketSize = PacketType::size; - - typedef typename Eigen::internal::traits::PointerType TensorPointerType; - typedef StorageMemory Storage; - typedef typename Storage::Type EvaluatorPointerType; - - // Subset of strides of the input tensor for the non-reduced dimensions. - // Indexed by output dimensions. - static const int NumPreservedStrides = max_n_1::size; + static const int PacketSize = internal::unpacket_traits::size; enum { IsAligned = false, - PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess, - BlockAccess = false, - PreferBlockAccess = true, + PacketAccess = Self::InputPacketAccess && Op::PacketAccess, Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; - typedef typename internal::remove_const::type ScalarNoConst; - - //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockNotImplemented TensorBlock; - //===--------------------------------------------------------------------===// - static const bool ReducingInnerMostDims = internal::are_inner_most_dims::value; static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims::value; static const bool RunningFullReduction = (NumOutputDims==0); - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device) - : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device), m_xpr_dims(op.dims()) { EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE); EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)), @@ -563,13 +434,11 @@ struct TensorReductionEvaluatorBase(m_outputStrides[i]); } } else { - m_outputStrides[NumOutputDims - 1] = 1; + m_outputStrides.back() = 1; for (int i = NumOutputDims - 2; i >= 0; --i) { m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; - m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } } @@ -597,7 +466,6 @@ struct TensorReductionEvaluatorBase(Layout) == static_cast(ColMajor)) - ? m_preservedStrides[0] - : m_preservedStrides[NumOutputDims - 1]; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_STRONG_INLINE -#if !defined(EIGEN_HIPCC) - // Marking this as EIGEN_DEVICE_FUNC for HIPCC requires also doing the same - // for all the functions being called within here, which then leads to - // proliferation of EIGEN_DEVICE_FUNC markings, one of which will eventually - // result in an NVCC error - EIGEN_DEVICE_FUNC -#endif - bool evalSubExprsIfNeededCommon(EvaluatorPointerType data) { + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(typename MakePointer_::Type data) { + m_impl.evalSubExprsIfNeeded(NULL); + // Use the FullReducer if possible. if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction && internal::FullReducer::HasOptimizedImplementation && @@ -634,7 +489,7 @@ struct TensorReductionEvaluatorBase(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType)))); + m_result = static_cast(m_device.allocate(sizeof(CoeffReturnType))); data = m_result; need_assign = true; } @@ -642,9 +497,20 @@ struct TensorReductionEvaluatorBase::run(*this, reducer, m_device, data); return need_assign; } + else if(RunningOnSycl){ + const Index num_values_to_reduce = internal::array_prod(m_reducedDims); + const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions); + if (!data) { + data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); + m_result = data; + } + Op reducer(m_reducer); + internal::InnerReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); + return (m_result != NULL); + } // Attempt to use an optimized reduction. - else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) { + else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) { bool reducing_inner_dims = true; for (int i = 0; i < NumReducedDims; ++i) { if (static_cast(Layout) == static_cast(ColMajor)) { @@ -658,8 +524,8 @@ struct TensorReductionEvaluatorBase num_coeffs_to_preserve && num_values_to_reduce > 128) || (RunningOnSycl)) { - data = static_cast(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); + if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) { + data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); m_result = data; } else { @@ -667,10 +533,9 @@ struct TensorReductionEvaluatorBase::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { if (m_result) { - m_device.deallocate_temp(m_result); + m_device.deallocate(m_result); m_result = NULL; } return true; @@ -692,8 +557,8 @@ struct TensorReductionEvaluatorBase num_coeffs_to_preserve && num_values_to_reduce > 32) || (RunningOnSycl)) { - data = static_cast(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); + if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) { + data = static_cast(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve)); m_result = data; } else { @@ -701,10 +566,9 @@ struct TensorReductionEvaluatorBase::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) { if (m_result) { - m_device.deallocate_temp(m_result); + m_device.deallocate(m_result); m_result = NULL; } return true; @@ -712,64 +576,21 @@ struct TensorReductionEvaluatorBase(m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve))); - m_result = data; - } - Op reducer(m_reducer); - internal::GenericReducer::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve); - return (m_result != NULL); - } - #endif } return true; } -#ifdef EIGEN_USE_THREADS - template - EIGEN_STRONG_INLINE -#if !defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC -#endif - void - evalSubExprsIfNeededAsync(EvaluatorPointerType data, - EvalSubExprsCallback done) { - m_impl.evalSubExprsIfNeededAsync(NULL, [this, data, done](bool) { - done(evalSubExprsIfNeededCommon(data)); - }); - } -#endif - - EIGEN_STRONG_INLINE -#if !defined(EIGEN_HIPCC) - // Marking this as EIGEN_DEVICE_FUNC for HIPCC requires also doing the same - // for all the functions being called within here, which then leads to - // proliferation of EIGEN_DEVICE_FUNC markings, one of which will eventually - // result in an NVCC error - EIGEN_DEVICE_FUNC -#endif - bool evalSubExprsIfNeeded(EvaluatorPointerType data) { - m_impl.evalSubExprsIfNeeded(NULL); - return evalSubExprsIfNeededCommon(data); - } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); if (m_result) { - m_device.deallocate_temp(m_result); + m_device.deallocate(m_result); m_result = NULL; } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - if (( RunningFullReduction || RunningOnGPU) && m_result ) { + if ((RunningOnSycl || RunningFullReduction || RunningOnGPU) && m_result) { return *(m_result + index); } Op reducer(m_reducer); @@ -841,52 +662,37 @@ struct TensorReductionEvaluatorBase& impl() const { return m_impl; } - EIGEN_DEVICE_FUNC const Device& device() const { return m_device; } -#ifdef EIGEN_USE_SYCL - // binding placeholder accessors to a command group handler for SYCL - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { - m_impl.bind(cgh); - m_result.bind(cgh); - } -#endif + EIGEN_DEVICE_FUNC typename MakePointer_::Type data() const { return m_result; } + /// required by sycl in order to extract the accessor + const TensorEvaluator& impl() const { return m_impl; } + /// added for sycl in order to construct the buffer from the sycl device + const Device& device() const{return m_device;} + /// added for sycl in order to re-construct the reduction eval on the device for the sub-kernel + const Dims& xprDims() const {return m_xpr_dims;} + private: template friend struct internal::GenericDimReducer; - template friend struct internal::InnerMostDimReducer; + template friend struct internal::InnerMostDimReducer; template friend struct internal::InnerMostDimPreserver; template friend struct internal::FullReducer; #ifdef EIGEN_USE_THREADS template friend struct internal::FullReducerShard; #endif -#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) - template KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); -#if defined(EIGEN_HAS_GPU_FP16) - template KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits::type*); - template KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits::type*); - template KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) + template friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*); +#ifdef EIGEN_HAS_CUDA_FP16 + template friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*); + template friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*); + template friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*); #endif - template KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); + template friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); - template KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); + template friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*); #endif -#if defined(EIGEN_USE_SYCL) - template < typename Evaluator_, typename Op__> friend class TensorSycl::internal::GenericNondeterministicReducer; - // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer - template friend struct internal::GenericReducer; -#endif - - template friend struct internal::InnerReducer; - struct BlockIteratorState { - Index input_dim; - Index output_size; - Index output_count; - }; - // Returns the Index in the input tensor of the first value that needs to be // used to compute the reduction at output index "index". EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { @@ -935,12 +741,10 @@ struct TensorReductionEvaluatorBase m_outputStrides; - array, NumOutputDims> m_fastOutputStrides; + // Subset of strides of the input tensor for the non-reduced dimensions. + // Indexed by output dimensions. + static const int NumPreservedStrides = max_n_1::size; array m_preservedStrides; - // Map from output to input dimension index. - array m_output_to_input_dim_map; - // How many values go into each reduction - Index m_numValuesToReduce; // Subset of strides of the input tensor for the reduced dimensions. // Indexed by reduced dimensions. @@ -956,7 +760,7 @@ struct TensorReductionEvaluatorBase::value; static const bool RunningOnSycl = false; #elif defined(EIGEN_USE_SYCL) @@ -966,36 +770,10 @@ static const bool RunningOnGPU = false; static const bool RunningOnGPU = false; static const bool RunningOnSycl = false; #endif - EvaluatorPointerType m_result; + typename MakePointer_::Type m_result; - const Device EIGEN_DEVICE_REF m_device; -}; - -template class MakePointer_, typename Device> -struct TensorEvaluator, Device> -: public TensorReductionEvaluatorBase, Device> { - typedef TensorReductionEvaluatorBase, Device> Base; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){} -}; - - -template class MakePointer_> -struct TensorEvaluator, Eigen::SyclDevice> -: public TensorReductionEvaluatorBase, Eigen::SyclDevice> { - - typedef TensorReductionEvaluatorBase, Eigen::SyclDevice> Base; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){} - // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel - //Therefore the coeff function should be overridden by for SYCL kernel - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const { - return *(this->data() + index); - } - // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel - //Therefore the packet function should be overridden by for SYCL kernel - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const { - return internal::pload(this->data() + index); - } + const Device& m_device; + const Dims& m_xpr_dims; }; } // end namespace Eigen diff --git a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 68780cd3c..65638b6a8 100644 --- a/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/uppsrc/plugin/Eigen/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -1,6 +1,750 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -#if defined(__clang__) || defined(__GNUC__) -#warning "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorReductionGpu.h file" +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H +#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_CUDA_H + +namespace Eigen { +namespace internal { + + +#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) +// Full reducers for GPU, don't vectorize for now + +// Reducer function that enables multiple cuda thread to safely accumulate at the same +// output address. It basically reads the current value of the output variable, and +// attempts to update it with the new value. If in the meantime another cuda thread +// updated the content of the output address it will try again. +template +__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { +#if __CUDA_ARCH__ >= 300 + if (sizeof(T) == 4) + { + unsigned int oldval = *reinterpret_cast(output); + unsigned int newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + unsigned int readback; + while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + } + } + else if (sizeof(T) == 8) { + unsigned long long oldval = *reinterpret_cast(output); + unsigned long long newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + unsigned long long readback; + while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + } + } + else { + assert(0 && "Wordsize not supported"); + } +#else + assert(0 && "Shouldn't be called on unsupported device"); +#endif +} + +// We extend atomicExch to support extra data types +template +__device__ inline Type atomicExchCustom(Type* address, Type val) { + return atomicExch(address, val); +} + +template <> +__device__ inline double atomicExchCustom(double* address, double val) { + unsigned long long int* address_as_ull = reinterpret_cast(address); + return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); +} + +#ifdef EIGEN_HAS_CUDA_FP16 +template