mirror of
https://github.com/ultimatepp/ultimatepp.git
synced 2026-06-20 22:04:03 -06:00
288 lines
10 KiB
C++
288 lines
10 KiB
C++
// This file is part of Eigen, a lightweight C++ template library
|
|
// for linear algebra.
|
|
//
|
|
// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
|
//
|
|
// This Source Code Form is subject to the terms of the Mozilla
|
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
|
|
#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
|
|
|
|
namespace Eigen {
|
|
|
|
/** \class TensorExecutor
|
|
* \ingroup CXX11_Tensor_Module
|
|
*
|
|
* \brief The tensor executor class.
|
|
*
|
|
* This class is responsible for launch the evaluation of the expression on
|
|
* the specified computing device.
|
|
*/
|
|
namespace internal {
|
|
|
|
// Default strategy: the expression is evaluated with a single cpu thread.
|
|
template<typename Expression, typename Device, bool Vectorizable>
|
|
class TensorExecutor
|
|
{
|
|
public:
|
|
typedef typename Expression::Index Index;
|
|
EIGEN_DEVICE_FUNC
|
|
static inline void run(const Expression& expr, const Device& device = Device())
|
|
{
|
|
TensorEvaluator<Expression, Device> evaluator(expr, device);
|
|
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
|
if (needs_assign)
|
|
{
|
|
const Index size = array_prod(evaluator.dimensions());
|
|
for (Index i = 0; i < size; ++i) {
|
|
evaluator.evalScalar(i);
|
|
}
|
|
}
|
|
evaluator.cleanup();
|
|
}
|
|
};
|
|
|
|
|
|
template<typename Expression>
|
|
class TensorExecutor<Expression, DefaultDevice, true>
|
|
{
|
|
public:
|
|
typedef typename Expression::Index Index;
|
|
EIGEN_DEVICE_FUNC
|
|
static inline void run(const Expression& expr, const DefaultDevice& device = DefaultDevice())
|
|
{
|
|
TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
|
|
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
|
if (needs_assign)
|
|
{
|
|
const Index size = array_prod(evaluator.dimensions());
|
|
const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
|
|
// Give the compiler a strong hint to unroll the loop. But don't insist
|
|
// on unrolling, because if the function is expensive the compiler should not
|
|
// unroll the loop at the expense of inlining.
|
|
const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
|
|
for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
|
|
for (Index j = 0; j < 4; j++) {
|
|
evaluator.evalPacket(i + j * PacketSize);
|
|
}
|
|
}
|
|
const Index VectorizedSize = (size / PacketSize) * PacketSize;
|
|
for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
|
|
evaluator.evalPacket(i);
|
|
}
|
|
for (Index i = VectorizedSize; i < size; ++i) {
|
|
evaluator.evalScalar(i);
|
|
}
|
|
}
|
|
evaluator.cleanup();
|
|
}
|
|
};
|
|
|
|
|
|
|
|
// Multicore strategy: the index space is partitioned and each partition is executed on a single core
|
|
#ifdef EIGEN_USE_THREADS
|
|
template <typename Evaluator, typename Index, bool Vectorizable>
|
|
struct EvalRange {
|
|
static void run(Evaluator* evaluator_in, const Index first, const Index last) {
|
|
Evaluator evaluator = *evaluator_in;
|
|
eigen_assert(last >= first);
|
|
for (Index i = first; i < last; ++i) {
|
|
evaluator.evalScalar(i);
|
|
}
|
|
}
|
|
|
|
static Index alignBlockSize(Index size) {
|
|
return size;
|
|
}
|
|
};
|
|
|
|
template <typename Evaluator, typename Index>
|
|
struct EvalRange<Evaluator, Index, true> {
|
|
static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
|
|
|
static void run(Evaluator* evaluator_in, const Index first, const Index last) {
|
|
Evaluator evaluator = *evaluator_in;
|
|
eigen_assert(last >= first);
|
|
Index i = first;
|
|
if (last - first >= PacketSize) {
|
|
eigen_assert(first % PacketSize == 0);
|
|
Index last_chunk_offset = last - 4 * PacketSize;
|
|
// Give the compiler a strong hint to unroll the loop. But don't insist
|
|
// on unrolling, because if the function is expensive the compiler should not
|
|
// unroll the loop at the expense of inlining.
|
|
for (; i <= last_chunk_offset; i += 4*PacketSize) {
|
|
for (Index j = 0; j < 4; j++) {
|
|
evaluator.evalPacket(i + j * PacketSize);
|
|
}
|
|
}
|
|
last_chunk_offset = last - PacketSize;
|
|
for (; i <= last_chunk_offset; i += PacketSize) {
|
|
evaluator.evalPacket(i);
|
|
}
|
|
}
|
|
for (; i < last; ++i) {
|
|
evaluator.evalScalar(i);
|
|
}
|
|
}
|
|
|
|
static Index alignBlockSize(Index size) {
|
|
// Align block size to packet size and account for unrolling in run above.
|
|
if (size >= 16 * PacketSize) {
|
|
return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
|
|
}
|
|
// Aligning to 4 * PacketSize would increase block size by more than 25%.
|
|
return (size + PacketSize - 1) & ~(PacketSize - 1);
|
|
}
|
|
};
|
|
|
|
template <typename Expression, bool Vectorizable>
|
|
class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
|
|
public:
|
|
typedef typename Expression::Index Index;
|
|
static inline void run(const Expression& expr, const ThreadPoolDevice& device)
|
|
{
|
|
typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
|
|
Evaluator evaluator(expr, device);
|
|
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
|
if (needs_assign)
|
|
{
|
|
const Index size = array_prod(evaluator.dimensions());
|
|
#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL)
|
|
device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
|
|
EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize,
|
|
[&evaluator](Index first, Index last) {
|
|
EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first, last);
|
|
});
|
|
#else
|
|
size_t num_threads = device.numThreads();
|
|
if (num_threads > 1) {
|
|
num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
|
|
size, evaluator.costPerCoeff(Vectorizable), num_threads);
|
|
}
|
|
if (num_threads == 1) {
|
|
EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
|
|
} else {
|
|
const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
|
|
Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
|
|
const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
|
|
const Index numblocks = size / blocksize;
|
|
|
|
Barrier barrier(numblocks);
|
|
for (int i = 0; i < numblocks; ++i) {
|
|
device.enqueue_with_barrier(
|
|
&barrier, &EvalRange<Evaluator, Index, Vectorizable>::run,
|
|
&evaluator, i * blocksize, (i + 1) * blocksize);
|
|
}
|
|
if (numblocks * blocksize < size) {
|
|
EvalRange<Evaluator, Index, Vectorizable>::run(
|
|
&evaluator, numblocks * blocksize, size);
|
|
}
|
|
barrier.Wait();
|
|
}
|
|
#endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL)
|
|
}
|
|
evaluator.cleanup();
|
|
}
|
|
};
|
|
#endif // EIGEN_USE_THREADS
|
|
|
|
|
|
// GPU: the evaluation of the expression is offloaded to a GPU.
|
|
#if defined(EIGEN_USE_GPU)
|
|
|
|
template <typename Expression, bool Vectorizable>
|
|
class TensorExecutor<Expression, GpuDevice, Vectorizable> {
|
|
public:
|
|
typedef typename Expression::Index Index;
|
|
static void run(const Expression& expr, const GpuDevice& device);
|
|
};
|
|
|
|
|
|
#if defined(__CUDACC__)
|
|
template <typename Evaluator, typename Index, bool Vectorizable>
|
|
struct EigenMetaKernelEval {
|
|
static __device__ EIGEN_ALWAYS_INLINE
|
|
void run(Evaluator& eval, Index first, Index last, Index step_size) {
|
|
for (Index i = first; i < last; i += step_size) {
|
|
eval.evalScalar(i);
|
|
}
|
|
}
|
|
};
|
|
|
|
template <typename Evaluator, typename Index>
|
|
struct EigenMetaKernelEval<Evaluator, Index, true> {
|
|
static __device__ EIGEN_ALWAYS_INLINE
|
|
void run(Evaluator& eval, Index first, Index last, Index step_size) {
|
|
const Index PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
|
|
const Index vectorized_size = (last / PacketSize) * PacketSize;
|
|
const Index vectorized_step_size = step_size * PacketSize;
|
|
|
|
// Use the vector path
|
|
for (Index i = first * PacketSize; i < vectorized_size;
|
|
i += vectorized_step_size) {
|
|
eval.evalPacket(i);
|
|
}
|
|
for (Index i = vectorized_size + first; i < last; i += step_size) {
|
|
eval.evalScalar(i);
|
|
}
|
|
}
|
|
};
|
|
|
|
template <typename Evaluator, typename Index>
|
|
__global__ void
|
|
__launch_bounds__(1024)
|
|
EigenMetaKernel(Evaluator eval, Index size) {
|
|
|
|
const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
|
|
const Index step_size = blockDim.x * gridDim.x;
|
|
|
|
const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
|
|
EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
|
|
}
|
|
|
|
/*static*/
|
|
template <typename Expression, bool Vectorizable>
|
|
inline void TensorExecutor<Expression, GpuDevice, Vectorizable>::run(
|
|
const Expression& expr, const GpuDevice& device) {
|
|
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
|
|
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
|
if (needs_assign) {
|
|
const int block_size = device.maxCudaThreadsPerBlock();
|
|
const int max_blocks = device.getNumCudaMultiProcessors() *
|
|
device.maxCudaThreadsPerMultiProcessor() / block_size;
|
|
const Index size = array_prod(evaluator.dimensions());
|
|
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
|
|
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
|
|
|
|
LAUNCH_CUDA_KERNEL(
|
|
(EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, Index>),
|
|
num_blocks, block_size, 0, device, evaluator, size);
|
|
}
|
|
evaluator.cleanup();
|
|
}
|
|
|
|
#endif // __CUDACC__
|
|
#endif // EIGEN_USE_GPU
|
|
|
|
// SYCL Executor policy
|
|
#ifdef EIGEN_USE_SYCL
|
|
|
|
template <typename Expression, bool Vectorizable>
|
|
class TensorExecutor<Expression, SyclDevice, Vectorizable> {
|
|
public:
|
|
static inline void run(const Expression &expr, const SyclDevice &device) {
|
|
// call TensorSYCL module
|
|
TensorSycl::run(expr, device);
|
|
}
|
|
};
|
|
|
|
#endif
|
|
|
|
} // end namespace internal
|
|
|
|
} // end namespace Eigen
|
|
|
|
#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
|