508 lines
17 KiB
C++
508 lines
17 KiB
C++
// Copyright 2019 The MediaPipe Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
//
|
|
// Parallel for loop execution.
|
|
// For details adapt parallel_using_* flags defined in parallel_invoker.cc.
|
|
|
|
// Usage example (for 1D):
|
|
|
|
// Define Functor or lambda function that implements:
|
|
// void operator()(const BlockedRange & range) const;
|
|
// (in addition functor needs to be copyable).
|
|
|
|
// Execute a for loop in parallel from 0 to N via:
|
|
// ParallelFor(0, // start_index
|
|
// num_frames, // end_index, exclusive
|
|
// 1 // number of elements processed per iteration
|
|
// [](const BlockedRange& range) {
|
|
// // Process per-thread sub-range
|
|
// for (int i = range.begin(); i < range.end(); ++i) {
|
|
// // Process i'th item.
|
|
// }
|
|
// }
|
|
|
|
// Specific implementation to copy a vector of images in parallel.
|
|
// class CopyInvoker {
|
|
// public:
|
|
// CopyInvoker(const vector<cv::Mat>& inputs,
|
|
// vector<cv::Mat*>* outputs)
|
|
// : inputs_(inputs), outputs_(outputs) {
|
|
// }
|
|
// CopyInvoker(const CopyInvoker& rhs)
|
|
// : inputs_(rhs.inputs_), outputs_(rhs.outputs) {
|
|
// }
|
|
// void operator()(const BlockedRange& range) {
|
|
// for (int frame = range.begin(); frame < range.end(); ++frame) {
|
|
// inputs_[frame].copyTo(*(*outputs_)[frame]);
|
|
// }
|
|
// }
|
|
// private:
|
|
// const vector<cv::Mat>& inputs_;
|
|
// vector<cv::Mat*>* outputs_;
|
|
// }
|
|
|
|
// vector<cv::Mat> inputs;
|
|
// vector<cv::Mat*> outputs;
|
|
// ParallelFor(0, num_frames, 1, CopyInvoker(inputs, &outputs));
|
|
//
|
|
// OR (with lambdas):
|
|
// ParallelFor(0, num_frames, 1,
|
|
// [&inputs, &outputs](const BlockedRange& range) {
|
|
// for (int frame = range.begin(); frame < range.end(); ++frame) {
|
|
// inputs[frame].copyTo(*(outputs)[frame]);
|
|
// }
|
|
// }
|
|
|
|
#ifndef MEDIAPIPE_UTIL_TRACKING_PARALLEL_INVOKER_H_
|
|
#define MEDIAPIPE_UTIL_TRACKING_PARALLEL_INVOKER_H_
|
|
|
|
#include <stddef.h>
|
|
|
|
#include <memory>
|
|
|
|
#include "absl/synchronization/mutex.h"
|
|
#include "mediapipe/framework/port/logging.h"
|
|
|
|
#ifdef PARALLEL_INVOKER_ACTIVE
|
|
#include "mediapipe/framework/port/threadpool.h"
|
|
|
|
#ifdef __APPLE__
|
|
#include <dispatch/dispatch.h>
|
|
#include <stdatomic.h>
|
|
#endif
|
|
|
|
#endif // PARALLEL_INVOKER_ACTIVE
|
|
|
|
// Specifies parallelization implementation to use.
|
|
enum PARALLEL_INVOKER_MODE {
|
|
PARALLEL_INVOKER_NONE = 0, // Uses single threaded execution
|
|
PARALLEL_INVOKER_THREAD_POOL = 1, // Uses //thread/threadpool
|
|
PARALLEL_INVOKER_OPENMP = 2, // Uses OpenMP (requires compiler support)
|
|
PARALLEL_INVOKER_GCD = 3, // Uses GCD (Apple)
|
|
PARALLEL_INVOKER_MAX_VALUE = 4, // Increase when adding more modes
|
|
};
|
|
|
|
extern int flags_parallel_invoker_mode;
|
|
extern int flags_parallel_invoker_max_threads;
|
|
|
|
// Note flag: Parallel processing only activated if
|
|
// PARALLEL_INVOKER_ACTIVE is defined.
|
|
|
|
namespace mediapipe {
|
|
|
|
// Partitions the range [begin, end) into equal blocks of size grain_size each
|
|
// (except last one, might be less than grain_size).
|
|
class BlockedRange {
|
|
public:
|
|
BlockedRange(int begin, int end, int grain_size)
|
|
: begin_(begin), end_(end), grain_size_(grain_size) {}
|
|
|
|
int begin() const { return begin_; }
|
|
int end() const { return end_; }
|
|
int grain_size() const { return grain_size_; }
|
|
|
|
private:
|
|
int begin_;
|
|
int end_;
|
|
int grain_size_;
|
|
};
|
|
|
|
// Partitions the range row_range x col_range into equal
|
|
// blocks of size row_range.grain_size() x col_range.grain_size() each
|
|
// (except last column and row might be of size less than grain_size in one
|
|
// or both of their dimensions).
|
|
class BlockedRange2D {
|
|
public:
|
|
BlockedRange2D(const BlockedRange& rows, const BlockedRange& cols)
|
|
: rows_(rows), cols_(cols) {}
|
|
|
|
const BlockedRange& rows() const { return rows_; }
|
|
const BlockedRange& cols() const { return cols_; }
|
|
|
|
private:
|
|
BlockedRange rows_;
|
|
BlockedRange cols_;
|
|
};
|
|
|
|
#ifdef PARALLEL_INVOKER_ACTIVE
|
|
|
|
// Singleton ThreadPool for parallel invoker.
|
|
ThreadPool* ParallelInvokerThreadPool();
|
|
|
|
#ifdef __APPLE__
|
|
// Enable to allow GCD as an option beside ThreadPool.
|
|
#define USE_PARALLEL_INVOKER_GCD 1
|
|
#define CHECK_GCD_PARALLEL_WORK_COUNT DEBUG
|
|
|
|
template <class Invoker>
|
|
class ParallelInvokerGCDContext {
|
|
public:
|
|
ParallelInvokerGCDContext(const Invoker& invoker, const BlockedRange& rows)
|
|
: local_invoker_(invoker), rows_(rows) {
|
|
#if CHECK_GCD_PARALLEL_WORK_COUNT
|
|
count_ = 0;
|
|
#endif
|
|
}
|
|
|
|
const Invoker& invoker() {
|
|
#if CHECK_GCD_PARALLEL_WORK_COUNT
|
|
// Implicitly tracking the # of launched tasks at invoker retrieval.
|
|
atomic_fetch_add(&count_, 1);
|
|
#endif
|
|
return local_invoker_;
|
|
}
|
|
const BlockedRange& rows() const { return rows_; }
|
|
#if CHECK_GCD_PARALLEL_WORK_COUNT
|
|
const int count() { return atomic_load(&count_); }
|
|
#endif
|
|
|
|
private:
|
|
Invoker local_invoker_;
|
|
const BlockedRange& rows_;
|
|
#if CHECK_GCD_PARALLEL_WORK_COUNT
|
|
_Atomic(int32_t) count_;
|
|
#endif
|
|
};
|
|
|
|
template <class Invoker>
|
|
class ParallelInvokerGCDContext2D : public ParallelInvokerGCDContext<Invoker> {
|
|
public:
|
|
ParallelInvokerGCDContext2D(const Invoker& invoker, const BlockedRange& rows,
|
|
const BlockedRange& cols)
|
|
: ParallelInvokerGCDContext<Invoker>(invoker, rows), cols_(cols) {}
|
|
|
|
const BlockedRange& cols() const { return cols_; }
|
|
|
|
private:
|
|
BlockedRange cols_;
|
|
};
|
|
|
|
template <class Invoker>
|
|
static void ParallelForGCDTask(void* context, size_t index) {
|
|
ParallelInvokerGCDContext<Invoker>* invoker_context =
|
|
static_cast<ParallelInvokerGCDContext<Invoker>*>(context);
|
|
const BlockedRange& all_tasks = invoker_context->rows();
|
|
int start = all_tasks.begin() + index * all_tasks.grain_size();
|
|
int end = std::min(all_tasks.end(), start + all_tasks.grain_size());
|
|
BlockedRange this_task(start, end, all_tasks.grain_size());
|
|
|
|
const Invoker& invoker = invoker_context->invoker();
|
|
invoker(this_task);
|
|
}
|
|
|
|
template <class Invoker>
|
|
static void ParallelForGCDTask2D(void* context, size_t index) {
|
|
ParallelInvokerGCDContext2D<Invoker>* invoker_context =
|
|
static_cast<ParallelInvokerGCDContext2D<Invoker>*>(context);
|
|
// Partitioning across rows.
|
|
const BlockedRange& all_tasks = invoker_context->rows();
|
|
int start = all_tasks.begin() + index * all_tasks.grain_size();
|
|
int end = std::min(all_tasks.end(), start + all_tasks.grain_size());
|
|
BlockedRange this_task(start, end, all_tasks.grain_size());
|
|
|
|
const Invoker& invoker = invoker_context->invoker();
|
|
invoker(BlockedRange2D(this_task, invoker_context->cols()));
|
|
}
|
|
#endif // __APPLE__
|
|
|
|
#endif // PARALLEL_INVOKER_ACTIVE
|
|
// Simple wrapper for compatibility with below ParallelFor function.
|
|
template <class Invoker>
|
|
void SerialFor(size_t start, size_t end, size_t grain_size,
|
|
const Invoker& invoker) {
|
|
invoker(BlockedRange(start, end, 1));
|
|
}
|
|
|
|
inline void CheckAndSetInvokerOptions() {
|
|
#if defined(PARALLEL_INVOKER_ACTIVE)
|
|
#if defined(__ANDROID__)
|
|
// If unsupported option is selected, force usage of OpenMP if detected, and
|
|
// ThreadPool otherwise.
|
|
if (flags_parallel_invoker_mode != PARALLEL_INVOKER_NONE &&
|
|
flags_parallel_invoker_mode != PARALLEL_INVOKER_THREAD_POOL &&
|
|
flags_parallel_invoker_mode != PARALLEL_INVOKER_OPENMP) {
|
|
#if defined(_OPENMP)
|
|
LOG(WARNING) << "Unsupported invoker mode selected on Android. "
|
|
<< "OpenMP linkage detected, so falling back to OpenMP";
|
|
flags_parallel_invoker_mode = PARALLEL_INVOKER_OPENMP;
|
|
#else // _OPENMP
|
|
// Fallback mode for active parallel invoker without OpenMP is ThreadPool.
|
|
LOG(WARNING) << "Unsupported invoker mode selected on Android. "
|
|
<< "Falling back to ThreadPool";
|
|
flags_parallel_invoker_mode = PARALLEL_INVOKER_THREAD_POOL;
|
|
#endif // _OPENMP
|
|
}
|
|
#endif // __ANDROID__
|
|
|
|
#if defined(__APPLE__) || defined(__EMSCRIPTEN__)
|
|
// Force usage of ThreadPool if unsupported option is selected.
|
|
// (OpenMP is not supported on iOS, due to missing clang support).
|
|
if (flags_parallel_invoker_mode != PARALLEL_INVOKER_NONE &&
|
|
#if defined(USE_PARALLEL_INVOKER_GCD)
|
|
flags_parallel_invoker_mode != PARALLEL_INVOKER_GCD &&
|
|
#endif // USE_PARALLEL_INVOKER_GCD
|
|
flags_parallel_invoker_mode != PARALLEL_INVOKER_THREAD_POOL) {
|
|
LOG(WARNING) << "Unsupported invoker mode selected on iOS. "
|
|
<< "Falling back to ThreadPool mode";
|
|
flags_parallel_invoker_mode = PARALLEL_INVOKER_THREAD_POOL;
|
|
}
|
|
#endif // __APPLE__ || __EMSCRIPTEN__
|
|
|
|
#if !defined(__APPLE__) && !defined(__EMSCRIPTEN__) && !defined(__ANDROID__)
|
|
flags_parallel_invoker_mode = PARALLEL_INVOKER_THREAD_POOL;
|
|
#endif // !__APPLE__ && !__EMSCRIPTEN__ && !__ANDROID__
|
|
|
|
// If OpenMP is requested, make sure we can actually use it, and fall back
|
|
// to ThreadPool if not.
|
|
if (flags_parallel_invoker_mode == PARALLEL_INVOKER_OPENMP) {
|
|
#if !defined(_OPENMP)
|
|
LOG(ERROR) << "OpenMP invoker mode selected but not compiling with OpenMP "
|
|
<< "enabled. Falling back to ThreadPool";
|
|
flags_parallel_invoker_mode = PARALLEL_INVOKER_THREAD_POOL;
|
|
#endif // _OPENMP
|
|
}
|
|
|
|
#else // PARALLEL_INVOKER_ACTIVE
|
|
if (flags_parallel_invoker_mode != PARALLEL_INVOKER_NONE) {
|
|
LOG(ERROR) << "Parallel execution requested but PARALLEL_INVOKER_ACTIVE "
|
|
<< "compile flag is not set. Falling back to single threaded "
|
|
<< "execution.";
|
|
flags_parallel_invoker_mode = PARALLEL_INVOKER_NONE;
|
|
}
|
|
#endif // PARALLEL_INVOKER_ACTIVE
|
|
|
|
CHECK_LT(flags_parallel_invoker_mode, PARALLEL_INVOKER_MAX_VALUE)
|
|
<< "Invalid invoker mode specified.";
|
|
CHECK_GE(flags_parallel_invoker_mode, 0) << "Invalid invoker mode specified.";
|
|
}
|
|
|
|
// Performs parallel iteration from [start to end), scheduling grain_size
|
|
// iterations per thread. For each iteration
|
|
// invoker(BlockedRange(thread_local_start, thread_local_end))
|
|
// is called. Each thread is given its local copy of invoker, i.e.
|
|
// invoker needs to have copy constructor defined.
|
|
template <class Invoker>
|
|
void ParallelFor(size_t start, size_t end, size_t grain_size,
|
|
const Invoker& invoker) {
|
|
#ifdef PARALLEL_INVOKER_ACTIVE
|
|
CheckAndSetInvokerOptions();
|
|
switch (flags_parallel_invoker_mode) {
|
|
#if defined(__APPLE__)
|
|
case PARALLEL_INVOKER_GCD: {
|
|
int iterations_remain = (end - start + grain_size - 1) / grain_size;
|
|
CHECK_GT(iterations_remain, 0);
|
|
if (iterations_remain == 1) {
|
|
// Execute invoker serially.
|
|
invoker(BlockedRange(start, std::min(end, start + grain_size), 1));
|
|
} else {
|
|
BlockedRange all_tasks(start, end, grain_size);
|
|
ParallelInvokerGCDContext<Invoker> context(invoker, all_tasks);
|
|
dispatch_queue_t concurrent_queue =
|
|
dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
|
|
dispatch_apply_f(iterations_remain, concurrent_queue, &context,
|
|
ParallelForGCDTask<Invoker>);
|
|
#if CHECK_GCD_PARALLEL_WORK_COUNT
|
|
CHECK_EQ(iterations_remain, context.count());
|
|
#endif
|
|
}
|
|
break;
|
|
}
|
|
#endif // __APPLE__
|
|
|
|
case PARALLEL_INVOKER_THREAD_POOL: {
|
|
int iterations_remain = (end - start + grain_size - 1) / grain_size;
|
|
CHECK_GT(iterations_remain, 0);
|
|
if (iterations_remain == 1) {
|
|
// Execute invoker serially.
|
|
invoker(BlockedRange(start, std::min(end, start + grain_size), 1));
|
|
break;
|
|
}
|
|
|
|
struct {
|
|
absl::Mutex mutex;
|
|
absl::CondVar completed;
|
|
int iterations_remain ABSL_GUARDED_BY(mutex);
|
|
} loop;
|
|
{
|
|
absl::MutexLock lock(&loop.mutex);
|
|
loop.iterations_remain = iterations_remain;
|
|
}
|
|
|
|
for (int x = start; x < end; x += grain_size) {
|
|
auto loop_func = [x, end, grain_size, &loop, invoker]() {
|
|
// Execute invoker.
|
|
invoker(BlockedRange(x, std::min(end, x + grain_size), 1));
|
|
|
|
// Decrement counter.
|
|
absl::MutexLock lock(&loop.mutex);
|
|
--loop.iterations_remain;
|
|
if (loop.iterations_remain == 0) {
|
|
loop.completed.SignalAll();
|
|
}
|
|
};
|
|
|
|
// Attempt to run in parallel, if busy run serial to avoid deadlocking.
|
|
// This can happen during nested invocation of ParallelFor, as if the
|
|
// loop iteration itself is calling ParallelFor we might deadlock if
|
|
// we can not guarantee for the iteration to be scheduled.
|
|
ParallelInvokerThreadPool()->Schedule(loop_func);
|
|
}
|
|
|
|
// Wait on termination of all iterations.
|
|
loop.mutex.Lock();
|
|
while (loop.iterations_remain > 0) {
|
|
loop.completed.Wait(&loop.mutex);
|
|
}
|
|
loop.mutex.Unlock();
|
|
break;
|
|
}
|
|
|
|
case PARALLEL_INVOKER_OPENMP: {
|
|
// Use thread-local copy of invoker.
|
|
Invoker local_invoker(invoker);
|
|
#pragma omp parallel for firstprivate(local_invoker) \
|
|
num_threads(flags_parallel_invoker_max_threads)
|
|
for (int x = start; x < end; ++x) {
|
|
local_invoker(BlockedRange(x, x + 1, 1));
|
|
}
|
|
break;
|
|
}
|
|
|
|
case PARALLEL_INVOKER_NONE: {
|
|
SerialFor(start, end, grain_size, invoker);
|
|
break;
|
|
}
|
|
|
|
case PARALLEL_INVOKER_MAX_VALUE: {
|
|
LOG(FATAL) << "Impossible.";
|
|
break;
|
|
}
|
|
}
|
|
#else
|
|
SerialFor(start, end, grain_size, invoker);
|
|
#endif // PARALLEL_INVOKER_ACTIVE
|
|
}
|
|
|
|
// Simple wrapper for compatibility with below ParallelFor2D function.
|
|
template <class Invoker>
|
|
void SerialFor2D(size_t start_row, size_t end_row, size_t start_col,
|
|
size_t end_col, size_t grain_size, const Invoker& invoker) {
|
|
invoker(BlockedRange2D(BlockedRange(start_row, end_row, 1),
|
|
BlockedRange(start_col, end_col, 1)));
|
|
}
|
|
|
|
// Same as above ParallelFor for 2D iteration.
|
|
template <class Invoker>
|
|
void ParallelFor2D(size_t start_row, size_t end_row, size_t start_col,
|
|
size_t end_col, size_t grain_size, const Invoker& invoker) {
|
|
#ifdef PARALLEL_INVOKER_ACTIVE
|
|
CheckAndSetInvokerOptions();
|
|
switch (flags_parallel_invoker_mode) {
|
|
#if defined(__APPLE__)
|
|
case PARALLEL_INVOKER_GCD: {
|
|
const int iterations_remain =
|
|
(end_row - start_row + grain_size - 1) / grain_size;
|
|
CHECK_GT(iterations_remain, 0);
|
|
if (iterations_remain == 1) {
|
|
// Execute invoker serially.
|
|
invoker(BlockedRange2D(BlockedRange(start_row, end_row, 1),
|
|
BlockedRange(start_col, end_col, 1)));
|
|
} else {
|
|
BlockedRange all_tasks(start_row, end_row, grain_size);
|
|
ParallelInvokerGCDContext2D<Invoker> context(
|
|
invoker, all_tasks, BlockedRange(start_col, end_col, grain_size));
|
|
dispatch_queue_t concurrent_queue =
|
|
dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0);
|
|
dispatch_apply_f(iterations_remain, concurrent_queue, &context,
|
|
ParallelForGCDTask2D<Invoker>);
|
|
#if CHECK_GCD_PARALLEL_WORK_COUNT
|
|
CHECK_EQ(iterations_remain, context.count());
|
|
#endif
|
|
}
|
|
break;
|
|
}
|
|
#endif // __APPLE__
|
|
|
|
case PARALLEL_INVOKER_THREAD_POOL: {
|
|
int iterations_remain = end_row - start_row; // Guarded by loop_mutex
|
|
CHECK_GT(iterations_remain, 0);
|
|
if (iterations_remain == 1) {
|
|
// Execute invoker serially.
|
|
invoker(BlockedRange2D(BlockedRange(start_row, end_row, 1),
|
|
BlockedRange(start_col, end_col, 1)));
|
|
break;
|
|
}
|
|
|
|
absl::Mutex loop_mutex;
|
|
absl::CondVar loop_completed;
|
|
|
|
for (int y = start_row; y < end_row; ++y) {
|
|
auto loop_func = [y, start_col, end_col, &loop_mutex, &loop_completed,
|
|
&iterations_remain, invoker]() {
|
|
// Execute invoker.
|
|
invoker(BlockedRange2D(BlockedRange(y, y + 1, 1),
|
|
BlockedRange(start_col, end_col, 1)));
|
|
|
|
// Decrement counter.
|
|
absl::MutexLock lock(&loop_mutex);
|
|
--iterations_remain;
|
|
if (iterations_remain == 0) {
|
|
loop_completed.Signal();
|
|
}
|
|
};
|
|
|
|
// Attempt to run in parallel, if busy run serial to avoid deadlocking.
|
|
ParallelInvokerThreadPool()->Schedule(loop_func);
|
|
}
|
|
|
|
// Wait on termination of all iterations.
|
|
loop_mutex.Lock();
|
|
while (iterations_remain > 0) {
|
|
loop_completed.Wait(&loop_mutex);
|
|
}
|
|
loop_mutex.Unlock();
|
|
break;
|
|
}
|
|
|
|
case PARALLEL_INVOKER_OPENMP: {
|
|
// Use thread-local copy of invoker.
|
|
Invoker local_invoker(invoker);
|
|
#pragma omp parallel for firstprivate(local_invoker) \
|
|
num_threads(flags_parallel_invoker_max_threads)
|
|
for (int y = start_row; y < end_row; ++y) {
|
|
local_invoker(BlockedRange2D(BlockedRange(y, y + 1, 1),
|
|
BlockedRange(start_col, end_col, 1)));
|
|
}
|
|
break;
|
|
}
|
|
|
|
case PARALLEL_INVOKER_NONE: {
|
|
SerialFor2D(start_row, end_row, start_col, end_col, grain_size, invoker);
|
|
break;
|
|
}
|
|
|
|
case PARALLEL_INVOKER_MAX_VALUE: {
|
|
LOG(FATAL) << "Impossible.";
|
|
break;
|
|
}
|
|
}
|
|
#else
|
|
SerialFor2D(start_row, end_row, start_col, end_col, grain_size, invoker);
|
|
#endif // PARALLEL_INVOKER_ACTIVE
|
|
}
|
|
|
|
} // namespace mediapipe
|
|
|
|
#endif // MEDIAPIPE_UTIL_TRACKING_PARALLEL_INVOKER_H_
|