// Copyright 2019 The MediaPipe Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // Fits tone models to intensity matches // (gathered from order statistics of matching patches supplied by // RegionFlowFeatureList's). #ifndef MEDIAPIPE_UTIL_TRACKING_TONE_ESTIMATION_H_ #define MEDIAPIPE_UTIL_TRACKING_TONE_ESTIMATION_H_ #include #include #include #include #include #include "mediapipe/framework/port/integral_types.h" #include "mediapipe/framework/port/logging.h" #include "mediapipe/framework/port/opencv_core_inc.h" #include "mediapipe/framework/port/opencv_imgproc_inc.h" #include "mediapipe/framework/port/vector.h" #include "mediapipe/util/tracking/region_flow.h" #include "mediapipe/util/tracking/region_flow.pb.h" #include "mediapipe/util/tracking/tone_estimation.pb.h" #include "mediapipe/util/tracking/tone_models.h" namespace mediapipe { class GainBiasModel; class RegionFlowFeatureList; typedef std::deque PatchToneMatches; // Each vector element presents its own channel. typedef std::vector ColorToneMatches; // Clip mask for C channels. template struct ClipMask { ClipMask() { min_exposure_threshold.resize(C); max_exposure_threshold.resize(C); } cv::Mat mask; std::vector min_exposure_threshold; std::vector max_exposure_threshold; }; class ToneEstimation { public: ToneEstimation(const ToneEstimationOptions& options, int frame_width, int frame_height); virtual ~ToneEstimation(); ToneEstimation(const ToneEstimation&) = delete; ToneEstimation& operator=(const ToneEstimation&) = delete; // Estimates ToneChange models from matching feature points. // Input RegionFlowFeatureList supplies (x, y) matches, where x is a feature // point in curr_frame, and y the matching feature point in prev_frame. The // estimated ToneChange describes the change from curr_frame to // prev_frame in the tone domain using a variety of linear models as specified // by ToneEstimationOptions. // If debug_output is not nullptr, contains visualization of inlier patches // and clip masks for debugging: current and previous frames are rendered // side-by-side (left:current, right:previous). // TODO: Parallel estimation across frames. void EstimateToneChange(const RegionFlowFeatureList& feature_list, const cv::Mat& curr_frame, const cv::Mat* prev_frame, ToneChange* tone_change, cv::Mat* debug_output = nullptr); // Returns mask of clipped pixels for input frame // (pixels that are clipped due to limited dynamic range are set to 1), // as well as per channel value denoting the min and max exposure threshold. template static void ComputeClipMask(const ClipMaskOptions& options, const cv::Mat& frame, ClipMask* clip_mask); // Returns color tone matches of size C from passed feature list. Tone matches // are obtained from order statistics of sampled patches at feature locations. // Matches are obtained for each color channel, with over and under-exposed // patches being discarded. // If debug_output is not NULL, contains visualization of inlier patches and // clip masks for debugging. template static void ComputeToneMatches(const ToneMatchOptions& tone_match_options, const RegionFlowFeatureList& feature_list, const cv::Mat& curr_frame, const cv::Mat& prev_frame, const ClipMask& curr_clip_mask, const ClipMask& prev_clip_mask, ColorToneMatches* color_tone_matches, cv::Mat* debug_output = nullptr); // Can be called with color tone matches for 1 - 3 channels, will simply set // missing channels in GainBiasModel to identity. // Returns the estimated gain and bias model for tone change, // that maps colors in current frame to those in the prev frame // using color_tone_matches as corresponding samples. Modifies // the irls_weights of samples in color_tone_matches based on their // fit to the estimated gain_bias_model. static void EstimateGainBiasModel(int irls_iterations, ColorToneMatches* color_tone_matches, GainBiasModel* gain_bias_model); // Tests if the estimated gain-bias model is stable based on: // (1) whether the model parameters are within gain and bias bounds, and // (2) there are enough inliers in the tone matches. // Some statistics related to stability analysis are returned in stats (if // not NULL). static bool IsStableGainBiasModel( const ToneEstimationOptions::GainBiasBounds& gain_bias_bounds, const GainBiasModel& model, const ColorToneMatches& tone_matches, ToneChange::StabilityStats* stats = nullptr); private: // Computes normalized intensity percentiles. void IntensityPercentiles(const cv::Mat& frame, const cv::Mat& clip_mask, bool log_domain, ToneChange* tone_change) const; private: ToneEstimationOptions options_; int frame_width_; int frame_height_; int original_width_; int original_height_; float downsample_scale_ = 1.0f; bool use_downsampling_ = false; std::unique_ptr resized_input_; std::unique_ptr prev_resized_input_; }; // Template implementation functions. template void ToneEstimation::ComputeClipMask(const ClipMaskOptions& options, const cv::Mat& frame, ClipMask* clip_mask) { CHECK(clip_mask != nullptr); CHECK_EQ(frame.channels(), C); // Over / Underexposure handling. // Masks pixels affected by clipping. clip_mask->mask.create(frame.rows, frame.cols, CV_8U); const float min_exposure_thresh = options.min_exposure() * 255.0f; const float max_exposure_thresh = options.max_exposure() * 255.0f; const int max_clipped_channels = options.max_clipped_channels(); std::vector planes; cv::split(frame, planes); CHECK_EQ(C, planes.size()); float min_exposure[C]; float max_exposure[C]; for (int c = 0; c < C; ++c) { min_exposure[c] = min_exposure_thresh; max_exposure[c] = max_exposure_thresh; } for (int c = 0; c < C; ++c) { clip_mask->min_exposure_threshold[c] = min_exposure[c]; clip_mask->max_exposure_threshold[c] = max_exposure[c]; } for (int i = 0; i < frame.rows; ++i) { const uint8* img_ptr = frame.ptr(i); uint8* clip_ptr = clip_mask->mask.template ptr(i); for (int j = 0; j < frame.cols; ++j) { const int idx = C * j; int clipped_channels = 0; // Count clipped channels. for (int p = 0; p < C; ++p) { clipped_channels += static_cast(img_ptr[idx + p] < min_exposure[p] || img_ptr[idx + p] > max_exposure[p]); } if (clipped_channels > max_clipped_channels) { clip_ptr[j] = 1; } else { clip_ptr[j] = 0; } } } // Dilate to address blooming. const int dilate_diam = options.clip_mask_diameter(); const int dilate_rad = ceil(dilate_diam * 0.5); // Remove border from dilate, as cv::dilate reads out of // bound values otherwise. if (clip_mask->mask.rows > 2 * dilate_rad && clip_mask->mask.cols > 2 * dilate_rad) { cv::Mat dilate_domain = cv::Mat(clip_mask->mask, cv::Range(dilate_rad, clip_mask->mask.rows - dilate_rad), cv::Range(dilate_rad, clip_mask->mask.cols - dilate_rad)); cv::Mat kernel(options.clip_mask_diameter(), options.clip_mask_diameter(), CV_8U); kernel.setTo(1.0); cv::dilate(dilate_domain, dilate_domain, kernel); } } template void ToneEstimation::ComputeToneMatches( const ToneMatchOptions& options, const RegionFlowFeatureList& feature_list, const cv::Mat& curr_frame, const cv::Mat& prev_frame, const ClipMask& curr_clip_mask, // Optional. const ClipMask& prev_clip_mask, // Optional. ColorToneMatches* color_tone_matches, cv::Mat* debug_output) { CHECK(color_tone_matches != nullptr); CHECK_EQ(curr_frame.channels(), C); CHECK_EQ(prev_frame.channels(), C); color_tone_matches->clear(); color_tone_matches->resize(C); const int patch_diam = 2 * options.patch_radius() + 1; const int patch_area = patch_diam * patch_diam; const float patch_denom = 1.0f / (patch_area); const float log_denom = 1.0f / LogDomainLUT().MaxLogDomainValue(); int num_matches = 0; std::vector curr_intensities[C]; std::vector prev_intensities[C]; for (int c = 0; c < C; ++c) { curr_intensities[c].resize(256, 0); prev_intensities[c].resize(256, 0); } // Debugging output. Horizontally concatenate current and previous frames. cv::Mat curr_debug; cv::Mat prev_debug; if (debug_output != nullptr) { const int rows = std::max(curr_frame.rows, prev_frame.rows); const int cols = curr_frame.cols + prev_frame.cols; const cv::Rect curr_rect(cv::Point(0, 0), curr_frame.size()); const cv::Rect prev_rect(cv::Point(curr_frame.cols, 0), prev_frame.size()); debug_output->create(rows, cols, CV_8UC3); debug_output->setTo(cv::Scalar(255, 0, 0)); curr_debug = (*debug_output)(curr_rect); prev_debug = (*debug_output)(prev_rect); curr_frame.copyTo(curr_debug, curr_clip_mask.mask ^ cv::Scalar(1)); prev_frame.copyTo(prev_debug, prev_clip_mask.mask ^ cv::Scalar(1)); } const int patch_radius = options.patch_radius(); const int frame_width = curr_frame.cols; const int frame_height = curr_frame.rows; for (const auto& feature : feature_list.feature()) { // Extract matching masks at this feature. Vector2_i curr_loc = FeatureIntLocation(feature); Vector2_i prev_loc = FeatureMatchIntLocation(feature); // Start bound inclusive, end bound exclusive. Vector2_i curr_start = Vector2_i(std::max(0, curr_loc.x() - patch_radius), std::max(0, curr_loc.y() - patch_radius)); Vector2_i curr_end = Vector2_i(std::min(frame_width, curr_loc.x() + patch_radius + 1), std::min(frame_height, curr_loc.y() + patch_radius + 1)); Vector2_i prev_start = Vector2_i(std::max(0, prev_loc.x() - patch_radius), std::max(0, prev_loc.y() - patch_radius)); Vector2_i prev_end = Vector2_i(std::min(frame_width, prev_loc.x() + patch_radius + 1), std::min(frame_height, prev_loc.y() + patch_radius + 1)); Vector2_i curr_size = curr_end - curr_start; Vector2_i prev_size = prev_end - prev_start; if (prev_size != curr_size || (curr_size.x() * curr_size.y()) != patch_area) { continue; // Ignore border patches. } // TODO: Instead of summing masks, perform box filter over mask // and just grab element at feature location. cv::Mat curr_patch_mask(curr_clip_mask.mask, cv::Range(curr_start.y(), curr_end.y()), cv::Range(curr_start.x(), curr_end.x())); cv::Mat prev_patch_mask(prev_clip_mask.mask, cv::Range(prev_start.y(), prev_end.y()), cv::Range(prev_start.x(), prev_end.x())); // Skip patch if too many clipped pixels. if (cv::sum(curr_patch_mask)[0] * patch_denom > options.max_frac_clipped() || cv::sum(prev_patch_mask)[0] * patch_denom > options.max_frac_clipped()) { continue; } // Extract matching patches at this feature. cv::Mat curr_patch(curr_frame, cv::Range(curr_start.y(), curr_end.y()), cv::Range(curr_start.x(), curr_end.x())); cv::Mat prev_patch(prev_frame, cv::Range(prev_start.y(), prev_end.y()), cv::Range(prev_start.x(), prev_end.x())); // Reset histograms to zero. for (int c = 0; c < C; ++c) { std::fill(curr_intensities[c].begin(), curr_intensities[c].end(), 0); std::fill(prev_intensities[c].begin(), prev_intensities[c].end(), 0); } // Build histogram (to sidestep sorting). // Note: We explicitly add over and under-exposed pixels to the // histogram, as we are going to sample the histograms at specific // percentiles modeling the shift in intensity between frames. // (If the average intensity increases by N, histogram shifts by N // bins to the right). However, matches that are over or underexposed // are discarded afterwards. for (int i = 0; i < patch_diam; ++i) { const uint8* prev_ptr = prev_patch.ptr(i); const uint8* curr_ptr = curr_patch.ptr(i); for (int j = 0; j < patch_diam; ++j) { const int j_c = C * j; for (int c = 0; c < C; ++c) { ++curr_intensities[c][curr_ptr[j_c + c]]; ++prev_intensities[c][prev_ptr[j_c + c]]; } } } // Sample at percentiles. for (int c = 0; c < C; ++c) { // Accumulate. for (int k = 1; k < 256; ++k) { curr_intensities[c][k] += curr_intensities[c][k - 1]; prev_intensities[c][k] += prev_intensities[c][k - 1]; } float percentile = options.min_match_percentile(); float percentile_step = (options.max_match_percentile() - options.min_match_percentile()) / options.match_percentile_steps(); PatchToneMatch patch_tone_match; for (int k = 0; k < options.match_percentile_steps(); ++k, percentile += percentile_step) { const auto& curr_iter = std::lower_bound(curr_intensities[c].begin(), curr_intensities[c].end(), percentile * patch_area); const auto& prev_iter = std::lower_bound(prev_intensities[c].begin(), prev_intensities[c].end(), percentile * patch_area); const int curr_int = curr_iter - curr_intensities[c].begin(); const int prev_int = prev_iter - prev_intensities[c].begin(); // If either clipped, discard match. if (curr_int < curr_clip_mask.min_exposure_threshold[c] || curr_int > curr_clip_mask.max_exposure_threshold[c] || prev_int < prev_clip_mask.min_exposure_threshold[c] || prev_int > prev_clip_mask.max_exposure_threshold[c]) { continue; } ToneMatch* tone_match = patch_tone_match.add_tone_match(); if (options.log_domain()) { tone_match->set_curr_val(LogDomainLUT().Map(curr_int) * log_denom); tone_match->set_prev_val(LogDomainLUT().Map(prev_int) * log_denom); } else { tone_match->set_curr_val(curr_int * (1.0f / 255.0f)); tone_match->set_prev_val(prev_int * (1.0f / 255.0f)); } } (*color_tone_matches)[c].push_back(patch_tone_match); // Debug output. if (debug_output != nullptr) { cv::rectangle(curr_debug, cv::Point(curr_start.x(), curr_start.y()), cv::Point(curr_end.x(), curr_end.y()), cv::Scalar(0, 0, 255)); cv::rectangle(prev_debug, cv::Point(prev_start.x(), prev_start.y()), cv::Point(prev_end.x(), prev_end.y()), cv::Scalar(0, 0, 255)); } } ++num_matches; } VLOG(1) << "Extracted fraction: " << static_cast(num_matches) / std::max(1, feature_list.feature_size()); } } // namespace mediapipe #endif // MEDIAPIPE_UTIL_TRACKING_TONE_ESTIMATION_H_