software_isp: swstats_cpu: Prepare for multi-threading support

Make the storage used to accumulate the RGB sums and the Y histogram
value a vector of SwIspStats objects instead of a single object so
that when using multi-threading every thread can use its own storage to
collect intermediate stats to avoid cache-line bouncing.

Benchmarking with the GPU-ISP which does separate swstats benchmarking,
on the Arduino Uno-Q which has a weak CPU which is good for performance
testing, shows 20ms to generate stats for a 3272x2464 frame both before
and after this change.

Reviewed-by: Milan Zamazal <mzamazal@redhat.com>
Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com>
Signed-off-by: Kieran Bingham <kieran.bingham@ideasonboard.com>
This commit is contained in:
Hans de Goede
2026-03-10 13:01:02 +01:00
committed by Kieran Bingham
parent 2a1c75504c
commit 6e53e72e96
2 changed files with 54 additions and 33 deletions

View File

@@ -12,6 +12,7 @@
#pragma once
#include <stdint.h>
#include <vector>
#include <libcamera/base/signal.h>
@@ -51,13 +52,13 @@ public:
const Size &patternSize() { return patternSize_; }
int configure(const StreamConfiguration &inputCfg);
int configure(const StreamConfiguration &inputCfg, unsigned int statsBufferCount = 1);
void setWindow(const Rectangle &window);
void startFrame(uint32_t frame);
void finishFrame(uint32_t frame, uint32_t bufferId);
void processFrame(uint32_t frame, uint32_t bufferId, FrameBuffer *input);
void processLine0(uint32_t frame, unsigned int y, const uint8_t *src[])
void processLine0(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0)
{
if (frame % kStatPerNumFrames)
return;
@@ -66,10 +67,10 @@ public:
y >= (window_.y + window_.height))
return;
(this->*stats0_)(src);
(this->*stats0_)(src, stats_[statsBufferIndex]);
}
void processLine2(uint32_t frame, unsigned int y, const uint8_t *src[])
void processLine2(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0)
{
if (frame % kStatPerNumFrames)
return;
@@ -78,25 +79,25 @@ public:
y >= (window_.y + window_.height))
return;
(this->*stats2_)(src);
(this->*stats2_)(src, stats_[statsBufferIndex]);
}
Signal<uint32_t, uint32_t> statsReady;
private:
using statsProcessFn = void (SwStatsCpu::*)(const uint8_t *src[]);
using statsProcessFn = void (SwStatsCpu::*)(const uint8_t *src[], SwIspStats &stats);
using processFrameFn = void (SwStatsCpu::*)(MappedFrameBuffer &in);
int setupStandardBayerOrder(BayerFormat::Order order);
/* Bayer 8 bpp unpacked */
void statsBGGR8Line0(const uint8_t *src[]);
void statsBGGR8Line0(const uint8_t *src[], SwIspStats &stats);
/* Bayer 10 bpp unpacked */
void statsBGGR10Line0(const uint8_t *src[]);
void statsBGGR10Line0(const uint8_t *src[], SwIspStats &stats);
/* Bayer 12 bpp unpacked */
void statsBGGR12Line0(const uint8_t *src[]);
void statsBGGR12Line0(const uint8_t *src[], SwIspStats &stats);
/* Bayer 10 bpp packed */
void statsBGGR10PLine0(const uint8_t *src[]);
void statsGBRG10PLine0(const uint8_t *src[]);
void statsBGGR10PLine0(const uint8_t *src[], SwIspStats &stats);
void statsGBRG10PLine0(const uint8_t *src[], SwIspStats &stats);
void processBayerFrame2(MappedFrameBuffer &in);
@@ -116,8 +117,8 @@ private:
unsigned int xShift_;
unsigned int stride_;
std::vector<SwIspStats> stats_;
SharedMemObject<SwIspStats> sharedStats_;
SwIspStats stats_;
Benchmark bench_;
};

View File

@@ -74,11 +74,12 @@ namespace libcamera {
*/
/**
* \fn void SwStatsCpu::processLine0(uint32_t frame, unsigned int y, const uint8_t *src[])
* \fn void SwStatsCpu::processLine0(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0)
* \brief Process line 0
* \param[in] frame The frame number
* \param[in] y The y coordinate.
* \param[in] src The input data.
* \param[in] y The y coordinate
* \param[in] src The input data
* \param[in] statsBufferIndex Index of stats buffer to use for multi-threading
*
* This function processes line 0 for input formats with
* patternSize height == 1.
@@ -97,14 +98,18 @@ namespace libcamera {
* to the line in plane 0, etc.
*
* For non Bayer single plane input data only a single src pointer is required.
*
* The statsBufferIndex value must be less than the statsBufferCount value passed
* to configure().
*/
/**
* \fn void SwStatsCpu::processLine2(uint32_t frame, unsigned int y, const uint8_t *src[])
* \fn void SwStatsCpu::processLine2(uint32_t frame, unsigned int y, const uint8_t *src[], unsigned int statsBufferIndex = 0)
* \brief Process line 2 and 3
* \param[in] frame The frame number
* \param[in] y The y coordinate.
* \param[in] src The input data.
* \param[in] y The y coordinate
* \param[in] src The input data
* \param[in] statsBufferIndex Index of stats buffer to use for multi-threading
*
* This function processes line 2 and 3 for input formats with
* patternSize height == 4.
@@ -182,14 +187,14 @@ static constexpr unsigned int kBlueYMul = 29; /* 0.114 * 256 */
yVal = r * kRedYMul; \
yVal += g * kGreenYMul; \
yVal += b * kBlueYMul; \
stats_.yHistogram[yVal * SwIspStats::kYHistogramSize / (256 * 256 * (div))]++;
stats.yHistogram[yVal * SwIspStats::kYHistogramSize / (256 * 256 * (div))]++;
#define SWSTATS_FINISH_LINE_STATS() \
stats_.sum_.r() += sumR; \
stats_.sum_.g() += sumG; \
stats_.sum_.b() += sumB;
stats.sum_.r() += sumR; \
stats.sum_.g() += sumG; \
stats.sum_.b() += sumB;
void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[])
void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[], SwIspStats &stats)
{
const uint8_t *src0 = src[1] + window_.x;
const uint8_t *src1 = src[2] + window_.x;
@@ -214,7 +219,7 @@ void SwStatsCpu::statsBGGR8Line0(const uint8_t *src[])
SWSTATS_FINISH_LINE_STATS()
}
void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[])
void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[], SwIspStats &stats)
{
const uint16_t *src0 = (const uint16_t *)src[1] + window_.x;
const uint16_t *src1 = (const uint16_t *)src[2] + window_.x;
@@ -240,7 +245,7 @@ void SwStatsCpu::statsBGGR10Line0(const uint8_t *src[])
SWSTATS_FINISH_LINE_STATS()
}
void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[])
void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[], SwIspStats &stats)
{
const uint16_t *src0 = (const uint16_t *)src[1] + window_.x;
const uint16_t *src1 = (const uint16_t *)src[2] + window_.x;
@@ -266,7 +271,7 @@ void SwStatsCpu::statsBGGR12Line0(const uint8_t *src[])
SWSTATS_FINISH_LINE_STATS()
}
void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[])
void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[], SwIspStats &stats)
{
const uint8_t *src0 = src[1] + window_.x * 5 / 4;
const uint8_t *src1 = src[2] + window_.x * 5 / 4;
@@ -292,7 +297,7 @@ void SwStatsCpu::statsBGGR10PLine0(const uint8_t *src[])
SWSTATS_FINISH_LINE_STATS()
}
void SwStatsCpu::statsGBRG10PLine0(const uint8_t *src[])
void SwStatsCpu::statsGBRG10PLine0(const uint8_t *src[], SwIspStats &stats)
{
const uint8_t *src0 = src[1] + window_.x * 5 / 4;
const uint8_t *src1 = src[2] + window_.x * 5 / 4;
@@ -332,8 +337,10 @@ void SwStatsCpu::startFrame(uint32_t frame)
if (window_.width == 0)
LOG(SwStatsCpu, Error) << "Calling startFrame() without setWindow()";
stats_.sum_ = RGB<uint64_t>({ 0, 0, 0 });
stats_.yHistogram.fill(0);
for (auto &s : stats_) {
s.sum_ = RGB<uint64_t>({ 0, 0, 0 });
s.yHistogram.fill(0);
}
}
/**
@@ -345,8 +352,19 @@ void SwStatsCpu::startFrame(uint32_t frame)
*/
void SwStatsCpu::finishFrame(uint32_t frame, uint32_t bufferId)
{
stats_.valid = frame % kStatPerNumFrames == 0;
*sharedStats_ = stats_;
bool valid = frame % kStatPerNumFrames == 0;
if (valid) {
sharedStats_->sum_ = RGB<uint64_t>({ 0, 0, 0 });
sharedStats_->yHistogram.fill(0);
for (const auto &s : stats_) {
sharedStats_->sum_ += s.sum_;
for (unsigned int j = 0; j < SwIspStats::kYHistogramSize; j++)
sharedStats_->yHistogram[j] += s.yHistogram[j];
}
}
sharedStats_->valid = valid;
statsReady.emit(frame, bufferId);
}
@@ -389,12 +407,14 @@ int SwStatsCpu::setupStandardBayerOrder(BayerFormat::Order order)
/**
* \brief Configure the statistics object for the passed in input format
* \param[in] inputCfg The input format
* \param[in] statsBufferCount number of internal stats buffers to use for multi-threading
*
* \return 0 on success, a negative errno value on failure
*/
int SwStatsCpu::configure(const StreamConfiguration &inputCfg)
int SwStatsCpu::configure(const StreamConfiguration &inputCfg, unsigned int statsBufferCount)
{
stride_ = inputCfg.stride;
stats_.resize(statsBufferCount);
BayerFormat bayerFormat =
BayerFormat::fromPixelFormat(inputCfg.pixelFormat);
@@ -504,7 +524,7 @@ void SwStatsCpu::processBayerFrame2(MappedFrameBuffer &in)
/* linePointers[0] is not used by any stats0_ functions */
linePointers[1] = src;
linePointers[2] = src + stride_;
(this->*stats0_)(linePointers);
(this->*stats0_)(linePointers, stats_[0]);
src += stride_ * 2;
}
}