software_isp: debayer_cpu: Add DebayerCpuThread class

Add a DebayerCpuThreadclass and use this in the inner render loop.
This contains data which needs to be separate per thread.

This is a preparation patch for making DebayerCpu support multi-threading.

Benchmarking on the Arduino Uno-Q with a weak CPU which is good for
performance testing, shows 146-147ms per 3272x2464 frame both before and
after this change, with things maybe being 0.5 ms slower after this change.

Tested-by: Barnabás Pőcze <barnabas.pocze@ideasonboard.com> # ThinkPad X1 Yoga Gen 7 + ov2740
Reviewed-by: Milan Zamazal <mzamazal@redhat.com>
Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com>
Signed-off-by: Kieran Bingham <kieran.bingham@ideasonboard.com>
This commit is contained in:
Hans de Goede
2026-03-10 13:01:03 +01:00
committed by Kieran Bingham
parent 6e53e72e96
commit 41d17f8211
2 changed files with 191 additions and 79 deletions

View File

@@ -18,6 +18,8 @@
#include <linux/dma-buf.h>
#include <libcamera/base/thread.h>
#include <libcamera/formats.h>
#include "libcamera/internal/bayer_format.h"
@@ -27,6 +29,55 @@
namespace libcamera {
/**
* \brief Class representing one CPU debayering thread
*
* Implementation for CPU based debayering threads.
*/
class DebayerCpuThread : public Thread, public Object
{
public:
DebayerCpuThread(DebayerCpu *debayer, unsigned int threadIndex,
bool enableInputMemcpy);
void configure(unsigned int yStart, unsigned int yEnd);
void process(uint32_t frame, const uint8_t *src, uint8_t *dst);
private:
void setupInputMemcpy(const uint8_t *linePointers[]);
void shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src);
void memcpyNextLine(const uint8_t *linePointers[]);
void process2(uint32_t frame, const uint8_t *src, uint8_t *dst);
void process4(uint32_t frame, const uint8_t *src, uint8_t *dst);
/* Max. supported Bayer pattern height is 4, debayering this requires 5 lines */
static constexpr unsigned int kMaxLineBuffers = 5;
DebayerCpu *debayer_;
unsigned int threadIndex_;
unsigned int yStart_;
unsigned int yEnd_;
unsigned int lineBufferLength_;
unsigned int lineBufferPadding_;
unsigned int lineBufferIndex_;
std::vector<uint8_t> lineBuffers_[kMaxLineBuffers];
bool enableInputMemcpy_;
};
/**
* \brief Construct a DebayerCpuThread object
* \param[in] debayer pointer back to the DebayerCpuObject this thread belongs to
* \param[in] threadIndex 0 .. n thread-index value for the thread
* \param[in] enableInputMemcpy when set copy input data to a heap buffer before use
*/
DebayerCpuThread::DebayerCpuThread(DebayerCpu *debayer, unsigned int threadIndex,
bool enableInputMemcpy)
: Thread("DebayerCpu:" + std::to_string(threadIndex)),
debayer_(debayer), threadIndex_(threadIndex),
enableInputMemcpy_(enableInputMemcpy)
{
}
/**
* \class DebayerCpu
* \brief Class for debayering on the CPU
@@ -53,8 +104,14 @@ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat
* \todo Make memcpy automatic based on runtime detection of platform
* capabilities.
*/
enableInputMemcpy_ =
bool enableInputMemcpy =
configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true);
/* Just one thread object for now, which will be called inline rather than async */
threads_.resize(1);
for (unsigned int i = 0; i < threads_.size(); i++)
threads_[i] = std::make_unique<DebayerCpuThread>(this, i, enableInputMemcpy);
}
DebayerCpu::~DebayerCpu() = default;
@@ -484,7 +541,7 @@ int DebayerCpu::configure(const StreamConfiguration &inputCfg,
if (getInputConfig(inputCfg.pixelFormat, inputConfig_) != 0)
return -EINVAL;
if (stats_->configure(inputCfg) != 0)
if (stats_->configure(inputCfg, threads_.size()) != 0)
return -EINVAL;
const Size &statsPatternSize = stats_->patternSize();
@@ -548,17 +605,43 @@ int DebayerCpu::configure(const StreamConfiguration &inputCfg,
*/
stats_->setWindow(Rectangle(window_.size()));
unsigned int yStart = 0;
unsigned int linesPerThread = (window_.height / threads_.size()) &
~(inputConfig_.patternSize.height - 1);
unsigned int i;
for (i = 0; i < (threads_.size() - 1); i++) {
threads_[i]->configure(yStart, yStart + linesPerThread);
yStart += linesPerThread;
}
threads_[i]->configure(yStart, window_.height);
return 0;
}
/**
* \brief Configure thread to process a specific part of the image
* \param[in] yStart y coordinate of first line to process
* \param[in] yEnd y coordinate of the line at which to stop processing
*
* Configure the thread to process lines from yStart to yEnd - 1.
*/
void DebayerCpuThread::configure(unsigned int yStart, unsigned int yEnd)
{
Debayer::DebayerInputConfig &inputConfig = debayer_->inputConfig_;
yStart_ = yStart;
yEnd_ = yEnd;
/* pad with patternSize.Width on both left and right side */
lineBufferPadding_ = inputConfig_.patternSize.width * inputConfig_.bpp / 8;
lineBufferLength_ = window_.width * inputConfig_.bpp / 8 +
lineBufferPadding_ = inputConfig.patternSize.width * inputConfig.bpp / 8;
lineBufferLength_ = debayer_->window_.width * inputConfig.bpp / 8 +
2 * lineBufferPadding_;
if (enableInputMemcpy_) {
for (unsigned int i = 0; i <= inputConfig_.patternSize.height; i++)
for (unsigned int i = 0; i <= inputConfig.patternSize.height; i++)
lineBuffers_[i].resize(lineBufferLength_);
}
return 0;
}
/*
@@ -599,9 +682,9 @@ DebayerCpu::strideAndFrameSize(const PixelFormat &outputFormat, const Size &size
return std::make_tuple(stride, stride * size.height);
}
void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[])
void DebayerCpuThread::setupInputMemcpy(const uint8_t *linePointers[])
{
const unsigned int patternHeight = inputConfig_.patternSize.height;
const unsigned int patternHeight = debayer_->inputConfig_.patternSize.height;
if (!enableInputMemcpy_)
return;
@@ -617,20 +700,20 @@ void DebayerCpu::setupInputMemcpy(const uint8_t *linePointers[])
lineBufferIndex_ = patternHeight;
}
void DebayerCpu::shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src)
void DebayerCpuThread::shiftLinePointers(const uint8_t *linePointers[], const uint8_t *src)
{
const unsigned int patternHeight = inputConfig_.patternSize.height;
const unsigned int patternHeight = debayer_->inputConfig_.patternSize.height;
for (unsigned int i = 0; i < patternHeight; i++)
linePointers[i] = linePointers[i + 1];
linePointers[patternHeight] = src +
(patternHeight / 2) * (int)inputConfig_.stride;
linePointers[patternHeight] =
src + (patternHeight / 2) * (int)debayer_->inputConfig_.stride;
}
void DebayerCpu::memcpyNextLine(const uint8_t *linePointers[])
void DebayerCpuThread::memcpyNextLine(const uint8_t *linePointers[])
{
const unsigned int patternHeight = inputConfig_.patternSize.height;
const unsigned int patternHeight = debayer_->inputConfig_.patternSize.height;
if (!enableInputMemcpy_)
return;
@@ -643,23 +726,48 @@ void DebayerCpu::memcpyNextLine(const uint8_t *linePointers[])
lineBufferIndex_ = (lineBufferIndex_ + 1) % (patternHeight + 1);
}
void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst)
/**
* \brief Process part of the image assigned to this debayer thread
* \param[in] frame The frame number
* \param[in] src The source buffer
* \param[in] dst The destination buffer
*/
void DebayerCpuThread::process(uint32_t frame, const uint8_t *src, uint8_t *dst)
{
unsigned int yEnd = window_.height;
Rectangle &window = debayer_->window_;
/* Adjust src to top left corner of the window */
src += (window.y + yStart_) * debayer_->inputConfig_.stride +
window.x * debayer_->inputConfig_.bpp / 8;
/* Adjust dst for yStart_ */
dst += yStart_ * debayer_->outputConfig_.stride;
if (debayer_->inputConfig_.patternSize.height == 2)
process2(frame, src, dst);
else
process4(frame, src, dst);
}
void DebayerCpuThread::process2(uint32_t frame, const uint8_t *src, uint8_t *dst)
{
unsigned int outputStride = debayer_->outputConfig_.stride;
unsigned int inputStride = debayer_->inputConfig_.stride;
Rectangle &window = debayer_->window_;
unsigned int yEnd = yEnd_;
/* Holds [0] previous- [1] current- [2] next-line */
const uint8_t *linePointers[3];
/* Adjust src to top left corner of the window */
src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8;
/* [x] becomes [x - 1] after initial shiftLinePointers() call */
if (window_.y) {
linePointers[1] = src - inputConfig_.stride; /* previous-line */
if (window.y + yStart_) {
linePointers[1] = src - inputStride; /* previous-line */
linePointers[2] = src;
} else {
/* window_.y == 0, use the next line as prev line */
linePointers[1] = src + inputConfig_.stride;
/* Top line, use the next line as prev line */
linePointers[1] = src + inputStride;
linePointers[2] = src;
}
if (window.y == 0 && yEnd_ == window.height) {
/*
* Last 2 lines also need special handling.
* (And configure() ensures that yEnd >= 2.)
@@ -669,83 +777,93 @@ void DebayerCpu::process2(uint32_t frame, const uint8_t *src, uint8_t *dst)
setupInputMemcpy(linePointers);
for (unsigned int y = 0; y < yEnd; y += 2) {
/*
* Note y is the line-number *inside* the window, since stats_' window
* is the stats window inside/relative to the debayer window. IOW for
* single thread rendering y goes from 0 to window.height.
*/
for (unsigned int y = yStart_; y < yEnd; y += 2) {
shiftLinePointers(linePointers, src);
memcpyNextLine(linePointers);
stats_->processLine0(frame, y, linePointers);
(this->*debayer0_)(dst, linePointers);
src += inputConfig_.stride;
dst += outputConfig_.stride;
debayer_->stats_->processLine0(frame, y, linePointers, threadIndex_);
debayer_->debayer0(dst, linePointers);
src += inputStride;
dst += outputStride;
shiftLinePointers(linePointers, src);
memcpyNextLine(linePointers);
(this->*debayer1_)(dst, linePointers);
src += inputConfig_.stride;
dst += outputConfig_.stride;
debayer_->debayer1(dst, linePointers);
src += inputStride;
dst += outputStride;
}
if (window_.y == 0) {
if (window.y == 0 && yEnd_ == window.height) {
shiftLinePointers(linePointers, src);
memcpyNextLine(linePointers);
stats_->processLine0(frame, yEnd, linePointers);
(this->*debayer0_)(dst, linePointers);
src += inputConfig_.stride;
dst += outputConfig_.stride;
debayer_->stats_->processLine0(frame, yEnd, linePointers, threadIndex_);
debayer_->debayer0(dst, linePointers);
src += inputStride;
dst += outputStride;
shiftLinePointers(linePointers, src);
/* next line may point outside of src, use prev. */
linePointers[2] = linePointers[0];
(this->*debayer1_)(dst, linePointers);
src += inputConfig_.stride;
dst += outputConfig_.stride;
debayer_->debayer1(dst, linePointers);
src += inputStride;
dst += outputStride;
}
}
void DebayerCpu::process4(uint32_t frame, const uint8_t *src, uint8_t *dst)
void DebayerCpuThread::process4(uint32_t frame, const uint8_t *src, uint8_t *dst)
{
unsigned int outputStride = debayer_->outputConfig_.stride;
unsigned int inputStride = debayer_->inputConfig_.stride;
/*
* This holds pointers to [0] 2-lines-up [1] 1-line-up [2] current-line
* [3] 1-line-down [4] 2-lines-down.
*/
const uint8_t *linePointers[5];
/* Adjust src to top left corner of the window */
src += window_.y * inputConfig_.stride + window_.x * inputConfig_.bpp / 8;
/* [x] becomes [x - 1] after initial shiftLinePointers() call */
linePointers[1] = src - 2 * inputConfig_.stride;
linePointers[2] = src - inputConfig_.stride;
linePointers[1] = src - 2 * inputStride;
linePointers[2] = src - inputStride;
linePointers[3] = src;
linePointers[4] = src + inputConfig_.stride;
linePointers[4] = src + inputStride;
setupInputMemcpy(linePointers);
for (unsigned int y = 0; y < window_.height; y += 4) {
/*
* Note y is the line-number *inside* the window, since stats_' window
* is the stats window inside/relative to the debayer window. IOW for
* single thread rendering y goes from 0 to window.height.
*/
for (unsigned int y = yStart_; y < yEnd_; y += 4) {
shiftLinePointers(linePointers, src);
memcpyNextLine(linePointers);
stats_->processLine0(frame, y, linePointers);
(this->*debayer0_)(dst, linePointers);
src += inputConfig_.stride;
dst += outputConfig_.stride;
debayer_->stats_->processLine0(frame, y, linePointers, threadIndex_);
debayer_->debayer0(dst, linePointers);
src += inputStride;
dst += outputStride;
shiftLinePointers(linePointers, src);
memcpyNextLine(linePointers);
(this->*debayer1_)(dst, linePointers);
src += inputConfig_.stride;
dst += outputConfig_.stride;
debayer_->debayer1(dst, linePointers);
src += inputStride;
dst += outputStride;
shiftLinePointers(linePointers, src);
memcpyNextLine(linePointers);
stats_->processLine2(frame, y, linePointers);
(this->*debayer2_)(dst, linePointers);
src += inputConfig_.stride;
dst += outputConfig_.stride;
debayer_->stats_->processLine2(frame, y, linePointers, threadIndex_);
debayer_->debayer2(dst, linePointers);
src += inputStride;
dst += outputStride;
shiftLinePointers(linePointers, src);
memcpyNextLine(linePointers);
(this->*debayer3_)(dst, linePointers);
src += inputConfig_.stride;
dst += outputConfig_.stride;
debayer_->debayer3(dst, linePointers);
src += inputStride;
dst += outputStride;
}
}
@@ -867,10 +985,7 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output
stats_->startFrame(frame);
if (inputConfig_.patternSize.height == 2)
process2(frame, in.planes()[0].data(), out.planes()[0].data());
else
process4(frame, in.planes()[0].data(), out.planes()[0].data());
threads_[0]->process(frame, in.planes()[0].data(), out.planes()[0].data());
metadata.planes()[0].bytesused = out.planes()[0].size();