software_isp: debayer_cpu: Add multi-threading support

Add CPU soft ISP multi-threading support.

Benchmark results for the Arduino Uno-Q with a weak CPU which is good for
performance testing, all numbers with an IMX219 running at
3280x2464 -> 3272x2464:

1 thread : 147ms / frame, ~6.5 fps
2 threads:  80ms / frame, ~12.5 fps
3 threads:  65ms / frame, ~15 fps

Adding a 4th thread does not improve performance.

Tested-by: Barnabás Pőcze <barnabas.pocze@ideasonboard.com> # ThinkPad X1 Yoga Gen 7 + ov2740
Reviewed-by: Milan Zamazal <mzamazal@redhat.com>
Signed-off-by: Hans de Goede <johannes.goede@oss.qualcomm.com>
Signed-off-by: Kieran Bingham <kieran.bingham@ideasonboard.com>
This commit is contained in:
Hans de Goede
2026-03-10 13:01:04 +01:00
committed by Kieran Bingham
parent 41d17f8211
commit e948ada380
3 changed files with 62 additions and 3 deletions

View File

@@ -76,6 +76,7 @@ DebayerCpuThread::DebayerCpuThread(DebayerCpu *debayer, unsigned int threadIndex
debayer_(debayer), threadIndex_(threadIndex),
enableInputMemcpy_(enableInputMemcpy)
{
moveToThread(this);
}
/**
@@ -107,11 +108,15 @@ DebayerCpu::DebayerCpu(std::unique_ptr<SwStatsCpu> stats, const GlobalConfigurat
bool enableInputMemcpy =
configuration.option<bool>({ "software_isp", "copy_input_buffer" }).value_or(true);
/* Just one thread object for now, which will be called inline rather than async */
threads_.resize(1);
unsigned int threadCount =
configuration.option<unsigned int>({ "software_isp", "threads" }).value_or(kDefaultThreads);
threadCount = std::clamp(threadCount, kMinThreads, kMaxThreads);
threads_.resize(threadCount);
for (unsigned int i = 0; i < threads_.size(); i++)
threads_[i] = std::make_unique<DebayerCpuThread>(this, i, enableInputMemcpy);
LOG(Debayer, Debug) << "Thread count " << threadCount;
}
DebayerCpu::~DebayerCpu() = default;
@@ -746,6 +751,11 @@ void DebayerCpuThread::process(uint32_t frame, const uint8_t *src, uint8_t *dst)
process2(frame, src, dst);
else
process4(frame, src, dst);
debayer_->workPendingMutex_.lock();
debayer_->workPending_ &= ~(1 << threadIndex_);
debayer_->workPendingMutex_.unlock();
debayer_->workPendingCv_.notify_one();
}
void DebayerCpuThread::process2(uint32_t frame, const uint8_t *src, uint8_t *dst)
@@ -985,7 +995,21 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output
stats_->startFrame(frame);
threads_[0]->process(frame, in.planes()[0].data(), out.planes()[0].data());
workPendingMutex_.lock();
workPending_ = (1 << threads_.size()) - 1;
workPendingMutex_.unlock();
for (auto &thread : threads_)
thread->invokeMethod(&DebayerCpuThread::process,
ConnectionTypeQueued, frame,
in.planes()[0].data(), out.planes()[0].data());
{
MutexLocker locker(workPendingMutex_);
workPendingCv_.wait(locker, [&]() LIBCAMERA_TSA_REQUIRES(workPendingMutex_) {
return workPending_ == 0;
});
}
metadata.planes()[0].bytesused = out.planes()[0].size();
@@ -1004,6 +1028,23 @@ void DebayerCpu::process(uint32_t frame, FrameBuffer *input, FrameBuffer *output
inputBufferReady.emit(input);
}
int DebayerCpu::start()
{
for (auto &thread : threads_)
thread->start();
return 0;
}
void DebayerCpu::stop()
{
for (auto &thread : threads_)
thread->exit();
for (auto &thread : threads_)
thread->wait();
}
SizeRange DebayerCpu::sizes(PixelFormat inputFormat, const Size &inputSize)
{
Size patternSize = this->patternSize(inputFormat);