Skip to content

Commit

Permalink
Make sure threads are separated by a cache line
Browse files Browse the repository at this point in the history
  • Loading branch information
kornelski committed Apr 27, 2024
1 parent f11e783 commit 39cb27e
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 11 deletions.
10 changes: 6 additions & 4 deletions src/kmeans.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use crate::CacheLineAlign;
use crate::hist::{HistItem, HistogramInternal};
use crate::nearest::Nearest;
use crate::pal::{f_pixel, PalF, PalIndex, PalPop};
Expand All @@ -7,6 +8,8 @@ use rgb::alt::ARGB;
use rgb::ComponentMap;
use std::cell::RefCell;

/// K-Means iteration: new palette color is computed from weighted average of colors that map best to that palette entry.
// avoid false sharing
pub(crate) struct Kmeans {
averages: Vec<ColorAvg>,
weighed_diff_sum: f64,
Expand All @@ -18,7 +21,6 @@ struct ColorAvg {
pub total: f64,
}

/// K-Means iteration: new palette color is computed from weighted average of colors that map best to that palette entry.
impl Kmeans {
#[inline]
pub fn new(pal_len: usize) -> Result<Self, Error> {
Expand Down Expand Up @@ -65,14 +67,14 @@ impl Kmeans {
// chunk size is a trade-off between parallelization and overhead
hist.items.par_chunks_mut(256).for_each({
let tls = &tls; move |batch| {
let kmeans = tls.get_or(move || RefCell::new(Kmeans::new(len)));
if let Ok(ref mut kmeans) = *kmeans.borrow_mut() {
let kmeans = tls.get_or(move || CacheLineAlign(RefCell::new(Kmeans::new(len))));
if let Ok(ref mut kmeans) = *kmeans.0.borrow_mut() {
kmeans.iterate_batch(batch, &n, colors, adjust_weight);
}
}});

let diff = tls.into_iter()
.map(RefCell::into_inner)
.map(|c| c.0.into_inner())
.reduce(Kmeans::try_merge)
.transpose()?
.map_or(0., |kmeans| {
Expand Down
3 changes: 3 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ mod rayoff {
pub(crate) use thread_local::ThreadLocal;
}

#[cfg_attr(feature = "threads", repr(align(128)))]
pub(crate) struct CacheLineAlign<T>(pub T);

/// Use imagequant-sys crate instead
#[cfg(feature = "_internal_c_ffi")]
pub mod capi;
Expand Down
3 changes: 3 additions & 0 deletions src/rayoff.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,15 +61,18 @@ impl<'a, T> FakeRayonIntoIter<T> for Box<[T]> {
pub(crate) struct SpawnMock;

impl SpawnMock {
#[inline(always)]
pub fn spawn<F, R>(&self, f: F) -> R where F: FnOnce(SpawnMock) -> R {
f(SpawnMock)
}
}

#[inline(always)]
pub(crate) fn scope<F, R>(f: F) -> R where F: FnOnce(SpawnMock) -> R {
f(SpawnMock)
}

#[inline(always)]
pub(crate) fn num_cpus() -> usize {
1
}
15 changes: 8 additions & 7 deletions src/remap.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use crate::CacheLineAlign;
use crate::error::Error;
use crate::image::Image;
use crate::kmeans::Kmeans;
Expand Down Expand Up @@ -35,10 +36,10 @@ pub(crate) fn remap_to_palette<'x, 'b: 'x>(px: &mut DynamicRows, background: Opt

let tls = ThreadLocal::new();
let width = px.width as usize;
let per_thread_buffers = move || -> Result<_, Error> { Ok(RefCell::new((Kmeans::new(palette_len)?, temp_buf(width)?, temp_buf(width)?, temp_buf(width)?))) };
let per_thread_buffers = move || -> Result<_, Error> { Ok(CacheLineAlign(RefCell::new((Kmeans::new(palette_len)?, temp_buf(width)?, temp_buf(width)?, temp_buf(width)?)))) };

let tls_tmp1 = tls.get_or_try(per_thread_buffers)?;
let mut tls_tmp = tls_tmp1.borrow_mut();
let mut tls_tmp = tls_tmp1.0.borrow_mut();

let input_rows = px.rows_iter(&mut tls_tmp.1)?;
let (background, transparent_index) = background.map(|background| {
Expand All @@ -60,7 +61,7 @@ pub(crate) fn remap_to_palette<'x, 'b: 'x>(px: &mut DynamicRows, background: Opt
Ok(res) => res,
Err(_) => return f64::NAN,
};
let (kmeans, temp_row, temp_row_f, temp_row_f_bg) = &mut *tls_res.borrow_mut();
let (kmeans, temp_row, temp_row_f, temp_row_f_bg) = &mut *tls_res.0.borrow_mut();

let output_pixels_row = &mut output_pixels_row[..width];
let importance_map = importance_map.and_then(|m| m.get(row * width..)).unwrap_or(&[]);
Expand Down Expand Up @@ -96,7 +97,7 @@ pub(crate) fn remap_to_palette<'x, 'b: 'x>(px: &mut DynamicRows, background: Opt
}

if let Some(kmeans) = tls.into_iter()
.map(|t| RefCell::into_inner(t).0)
.map(|t| t.0.into_inner().0)
.reduce(Kmeans::merge) { kmeans.finalize(palette); }

let remapping_error = remapping_error / f64::from(px.width * px.height);
Expand Down Expand Up @@ -188,11 +189,11 @@ pub(crate) fn remap_to_palette_floyd(input_image: &mut Image, mut output_pixels:

// Chunks have overhead, so should be big (more than 2 bring diminishing results). Chunks risk causing seams, so should be tall.
let num_chunks = if quant.single_threaded_dithering { 1 } else { (width * height / 524_288).min(height / 128).max(if height > 128 {2} else {1}).min(num_cpus()) };
let chunks = output_pixels.chunks((height + num_chunks - 1) / num_chunks);
let chunks = output_pixels.chunks((height + num_chunks - 1) / num_chunks).map(CacheLineAlign);
scope(move |s| {
let mut chunk_start_row = 0;
for mut chunk in chunks {
let chunk_len = chunk.len();
let chunk_len = chunk.0.len();
let mut temp_row = temp_buf(width)?;
let mut input_image_iter = input_image_px.rows_iter_prepared()?;
let mut background = background.map(|bg| bg.rows_iter_prepared()).transpose()?;
Expand All @@ -218,7 +219,7 @@ pub(crate) fn remap_to_palette_floyd(input_image: &mut Image, mut output_pixels:
return Err(Error::Aborted);
}
s.spawn(move |_| {
for (chunk_row, output_pixels_row) in chunk.rows_mut().enumerate() {
for (chunk_row, output_pixels_row) in chunk.0.rows_mut().enumerate() {
let row = chunk_start_row + chunk_row;
let row_pixels = input_image_iter.row_f(&mut temp_row, row as _);
let bg_pixels = background.as_mut().map(|b| b.row_f(&mut temp_row, row as _)).unwrap_or(&[]);
Expand Down

0 comments on commit 39cb27e

Please sign in to comment.