diff options
Diffstat (limited to 'recipes/xorg-lib/pixman-0.21.6/0009-SSE2-optimization-for-bilinear-scaled-src_8888_8888.patch')
-rw-r--r-- | recipes/xorg-lib/pixman-0.21.6/0009-SSE2-optimization-for-bilinear-scaled-src_8888_8888.patch | 156 |
1 files changed, 156 insertions, 0 deletions
diff --git a/recipes/xorg-lib/pixman-0.21.6/0009-SSE2-optimization-for-bilinear-scaled-src_8888_8888.patch b/recipes/xorg-lib/pixman-0.21.6/0009-SSE2-optimization-for-bilinear-scaled-src_8888_8888.patch new file mode 100644 index 0000000000..b85f78169c --- /dev/null +++ b/recipes/xorg-lib/pixman-0.21.6/0009-SSE2-optimization-for-bilinear-scaled-src_8888_8888.patch @@ -0,0 +1,156 @@ +From 350029396d911941591149cc82b5e68a78ad6747 Mon Sep 17 00:00:00 2001 +From: Siarhei Siamashka <siarhei.siamashka@nokia.com> +Date: Mon, 21 Feb 2011 20:18:02 +0200 +Subject: [PATCH 09/40] SSE2 optimization for bilinear scaled 'src_8888_8888' + +A primitive naive implementation of bilinear scaling using SSE2 intrinsics, +which only handles one pixel at a time. It is approximately 2x faster than +pixman general compositing path. Single pass processing without intermediate +temporary buffer contributes to ~15% and loop unrolling contributes to ~20% +of this speedup. + +Benchmark on Intel Core i7 (x86-64): + Using cairo-perf-trace: + before: image firefox-planet-gnome 12.566 12.610 0.23% 6/6 + after: image firefox-planet-gnome 10.961 11.013 0.19% 5/6 + + Microbenchmark (scaling 2000x2000 image with scale factor close to 1x): + before: op=1, src=20028888, dst=20028888, speed=70.48 MPix/s + after: op=1, src=20028888, dst=20028888, speed=165.38 MPix/s +--- + pixman/pixman-sse2.c | 112 ++++++++++++++++++++++++++++++++++++++++++++++++++ + 1 files changed, 112 insertions(+), 0 deletions(-) + +diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c +index 88287b4..696005f 100644 +--- a/pixman/pixman-sse2.c ++++ b/pixman/pixman-sse2.c +@@ -5567,6 +5567,114 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, + scaled_nearest_scanline_sse2_8888_n_8888_OVER, + uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE) + ++static void ++bilinear_interpolate_line_sse2 (uint32_t * out, ++ const uint32_t * top, ++ const uint32_t * bottom, ++ int wt, ++ int wb, ++ pixman_fixed_t x, ++ pixman_fixed_t ux, ++ int width) ++{ ++ const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); ++ const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); ++ const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff); ++ const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); ++ const __m128i xmm_ux = _mm_set_epi16 (ux, ux, ux, ux, ux, ux, ux, ux); ++ const __m128i xmm_zero = _mm_setzero_si128 (); ++ __m128i xmm_x = _mm_set_epi16 (x, x, x, x, x, x, x, x); ++ uint32_t pix1, pix2, pix3, pix4; ++ ++ #define INTERPOLATE_ONE_PIXEL(pix) \ ++ do { \ ++ __m128i xmm_wh, xmm_lo, xmm_hi, a; \ ++ /* fetch 2x2 pixel block into sse2 register */ \ ++ uint32_t tl = top [pixman_fixed_to_int (x)]; \ ++ uint32_t tr = top [pixman_fixed_to_int (x) + 1]; \ ++ uint32_t bl = bottom [pixman_fixed_to_int (x)]; \ ++ uint32_t br = bottom [pixman_fixed_to_int (x) + 1]; \ ++ a = _mm_set_epi32 (tr, tl, br, bl); \ ++ x += ux; \ ++ /* vertical interpolation */ \ ++ a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero), \ ++ xmm_wt), \ ++ _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero), \ ++ xmm_wb)); \ ++ /* calculate horizontal weights */ \ ++ xmm_wh = _mm_add_epi16 (xmm_addc, \ ++ _mm_xor_si128 (xmm_xorc, \ ++ _mm_srli_epi16 (xmm_x, 8))); \ ++ xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ ++ /* horizontal interpolation */ \ ++ xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \ ++ xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \ ++ a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \ ++ _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \ ++ /* shift and pack the result */ \ ++ a = _mm_srli_epi32 (a, 16); \ ++ a = _mm_packs_epi32 (a, a); \ ++ a = _mm_packus_epi16 (a, a); \ ++ pix = _mm_cvtsi128_si32 (a); \ ++ } while (0) ++ ++ while ((width -= 4) >= 0) ++ { ++ INTERPOLATE_ONE_PIXEL (pix1); ++ INTERPOLATE_ONE_PIXEL (pix2); ++ INTERPOLATE_ONE_PIXEL (pix3); ++ INTERPOLATE_ONE_PIXEL (pix4); ++ *out++ = pix1; ++ *out++ = pix2; ++ *out++ = pix3; ++ *out++ = pix4; ++ } ++ if (width & 2) ++ { ++ INTERPOLATE_ONE_PIXEL (pix1); ++ INTERPOLATE_ONE_PIXEL (pix2); ++ *out++ = pix1; ++ *out++ = pix2; ++ } ++ if (width & 1) ++ { ++ INTERPOLATE_ONE_PIXEL (pix1); ++ *out = pix1; ++ } ++ ++ #undef INTERPOLATE_ONE_PIXEL ++} ++ ++static force_inline void ++scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst, ++ const uint32_t * mask, ++ const uint32_t * src_top, ++ const uint32_t * src_bottom, ++ int32_t w, ++ int wt, ++ int wb, ++ pixman_fixed_t vx, ++ pixman_fixed_t unit_x, ++ pixman_fixed_t max_vx, ++ pixman_bool_t zero_src) ++{ ++ bilinear_interpolate_line_sse2 (dst, src_top, src_bottom, ++ wt, wb, vx, unit_x, w); ++} ++ ++FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC, ++ scaled_bilinear_scanline_sse2_8888_8888_SRC, ++ uint32_t, uint32_t, uint32_t, ++ COVER, FALSE, FALSE) ++FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC, ++ scaled_bilinear_scanline_sse2_8888_8888_SRC, ++ uint32_t, uint32_t, uint32_t, ++ PAD, FALSE, FALSE) ++FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC, ++ scaled_bilinear_scanline_sse2_8888_8888_SRC, ++ uint32_t, uint32_t, uint32_t, ++ NONE, FALSE, FALSE) ++ + static const pixman_fast_path_t sse2_fast_paths[] = + { + /* PIXMAN_OP_OVER */ +@@ -5668,6 +5776,10 @@ static const pixman_fast_path_t sse2_fast_paths[] = + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), + ++ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888), ++ SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888), ++ SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888), ++ + { PIXMAN_OP_NONE }, + }; + +-- +1.6.6.1 + |