1 files changed, 156 insertions, 0 deletions
diff --git a/recipes/xorg-lib/pixman-0.21.6/0009-SSE2-optimization-for-bilinear-scaled-src_8888_8888.patch b/recipes/xorg-lib/pixman-0.21.6/0009-SSE2-optimization-for-bilinear-scaled-src_8888_8888.patch
new file mode 100644
index 0000000000..b85f78169c
--- /dev/null
+++ b/recipes/xorg-lib/pixman-0.21.6/0009-SSE2-optimization-for-bilinear-scaled-src_8888_8888.patch
@@ -0,0 +1,156 @@
+From 350029396d911941591149cc82b5e68a78ad6747 Mon Sep 17 00:00:00 2001
+From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Mon, 21 Feb 2011 20:18:02 +0200
+Subject: [PATCH 09/40] SSE2 optimization for bilinear scaled 'src_8888_8888'
+
+A primitive naive implementation of bilinear scaling using SSE2 intrinsics,
+which only handles one pixel at a time. It is approximately 2x faster than
+pixman general compositing path. Single pass processing without intermediate
+temporary buffer contributes to ~15% and loop unrolling contributes to ~20%
+of this speedup.
+
+Benchmark on Intel Core i7 (x86-64):
+ Using cairo-perf-trace:
+  before: image        firefox-planet-gnome   12.566   12.610   0.23%    6/6
+  after:  image        firefox-planet-gnome   10.961   11.013   0.19%    5/6
+
+ Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
+  before: op=1, src=20028888, dst=20028888, speed=70.48 MPix/s
+  after:  op=1, src=20028888, dst=20028888, speed=165.38 MPix/s
+---
+ pixman/pixman-sse2.c |  112 ++++++++++++++++++++++++++++++++++++++++++++++++++
+ 1 files changed, 112 insertions(+), 0 deletions(-)
+
+diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
+index 88287b4..696005f 100644
+--- a/pixman/pixman-sse2.c
++++ b/pixman/pixman-sse2.c
+@@ -5567,6 +5567,114 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
+ 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
+ 			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
+ 
++static void
++bilinear_interpolate_line_sse2 (uint32_t *       out,
++                                const uint32_t * top,
++                                const uint32_t * bottom,
++                                int              wt,
++                                int              wb,
++                                pixman_fixed_t   x,
++                                pixman_fixed_t   ux,
++                                int              width)
++{
++    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);
++    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);
++    const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);
++    const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);
++    const __m128i xmm_ux = _mm_set_epi16 (ux, ux, ux, ux, ux, ux, ux, ux);
++    const __m128i xmm_zero = _mm_setzero_si128 ();
++    __m128i xmm_x = _mm_set_epi16 (x, x, x, x, x, x, x, x);
++    uint32_t pix1, pix2, pix3, pix4;
++
++    #define INTERPOLATE_ONE_PIXEL(pix)						\
++    do {									\
++	__m128i xmm_wh, xmm_lo, xmm_hi, a;					\
++	/* fetch 2x2 pixel block into sse2 register */				\
++	uint32_t tl = top [pixman_fixed_to_int (x)];				\
++	uint32_t tr = top [pixman_fixed_to_int (x) + 1];			\
++	uint32_t bl = bottom [pixman_fixed_to_int (x)];				\
++	uint32_t br = bottom [pixman_fixed_to_int (x) + 1];			\
++	a = _mm_set_epi32 (tr, tl, br, bl);					\
++        x += ux;								\
++	/* vertical interpolation */						\
++	a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero),	\
++					    xmm_wt),				\
++			   _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero),	\
++					    xmm_wb));				\
++	/* calculate horizontal weights */					\
++	xmm_wh = _mm_add_epi16 (xmm_addc,					\
++				_mm_xor_si128 (xmm_xorc,			\
++					       _mm_srli_epi16 (xmm_x, 8)));	\
++	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
++	/* horizontal interpolation */						\
++	xmm_lo = _mm_mullo_epi16 (a, xmm_wh);					\
++	xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);					\
++	a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),			\
++			   _mm_unpackhi_epi16 (xmm_lo, xmm_hi));		\
++	/* shift and pack the result */						\
++	a = _mm_srli_epi32 (a, 16);						\
++	a = _mm_packs_epi32 (a, a);						\
++	a = _mm_packus_epi16 (a, a);						\
++	pix = _mm_cvtsi128_si32 (a);						\
++    } while (0)
++
++    while ((width -= 4) >= 0)
++    {
++	INTERPOLATE_ONE_PIXEL (pix1);
++	INTERPOLATE_ONE_PIXEL (pix2);
++	INTERPOLATE_ONE_PIXEL (pix3);
++	INTERPOLATE_ONE_PIXEL (pix4);
++	*out++ = pix1;
++	*out++ = pix2;
++	*out++ = pix3;
++	*out++ = pix4;
++    }
++    if (width & 2)
++    {
++	INTERPOLATE_ONE_PIXEL (pix1);
++	INTERPOLATE_ONE_PIXEL (pix2);
++	*out++ = pix1;
++	*out++ = pix2;
++    }
++    if (width & 1)
++    {
++	INTERPOLATE_ONE_PIXEL (pix1);
++	*out = pix1;
++    }
++
++    #undef INTERPOLATE_ONE_PIXEL
++}
++
++static force_inline void
++scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
++					     const uint32_t * mask,
++					     const uint32_t * src_top,
++					     const uint32_t * src_bottom,
++					     int32_t          w,
++					     int              wt,
++					     int              wb,
++					     pixman_fixed_t   vx,
++					     pixman_fixed_t   unit_x,
++					     pixman_fixed_t   max_vx,
++					     pixman_bool_t    zero_src)
++{
++    bilinear_interpolate_line_sse2 (dst, src_top, src_bottom,
++				    wt, wb, vx, unit_x, w);
++}
++
++FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
++			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
++			       uint32_t, uint32_t, uint32_t,
++			       COVER, FALSE, FALSE)
++FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
++			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
++			       uint32_t, uint32_t, uint32_t,
++			       PAD, FALSE, FALSE)
++FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
++			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
++			       uint32_t, uint32_t, uint32_t,
++			       NONE, FALSE, FALSE)
++
+ static const pixman_fast_path_t sse2_fast_paths[] =
+ {
+     /* PIXMAN_OP_OVER */
+@@ -5668,6 +5776,10 @@ static const pixman_fast_path_t sse2_fast_paths[] =
+     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
+     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
+ 
++    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
++    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
++    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
++
+     { PIXMAN_OP_NONE },
+ };
+ 
+-- 
+1.6.6.1
+