aboutsummaryrefslogtreecommitdiff
path: root/recipes/xorg-lib/pixman-0.21.6/0009-SSE2-optimization-for-bilinear-scaled-src_8888_8888.patch
blob: b85f78169c6eb7ad9d6c4e5fbb39d60f03d35167 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
From 350029396d911941591149cc82b5e68a78ad6747 Mon Sep 17 00:00:00 2001
From: Siarhei Siamashka <siarhei.siamashka@nokia.com>
Date: Mon, 21 Feb 2011 20:18:02 +0200
Subject: [PATCH 09/40] SSE2 optimization for bilinear scaled 'src_8888_8888'

A primitive naive implementation of bilinear scaling using SSE2 intrinsics,
which only handles one pixel at a time. It is approximately 2x faster than
pixman general compositing path. Single pass processing without intermediate
temporary buffer contributes to ~15% and loop unrolling contributes to ~20%
of this speedup.

Benchmark on Intel Core i7 (x86-64):
 Using cairo-perf-trace:
  before: image        firefox-planet-gnome   12.566   12.610   0.23%    6/6
  after:  image        firefox-planet-gnome   10.961   11.013   0.19%    5/6

 Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
  before: op=1, src=20028888, dst=20028888, speed=70.48 MPix/s
  after:  op=1, src=20028888, dst=20028888, speed=165.38 MPix/s
---
 pixman/pixman-sse2.c |  112 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 112 insertions(+), 0 deletions(-)

diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 88287b4..696005f 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5567,6 +5567,114 @@ FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
 			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
 
+static void
+bilinear_interpolate_line_sse2 (uint32_t *       out,
+                                const uint32_t * top,
+                                const uint32_t * bottom,
+                                int              wt,
+                                int              wb,
+                                pixman_fixed_t   x,
+                                pixman_fixed_t   ux,
+                                int              width)
+{
+    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);
+    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);
+    const __m128i xmm_xorc = _mm_set_epi16 (0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff);
+    const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);
+    const __m128i xmm_ux = _mm_set_epi16 (ux, ux, ux, ux, ux, ux, ux, ux);
+    const __m128i xmm_zero = _mm_setzero_si128 ();
+    __m128i xmm_x = _mm_set_epi16 (x, x, x, x, x, x, x, x);
+    uint32_t pix1, pix2, pix3, pix4;
+
+    #define INTERPOLATE_ONE_PIXEL(pix)						\
+    do {									\
+	__m128i xmm_wh, xmm_lo, xmm_hi, a;					\
+	/* fetch 2x2 pixel block into sse2 register */				\
+	uint32_t tl = top [pixman_fixed_to_int (x)];				\
+	uint32_t tr = top [pixman_fixed_to_int (x) + 1];			\
+	uint32_t bl = bottom [pixman_fixed_to_int (x)];				\
+	uint32_t br = bottom [pixman_fixed_to_int (x) + 1];			\
+	a = _mm_set_epi32 (tr, tl, br, bl);					\
+        x += ux;								\
+	/* vertical interpolation */						\
+	a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpackhi_epi8 (a, xmm_zero),	\
+					    xmm_wt),				\
+			   _mm_mullo_epi16 (_mm_unpacklo_epi8 (a, xmm_zero),	\
+					    xmm_wb));				\
+	/* calculate horizontal weights */					\
+	xmm_wh = _mm_add_epi16 (xmm_addc,					\
+				_mm_xor_si128 (xmm_xorc,			\
+					       _mm_srli_epi16 (xmm_x, 8)));	\
+	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
+	/* horizontal interpolation */						\
+	xmm_lo = _mm_mullo_epi16 (a, xmm_wh);					\
+	xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);					\
+	a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),			\
+			   _mm_unpackhi_epi16 (xmm_lo, xmm_hi));		\
+	/* shift and pack the result */						\
+	a = _mm_srli_epi32 (a, 16);						\
+	a = _mm_packs_epi32 (a, a);						\
+	a = _mm_packus_epi16 (a, a);						\
+	pix = _mm_cvtsi128_si32 (a);						\
+    } while (0)
+
+    while ((width -= 4) >= 0)
+    {
+	INTERPOLATE_ONE_PIXEL (pix1);
+	INTERPOLATE_ONE_PIXEL (pix2);
+	INTERPOLATE_ONE_PIXEL (pix3);
+	INTERPOLATE_ONE_PIXEL (pix4);
+	*out++ = pix1;
+	*out++ = pix2;
+	*out++ = pix3;
+	*out++ = pix4;
+    }
+    if (width & 2)
+    {
+	INTERPOLATE_ONE_PIXEL (pix1);
+	INTERPOLATE_ONE_PIXEL (pix2);
+	*out++ = pix1;
+	*out++ = pix2;
+    }
+    if (width & 1)
+    {
+	INTERPOLATE_ONE_PIXEL (pix1);
+	*out = pix1;
+    }
+
+    #undef INTERPOLATE_ONE_PIXEL
+}
+
+static force_inline void
+scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
+					     const uint32_t * mask,
+					     const uint32_t * src_top,
+					     const uint32_t * src_bottom,
+					     int32_t          w,
+					     int              wt,
+					     int              wb,
+					     pixman_fixed_t   vx,
+					     pixman_fixed_t   unit_x,
+					     pixman_fixed_t   max_vx,
+					     pixman_bool_t    zero_src)
+{
+    bilinear_interpolate_line_sse2 (dst, src_top, src_bottom,
+				    wt, wb, vx, unit_x, w);
+}
+
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
+			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       COVER, FALSE, FALSE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
+			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       PAD, FALSE, FALSE)
+FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
+			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
+			       uint32_t, uint32_t, uint32_t,
+			       NONE, FALSE, FALSE)
+
 static const pixman_fast_path_t sse2_fast_paths[] =
 {
     /* PIXMAN_OP_OVER */
@@ -5668,6 +5776,10 @@ static const pixman_fast_path_t sse2_fast_paths[] =
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
 
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
+    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
+
     { PIXMAN_OP_NONE },
 };
 
-- 
1.6.6.1