Qt 4.8
qdrawhelper_neon.cpp
Go to the documentation of this file.
1 /****************************************************************************
2 **
3 ** Copyright (C) 2014 Digia Plc and/or its subsidiary(-ies).
4 ** Contact: http://www.qt-project.org/legal
5 **
6 ** This file is part of the QtGui module of the Qt Toolkit.
7 **
8 ** $QT_BEGIN_LICENSE:LGPL$
9 ** Commercial License Usage
10 ** Licensees holding valid commercial Qt licenses may use this file in
11 ** accordance with the commercial license agreement provided with the
12 ** Software or, alternatively, in accordance with the terms contained in
13 ** a written agreement between you and Digia. For licensing terms and
14 ** conditions see http://qt.digia.com/licensing. For further information
15 ** use the contact form at http://qt.digia.com/contact-us.
16 **
17 ** GNU Lesser General Public License Usage
18 ** Alternatively, this file may be used under the terms of the GNU Lesser
19 ** General Public License version 2.1 as published by the Free Software
20 ** Foundation and appearing in the file LICENSE.LGPL included in the
21 ** packaging of this file. Please review the following information to
22 ** ensure the GNU Lesser General Public License version 2.1 requirements
23 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
24 **
25 ** In addition, as a special exception, Digia gives you certain additional
26 ** rights. These rights are described in the Digia Qt LGPL Exception
27 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
28 **
29 ** GNU General Public License Usage
30 ** Alternatively, this file may be used under the terms of the GNU
31 ** General Public License version 3.0 as published by the Free Software
32 ** Foundation and appearing in the file LICENSE.GPL included in the
33 ** packaging of this file. Please review the following information to
34 ** ensure the GNU General Public License version 3.0 requirements will be
35 ** met: http://www.gnu.org/copyleft/gpl.html.
36 **
37 **
38 ** $QT_END_LICENSE$
39 **
40 ****************************************************************************/
41 
42 #include <private/qdrawhelper_p.h>
43 #include <private/qblendfunctions_p.h>
44 #include <private/qmath_p.h>
45 
46 #ifdef QT_HAVE_NEON
47 
48 #include <private/qdrawhelper_neon_p.h>
49 #include <private/qpaintengine_raster_p.h>
50 #include <arm_neon.h>
51 
53 
54 void qt_memfill32_neon(quint32 *dest, quint32 value, int count)
55 {
56  const int epilogueSize = count % 16;
57  if (count >= 16) {
58  quint32 *const neonEnd = dest + count - epilogueSize;
59  register uint32x4_t valueVector1 asm ("q0") = vdupq_n_u32(value);
60  register uint32x4_t valueVector2 asm ("q1") = valueVector1;
61  while (dest != neonEnd) {
62  asm volatile (
63  "vst2.32 { d0, d1, d2, d3 }, [%[DST]] !\n\t"
64  "vst2.32 { d0, d1, d2, d3 }, [%[DST]] !\n\t"
65  : [DST]"+r" (dest)
66  : [VALUE1]"w"(valueVector1), [VALUE2]"w"(valueVector2)
67  : "memory"
68  );
69  }
70  }
71 
72  switch (epilogueSize)
73  {
74  case 15: *dest++ = value;
75  case 14: *dest++ = value;
76  case 13: *dest++ = value;
77  case 12: *dest++ = value;
78  case 11: *dest++ = value;
79  case 10: *dest++ = value;
80  case 9: *dest++ = value;
81  case 8: *dest++ = value;
82  case 7: *dest++ = value;
83  case 6: *dest++ = value;
84  case 5: *dest++ = value;
85  case 4: *dest++ = value;
86  case 3: *dest++ = value;
87  case 2: *dest++ = value;
88  case 1: *dest++ = value;
89  }
90 }
91 
92 static inline uint16x8_t qvdiv_255_u16(uint16x8_t x, uint16x8_t half)
93 {
94  // result = (x + (x >> 8) + 0x80) >> 8
95 
96  const uint16x8_t temp = vshrq_n_u16(x, 8); // x >> 8
97  const uint16x8_t sum_part = vaddq_u16(x, half); // x + 0x80
98  const uint16x8_t sum = vaddq_u16(temp, sum_part);
99 
100  return vshrq_n_u16(sum, 8);
101 }
102 
103 static inline uint16x8_t qvbyte_mul_u16(uint16x8_t x, uint16x8_t alpha, uint16x8_t half)
104 {
105  // t = qRound(x * alpha / 255.0)
106 
107  const uint16x8_t t = vmulq_u16(x, alpha); // t
108  return qvdiv_255_u16(t, half);
109 }
110 
111 static inline uint16x8_t qvinterpolate_pixel_255(uint16x8_t x, uint16x8_t a, uint16x8_t y, uint16x8_t b, uint16x8_t half)
112 {
113  // t = x * a + y * b
114 
115  const uint16x8_t ta = vmulq_u16(x, a);
116  const uint16x8_t tb = vmulq_u16(y, b);
117 
118  return qvdiv_255_u16(vaddq_u16(ta, tb), half);
119 }
120 
121 static inline uint16x8_t qvsource_over_u16(uint16x8_t src16, uint16x8_t dst16, uint16x8_t half, uint16x8_t full)
122 {
123  const uint16x4_t alpha16_high = vdup_lane_u16(vget_high_u16(src16), 3);
124  const uint16x4_t alpha16_low = vdup_lane_u16(vget_low_u16(src16), 3);
125 
126  const uint16x8_t alpha16 = vsubq_u16(full, vcombine_u16(alpha16_low, alpha16_high));
127 
128  return vaddq_u16(src16, qvbyte_mul_u16(dst16, alpha16, half));
129 }
130 
131 extern "C" void
132 pixman_composite_over_8888_0565_asm_neon (int32_t w,
133  int32_t h,
134  uint16_t *dst,
135  int32_t dst_stride,
136  uint32_t *src,
137  int32_t src_stride);
138 
139 extern "C" void
140 pixman_composite_over_8888_8888_asm_neon (int32_t w,
141  int32_t h,
142  uint32_t *dst,
143  int32_t dst_stride,
144  uint32_t *src,
145  int32_t src_stride);
146 
147 extern "C" void
148 pixman_composite_src_0565_8888_asm_neon (int32_t w,
149  int32_t h,
150  uint32_t *dst,
151  int32_t dst_stride,
152  uint16_t *src,
153  int32_t src_stride);
154 
155 extern "C" void
156 pixman_composite_over_n_8_0565_asm_neon (int32_t w,
157  int32_t h,
158  uint16_t *dst,
159  int32_t dst_stride,
160  uint32_t src,
161  int32_t unused,
162  uint8_t *mask,
163  int32_t mask_stride);
164 
165 extern "C" void
166 pixman_composite_scanline_over_asm_neon (int32_t w,
167  const uint32_t *dst,
168  const uint32_t *src);
169 
170 extern "C" void
171 pixman_composite_src_0565_0565_asm_neon (int32_t w,
172  int32_t h,
173  uint16_t *dst,
174  int32_t dst_stride,
175  uint16_t *src,
176  int32_t src_stride);
177 
178 // qblendfunctions.cpp
179 void qt_blend_argb32_on_rgb16_const_alpha(uchar *destPixels, int dbpl,
180  const uchar *srcPixels, int sbpl,
181  int w, int h,
182  int const_alpha);
183 
184 void qt_blend_rgb16_on_argb32_neon(uchar *destPixels, int dbpl,
185  const uchar *srcPixels, int sbpl,
186  int w, int h,
187  int const_alpha)
188 {
189  dbpl /= 4;
190  sbpl /= 2;
191 
192  quint32 *dst = (quint32 *) destPixels;
193  quint16 *src = (quint16 *) srcPixels;
194 
195  if (const_alpha != 256) {
196  quint8 a = (255 * const_alpha) >> 8;
197  quint8 ia = 255 - a;
198 
199  while (h--) {
200  for (int x=0; x<w; ++x)
201  dst[x] = INTERPOLATE_PIXEL_255(qt_colorConvert(src[x], dst[x]), a, dst[x], ia);
202  dst += dbpl;
203  src += sbpl;
204  }
205  return;
206  }
207 
208  pixman_composite_src_0565_8888_asm_neon(w, h, dst, dbpl, src, sbpl);
209 }
210 
211 // qblendfunctions.cpp
212 void qt_blend_rgb16_on_rgb16(uchar *dst, int dbpl,
213  const uchar *src, int sbpl,
214  int w, int h,
215  int const_alpha);
216 
217 template <int N>
218 static inline void scanLineBlit16(quint16 *dst, quint16 *src, int dstride)
219 {
220  if (N >= 2) {
221  ((quint32 *)dst)[0] = ((quint32 *)src)[0];
222  __builtin_prefetch(dst + dstride, 1, 0);
223  }
224  for (int i = 1; i < N/2; ++i)
225  ((quint32 *)dst)[i] = ((quint32 *)src)[i];
226  if (N & 1)
227  dst[N-1] = src[N-1];
228 }
229 
230 template <int Width>
231 static inline void blockBlit16(quint16 *dst, quint16 *src, int dstride, int sstride, int h)
232 {
233  union {
234  quintptr address;
235  quint16 *pointer;
236  } u;
237 
238  u.pointer = dst;
239 
240  if (u.address & 2) {
241  while (h--) {
242  // align dst
243  dst[0] = src[0];
244  if (Width > 1)
245  scanLineBlit16<Width-1>(dst + 1, src + 1, dstride);
246  dst += dstride;
247  src += sstride;
248  }
249  } else {
250  while (h--) {
251  scanLineBlit16<Width>(dst, src, dstride);
252 
253  dst += dstride;
254  src += sstride;
255  }
256  }
257 }
258 
259 void qt_blend_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
260  const uchar *srcPixels, int sbpl,
261  int w, int h,
262  int const_alpha)
263 {
264  // testing show that the default memcpy is faster for widths 150 and up
265  if (const_alpha != 256 || w >= 150) {
266  qt_blend_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
267  return;
268  }
269 
270  int dstride = dbpl / 2;
271  int sstride = sbpl / 2;
272 
273  quint16 *dst = (quint16 *) destPixels;
274  quint16 *src = (quint16 *) srcPixels;
275 
276  switch (w) {
277 #define BLOCKBLIT(n) case n: blockBlit16<n>(dst, src, dstride, sstride, h); return;
278  BLOCKBLIT(1);
279  BLOCKBLIT(2);
280  BLOCKBLIT(3);
281  BLOCKBLIT(4);
282  BLOCKBLIT(5);
283  BLOCKBLIT(6);
284  BLOCKBLIT(7);
285  BLOCKBLIT(8);
286  BLOCKBLIT(9);
287  BLOCKBLIT(10);
288  BLOCKBLIT(11);
289  BLOCKBLIT(12);
290  BLOCKBLIT(13);
291  BLOCKBLIT(14);
292  BLOCKBLIT(15);
293 #undef BLOCKBLIT
294  default:
295  break;
296  }
297 
298  pixman_composite_src_0565_0565_asm_neon (w, h, dst, dstride, src, sstride);
299 }
300 
301 extern "C" void blend_8_pixels_argb32_on_rgb16_neon(quint16 *dst, const quint32 *src, int const_alpha);
302 
303 void qt_blend_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
304  const uchar *srcPixels, int sbpl,
305  int w, int h,
306  int const_alpha)
307 {
308  quint16 *dst = (quint16 *) destPixels;
309  quint32 *src = (quint32 *) srcPixels;
310 
311  if (const_alpha != 256) {
312  for (int y=0; y<h; ++y) {
313  int i = 0;
314  for (; i < w-7; i += 8)
315  blend_8_pixels_argb32_on_rgb16_neon(&dst[i], &src[i], const_alpha);
316 
317  if (i < w) {
318  int tail = w - i;
319 
320  quint16 dstBuffer[8];
321  quint32 srcBuffer[8];
322 
323  for (int j = 0; j < tail; ++j) {
324  dstBuffer[j] = dst[i + j];
325  srcBuffer[j] = src[i + j];
326  }
327 
328  blend_8_pixels_argb32_on_rgb16_neon(dstBuffer, srcBuffer, const_alpha);
329 
330  for (int j = 0; j < tail; ++j)
331  dst[i + j] = dstBuffer[j];
332  }
333 
334  dst = (quint16 *)(((uchar *) dst) + dbpl);
335  src = (quint32 *)(((uchar *) src) + sbpl);
336  }
337  return;
338  }
339 
340  pixman_composite_over_8888_0565_asm_neon(w, h, dst, dbpl / 2, src, sbpl / 4);
341 }
342 
343 void qt_blend_argb32_on_argb32_scanline_neon(uint *dest, const uint *src, int length, uint const_alpha)
344 {
345  if (const_alpha == 255) {
346  pixman_composite_scanline_over_asm_neon(length, dest, src);
347  } else {
348  qt_blend_argb32_on_argb32_neon((uchar *)dest, 4 * length, (uchar *)src, 4 * length, length, 1, (const_alpha * 256) / 255);
349  }
350 }
351 
352 void qt_blend_argb32_on_argb32_neon(uchar *destPixels, int dbpl,
353  const uchar *srcPixels, int sbpl,
354  int w, int h,
355  int const_alpha)
356 {
357  const uint *src = (const uint *) srcPixels;
358  uint *dst = (uint *) destPixels;
359  uint16x8_t half = vdupq_n_u16(0x80);
360  uint16x8_t full = vdupq_n_u16(0xff);
361  if (const_alpha == 256) {
362  pixman_composite_over_8888_8888_asm_neon(w, h, (uint32_t *)destPixels, dbpl / 4, (uint32_t *)srcPixels, sbpl / 4);
363  } else if (const_alpha != 0) {
364  const_alpha = (const_alpha * 255) >> 8;
365  uint16x8_t const_alpha16 = vdupq_n_u16(const_alpha);
366  for (int y = 0; y < h; ++y) {
367  int x = 0;
368  for (; x < w-3; x += 4) {
369  if (src[x] | src[x+1] | src[x+2] | src[x+3]) {
370  uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
371  uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]);
372 
373  const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
374  const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32);
375 
376  const uint8x8_t src8_low = vget_low_u8(src8);
377  const uint8x8_t dst8_low = vget_low_u8(dst8);
378 
379  const uint8x8_t src8_high = vget_high_u8(src8);
380  const uint8x8_t dst8_high = vget_high_u8(dst8);
381 
382  const uint16x8_t src16_low = vmovl_u8(src8_low);
383  const uint16x8_t dst16_low = vmovl_u8(dst8_low);
384 
385  const uint16x8_t src16_high = vmovl_u8(src8_high);
386  const uint16x8_t dst16_high = vmovl_u8(dst8_high);
387 
388  const uint16x8_t srcalpha16_low = qvbyte_mul_u16(src16_low, const_alpha16, half);
389  const uint16x8_t srcalpha16_high = qvbyte_mul_u16(src16_high, const_alpha16, half);
390 
391  const uint16x8_t result16_low = qvsource_over_u16(srcalpha16_low, dst16_low, half, full);
392  const uint16x8_t result16_high = qvsource_over_u16(srcalpha16_high, dst16_high, half, full);
393 
394  const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
395  const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
396 
397  vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
398  }
399  }
400  for (; x<w; ++x) {
401  uint s = src[x];
402  if (s != 0) {
403  s = BYTE_MUL(s, const_alpha);
404  dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
405  }
406  }
407  dst = (quint32 *)(((uchar *) dst) + dbpl);
408  src = (const quint32 *)(((const uchar *) src) + sbpl);
409  }
410  }
411 }
412 
413 // qblendfunctions.cpp
414 void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl,
415  const uchar *srcPixels, int sbpl,
416  int w, int h,
417  int const_alpha);
418 
419 void qt_blend_rgb32_on_rgb32_neon(uchar *destPixels, int dbpl,
420  const uchar *srcPixels, int sbpl,
421  int w, int h,
422  int const_alpha)
423 {
424  if (const_alpha != 256) {
425  if (const_alpha != 0) {
426  const uint *src = (const uint *) srcPixels;
427  uint *dst = (uint *) destPixels;
428  uint16x8_t half = vdupq_n_u16(0x80);
429  const_alpha = (const_alpha * 255) >> 8;
430  int one_minus_const_alpha = 255 - const_alpha;
431  uint16x8_t const_alpha16 = vdupq_n_u16(const_alpha);
432  uint16x8_t one_minus_const_alpha16 = vdupq_n_u16(255 - const_alpha);
433  for (int y = 0; y < h; ++y) {
434  int x = 0;
435  for (; x < w-3; x += 4) {
436  uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
437  uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]);
438 
439  const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
440  const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32);
441 
442  const uint8x8_t src8_low = vget_low_u8(src8);
443  const uint8x8_t dst8_low = vget_low_u8(dst8);
444 
445  const uint8x8_t src8_high = vget_high_u8(src8);
446  const uint8x8_t dst8_high = vget_high_u8(dst8);
447 
448  const uint16x8_t src16_low = vmovl_u8(src8_low);
449  const uint16x8_t dst16_low = vmovl_u8(dst8_low);
450 
451  const uint16x8_t src16_high = vmovl_u8(src8_high);
452  const uint16x8_t dst16_high = vmovl_u8(dst8_high);
453 
454  const uint16x8_t result16_low = qvinterpolate_pixel_255(src16_low, const_alpha16, dst16_low, one_minus_const_alpha16, half);
455  const uint16x8_t result16_high = qvinterpolate_pixel_255(src16_high, const_alpha16, dst16_high, one_minus_const_alpha16, half);
456 
457  const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
458  const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
459 
460  vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
461  }
462  for (; x<w; ++x) {
463  uint s = src[x];
464  s = BYTE_MUL(s, const_alpha);
465  dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
466  }
467  dst = (quint32 *)(((uchar *) dst) + dbpl);
468  src = (const quint32 *)(((const uchar *) src) + sbpl);
469  }
470  }
471  } else {
472  qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
473  }
474 }
475 
476 void qt_alphamapblit_quint16_neon(QRasterBuffer *rasterBuffer,
477  int x, int y, quint32 color,
478  const uchar *bitmap,
479  int mapWidth, int mapHeight, int mapStride,
480  const QClipData *)
481 {
482  quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x;
483  const int destStride = rasterBuffer->bytesPerLine() / sizeof(quint16);
484 
485  uchar *mask = const_cast<uchar *>(bitmap);
486 
487  pixman_composite_over_n_8_0565_asm_neon(mapWidth, mapHeight, dest, destStride, color, 0, mask, mapStride);
488 }
489 
490 extern "C" void blend_8_pixels_rgb16_on_rgb16_neon(quint16 *dst, const quint16 *src, int const_alpha);
491 
492 template <typename SRC, typename BlendFunc>
493 struct Blend_on_RGB16_SourceAndConstAlpha_Neon {
494  Blend_on_RGB16_SourceAndConstAlpha_Neon(BlendFunc blender, int const_alpha)
495  : m_index(0)
496  , m_blender(blender)
497  , m_const_alpha(const_alpha)
498  {
499  }
500 
501  inline void write(quint16 *dst, quint32 src)
502  {
503  srcBuffer[m_index++] = src;
504 
505  if (m_index == 8) {
506  m_blender(dst - 7, srcBuffer, m_const_alpha);
507  m_index = 0;
508  }
509  }
510 
511  inline void flush(quint16 *dst)
512  {
513  if (m_index > 0) {
514  quint16 dstBuffer[8];
515  for (int i = 0; i < m_index; ++i)
516  dstBuffer[i] = dst[i - m_index];
517 
518  m_blender(dstBuffer, srcBuffer, m_const_alpha);
519 
520  for (int i = 0; i < m_index; ++i)
521  dst[i - m_index] = dstBuffer[i];
522 
523  m_index = 0;
524  }
525  }
526 
527  SRC srcBuffer[8];
528 
529  int m_index;
530  BlendFunc m_blender;
531  int m_const_alpha;
532 };
533 
534 template <typename SRC, typename BlendFunc>
535 Blend_on_RGB16_SourceAndConstAlpha_Neon<SRC, BlendFunc>
536 Blend_on_RGB16_SourceAndConstAlpha_Neon_create(BlendFunc blender, int const_alpha)
537 {
538  return Blend_on_RGB16_SourceAndConstAlpha_Neon<SRC, BlendFunc>(blender, const_alpha);
539 }
540 
541 void qt_scale_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
542  const uchar *srcPixels, int sbpl, int sh,
543  const QRectF &targetRect,
544  const QRectF &sourceRect,
545  const QRect &clip,
546  int const_alpha)
547 {
548  if (const_alpha == 0)
549  return;
550 
551  qt_scale_image_16bit<quint32>(destPixels, dbpl, srcPixels, sbpl, sh, targetRect, sourceRect, clip,
552  Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint32>(blend_8_pixels_argb32_on_rgb16_neon, const_alpha));
553 }
554 
555 void qt_scale_image_rgb16_on_rgb16(uchar *destPixels, int dbpl,
556  const uchar *srcPixels, int sbpl, int sh,
557  const QRectF &targetRect,
558  const QRectF &sourceRect,
559  const QRect &clip,
560  int const_alpha);
561 
562 void qt_scale_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
563  const uchar *srcPixels, int sbpl, int sh,
564  const QRectF &targetRect,
565  const QRectF &sourceRect,
566  const QRect &clip,
567  int const_alpha)
568 {
569  if (const_alpha == 0)
570  return;
571 
572  if (const_alpha == 256) {
573  qt_scale_image_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, sh, targetRect, sourceRect, clip, const_alpha);
574  return;
575  }
576 
577  qt_scale_image_16bit<quint16>(destPixels, dbpl, srcPixels, sbpl, sh, targetRect, sourceRect, clip,
578  Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint16>(blend_8_pixels_rgb16_on_rgb16_neon, const_alpha));
579 }
580 
581 extern void qt_transform_image_rgb16_on_rgb16(uchar *destPixels, int dbpl,
582  const uchar *srcPixels, int sbpl,
583  const QRectF &targetRect,
584  const QRectF &sourceRect,
585  const QRect &clip,
586  const QTransform &targetRectTransform,
587  int const_alpha);
588 
589 void qt_transform_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
590  const uchar *srcPixels, int sbpl,
591  const QRectF &targetRect,
592  const QRectF &sourceRect,
593  const QRect &clip,
594  const QTransform &targetRectTransform,
595  int const_alpha)
596 {
597  if (const_alpha == 0)
598  return;
599 
600  if (const_alpha == 256) {
601  qt_transform_image_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip, targetRectTransform, const_alpha);
602  return;
603  }
604 
605  qt_transform_image(reinterpret_cast<quint16 *>(destPixels), dbpl,
606  reinterpret_cast<const quint16 *>(srcPixels), sbpl, targetRect, sourceRect, clip, targetRectTransform,
607  Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint16>(blend_8_pixels_rgb16_on_rgb16_neon, const_alpha));
608 }
609 
610 void qt_transform_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
611  const uchar *srcPixels, int sbpl,
612  const QRectF &targetRect,
613  const QRectF &sourceRect,
614  const QRect &clip,
615  const QTransform &targetRectTransform,
616  int const_alpha)
617 {
618  if (const_alpha == 0)
619  return;
620 
621  qt_transform_image(reinterpret_cast<quint16 *>(destPixels), dbpl,
622  reinterpret_cast<const quint32 *>(srcPixels), sbpl, targetRect, sourceRect, clip, targetRectTransform,
623  Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint32>(blend_8_pixels_argb32_on_rgb16_neon, const_alpha));
624 }
625 
626 static inline void convert_8_pixels_rgb16_to_argb32(quint32 *dst, const quint16 *src)
627 {
628  asm volatile (
629  "vld1.16 { d0, d1 }, [%[SRC]]\n\t"
630 
631  /* convert 8 r5g6b5 pixel data from {d0, d1} to planar 8-bit format
632  and put data into d4 - red, d3 - green, d2 - blue */
633  "vshrn.u16 d4, q0, #8\n\t"
634  "vshrn.u16 d3, q0, #3\n\t"
635  "vsli.u16 q0, q0, #5\n\t"
636  "vsri.u8 d4, d4, #5\n\t"
637  "vsri.u8 d3, d3, #6\n\t"
638  "vshrn.u16 d2, q0, #2\n\t"
639 
640  /* fill d5 - alpha with 0xff */
641  "mov r2, #255\n\t"
642  "vdup.8 d5, r2\n\t"
643 
644  "vst4.8 { d2, d3, d4, d5 }, [%[DST]]"
645  : : [DST]"r" (dst), [SRC]"r" (src)
646  : "memory", "r2", "d0", "d1", "d2", "d3", "d4", "d5"
647  );
648 }
649 
650 uint * QT_FASTCALL qt_destFetchRGB16_neon(uint *buffer, QRasterBuffer *rasterBuffer, int x, int y, int length)
651 {
652  const ushort *data = (const ushort *)rasterBuffer->scanLine(y) + x;
653 
654  int i = 0;
655  for (; i < length - 7; i += 8)
656  convert_8_pixels_rgb16_to_argb32(&buffer[i], &data[i]);
657 
658  if (i < length) {
659  quint16 srcBuffer[8];
660  quint32 dstBuffer[8];
661 
662  int tail = length - i;
663  for (int j = 0; j < tail; ++j)
664  srcBuffer[j] = data[i + j];
665 
666  convert_8_pixels_rgb16_to_argb32(dstBuffer, srcBuffer);
667 
668  for (int j = 0; j < tail; ++j)
669  buffer[i + j] = dstBuffer[j];
670  }
671 
672  return buffer;
673 }
674 
675 static inline void convert_8_pixels_argb32_to_rgb16(quint16 *dst, const quint32 *src)
676 {
677  asm volatile (
678  "vld4.8 { d0, d1, d2, d3 }, [%[SRC]]\n\t"
679 
680  /* convert to r5g6b5 and store it into {d28, d29} */
681  "vshll.u8 q14, d2, #8\n\t"
682  "vshll.u8 q8, d1, #8\n\t"
683  "vshll.u8 q9, d0, #8\n\t"
684  "vsri.u16 q14, q8, #5\n\t"
685  "vsri.u16 q14, q9, #11\n\t"
686 
687  "vst1.16 { d28, d29 }, [%[DST]]"
688  : : [DST]"r" (dst), [SRC]"r" (src)
689  : "memory", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d28", "d29"
690  );
691 }
692 
693 void QT_FASTCALL qt_destStoreRGB16_neon(QRasterBuffer *rasterBuffer, int x, int y, const uint *buffer, int length)
694 {
695  quint16 *data = (quint16*)rasterBuffer->scanLine(y) + x;
696 
697  int i = 0;
698  for (; i < length - 7; i += 8)
699  convert_8_pixels_argb32_to_rgb16(&data[i], &buffer[i]);
700 
701  if (i < length) {
702  quint32 srcBuffer[8];
703  quint16 dstBuffer[8];
704 
705  int tail = length - i;
706  for (int j = 0; j < tail; ++j)
707  srcBuffer[j] = buffer[i + j];
708 
709  convert_8_pixels_argb32_to_rgb16(dstBuffer, srcBuffer);
710 
711  for (int j = 0; j < tail; ++j)
712  data[i + j] = dstBuffer[j];
713  }
714 }
715 
716 void QT_FASTCALL comp_func_solid_SourceOver_neon(uint *destPixels, int length, uint color, uint const_alpha)
717 {
718  if ((const_alpha & qAlpha(color)) == 255) {
719  QT_MEMFILL_UINT(destPixels, length, color);
720  } else {
721  if (const_alpha != 255)
722  color = BYTE_MUL(color, const_alpha);
723 
724  const quint32 minusAlphaOfColor = qAlpha(~color);
725  int x = 0;
726 
727  uint32_t *dst = (uint32_t *) destPixels;
728  const uint32x4_t colorVector = vdupq_n_u32(color);
729  uint16x8_t half = vdupq_n_u16(0x80);
730  const uint16x8_t minusAlphaOfColorVector = vdupq_n_u16(minusAlphaOfColor);
731 
732  for (; x < length-3; x += 4) {
733  uint32x4_t dstVector = vld1q_u32(&dst[x]);
734 
735  const uint8x16_t dst8 = vreinterpretq_u8_u32(dstVector);
736 
737  const uint8x8_t dst8_low = vget_low_u8(dst8);
738  const uint8x8_t dst8_high = vget_high_u8(dst8);
739 
740  const uint16x8_t dst16_low = vmovl_u8(dst8_low);
741  const uint16x8_t dst16_high = vmovl_u8(dst8_high);
742 
743  const uint16x8_t result16_low = qvbyte_mul_u16(dst16_low, minusAlphaOfColorVector, half);
744  const uint16x8_t result16_high = qvbyte_mul_u16(dst16_high, minusAlphaOfColorVector, half);
745 
746  const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low));
747  const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high));
748 
749  uint32x4_t blendedPixels = vcombine_u32(result32_low, result32_high);
750  uint32x4_t colorPlusBlendedPixels = vaddq_u32(colorVector, blendedPixels);
751  vst1q_u32(&dst[x], colorPlusBlendedPixels);
752  }
753 
754  for (;x < length; ++x)
755  destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
756  }
757 }
758 
759 void QT_FASTCALL comp_func_Plus_neon(uint *dst, const uint *src, int length, uint const_alpha)
760 {
761  if (const_alpha == 255) {
762  uint *const end = dst + length;
763  uint *const neonEnd = end - 3;
764 
765  while (dst < neonEnd) {
766  asm volatile (
767  "vld2.8 { d0, d1 }, [%[SRC]] !\n\t"
768  "vld2.8 { d2, d3 }, [%[DST]]\n\t"
769  "vqadd.u8 q0, q0, q1\n\t"
770  "vst2.8 { d0, d1 }, [%[DST]] !\n\t"
771  : [DST]"+r" (dst), [SRC]"+r" (src)
772  :
773  : "memory", "d0", "d1", "d2", "d3", "q0", "q1"
774  );
775  }
776 
777  while (dst != end) {
778  *dst = comp_func_Plus_one_pixel(*dst, *src);
779  ++dst;
780  ++src;
781  }
782  } else {
783  int x = 0;
784  const int one_minus_const_alpha = 255 - const_alpha;
785  const uint16x8_t constAlphaVector = vdupq_n_u16(const_alpha);
786  const uint16x8_t oneMinusconstAlphaVector = vdupq_n_u16(one_minus_const_alpha);
787 
788  const uint16x8_t half = vdupq_n_u16(0x80);
789  for (; x < length - 3; x += 4) {
790  const uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
791  const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
792  uint8x16_t dst8 = vld1q_u8((uint8_t *)&dst[x]);
793  uint8x16_t result = vqaddq_u8(dst8, src8);
794 
795  uint16x8_t result_low = vmovl_u8(vget_low_u8(result));
796  uint16x8_t result_high = vmovl_u8(vget_high_u8(result));
797 
798  uint16x8_t dst_low = vmovl_u8(vget_low_u8(dst8));
799  uint16x8_t dst_high = vmovl_u8(vget_high_u8(dst8));
800 
801  result_low = qvinterpolate_pixel_255(result_low, constAlphaVector, dst_low, oneMinusconstAlphaVector, half);
802  result_high = qvinterpolate_pixel_255(result_high, constAlphaVector, dst_high, oneMinusconstAlphaVector, half);
803 
804  const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result_low));
805  const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result_high));
806  vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
807  }
808 
809  for (; x < length; ++x)
810  dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha);
811  }
812 }
813 
814 static const int tileSize = 32;
815 
816 extern "C" void qt_rotate90_16_neon(quint16 *dst, const quint16 *src, int sstride, int dstride, int count);
817 
818 void qt_memrotate90_16_neon(const uchar *srcPixels, int w, int h, int sstride, uchar *destPixels, int dstride)
819 {
820  const ushort *src = (const ushort *)srcPixels;
821  ushort *dest = (ushort *)destPixels;
822 
823  sstride /= sizeof(ushort);
824  dstride /= sizeof(ushort);
825 
826  const int pack = sizeof(quint32) / sizeof(ushort);
827  const int unaligned =
828  qMin(uint((quintptr(dest) & (sizeof(quint32)-1)) / sizeof(ushort)), uint(h));
829  const int restX = w % tileSize;
830  const int restY = (h - unaligned) % tileSize;
831  const int unoptimizedY = restY % pack;
832  const int numTilesX = w / tileSize + (restX > 0);
833  const int numTilesY = (h - unaligned) / tileSize + (restY >= pack);
834 
835  for (int tx = 0; tx < numTilesX; ++tx) {
836  const int startx = w - tx * tileSize - 1;
837  const int stopx = qMax(startx - tileSize, 0);
838 
839  if (unaligned) {
840  for (int x = startx; x >= stopx; --x) {
841  ushort *d = dest + (w - x - 1) * dstride;
842  for (int y = 0; y < unaligned; ++y) {
843  *d++ = src[y * sstride + x];
844  }
845  }
846  }
847 
848  for (int ty = 0; ty < numTilesY; ++ty) {
849  const int starty = ty * tileSize + unaligned;
850  const int stopy = qMin(starty + tileSize, h - unoptimizedY);
851 
852  int x = startx;
853  // qt_rotate90_16_neon writes to eight rows, four pixels at a time
854  for (; x >= stopx + 7; x -= 8) {
855  ushort *d = dest + (w - x - 1) * dstride + starty;
856  const ushort *s = &src[starty * sstride + x - 7];
857  qt_rotate90_16_neon(d, s, sstride * 2, dstride * 2, stopy - starty);
858  }
859 
860  for (; x >= stopx; --x) {
861  quint32 *d = reinterpret_cast<quint32*>(dest + (w - x - 1) * dstride + starty);
862  for (int y = starty; y < stopy; y += pack) {
863  quint32 c = src[y * sstride + x];
864  for (int i = 1; i < pack; ++i) {
865  const int shift = (sizeof(int) * 8 / pack * i);
866  const ushort color = src[(y + i) * sstride + x];
867  c |= color << shift;
868  }
869  *d++ = c;
870  }
871  }
872  }
873 
874  if (unoptimizedY) {
875  const int starty = h - unoptimizedY;
876  for (int x = startx; x >= stopx; --x) {
877  ushort *d = dest + (w - x - 1) * dstride + starty;
878  for (int y = starty; y < h; ++y) {
879  *d++ = src[y * sstride + x];
880  }
881  }
882  }
883  }
884 }
885 
886 extern "C" void qt_rotate270_16_neon(quint16 *dst, const quint16 *src, int sstride, int dstride, int count);
887 
888 void qt_memrotate270_16_neon(const uchar *srcPixels, int w, int h,
889  int sstride,
890  uchar *destPixels, int dstride)
891 {
892  const ushort *src = (const ushort *)srcPixels;
893  ushort *dest = (ushort *)destPixels;
894 
895  sstride /= sizeof(ushort);
896  dstride /= sizeof(ushort);
897 
898  const int pack = sizeof(quint32) / sizeof(ushort);
899  const int unaligned =
900  qMin(uint((long(dest) & (sizeof(quint32)-1)) / sizeof(ushort)), uint(h));
901  const int restX = w % tileSize;
902  const int restY = (h - unaligned) % tileSize;
903  const int unoptimizedY = restY % pack;
904  const int numTilesX = w / tileSize + (restX > 0);
905  const int numTilesY = (h - unaligned) / tileSize + (restY >= pack);
906 
907  for (int tx = 0; tx < numTilesX; ++tx) {
908  const int startx = tx * tileSize;
909  const int stopx = qMin(startx + tileSize, w);
910 
911  if (unaligned) {
912  for (int x = startx; x < stopx; ++x) {
913  ushort *d = dest + x * dstride;
914  for (int y = h - 1; y >= h - unaligned; --y) {
915  *d++ = src[y * sstride + x];
916  }
917  }
918  }
919 
920  for (int ty = 0; ty < numTilesY; ++ty) {
921  const int starty = h - 1 - unaligned - ty * tileSize;
922  const int stopy = qMax(starty - tileSize, unoptimizedY);
923 
924  int x = startx;
925  // qt_rotate90_16_neon writes to eight rows, four pixels at a time
926  for (; x < stopx - 7; x += 8) {
927  ushort *d = dest + x * dstride + h - 1 - starty;
928  const ushort *s = &src[starty * sstride + x];
929  qt_rotate90_16_neon(d + 7 * dstride, s, -sstride * 2, -dstride * 2, starty - stopy);
930  }
931 
932  for (; x < stopx; ++x) {
933  quint32 *d = reinterpret_cast<quint32*>(dest + x * dstride
934  + h - 1 - starty);
935  for (int y = starty; y > stopy; y -= pack) {
936  quint32 c = src[y * sstride + x];
937  for (int i = 1; i < pack; ++i) {
938  const int shift = (sizeof(int) * 8 / pack * i);
939  const ushort color = src[(y - i) * sstride + x];
940  c |= color << shift;
941  }
942  *d++ = c;
943  }
944  }
945  }
946  if (unoptimizedY) {
947  const int starty = unoptimizedY - 1;
948  for (int x = startx; x < stopx; ++x) {
949  ushort *d = dest + x * dstride + h - 1 - starty;
950  for (int y = starty; y >= 0; --y) {
951  *d++ = src[y * sstride + x];
952  }
953  }
954  }
955  }
956 }
957 
958 class QSimdNeon
959 {
960 public:
961  typedef int32x4_t Int32x4;
962  typedef float32x4_t Float32x4;
963 
964  union Vect_buffer_i { Int32x4 v; int i[4]; };
965  union Vect_buffer_f { Float32x4 v; float f[4]; };
966 
967  static inline Float32x4 v_dup(float x) { return vdupq_n_f32(x); }
968  static inline Int32x4 v_dup(int x) { return vdupq_n_s32(x); }
969  static inline Int32x4 v_dup(uint x) { return vdupq_n_s32(x); }
970 
971  static inline Float32x4 v_add(Float32x4 a, Float32x4 b) { return vaddq_f32(a, b); }
972  static inline Int32x4 v_add(Int32x4 a, Int32x4 b) { return vaddq_s32(a, b); }
973 
974  static inline Float32x4 v_max(Float32x4 a, Float32x4 b) { return vmaxq_f32(a, b); }
975  static inline Float32x4 v_min(Float32x4 a, Float32x4 b) { return vminq_f32(a, b); }
976  static inline Int32x4 v_min_16(Int32x4 a, Int32x4 b) { return vminq_s32(a, b); }
977 
978  static inline Int32x4 v_and(Int32x4 a, Int32x4 b) { return vandq_s32(a, b); }
979 
980  static inline Float32x4 v_sub(Float32x4 a, Float32x4 b) { return vsubq_f32(a, b); }
981  static inline Int32x4 v_sub(Int32x4 a, Int32x4 b) { return vsubq_s32(a, b); }
982 
983  static inline Float32x4 v_mul(Float32x4 a, Float32x4 b) { return vmulq_f32(a, b); }
984 
985  static inline Float32x4 v_sqrt(Float32x4 x) { Float32x4 y = vrsqrteq_f32(x); y = vmulq_f32(y, vrsqrtsq_f32(x, vmulq_f32(y, y))); return vmulq_f32(x, y); }
986 
987  static inline Int32x4 v_toInt(Float32x4 x) { return vcvtq_s32_f32(x); }
988 
989  static inline Int32x4 v_greaterOrEqual(Float32x4 a, Float32x4 b) { return vreinterpretq_s32_u32(vcgeq_f32(a, b)); }
990 };
991 
992 const uint * QT_FASTCALL qt_fetch_radial_gradient_neon(uint *buffer, const Operator *op, const QSpanData *data,
993  int y, int x, int length)
994 {
995  return qt_fetch_radial_gradient_template<QRadialFetchSimd<QSimdNeon> >(buffer, op, data, y, x, length);
996 }
997 
999 
1000 #endif // QT_HAVE_NEON
1001 
double d
Definition: qnumeric_p.h:62
void qt_scale_image_rgb16_on_rgb16(uchar *destPixels, int dbpl, const uchar *srcPixels, int sbpl, int sh, const QRectF &targetRect, const QRectF &sourceRect, const QRect &clip, int const_alpha)
static ShiftResult shift(const QBezier *orig, QBezier *shifted, qreal offset, qreal threshold)
Definition: qbezier.cpp:289
void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl, const uchar *srcPixels, int sbpl, int w, int h, int const_alpha)
QIntegerForSizeof< void * >::Unsigned quintptr
Definition: qglobal.h:986
unsigned char c[8]
Definition: qnumeric_p.h:62
Q_DECL_CONSTEXPR const T & qMin(const T &a, const T &b)
Definition: qglobal.h:1215
#define QT_END_NAMESPACE
This macro expands to.
Definition: qglobal.h:90
void qt_blend_rgb16_on_rgb16(uchar *dst, int dbpl, const uchar *src, int sbpl, int w, int h, int const_alpha)
Q_GUI_EXPORT_INLINE int qAlpha(QRgb rgb)
Definition: qrgb.h:66
int comp_func_Plus_one_pixel_const_alpha(uint d, const uint s, const uint const_alpha, const uint one_minus_const_alpha)
quint16 u
long ASN1_INTEGER_get ASN1_INTEGER * a
unsigned char quint8
Definition: qglobal.h:934
int bytesPerLine() const
#define QT_FASTCALL
Definition: qglobal.h:1161
Q_DECL_CONSTEXPR const T & qMax(const T &a, const T &b)
Definition: qglobal.h:1217
int qAlpha(QRgb rgba)
Returns the alpha component of the ARGB quadruplet rgba.
Definition: qrgb.h:66
#define QT_MEMFILL_UINT(dest, length, color)
unsigned char uchar
Definition: qglobal.h:994
Q_STATIC_INLINE_FUNCTION uint INTERPOLATE_PIXEL_255(uint x, uint a, uint y, uint b)
Q_STATIC_INLINE_FUNCTION uint BYTE_MUL(uint x, uint a)
#define QT_BEGIN_NAMESPACE
This macro expands to.
Definition: qglobal.h:89
void qt_transform_image(DestT *destPixels, int dbpl, const SrcT *srcPixels, int sbpl, const QRectF &targetRect, const QRectF &sourceRect, const QRect &clip, const QTransform &targetRectTransform, Blender blender)
The QRectF class defines a rectangle in the plane using floating point precision. ...
Definition: qrect.h:511
unsigned short quint16
Definition: qglobal.h:936
static const char * data(const QByteArray &arr)
unsigned int uint
Definition: qglobal.h:996
uchar * scanLine(int y)
void qt_transform_image_rgb16_on_rgb16(uchar *destPixels, int dbpl, const uchar *srcPixels, int sbpl, const QRectF &targetRect, const QRectF &sourceRect, const QRect &clip, const QTransform &targetRectTransform, int const_alpha)
unsigned short ushort
Definition: qglobal.h:995
unsigned int quint32
Definition: qglobal.h:938
void qt_blend_argb32_on_rgb16_const_alpha(uchar *destPixels, int dbpl, const uchar *srcPixels, int sbpl, int w, int h, int const_alpha)
The QRect class defines a rectangle in the plane using integer precision.
Definition: qrect.h:58
int comp_func_Plus_one_pixel(uint d, const uint s)
Q_CORE_EXPORT QTextStream & flush(QTextStream &s)
static const KeyPair *const end
static const int tileSize
Definition: qmemrotate.cpp:47
The QTransform class specifies 2D transformations of a coordinate system.
Definition: qtransform.h:65
DST qt_colorConvert(SRC color, DST dummy)