Dear congatec git server users, we are currently restructuring the ARM-NXP namespace. Some repositories are moved to a different location. The restructuring will be completed next Monday at the latest. We strongly recommend that you update the repositories after the restructuring. We apologize for the inconvenience.

Best regards, congatec ARM software team

Commit 6d792e90 authored by Koen Kooi's avatar Koen Kooi

pixman 0.23.6: add initial version with NEON bilinear patches

The overlappet blit patches have been reduced to the generic C version
Signed-off-by: default avatarKoen Kooi <koen@dominion.thruhere.net>
parent 8fcf92fe
From f7d1d45e30b59b513d48294de50dc86af60ea68c Mon Sep 17 00:00:00 2001
From: Taekyun Kim <tkq.kim@samsung.com>
Date: Mon, 26 Sep 2011 17:03:54 +0900
Subject: [PATCH 1/8] ARM: NEON: Standard fast path src_n_8_8888
Performance numbers of before/after on cortex-a8 @ 1GHz
- before
L1: 32.39 L2: 31.79 M: 30.84 ( 13.77%) HT: 21.58 VT: 19.75 R: 18.83 RT: 10.46 ( 106Kops/s)
- after
L1: 516.25 L2: 372.00 M:193.49 ( 85.59%) HT:136.93 VT:109.10 R:104.48 RT: 34.77 ( 253Kops/s)
---
pixman/pixman-arm-neon-asm.S | 73 ++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-neon.c | 7 ++++
2 files changed, 80 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 3fcd07d..1db02db 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1219,6 +1219,79 @@ generate_composite_function \
/******************************************************************************/
+.macro pixman_composite_src_n_8_8888_process_pixblock_head
+ /* expecting solid source in {d0, d1, d2, d3} */
+ /* mask is in d24 (d25, d26, d27 are unused) */
+
+ /* in */
+ vmull.u8 q8, d24, d0
+ vmull.u8 q9, d24, d1
+ vmull.u8 q10, d24, d2
+ vmull.u8 q11, d24, d3
+ vrsra.u16 q8, q8, #8
+ vrsra.u16 q9, q9, #8
+ vrsra.u16 q10, q10, #8
+ vrsra.u16 q11, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail
+ vrshrn.u16 d28, q8, #8
+ vrshrn.u16 d29, q9, #8
+ vrshrn.u16 d30, q10, #8
+ vrshrn.u16 d31, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
+ fetch_mask_pixblock
+ PF add PF_X, PF_X, #8
+ vrshrn.u16 d28, q8, #8
+ PF tst PF_CTL, #0x0F
+ vrshrn.u16 d29, q9, #8
+ PF addne PF_X, PF_X, #8
+ vrshrn.u16 d30, q10, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vrshrn.u16 d31, q11, #8
+ PF cmp PF_X, ORIG_W
+ vmull.u8 q8, d24, d0
+ PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+ vmull.u8 q9, d24, d1
+ PF subge PF_X, PF_X, ORIG_W
+ vmull.u8 q10, d24, d2
+ PF subges PF_CTL, PF_CTL, #0x10
+ vmull.u8 q11, d24, d3
+ PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ vst4.8 {d28, d29, d30, d31}, [DST_W :128]!
+ vrsra.u16 q8, q8, #8
+ vrsra.u16 q9, q9, #8
+ vrsra.u16 q10, q10, #8
+ vrsra.u16 q11, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d3[0]}, [DUMMY]
+ vdup.8 d0, d3[0]
+ vdup.8 d1, d3[1]
+ vdup.8 d2, d3[2]
+ vdup.8 d3, d3[3]
+.endm
+
+.macro pixman_composite_src_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_src_n_8_8888_init, \
+ pixman_composite_src_n_8_8888_cleanup, \
+ pixman_composite_src_n_8_8888_process_pixblock_head, \
+ pixman_composite_src_n_8_8888_process_pixblock_tail, \
+ pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
+
+/******************************************************************************/
+
.macro pixman_composite_over_n_8_8888_process_pixblock_head
/* expecting deinterleaved source data in {d8, d9, d10, d11} */
/* d8 - blue, d9 - green, d10 - red, d11 - alpha */
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index effb50b..3db9adf 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -90,6 +90,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8,
uint8_t, 1, uint8_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888,
uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8888,
+ uint8_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888,
uint32_t, 1, uint32_t, 1)
@@ -289,6 +291,11 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
PIXMAN_STD_FAST_PATH (SRC, pixbuf, pixbuf, a8b8g8r8, neon_composite_src_rpixbuf_8888),
PIXMAN_STD_FAST_PATH (SRC, rpixbuf, rpixbuf, a8r8g8b8, neon_composite_src_rpixbuf_8888),
PIXMAN_STD_FAST_PATH (SRC, rpixbuf, rpixbuf, a8b8g8r8, neon_composite_src_pixbuf_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, neon_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, neon_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, neon_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, neon_composite_src_n_8_8888),
+
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8, neon_composite_over_n_8_8),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, neon_composite_over_n_8_0565),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, neon_composite_over_n_8_0565),
--
1.6.6.1
From fc92ad56c5218157a097f6ed0c06196be9f74906 Mon Sep 17 00:00:00 2001
From: Taekyun Kim <tkq.kim@samsung.com>
Date: Mon, 26 Sep 2011 18:33:27 +0900
Subject: [PATCH 2/8] ARM: NEON: Standard fast path src_n_8_8
Performance numbers of before/after on cortex-a8 @ 1GHz
- before
L1: 28.05 L2: 28.26 M: 26.97 ( 4.48%) HT: 19.79 VT: 19.14 R: 17.61 RT: 9.88 ( 101Kops/s)
- after
L1:1430.28 L2:1252.10 M:421.93 ( 75.48%) HT:170.16 VT:138.03 R:145.86 RT: 35.51 ( 255Kops/s)
---
pixman/pixman-arm-neon-asm.S | 66 ++++++++++++++++++++++++++++++++++++++++++
pixman/pixman-arm-neon.c | 3 ++
2 files changed, 69 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 1db02db..da8f054 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1292,6 +1292,72 @@ generate_composite_function \
/******************************************************************************/
+.macro pixman_composite_src_n_8_8_process_pixblock_head
+ vmull.u8 q0, d24, d16
+ vmull.u8 q1, d25, d16
+ vmull.u8 q2, d26, d16
+ vmull.u8 q3, d27, d16
+ vrsra.u16 q0, q0, #8
+ vrsra.u16 q1, q1, #8
+ vrsra.u16 q2, q2, #8
+ vrsra.u16 q3, q3, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail
+ vrshrn.u16 d28, q0, #8
+ vrshrn.u16 d29, q1, #8
+ vrshrn.u16 d30, q2, #8
+ vrshrn.u16 d31, q3, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
+ fetch_mask_pixblock
+ PF add PF_X, PF_X, #8
+ vrshrn.u16 d28, q0, #8
+ PF tst PF_CTL, #0x0F
+ vrshrn.u16 d29, q1, #8
+ PF addne PF_X, PF_X, #8
+ vrshrn.u16 d30, q2, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vrshrn.u16 d31, q3, #8
+ PF cmp PF_X, ORIG_W
+ vmull.u8 q0, d24, d16
+ PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+ vmull.u8 q1, d25, d16
+ PF subge PF_X, PF_X, ORIG_W
+ vmull.u8 q2, d26, d16
+ PF subges PF_CTL, PF_CTL, #0x10
+ vmull.u8 q3, d27, d16
+ PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ vrsra.u16 q0, q0, #8
+ vrsra.u16 q1, q1, #8
+ vrsra.u16 q2, q2, #8
+ vrsra.u16 q3, q3, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d16[0]}, [DUMMY]
+ vdup.8 d16, d16[3]
+.endm
+
+.macro pixman_composite_src_n_8_8_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
+ FLAG_DST_WRITEONLY, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_src_n_8_8_init, \
+ pixman_composite_src_n_8_8_cleanup, \
+ pixman_composite_src_n_8_8_process_pixblock_head, \
+ pixman_composite_src_n_8_8_process_pixblock_tail, \
+ pixman_composite_src_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
.macro pixman_composite_over_n_8_8888_process_pixblock_head
/* expecting deinterleaved source data in {d8, d9, d10, d11} */
/* d8 - blue, d9 - green, d10 - red, d11 - alpha */
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 3db9adf..ca139de 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -92,6 +92,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888,
uint8_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8888,
uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8,
+ uint8_t, 1, uint8_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888,
uint32_t, 1, uint32_t, 1)
@@ -295,6 +297,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, neon_composite_src_n_8_8888),
PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, neon_composite_src_n_8_8888),
PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, neon_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8, neon_composite_src_n_8_8),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8, neon_composite_over_n_8_8),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, neon_composite_over_n_8_0565),
--
1.6.6.1
From 524d1cc7acb753167fffdd08d8c10bf71e0634ba Mon Sep 17 00:00:00 2001
From: Taekyun Kim <tkq.kim@samsung.com>
Date: Tue, 20 Sep 2011 21:32:35 +0900
Subject: [PATCH 4/8] ARM: NEON: Bilinear macro template for instruction scheduling
This macro template takes 6 code blocks.
1. process_last_pixel
2. process_two_pixels
3. process_four_pixels
4. process_pixblock_head
5. process_pixblock_tail
6. process_pixblock_tail_head
process_last_pixel does not need to update horizontal weight. This
is done by the template. two and four code block should update
horizontal weight inside of them. head/tail/tail_head blocks
consist unrolled core loop. You can apply instruction scheduling
to the tail_head blocks.
You can also specify size of the pixel block. Supported size is 4
and 8. If you want to use mask, give BILINEAR_FLAG_USE_MASK flags
to the template, then you can use register MASK. When using d8~d15
registers, give BILINEAR_FLAG_USE_ALL_NEON_REGS to make sure
registers are properly saved on the stack and later restored.
---
pixman/pixman-arm-neon-asm-bilinear.S | 195 +++++++++++++++++++++++++++++++++
1 files changed, 195 insertions(+), 0 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index c5ba929..784e5df 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -773,3 +773,198 @@ generate_bilinear_scanline_func_src_a8_dst \
generate_bilinear_scanline_func_src_a8_dst \
pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
8888, 8888, add, 2, 28
+
+.set BILINEAR_FLAG_USE_MASK, 1
+.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
+
+/*
+ * Main template macro for generating NEON optimized bilinear scanline functions.
+ *
+ * Bilinear scanline generator macro take folling arguments:
+ * fname - name of the function to generate
+ * src_fmt - source color format (8888 or 0565)
+ * dst_fmt - destination color format (8888 or 0565)
+ * src/dst_bpp_shift - (1 << bpp_shift) is the size of src/dst pixel in bytes
+ * process_last_pixel - code block that interpolate one pixel and does not
+ * update horizontal weight
+ * process_two_pixels - code block that interpolate two pixels and update
+ * horizontal weight
+ * process_four_pixels - code block that interpolate four pixels and update
+ * horizontal weight
+ * process_pixblock_head - head part of middle loop
+ * process_pixblock_tail - tail part of middle loop
+ * process_pixblock_tail_head - tail_head of middle loop
+ * pixblock_size - number of pixels processed in a single middle loop
+ * prefetch_distance - prefetch in the source image by that many pixels ahead
+ */
+
+.macro generate_bilinear_scanline_func \
+ fname, \
+ src_fmt, dst_fmt, src_bpp_shift, dst_bpp_shift, \
+ bilinear_process_last_pixel, \
+ bilinear_process_two_pixels, \
+ bilinear_process_four_pixels, \
+ bilinear_process_pixblock_head, \
+ bilinear_process_pixblock_tail, \
+ bilinear_process_pixblock_tail_head, \
+ pixblock_size, \
+ prefetch_distance, \
+ flags
+
+pixman_asm_function fname
+.if pixblock_size == 8
+.elseif pixblock_size == 4
+.else
+ .error unsupported pixblock size
+.endif
+
+.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
+ OUT .req r0
+ TOP .req r1
+ BOTTOM .req r2
+ WT .req r3
+ WB .req r4
+ X .req r5
+ UX .req r6
+ WIDTH .req ip
+ TMP1 .req r3
+ TMP2 .req r4
+ PF_OFFS .req r7
+ TMP3 .req r8
+ TMP4 .req r9
+ STRIDE .req r2
+
+ mov ip, sp
+ push {r4, r5, r6, r7, r8, r9}
+ mov PF_OFFS, #prefetch_distance
+ ldmia ip, {WB, X, UX, WIDTH}
+.else
+ OUT .req r0
+ MASK .req r1
+ TOP .req r2
+ BOTTOM .req r3
+ WT .req r4
+ WB .req r5
+ X .req r6
+ UX .req r7
+ WIDTH .req ip
+ TMP1 .req r4
+ TMP2 .req r5
+ PF_OFFS .req r8
+ TMP3 .req r9
+ TMP4 .req r10
+ STRIDE .req r3
+
+ mov ip, sp
+ push {r4, r5, r6, r7, r8, r9, r10, ip}
+ mov PF_OFFS, #prefetch_distance
+ ldmia ip, {WT, WB, X, UX, WIDTH}
+.endif
+
+ mul PF_OFFS, PF_OFFS, UX
+
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+ vpush {d8-d15}
+.endif
+
+ sub STRIDE, BOTTOM, TOP
+ .unreq BOTTOM
+
+ cmp WIDTH, #0
+ ble 3f
+
+ vdup.u16 q12, X
+ vdup.u16 q13, UX
+ vdup.u8 d28, WT
+ vdup.u8 d29, WB
+ vadd.u16 d25, d25, d26
+
+ /* ensure good destination alignment */
+ cmp WIDTH, #1
+ blt 0f
+ tst OUT, #(1 << dst_bpp_shift)
+ beq 0f
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ bilinear_process_last_pixel
+ sub WIDTH, WIDTH, #1
+0:
+ vadd.u16 q13, q13, q13
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+
+ cmp WIDTH, #2
+ blt 0f
+ tst OUT, #(1 << (dst_bpp_shift + 1))
+ beq 0f
+ bilinear_process_two_pixels
+ sub WIDTH, WIDTH, #2
+0:
+.if pixblock_size == 8
+ cmp WIDTH, #4
+ blt 0f
+ tst OUT, #(1 << (dst_bpp_shift + 2))
+ beq 0f
+ bilinear_process_four_pixels
+ sub WIDTH, WIDTH, #4
+0:
+.endif
+ subs WIDTH, WIDTH, #pixblock_size
+ blt 1f
+ mov PF_OFFS, PF_OFFS, asr #(16 - src_bpp_shift)
+ bilinear_process_pixblock_head
+ subs WIDTH, WIDTH, #pixblock_size
+ blt 5f
+0:
+ bilinear_process_pixblock_tail_head
+ subs WIDTH, WIDTH, #pixblock_size
+ bge 0b
+5:
+ bilinear_process_pixblock_tail
+1:
+.if pixblock_size == 8
+ tst WIDTH, #4
+ beq 2f
+ bilinear_process_four_pixels
+2:
+.endif
+ /* handle the remaining trailing pixels */
+ tst WIDTH, #2
+ beq 2f
+ bilinear_process_two_pixels
+2:
+ tst WIDTH, #1
+ beq 3f
+ bilinear_process_last_pixel
+3:
+.if ((flags) & BILINEAR_FLAG_USE_ALL_NEON_REGS) != 0
+ vpop {d8-d15}
+.endif
+
+.if ((flags) & BILINEAR_FLAG_USE_MASK) == 0
+ pop {r4, r5, r6, r7, r8, r9}
+.else
+ pop {r4, r5, r6, r7, r8, r9, r10, ip}
+.endif
+ bx lr
+
+ .unreq OUT
+ .unreq TOP
+ .unreq WT
+ .unreq WB
+ .unreq X
+ .unreq UX
+ .unreq WIDTH
+ .unreq TMP1
+ .unreq TMP2
+ .unreq PF_OFFS
+ .unreq TMP3
+ .unreq TMP4
+ .unreq STRIDE
+.if ((flags) & BILINEAR_FLAG_USE_MASK) != 0
+ .unreq MASK
+.endif
+
+.endfunc
+
+.endm
--
1.6.6.1
From c8f7edaebd510ba120d74102a93ad4d202b0e806 Mon Sep 17 00:00:00 2001
From: Taekyun Kim <tkq.kim@samsung.com>
Date: Wed, 21 Sep 2011 15:52:13 +0900
Subject: [PATCH 6/8] ARM: NEON: Instruction scheduling of bilinear over_8888_8888
Instructions are reordered to eliminate pipeline stalls and get
better memory access.
Performance of before/after on cortex-a8 @ 1GHz
<< 2000 x 2000 with scale factor close to 1.x >>
before : 50.43 Mpix/s
after : 61.09 Mpix/s
---
pixman/pixman-arm-neon-asm-bilinear.S | 149 ++++++++++++++++++++++++++++++++-
1 files changed, 146 insertions(+), 3 deletions(-)
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index 25bcb24..76937e0 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -893,15 +893,158 @@ pixman_asm_function fname
.endm
.macro bilinear_over_8888_8888_process_pixblock_head
- bilinear_over_8888_8888_process_four_pixels
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+
+ vld1.32 {d22}, [TMP1], STRIDE
+ vld1.32 {d23}, [TMP1]
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ vmull.u8 q8, d22, d28
+ vmlal.u8 q8, d23, d29
+
+ vld1.32 {d22}, [TMP2], STRIDE
+ vld1.32 {d23}, [TMP2]
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vmull.u8 q9, d22, d28
+ vmlal.u8 q9, d23, d29
+
+ vld1.32 {d22}, [TMP3], STRIDE
+ vld1.32 {d23}, [TMP3]
+ vmull.u8 q10, d22, d28
+ vmlal.u8 q10, d23, d29
+
+ vshll.u16 q0, d16, #8
+ vmlsl.u16 q0, d16, d30
+ vmlal.u16 q0, d17, d30
+
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d16}, [TMP4], STRIDE
+ vld1.32 {d17}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q11, d16, d28
+ vmlal.u8 q11, d17, d29
+
+ vshll.u16 q1, d18, #8
+ vmlsl.u16 q1, d18, d31
+ vmlal.u16 q1, d19, d31
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
.endm
.macro bilinear_over_8888_8888_process_pixblock_tail
+ vshll.u16 q2, d20, #8
+ vmlsl.u16 q2, d20, d30
+ vmlal.u16 q2, d21, d30
+ vshll.u16 q3, d22, #8
+ vmlsl.u16 q3, d22, d31
+ vmlal.u16 q3, d23, d31
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q1, #16
+ vld1.32 {d2, d3}, [OUT, :128]
+ pld [OUT, PF_OFFS]
+ vshrn.u32 d4, q2, #16
+ vshr.u16 q15, q12, #8
+ vshrn.u32 d5, q3, #16
+ vmovn.u16 d6, q0
+ vmovn.u16 d7, q2
+ vuzp.8 d6, d7
+ vuzp.8 d2, d3
+ vuzp.8 d6, d7
+ vuzp.8 d2, d3
+ vdup.32 d4, d7[1]
+ vmvn.8 d4, d4
+ vmull.u8 q11, d2, d4
+ vmull.u8 q2, d3, d4
+ vrshr.u16 q1, q11, #8
+ vrshr.u16 q10, q2, #8
+ vraddhn.u16 d2, q1, q11
+ vraddhn.u16 d3, q10, q2
+ vqadd.u8 q3, q1, q3
+ vuzp.8 d6, d7
+ vuzp.8 d6, d7
+ vadd.u16 q12, q12, q13
+ vst1.32 {d6, d7}, [OUT, :128]!
.endm
.macro bilinear_over_8888_8888_process_pixblock_tail_head
- bilinear_over_8888_8888_process_pixblock_tail
- bilinear_over_8888_8888_process_pixblock_head
+ vshll.u16 q2, d20, #8
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ vmlsl.u16 q2, d20, d30
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+ vmlal.u16 q2, d21, d30
+ vshll.u16 q3, d22, #8
+ vld1.32 {d20}, [TMP1], STRIDE
+ vmlsl.u16 q3, d22, d31
+ vmlal.u16 q3, d23, d31
+ vld1.32 {d21}, [TMP1]
+ vmull.u8 q8, d20, d28
+ vmlal.u8 q8, d21, d29
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q1, #16
+ vld1.32 {d2, d3}, [OUT, :128]
+ pld [OUT, PF_OFFS]
+ vshrn.u32 d4, q2, #16
+ vshr.u16 q15, q12, #8
+ vld1.32 {d22}, [TMP2], STRIDE
+ vshrn.u32 d5, q3, #16
+ vmovn.u16 d6, q0
+ vld1.32 {d23}, [TMP2]
+ vmull.u8 q9, d22, d28
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vmlal.u8 q9, d23, d29
+ vmovn.u16 d7, q2
+ vld1.32 {d22}, [TMP3], STRIDE
+ vuzp.8 d6, d7
+ vuzp.8 d2, d3
+ vuzp.8 d6, d7
+ vuzp.8 d2, d3
+ vdup.32 d4, d7[1]
+ vld1.32 {d23}, [TMP3]
+ vmvn.8 d4, d4
+ vmull.u8 q10, d22, d28
+ vmlal.u8 q10, d23, d29
+ vmull.u8 q11, d2, d4
+ vmull.u8 q2, d3, d4
+ vshll.u16 q0, d16, #8
+ vmlsl.u16 q0, d16, d30
+ vrshr.u16 q1, q11, #8
+ vmlal.u16 q0, d17, d30
+ vrshr.u16 q8, q2, #8
+ vraddhn.u16 d2, q1, q11
+ vraddhn.u16 d3, q8, q2
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d16}, [TMP4], STRIDE
+ vqadd.u8 q3, q1, q3
+ vld1.32 {d17}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q11, d16, d28
+ vmlal.u8 q11, d17, d29
+ vuzp.8 d6, d7
+ vshll.u16 q1, d18, #8
+ vuzp.8 d6, d7
+ vmlsl.u16 q1, d18, d31
+ vadd.u16 q12, q12, q13
+ vmlal.u16 q1, d19, d31
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vst1.32 {d6, d7}, [OUT, :128]!
.endm