Skip to content
Snippets Groups Projects
Commit b2732115 authored by Hubert Mazur's avatar Hubert Mazur Committed by Martin Storsjö
Browse files

lavc/aarch64: Add neon implementation for pix_median_abs8


Provide optimized implementation for pix_median_abs8 function.

Performance comparison tests are shown below.
- median_sad_1_c: 277.0
- median_sad_1_neon: 82.0

Benchmarks and tests run with checkasm tool on AWS Graviton 3.

Signed-off-by: default avatarHubert Mazur <hum@semihalf.com>
Signed-off-by: default avatarMartin Storsjö <martin@martin.st>
parent e9a61702
No related branches found
No related tags found
No related merge requests found
......@@ -57,6 +57,8 @@ int nsse16_neon_wrapper(MpegEncContext *c, const uint8_t *s1, const uint8_t *s2,
ptrdiff_t stride, int h);
int pix_median_abs16_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
int pix_median_abs8_neon(MpegEncContext *v, const uint8_t *pix1, const uint8_t *pix2,
ptrdiff_t stride, int h);
av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
{
......@@ -85,6 +87,7 @@ av_cold void ff_me_cmp_init_aarch64(MECmpContext *c, AVCodecContext *avctx)
c->nsse[0] = nsse16_neon_wrapper;
c->median_sad[0] = pix_median_abs16_neon;
c->median_sad[1] = pix_median_abs8_neon;
}
}
......
......@@ -1089,3 +1089,65 @@ function vsad_intra8_neon, export=1
ret
endfunc
function pix_median_abs8_neon, export=1
// x0 unused
// x1 uint8_t *pix1
// x2 uint8_t *pix2
// x3 ptrdiff_t stride
// w4 int h
ld1 {v2.8b}, [x1], x3
ld1 {v3.8b}, [x2], x3
movi v31.8h, #0
ext v0.8b, v2.8b, v2.8b, #1
ext v1.8b, v3.8b, v3.8b, #1
usubl v28.8h, v2.8b, v3.8b
usubl v26.8h, v0.8b, v1.8b
sub w4, w4, #1 // we need to make h-1 iterations
saba v31.8h, v26.8h, v28.8h
mov h18, v28.h[0]
cmp w4, #1
sqabs h18, h18
movi v0.8h, #0
b.lt 2f
1:
ld1 {v6.8b}, [x1], x3 // pix1 vector for V(j-1)
ld1 {v7.8b}, [x2], x3 // pix2 vector for V(j-1)
subs w4, w4, #1
ext v4.8b, v6.8b, v6.8b, #1 // pix1 vector for V(j)
ext v5.8b, v7.8b, v7.8b, #1 // pix2 vector for V(j)
// protected registers: v30, v29, v28, v27, v26, v25, v24, v23
// scratch registers: v22, v21, v20, v19, v17
// To find median of three values, calculate sum of them
// and subtract max and min value from it.
usubl v30.8h, v6.8b, v7.8b // V(j-1)
usubl v24.8h, v4.8b, v5.8b // V(j)
saba v0.8h, v30.8h, v28.8h
add v22.8h, v26.8h, v30.8h
smin v20.8h, v26.8h, v30.8h
smax v19.8h, v26.8h, v30.8h
sub v22.8h, v22.8h, v28.8h
smin v17.8h, v19.8h, v22.8h
mov v28.16b, v30.16b
smax v20.8h, v20.8h, v17.8h // median values lower half
smax v19.8h, v25.8h, v29.8h
saba v31.8h, v24.8h, v20.8h
mov v26.16b, v24.16b
smax v17.8h, v22.8h, v19.8h // median values upper half
b.ne 1b
2:
mov h17, v0.h[0]
ins v31.h[7], wzr
add d18, d18, d17
uaddlv s17, v31.8h
add d18, d18, d17
fmov w0, s18
ret
endfunc
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment