diff options
Diffstat (limited to 'drivers/webp/dsp/dec_neon.c')
-rw-r--r-- | drivers/webp/dsp/dec_neon.c | 160 |
1 files changed, 28 insertions, 132 deletions
diff --git a/drivers/webp/dsp/dec_neon.c b/drivers/webp/dsp/dec_neon.c index 9c3d8cc016..ec824b790b 100644 --- a/drivers/webp/dsp/dec_neon.c +++ b/drivers/webp/dsp/dec_neon.c @@ -1,10 +1,8 @@ // Copyright 2012 Google Inc. All Rights Reserved. // -// Use of this source code is governed by a BSD-style license -// that can be found in the COPYING file in the root of the source -// tree. An additional intellectual property rights grant can be found -// in the file PATENTS. All contributing project authors may -// be found in the AUTHORS file in the root of the source tree. +// This code is licensed under the same terms as WebM: +// Software License Agreement: http://www.webmproject.org/license/software/ +// Additional IP Rights Grant: http://www.webmproject.org/license/additional/ // ----------------------------------------------------------------------------- // // ARM NEON version of dsp functions and loop filtering. @@ -18,7 +16,11 @@ #include "../dec/vp8i.h" -#define QRegs "q0", "q1", "q2", "q3", \ +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +#define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", \ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" #define FLIP_SIGN_BIT2(a, b, s) \ @@ -77,7 +79,7 @@ "vld4.8 {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \ "vld4.8 {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n" -#define STORE8x2(c1, c2, p, stride) \ +#define STORE8x2(c1, c2, p,stride) \ "vst2.8 {" #c1"[0], " #c2"[0]}," #p "," #stride " \n" \ "vst2.8 {" #c1"[1], " #c2"[1]}," #p "," #stride " \n" \ "vst2.8 {" #c1"[2], " #c2"[2]}," #p "," #stride " \n" \ @@ -97,9 +99,9 @@ static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) { "vld1.u8 {q1}, [%[p]], %[stride] \n" // p1 "vld1.u8 {q2}, [%[p]], %[stride] \n" // p0 "vld1.u8 {q3}, [%[p]], %[stride] \n" // q0 - "vld1.u8 {q12}, [%[p]] \n" // q1 + "vld1.u8 {q4}, [%[p]] \n" // q1 - DO_FILTER2(q1, q2, q3, q12, %[thresh]) + DO_FILTER2(q1, q2, q3, q4, %[thresh]) "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride @@ -118,18 +120,18 @@ static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) { "add r5, r4, %[stride] \n" // base2 = base1 + stride LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6) - LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6) - "vswp d3, d24 \n" // p1:q1 p0:q3 - "vswp d5, d26 \n" // q0:q2 q1:q4 - "vswp q2, q12 \n" // p1:q1 p0:q2 q0:q3 q1:q4 + LOAD8x4(d6, d7, d8, d9, [r4], [r5], r6) + "vswp d3, d6 \n" // p1:q1 p0:q3 + "vswp d5, d8 \n" // q0:q2 q1:q4 + "vswp q2, q3 \n" // p1:q1 p0:q2 q0:q3 q1:q4 - DO_FILTER2(q1, q2, q12, q13, %[thresh]) + DO_FILTER2(q1, q2, q3, q4, %[thresh]) "sub %[p], %[p], #1 \n" // p - 1 - "vswp d5, d24 \n" + "vswp d5, d6 \n" STORE8x2(d4, d5, [%[p]], %[stride]) - STORE8x2(d24, d25, [%[p]], %[stride]) + STORE8x2(d6, d7, [%[p]], %[stride]) : [p] "+r"(p) : [stride] "r"(stride), [thresh] "r"(thresh) @@ -153,10 +155,7 @@ static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) { } } -//----------------------------------------------------------------------------- -// Inverse transforms (Paragraph 14.4) - -static void TransformOne(const int16_t* in, uint8_t* dst) { +static void TransformOneNEON(const int16_t *in, uint8_t *dst) { const int kBPS = BPS; const int16_t constants[] = {20091, 17734, 0, 0}; /* kC1, kC2. Padded because vld1.16 loads 8 bytes @@ -305,129 +304,26 @@ static void TransformOne(const int16_t* in, uint8_t* dst) { ); } -static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { - TransformOne(in, dst); +static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) { + TransformOneNEON(in, dst); if (do_two) { - TransformOne(in + 16, dst + 4); + TransformOneNEON(in + 16, dst + 4); } } -static void TransformDC(const int16_t* in, uint8_t* dst) { - const int DC = (in[0] + 4) >> 3; - const int kBPS = BPS; - __asm__ volatile ( - "vdup.16 q1, %[DC] \n" - - "vld1.32 d0[0], [%[dst]], %[kBPS] \n" - "vld1.32 d1[0], [%[dst]], %[kBPS] \n" - "vld1.32 d0[1], [%[dst]], %[kBPS] \n" - "vld1.32 d1[1], [%[dst]], %[kBPS] \n" - - "sub %[dst], %[dst], %[kBPS], lsl #2 \n" - - // add DC and convert to s16. - "vaddw.u8 q2, q1, d0 \n" - "vaddw.u8 q3, q1, d1 \n" - // convert back to u8 with saturation - "vqmovun.s16 d0, q2 \n" - "vqmovun.s16 d1, q3 \n" - - "vst1.32 d0[0], [%[dst]], %[kBPS] \n" - "vst1.32 d1[0], [%[dst]], %[kBPS] \n" - "vst1.32 d0[1], [%[dst]], %[kBPS] \n" - "vst1.32 d1[1], [%[dst]] \n" - : [in] "+r"(in), [dst] "+r"(dst) /* modified registers */ - : [kBPS] "r"(kBPS), /* constants */ - [DC] "r"(DC) - : "memory", "q0", "q1", "q2", "q3" /* clobbered */ - ); -} - -static void TransformWHT(const int16_t* in, int16_t* out) { - const int kStep = 32; // The store is only incrementing the pointer as if we - // had stored a single byte. - __asm__ volatile ( - // part 1 - // load data into q0, q1 - "vld1.16 {q0, q1}, [%[in]] \n" - - "vaddl.s16 q2, d0, d3 \n" // a0 = in[0] + in[12] - "vaddl.s16 q3, d1, d2 \n" // a1 = in[4] + in[8] - "vsubl.s16 q10, d1, d2 \n" // a2 = in[4] - in[8] - "vsubl.s16 q11, d0, d3 \n" // a3 = in[0] - in[12] - - "vadd.s32 q0, q2, q3 \n" // tmp[0] = a0 + a1 - "vsub.s32 q2, q2, q3 \n" // tmp[8] = a0 - a1 - "vadd.s32 q1, q11, q10 \n" // tmp[4] = a3 + a2 - "vsub.s32 q3, q11, q10 \n" // tmp[12] = a3 - a2 - - // Transpose - // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14] - // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15] - "vswp d1, d4 \n" // vtrn.64 q0, q2 - "vswp d3, d6 \n" // vtrn.64 q1, q3 - "vtrn.32 q0, q1 \n" - "vtrn.32 q2, q3 \n" - - "vmov.s32 q10, #3 \n" // dc = 3 - "vadd.s32 q0, q0, q10 \n" // dc = tmp[0] + 3 - "vadd.s32 q12, q0, q3 \n" // a0 = dc + tmp[3] - "vadd.s32 q13, q1, q2 \n" // a1 = tmp[1] + tmp[2] - "vsub.s32 q8, q1, q2 \n" // a2 = tmp[1] - tmp[2] - "vsub.s32 q9, q0, q3 \n" // a3 = dc - tmp[3] - - "vadd.s32 q0, q12, q13 \n" - "vshrn.s32 d0, q0, #3 \n" // (a0 + a1) >> 3 - "vadd.s32 q1, q9, q8 \n" - "vshrn.s32 d1, q1, #3 \n" // (a3 + a2) >> 3 - "vsub.s32 q2, q12, q13 \n" - "vshrn.s32 d2, q2, #3 \n" // (a0 - a1) >> 3 - "vsub.s32 q3, q9, q8 \n" - "vshrn.s32 d3, q3, #3 \n" // (a3 - a2) >> 3 - - // set the results to output - "vst1.16 d0[0], [%[out]], %[kStep] \n" - "vst1.16 d1[0], [%[out]], %[kStep] \n" - "vst1.16 d2[0], [%[out]], %[kStep] \n" - "vst1.16 d3[0], [%[out]], %[kStep] \n" - "vst1.16 d0[1], [%[out]], %[kStep] \n" - "vst1.16 d1[1], [%[out]], %[kStep] \n" - "vst1.16 d2[1], [%[out]], %[kStep] \n" - "vst1.16 d3[1], [%[out]], %[kStep] \n" - "vst1.16 d0[2], [%[out]], %[kStep] \n" - "vst1.16 d1[2], [%[out]], %[kStep] \n" - "vst1.16 d2[2], [%[out]], %[kStep] \n" - "vst1.16 d3[2], [%[out]], %[kStep] \n" - "vst1.16 d0[3], [%[out]], %[kStep] \n" - "vst1.16 d1[3], [%[out]], %[kStep] \n" - "vst1.16 d2[3], [%[out]], %[kStep] \n" - "vst1.16 d3[3], [%[out]], %[kStep] \n" - - : [out] "+r"(out) // modified registers - : [in] "r"(in), [kStep] "r"(kStep) // constants - : "memory", "q0", "q1", "q2", "q3", - "q8", "q9", "q10", "q11", "q12", "q13" // clobbered - ); -} - -#endif // WEBP_USE_NEON - -//------------------------------------------------------------------------------ -// Entry point - extern void VP8DspInitNEON(void); void VP8DspInitNEON(void) { -#if defined(WEBP_USE_NEON) - VP8Transform = TransformTwo; - VP8TransformAC3 = TransformOne; // no special code here - VP8TransformDC = TransformDC; - VP8TransformWHT = TransformWHT; + VP8Transform = TransformTwoNEON; VP8SimpleVFilter16 = SimpleVFilter16NEON; VP8SimpleHFilter16 = SimpleHFilter16NEON; VP8SimpleVFilter16i = SimpleVFilter16iNEON; VP8SimpleHFilter16i = SimpleHFilter16iNEON; -#endif // WEBP_USE_NEON } +#if defined(__cplusplus) || defined(c_plusplus) +} // extern "C" +#endif + +#endif // WEBP_USE_NEON |