/*=========================================================================
* jdidct-armv7.s
*
* Copyright (c) 2010, Code Aurora Forum. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided
* with the distribution.
* * Neither the name of Code Aurora Forum, Inc. nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*==========================================================================
*==========================================================================
* FUNCTION LIST
*--------------------------------------------------------------------------
* - idct_1x1_venum
* - idct_2x2_venum
* - idct_4x4_venum
* - idct_8x8_venum
*
*==========================================================================
*/
@==========================================================================
@ MACRO DEFINITION
@==========================================================================
.macro Transpose8x8
@==================================================================
@ Transpose an 8 x 8 x 16 bit matrix in place
@ Input: q8 to q15
@ Output: q8 to q15
@ Registers used: q8 to q15
@ Assumptions: 8 x 8 x 16 bit data
@==================================================================
vswp d17, d24 @q8, q12
vswp d23, d30 @q11, q15
vswp d21, d28 @q10, q14
vswp d19, d26 @q9, q13
vtrn.32 q8, q10
vtrn.32 q9, q11
vtrn.32 q12, q14
vtrn.32 q13, q15
vtrn.16 q8, q9
vtrn.16 q10, q11
vtrn.16 q12, q13
vtrn.16 q14, q15
.endm
.macro IDCT1D
@==================================================================
@ One dimensional 64 element inverse DCT
@ Input: q8 to q15 loaded with data
@ q0 loaded with constants
@ Output: q8 to q15
@ Registers used: q0, q4 to q15
@ Assumptions: 16 bit data, first elements in least significant
@ halfwords
@==================================================================
@1st stage
vqrdmulh.s16 q4, q15, d0[2] @q4 = a1*vx7
vqrdmulh.s16 q5, q9, d0[2] @q5 = a1*vx1
vqrdmulh.s16 q6, q13, d0[3] @q6 = a2*vx5
vqrdmulh.s16 q7, q11, d1[1] @q7 = ma2*vx3
vqrdmulh.s16 q2, q14, d0[1] @q6 = a0*vx6
vqrdmulh.s16 q3, q10, d0[1] @q7 = a0*vx2
vqadd.s16 q9, q4, q9 @q9 = t1 = a1*vx7 + vx1
vqsub.s16 q5, q5, q15 @q5 = t8 = a1*vx1 - vx7
vqadd.s16 q15, q6, q11 @q15 = t7 = a2*vx5 + vx3
vqadd.s16 q11, q7, q13 @q11 = t3 = ma2*vx3 + vx5
@2nd stage
vqadd.s16 q13, q8, q12 @q13 = t5 = vx0 + vx4
vqsub.s16 q8, q8, q12 @q8 = t0 = vx0 - vx4
vqadd.s16 q10, q2, q10 @q10 = t2 = a0*vx6 + vx2
vqsub.s16 q12, q3, q14 @q12 = t4 = a0*vx2 - vx6
vqadd.s16 q14, q5, q11 @q14 = t6 = t8 + t3
vqsub.s16 q11, q5, q11 @q11 = t3 = t8 - t3
vqsub.s16 q5, q9, q15 @q5 = t8 = t1 - t7
vqadd.s16 q9, q9, q15 @q9 = t1 = t1 + t7
@3rd stage
vqadd.s16 q15, q13, q10 @q15 = t7 = t5 + t2
vqsub.s16 q10, q13, q10 @q10 = t2 = t5 - t2
vqadd.s16 q13, q8, q12 @q13 = t5 = t0 + t4
vqsub.s16 q7, q8, q12 @q7 = t0 = t0 - t4
vqsub.s16 q12, q5, q11 @q12 = t4 = t8 - t3
vqadd.s16 q11, q5, q11 @q11 = t3 = t8 + t3
@4th stage
vqadd.s16 q8, q15, q9 @q8 = vy0 = t7 + t1
vqsub.s16 q15, q15, q9 @q15 = vy7 = t7 - t1
vqrdmulh.s16 q6, q12, d0[0] @q6 = c4*t4
vqrdmulh.s16 q4, q11, d0[0] @q4 = c4*t3
vqsub.s16 q12, q10, q14 @q12 = vy4 = t2 - t6
vqadd.s16 q11, q10, q14 @q11 = vy3 = t2 + t6
vqadd.s16 q10, q7, q6 @q10 = vy2 = t0 + c4*t4
vqsub.s16 q14, q13, q4 @q14 = vy6 = t5 - c4*t3
vqadd.s16 q9, q13, q4 @q9 = vy1 = t5 + c4*t3
vqsub.s16 q13, q7, q6 @q13 = vy5 = t0 - c4*t4
.endm
.macro PART1
@==================================================================
@ Load input input data from memory and shift
@==================================================================
vld1.16 {d16, d17},[r0]! @q8 =row0
vqshl.s16 q8, q8, #4 @Input data too big?!!
@Maximum MPEG input is 2047/-2048.
vld1.16 {d18, d19},[r0]! @q9 =row1
vqshl.s16 q9, q9, #4 @Shift 1 instead of 4
vld1.16 {d20, d21},[r0]! @q10=row2
vqshl.s16 q10, q10, #4
vld1.16 {d22, d23},[r0]! @q11=row3
vqshl.s16 q11, q11, #4
vld1.16 {d24, d25},[r0]! @q12=row4
vqshl.s16 q12, q12, #4
vld1.16 {d26, d27},[r0]! @q13=row5
vqshl.s16 q13, q13, #4
vld1.16 {d28, d29},[r0]! @q14=row6
vqshl.s16 q14, q14, #4
vld1.16 {d30, d31},[r0]! @q15=row7
vqshl.s16 q15, q15, #4
@==================================================================
@ refresh the constants that was clobbered last time through IDCT1D
@==================================================================
vld1.16 {d4, d5},[r7] @q2 =constants[2]
vld1.16 {d6, d7},[r8] @q3 =constants[3]
vld1.16 {d8, d9},[r9] @q4 =constants[4]
.endm
.macro PART2
@==================================================================
@ Prescale the input
@==================================================================
vqrdmulh.s16 q12, q12, q1 @q12=row4 * constants[1] = vx4
vqrdmulh.s16 q15, q15, q2 @q15=row7 * constants[2] = vx7
vqrdmulh.s16 q9, q9, q2 @q9 =row1 * constants[2] = vx1
vqrdmulh.s16 q13, q13, q4 @q13=row5 * constants[4] = vx5
vqrdmulh.s16 q11, q11, q4 @q11=row3 * constants[4] = vx3
vqrdmulh.s16 q14, q14, q3 @q14=row6 * constants[3] = vx6
vqrdmulh.s16 q10, q10, q3 @q10=row2 * constants[3] = vx2
vqrdmulh.s16 q8, q8, q1 @q8 =row0 * constants[1] = vx0
@==================================================================
@ At thsi point, the input 8x8 x 16 bit coefficients are
@ transposed, prescaled, and loaded in q8 to q15
@ q0 loaded with scalar constants
@ Perform 1D IDCT
@==================================================================
IDCT1D @perform 1d idct
@==================================================================
@ Transpose the intermediate results to get read for vertical
@ transformation
@==================================================================
vswp d17, d24 @q8, q12
vswp d23, d30 @q11, q15
vswp d21, d28 @q10, q14
vswp d19, d26 @q9, q13
@==================================================================
@ Load the bias
@==================================================================
vdup.32 q4, d1[1] @a cycle is saved by loading
@the bias at this point
@==================================================================
@ Finish the transposition
@==================================================================
vtrn.32 q8, q10
vtrn.32 q9, q11
vtrn.32 q12, q14
vtrn.32 q13, q15
vtrn.16 q8, q9
vtrn.16 q10, q11
vtrn.16 q12, q13
vtrn.16 q14, q15
@==================================================================
@ Add bias
@==================================================================
vqadd.s16 q8, q8, q4
@==================================================================
@ IDCT 2nd half
@==================================================================
IDCT1D @perform 1d dct
@==================================================================
@ Scale and clamp the output to correct range and save to memory
@ 1. scale to 8bits by right shift 6
@ 2. clamp output to [0, 255] by min/max
@ 3. use multiple store. Each store will save one row of output.
@ The st queue size is 4, so do no more than 4 str in sequence.
@==================================================================
ldr r5, =constants+5*16 @constants[5],
vld1.16 d10, [r5] @load clamping parameters
vdup.s16 q6, d10[0] @q6=[0000000000000000]
vdup.s16 q7, d10[1] @q7=[FFFFFFFFFFFFFFFF]
@Save the results
vshr.s16 q8, q8, #6 @q8 = vy0
vmax.s16 q8, q8, q6 @clamp >0
vmin.s16 q8, q8, q7 @clamp <255
vshr.s16 q9, q9, #6 @q9 = vy1
vmax.s16 q9, q9, q6 @clamp >0
vmin.s16 q9, q9, q7 @clamp <255
vshr.s16 q10, q10, #6 @q10 = vy2
vmax.s16 q10, q10, q6 @clamp >0
vmin.s16 q10, q10, q7 @clamp <255
vshr.s16 q11, q11, #6 @q11 = vy3
vmax.s16 q11, q11, q6 @clamp >0
vmin.s16 q11, q11, q7 @clamp <255
vst1.16 {d16, d17},[r1],r2 @q8 =row0
vst1.16 {d18, d19},[r1],r2 @q9 =row1
vst1.16 {d20, d21},[r1],r2 @q10=row2
vst1.16 {d22, d23},[r1],r2 @q11=row3
vshr.s16 q12, q12, #6 @q12 = vy4
vmax.s16 q12, q12, q6 @clamp >0
vmin.s16 q12, q12, q7 @clamp <255
vshr.s16 q13, q13, #6 @q13 = vy5
vmax.s16 q13, q13, q6 @clamp >0
vmin.s16 q13, q13, q7 @clamp <255
vshr.s16 q14, q14, #6 @q14 = vy6
vmax.s16 q14, q14, q6 @clamp >0
vmin.s16 q14, q14, q7 @clamp <255
vshr.s16 q15, q15, #6 @q15 = vy7
vmax.s16 q15, q15, q6 @clamp >0
vmin.s16 q15, q15, q7 @clamp <255
vst1.16 {d24, d25},[r1],r2 @q12=row4
vst1.16 {d26, d27},[r1],r2 @q13=row5
vst1.16 {d28, d29},[r1],r2 @q14=row6
vst1.16 {d30, d31},[r1] @q15=row7
.endm
.macro BIG_BODY_TRANSPOSE_INPUT
@==================================================================
@ Main body of idct
@==================================================================
PART1
Transpose8x8
PART2
.endm
.macro IDCT_ENTRY
@==================================================================
@ Load the locations of the constants
@==================================================================
ldr r5, =constants+0*16 @constants[0]
ldr r6, =constants+1*16 @constants[1]
ldr r7, =constants+2*16 @constants[2]
ldr r8, =constants+3*16 @constants[3]
ldr r9, =constants+4*16 @constants[4]
@==================================================================
@ Load the coefficients
@ only some input coefficients are load due to register constrain
@==================================================================
vld1.16 {d0, d1},[r5] @q0 =constants[0] (scalars)
vld1.16 {d2, d3},[r6] @q1 =constants[1]
.endm
@==========================================================================
@ END of MACRO DEFINITION
@==========================================================================
.section idct_func, "x" @ ARE
.text @ idct_func, CODE, READONLY
.align 2
.code 32 @ CODE32
@==========================================================================
@ Main Routine
@==========================================================================
.global idct_1x1_venum
.global idct_2x2_venum
.global idct_4x4_venum
.global idct_8x8_venum
@==========================================================================
@ FUNCTION : idct_1x1_venum
@--------------------------------------------------------------------------
@ DISCRIPTION : ARM optimization of one 1x1 block iDCT
@--------------------------------------------------------------------------
@ C PROTOTYPE : void idct_1x1_venum(int16 * input,
@ int16 * output,
@ int32 stride)
@--------------------------------------------------------------------------
@ REG INPUT : R0 pointer to input (int16)
@ R1 pointer to output (int16)
@ R2 block stride
@--------------------------------------------------------------------------
@ STACK ARG : None
@--------------------------------------------------------------------------
@ MEM INPUT : None
@--------------------------------------------------------------------------
@ REG OUTPUT : None
@--------------------------------------------------------------------------
@ MEM OUTPUT : None
@--------------------------------------------------------------------------
@ REG AFFECTED : R0 - R2
@--------------------------------------------------------------------------
@ STACK USAGE : none
@--------------------------------------------------------------------------
@ CYCLES : 17 cycles
@--------------------------------------------------------------------------
@ NOTES :
@ This idct_1x1_venum code was developed with ARM instruction set.
@
@ ARM REGISTER ALLOCATION
@ =========================================================================
@ r0 : pointer to input data
@ r1 : pointer to output area
@ r2 : stride in the output buffer
@==========================================================================
.type idct_1x1_venum, %function
idct_1x1_venum:
ldrsh r3, [r0] @ Load signed half word (int16)
ldr r2, =1028 @ 1028 = 4 + 128 << 3
@ 4 for rounding, 128 for offset
add r2, r3, r2
asrs r2, r2, #3 @ Divide by 8, and set status bit
movmi r2, #0 @ Clamp to be greater than 0
cmp r2, #255
movgt r2, #255 @ Clamp to be less than 255
str r2, [r1] @ Save output
bx lr @ Return to caller
@ end of idct_1x1_venum
@==========================================================================
@ FUNCTION : idct_2x2_venum
@--------------------------------------------------------------------------
@ DISCRIPTION : VeNum optimization of one 2x2 block iDCT
@--------------------------------------------------------------------------
@ C PROTOTYPE : void idct_2x2_venum(int16 * input,
@ int16 * output,
@ int32 stride)
@--------------------------------------------------------------------------
@ REG INPUT : R0 pointer to input (int16)
@ R1 pointer to output (int16)
@ R2 block stride
@--------------------------------------------------------------------------
@ STACK ARG : None
@--------------------------------------------------------------------------
@ MEM INPUT : None
@--------------------------------------------------------------------------
@ REG OUTPUT : None
@--------------------------------------------------------------------------
@ MEM OUTPUT : None
@--------------------------------------------------------------------------
@ REG AFFECTED : R0 - R2
@--------------------------------------------------------------------------
@ STACK USAGE : none
@--------------------------------------------------------------------------
@ CYCLES : 27 cycles
@--------------------------------------------------------------------------
@ NOTES : Output buffer must be an 8x8 16-bit buffer
@
@ ARM REGISTER ALLOCATION
@ ==========================================
@ r0 : pointer to input data
@ r1 : pointer to output area
@ r2 : stride in the output buffer
@ -------------------------------------------
@
@ VENUM REGISTER ALLOCATION
@ =================================================
@ q0 : output x0 - x4
@ q1 : not used
@ q2 : not used
@ q3 : not used
@ q4 : not used
@ q5 : not used
@ q6 : not used
@ q7 : not used
@ q8 : input y0 - y4
@ q9 : intermediate value
@ q10 : intermediate value
@ q11 : offset value
@ q12 : clamp value
@ q13 : not used
@ q14 : not used
@ q15 : not used
@==========================================================================
.type idct_2x2_venum, %function
idct_2x2_venum:
vld4.32 {d16, d17, d18, d19}, [r0]
@ d16: y0 | y1 | y2 | y3 (LSB | MSB)
vtrn.32 d16, d17 @ d16: y0 | y1 | X | X
@ d17: y2 | y3 | X | X
vqadd.s16 d18, d16, d17 @ d18: y0+y2 | y1+y3 | X | X q: saturated
vqsub.s16 d19, d16, d17 @ d19: y0-y2 | y1-y3 | X | X q: saturated
vtrn.16 d18, d19 @ d18: y0+y2 | y0-y2 | X | X
@ d19: y1+y3 | y1-y3 | X | X
vqadd.s16 d20, d18, d19 @ d20: (y0+y2)+(y1+y3) | (y0-y2)+(y1-y3)
@ x0 | x2 | X | X
vqsub.s16 d21, d18, d19 @ d21: (y0+y2)-(y1+y3) | (y0-y2)-(y1-y3)
@ x1 | x3 | X | X
vtrn.16 d20, d21 @ d20: x0 | x1 | X | X
@ d21: x2 | x3 | X | X
vrshr.s16 q10, q10, #3 @ Divide by 8
vmov.i16 q11, #128 @ q11 = 128|128|128|128|128|128|128|128
vqadd.s16 q0, q10, q11 @ Add offset to make output in [0,255]
vmov.i16 q12, #0 @ q12 = [0000000000000000]
vmov.i16 q13, #255 @ q13 = [FFFFFFFFFFFFFFFF] (hex)
vmax.s16 q0, q0, q12 @ Clamp > 0
vmin.s16 q0, q0, q13 @ Clamp < 255
vstr d0, [r1] @ Store x0 | x1 | X | X
@ Potential out of boundary issue
add r1, r1, r2 @ Add the offset to the output pointer
vstr d1, [r1] @ Store x2 | x3 | X | X
@ Potential out of boundary issue
bx lr @ Return to caller
@ end of idct_2x2_venum
@==========================================================================
@ FUNCTION : idct_4x4_venum
@--------------------------------------------------------------------------
@ DISCRIPTION : VeNum optimization of one 4x4 block iDCT
@--------------------------------------------------------------------------
@ C PROTOTYPE : void idct_4x4_venum(int16 * input,
@ int16 * output,
@ int32 stride)
@--------------------------------------------------------------------------
@ REG INPUT : R0 pointer to input (int16)
@ R1 pointer to output (int16)
@ R2 block stride
@--------------------------------------------------------------------------
@ STACK ARG : None
@--------------------------------------------------------------------------
@ MEM INPUT : None
@--------------------------------------------------------------------------
@ REG OUTPUT : None
@--------------------------------------------------------------------------
@ MEM OUTPUT : None
@--------------------------------------------------------------------------
@ REG AFFECTED : R0 - R3, R12
@--------------------------------------------------------------------------
@ STACK USAGE : none
@--------------------------------------------------------------------------
@ CYCLES : 56 cycles
@--------------------------------------------------------------------------
@ NOTES :
@
@ ARM REGISTER ALLOCATION
@ ==========================================
@ r0 : pointer to input data
@ r1 : pointer to output area
@ r2 : stride in the output buffer
@ r3 : pointer to the coefficient set
@ r12 : pointer to the coefficient set
@ -------------------------------------------
@
@ VENUM REGISTER ALLOCATION
@ =================================================
@ q0 : coefficients[0]
@ q1 : coefficients[1]
@ q2 : coefficients[2]
@ q3 : coefficients[3]
@ q4 : not used
@ q5 : not used
@ q6 : not used
@ q7 : not used
@ q8 : input y0 - y7
@ q9 : input y8 - y15
@ q10 : intermediate value
@ q11 : intermediate value
@ q12 : intermediate value
@ q13 : intermediate value
@ q14 : intermediate value
@ q15 : intermediate value
@==========================================================================
.type idct_4x4_venum, %function
idct_4x4_venum:
@ Load the locations of the first 2 sets of coefficients
ldr r3, =coefficient+0*16 @ coefficient[0]
ldr r12, =coefficient+1*16 @ coefficient[1]
@ Load the first 2 sets of coefficients
vld1.16 {d0, d1},[r3] @ q0 = C4 | C2 | C4 | C6 | C4 | C2 | C4 | C6
vld1.16 {d2, d3},[r12] @ q1 = C4 | C6 | C4 | C2 | C4 | C6 | C4 | C2
@ Load the locations of the second 2 sets of coefficients
ldr r3, =coefficient+2*16 @ coefficient[2]
ldr r12, =coefficient+3*16 @ coefficient[3]
@ Load the second 2 sets of coefficients
vld1.16 {d4, d5},[r3] @ q2 = C4 | C4 | C4 | C4 | C2 | C2 | C2 | C2
vld1.16 {d6, d7},[r12] @ q3 = C4 | C4 | C4 | C4 | C6 | C6 | C6 | C6
@ Load the input values
vld1.16 {d16}, [r0], r2 @ d16: y0 | y1 | y2 | y3 (LSB | MSB)
vld1.16 {d17}, [r0], r2 @ d17: y4 | y5 | y6 | y7 (LSB | MSB)
vld1.16 {d18}, [r0], r2 @ d18: y8 | y9 | y10 | y11 (LSB | MSB)
vld1.16 {d19}, [r0], r2 @ d19: y12 | y13 | y14 | y15 (LSB | MSB)
@ Apply iDCT Horizonally
@ q8: y0 |y1 |y2 |y3 |y4 |y5 |y6 |y7
@ q9: y8 |y9 |y10|y11|y12|y13|y14|y15
@======================================================================
@ vqrdmulh doubles the result and save the high 16 bits of the result,
@ this is equivalent to right shift by 15 bits.
@ since coefficients are in Q15 format, it contradicts with the right
@ shift 15 here, so the final result is in Q0 format
@
@ vqrdmulh will also round the result
@======================================================================
vqrdmulh.s16 q10, q8, q0 @ q10: C4*y0 | C2*y1 | C4*y2 | C6*y3 | C4*y4 | C2*y5 | C4*y6 | C6*y7
vqrdmulh.s16 q11, q8, q1 @ q11: C4*y0 | C6*y1 | C4*y2 | C2*y3 | C4*y4 | C6*y5 | C4*y6 | C2*y7
vqrdmulh.s16 q12, q9, q0 @ q12: C4*y8 | C2*y9 | C4*y10 | C6*y11 | C4*y12 | C2*y13 | C4*y14 | C6*y15
vqrdmulh.s16 q13, q9, q1 @ q13: C4*y8 | C6*y9 | C4*y10 | C2*y11 | C4*y12 | C6*y13 | C4*y14 | C2*y15
vtrn.32 q10, q12 @ q10: C4*y0 | C2*y1 | C4*y8 | C2*y9 | C4*y4 | C2*y5 | C4*y12 | C2*y13
@ q12: C4*y2 | C6*y3 | C4*y10 | C6*y11 | C4*y6 | C6*y7 | C4*y14 | C6*y15
vtrn.32 q11, q13 @ q11: C4*y0 | C6*y1 | C4*y8 | C6*y9 | C4*y4 | C6*y5 | C4*y12 | C6*y13
@ q13: C4*y2 | C2*y3 | C4*y10 | C2*y11 | C4*y6 | C2*y7 | C4*y14 | C2*y15
vqadd.s16 q14, q10, q12 @ q14: C4*y0 + C4*y2 | C2*y1 + C6*y3 | C4*y8 + C4*y10 | C2*y9 + C6*y11 | C4*y4 + C4*y6 | C2*y5 + C6*y7 | C4*y12 + C4*y14 | C2*y13 + C6*y15
@ S0 | S2 | S8 | S10 | S4 | S6 | S12 | S14
vqsub.s16 q15, q11, q13 @ q15: C4*y0 - C4*y2 | C6*y1 - C2*y3 | C4*y8 - C4*y10 | C6*y9 - C2*y11 | C4*y4 - C4*y6 | C6*y5 - C2*y7 | C4*y12 - C4*y14 | C6*y13 - C2*y15
@ S1 | S3 | S9 | S11 | S5 | S7 | S13 | S15
vtrn.16 q14, q15 @ q14: S0 | S1 | S8 | S9 | S4 | S5 | S12 | S13
@ q15: S2 | S3 | S10 | S11 | S6 | S7 | S14 | S15
vqadd.s16 q8, q14, q15 @ q8: Z0 | Z1 | Z8 | Z9 | Z4 | Z5 | Z12 | Z13
vqsub.s16 q9, q14, q15 @ q9: Z3 | Z2 | Z11 | Z10 | Z7 | Z6 | Z15 | Z14
vrev32.16 q9, q9 @ q9: Z2 | Z3 | Z10 | Z11 | Z6 | Z7 | Z14 | Z15
@ Apply iDCT Vertically
vtrn.32 q8, q9 @ q8: Z0 | Z1 | Z2 | Z3 | Z4 | Z5 | Z6 | Z7
@ q9: Z8 | Z9 | Z10 | Z11 | Z12 | Z13 | Z14 | Z15
vqrdmulh.s16 q10, q8, q2 @ q10: C4*Z0 | C4*Z1 | C4*Z2 | C4*Z3 | C2*Z4 | C2*Z5 | C2*Z6 | C2*Z7
vqrdmulh.s16 q11, q8, q3 @ q11: C4*Z0 | C4*Z1 | C4*Z2 | C4*Z3 | C6*Z4 | C6*Z5 | C6*Z6 | C6*Z7
vqrdmulh.s16 q12, q9, q2 @ q12: C4*Z8 | C4*Z9 | C4*Z10 | C4*Z11 | C2*Z12 | C2*Z13 | C2*Z14 | C2*Z15
vqrdmulh.s16 q13, q9, q3 @ q13: C4*Z8 | C4*Z9 | C4*Z10 | C4*Z11 | C6*Z12 | C6*Z13 | C6*Z14 | C6*Z15
vqadd.s16 q14, q10, q13 @ q14: C4*Z0+C4*Z8 | C4*Z1+C4*Z9 | C4*Z2+C4*Z10 | C4*Z3+C4*Z11 | C2*Z4+C6*Z12 | C2*Z5+C6*Z13 | C2*Z6+C6*Z14 | C2*Z7+C6*Z15
@ s0 | s4 | s8 | s12 | s2 | s6 | s10 | s14
vqsub.s16 q15, q11, q12 @ q15: C4*Z0-C4*Z8 | C4*Z1-C4*Z9 | C4*Z2-C4*Z10 | C4*Z3-C4*Z11 | C6*Z4-C2*Z12 | C6*Z5-C2*Z13 | C6*Z6-C2*Z14 | C6*Z7-C2*Z15
@ s1 | s5 | s9 | s13 | s3 | s7 | s11 | s15
vswp d29, d30 @ q14: s0 | s4 | s8 | s12 | s1 | s5 | s9 | s13
@ q15: s2 | s6 | s10 | s14 | s3 | s7 | s11 | s15
vqadd.s16 q8, q14, q15 @ q8: x0 | x4 | x8 | x12 | x1 | x5 | x9 | x13
vqsub.s16 q9, q14, q15 @ q9: x3 | x7 | x11 | x15 | x2 | x6 | x10 | x14
vmov.i16 q10, #0 @ q10=[0000000000000000]
vmov.i16 q11, #255 @ q11=[FFFFFFFFFFFFFFFF] (hex)
vmov.i16 q0, #128 @ q0 = 128|128|128|128|128|128|128|128
vqadd.s16 q8, q8, q0 @ Add the offset
vqadd.s16 q9, q9, q0 @ Add the offset
vmax.s16 q8, q8, q10 @ clamp > 0
vmin.s16 q8, q8, q11 @ clamp < 255
vmax.s16 q9, q9, q10 @ clamp > 0
vmin.s16 q9, q9, q11 @ clamp < 255
vst1.16 {d16}, [r1], r2 @ d16: x0 | x1 | x2 | x3 (LSB | MSB)
vst1.16 {d17}, [r1], r2 @ d17: x4 | x5 | x6 | x7 (LSB | MSB)
vst1.16 {d19}, [r1], r2 @ d18: x8 | x9 | x10 | x11 (LSB | MSB)
vst1.16 {d18}, [r1], r2 @ d19: x12| x13 | x14 | x15 (LSB | MSB)
bx lr @ Return to caller
@ end of idct_4x4_venum
@==========================================================================
@ FUNCTION : idct_8x8_venum
@--------------------------------------------------------------------------
@ DISCRIPTION : VeNum optimization of one 8x8 block iDCT
@--------------------------------------------------------------------------
@ C PROTOTYPE : void idct_8x8_venum(int16 * input,
@ int16 * output,
@ int32 stride)
@--------------------------------------------------------------------------
@ REG INPUT : R0 pointer to input (int16)
@ R1 pointer to output (int16)
@ R2 block stride
@--------------------------------------------------------------------------
@ STACK ARG : None
@--------------------------------------------------------------------------
@ MEM INPUT : None
@--------------------------------------------------------------------------
@ REG OUTPUT : None
@--------------------------------------------------------------------------
@ MEM OUTPUT : None
@--------------------------------------------------------------------------
@ REG AFFECTED : R0 - R9
@--------------------------------------------------------------------------
@ STACK USAGE : none
@--------------------------------------------------------------------------
@ CYCLES : 177 cycles
@--------------------------------------------------------------------------
@ NOTES :
@
@ It was tested to be IEEE 1180 compliant. Since IEEE 1180 compliance is more stringent
@ than MPEG-4 compliance, this version is also MPEG-4 compliant.
@
@ CODE STRUCTURE:
@ (i) Macros for transposing an 8x8 matrix and for configuring the VFP unit are defined.
@ (ii) Macro for IDCT in one dimension is defined as four stages
@ (iii) The two dimensional code begins
@ (iv) constants are defined in the area DataArea
@
@ PROGRAM FLOW:
@
@ The VFP is configured
@ The parameters to IDCT are loaded
@ the coefficients are loaded
@ loop:
@ decrement loop counter
@ The first input Matrix is loaded and pre-scaled
@ The input is prescaled using the constants
@ IDCT is performed in one dimension on the 8 columns
@ The matrix is transposed
@ A bias is loaded an added to the matrix
@ IDCT is performed in one dimension on the 8 rows
@ The matrix is post-scaled
@ The matrix is saved
@ test loop counter and loop if greater than zero
@ stop
@
@
@ ARM REGISTER ALLOCATION
@ ==========================================
@ r0 : pointer to input data
@ r1 : pointer to output are
@ r2 : stride in the output buffer
@ r3 :
@ r4 :
@ r5 : pointer to constants[0] [5]
@ r6 : pointer to constants[1]
@ r7 : pointer to constants[2]
@ r8 : pointer to constants[3]
@ r9 : pointer to constants[4]
@ -------------------------------------------
@
@ VENUM REGISTER ALLOCATION
@ =================================================
@ q0 : constants[0]
@ q1 : constants[1]
@ q2 : constants[2], IDCT1D in-place scratch
@ q3 : constants[3], IDCT1D in-place scratch
@ q4 : constants[4], IDCT1D in-place scratch, and bias compensation
@ q5 : IDCT1D in-place scratch
@ q6 : IDCT1D in-place scratch
@ q7 : IDCT1D in-place scratch
@ q8 : Matrix[0] IDCT1D in-place scratch
@ q9 : Matrix[1] IDCT1D in-place scratch
@ q10 : Matrix[2] IDCT1D in-place scratch
@ q11 : Matrix[3] IDCT1D in-place scratch
@ q12 : Matrix[4] IDCT1D in-place scratch
@ q13 : Matrix[5] IDCT1D in-place scratch
@ q14 : Matrix[6] IDCT1D in-place scratch
@ q15 : Matrix[7] IDCT1D in-place scratch
@==========================================================================
.type idct_8x8_venum, %function
idct_8x8_venum:
push {r5-r9}
vpush {d8-d15}
IDCT_ENTRY
BIG_BODY_TRANSPOSE_INPUT
vpop {d8-d15}
pop {r5-r9}
bx lr
@ end of idct_8x8_venum
@==========================================================================
@ Constants Definition AREA: define idct kernel, bias
@==========================================================================
.section ro_data_area @ AREA RODataArea
.data @ DATA, READONLY
.align 5 @ ALIGN=5
constants:
.hword 23170, 13573, 6518, 21895, -23170, -21895, 8223, 8224
.hword 16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725
.hword 22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521
.hword 21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692
.hword 19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722
.hword 0, 255, 0, 0
coefficient: @ These are the coefficent used by 4x4 iDCT in Q15 format
.hword 11585, 15137, 11585, 6270, 11585, 15137, 11585, 6270 @ C4, C2, C4, C6, C4, C2, C4, C6 /2
.hword 11585, 6270, 11585, 15137, 11585, 6270, 11585, 15137 @ C4, C6, C4, C2, C4, C6, C4, C2 /2
.hword 11585, 11585, 11585, 11585, 15137, 15137, 15137, 15137 @ C4, C4, C4, C4, C2, C2, C2, C2 /2
.hword 11585, 11585, 11585, 11585, 6270, 6270, 6270, 6270 @ C4, C4, C4, C4, C6, C6, C6, C6 /2
.end