ff_mlp_rematrix_channel_sse4
#define REMATRIX_CHANNEL_FUNC(opt) \
void ff_mlp_rematrix_channel_##opt(int32_t *samples, \
const int32_t *coeffs, \
const uint8_t *bypassed_lsbs, \
const int8_t *noise_buffer, \
int index, \
unsigned int dest_ch, \
uint16_t blockpos, \
unsigned int maxchan, \
int matrix_noise_shift, \
int access_unit_size_pow2, \
int32_t mask);
REMATRIX_CHANNEL_FUNC(sse4)
REMATRIX_CHANNEL_FUNC(avx2_bmi2)
---------------------------------------------------一下是错误的------------------------------------------
#if !(ARCH_X86_64)
void ff_mlp_rematrix_channel_avx2_bmi2(int32_t *samples, const int32_t *coeffs, const uint8_t *bypassed_lsbs, const int8_t *noise_buffer, int index, unsigned int dest_ch, uint16_t blockpos, unsigned int maxchan, int matrix_noise_shift, int access_unit_size_pow2, int32_t mask) {return;}
#endif
#if !(ARCH_X86_64)
void ff_mlp_rematrix_channel_sse4(int32_t *samples, const int32_t *coeffs, const uint8_t *bypassed_lsbs, const int8_t *noise_buffer, int index, unsigned int dest_ch, uint16_t blockpos, unsigned int maxchan, int matrix_noise_shift, int access_unit_size_pow2, int32_t mask) {return;}
#endif
;void ff_mlp_rematrix_channel(int32_t *samples, const int32_t *coeffs,
; const uint8_t *bypassed_lsbs, const int8_t *noise_buffer,
; int index, unsigned int dest_ch, uint16_t blockpos,
; unsigned int maxchan, int matrix_noise_shift,
; int access_unit_size_pow2, int32_t mask)
%macro MLP_REMATRIX_CHANNEL 0
cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \
index, dest_ch, blockpos, maxchan, mns, \
accum, mask, cnt
mov mnsd, mnsm ; load matrix_noise_shift
movzx blockposq, word blockposm ; load and zero extend blockpos (16bit)
mov maxchand, maxchanm ; load maxchan
mov maskd, maskm ; load mask
%if WIN64
mov dest_chd, dest_chm ; load dest_chd (not needed on UNIX64)
%endif
shl dest_chd, 2
lea cntq, [blsbs_ptrq + blockposq*8]
test mnsd, mnsd ; is matrix_noise_shift != 0?
jne .shift ; jump if true
cmp maxchand, 4 ; is maxchan < 4?
jl .loop4 ; jump if true
align 16
.loop8:
; Process 5 or more channels
REMATRIX
LOOP_END
jne .loop8
RET
align 16
.loop4:
; Process up to 4 channels
movdqa xm0, [samplesq]
movdqa xm1, [coeffsq ]
pshufd xm2, xm0, q2301
pshufd xm3, xm1, q2301
pmuldq xm0, xm1
pmuldq xm3, xm2
paddq xm0, xm3
LOOP_END
jne .loop4
RET
.shift:
%if WIN64
mov indexd, indexm ; load index (not needed on UNIX64)
%endif
mov r9d, r9m ; load access_unit_size_pow2
%if cpuflag(bmi2)
; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place.
DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \
index, dest_ch, accum, index2, mns, \
ausp, mask, cnt, noise
add mnsd, 7 ; matrix_noise_shift += 7
%else ; sse4
mov r6, rcx ; move rcx elsewhere so we can use cl for matrix_noise_shift
%if WIN64
; r0 = rcx
DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \
index2, accum, ausp, mask, cnt, noise
%else ; UNIX64
; r3 = rcx
DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \
index2, accum, ausp, mask, cnt, noise
%endif
lea mnsd, [r8 + 7] ; rcx = matrix_noise_shift + 7
%endif ; cpuflag
sub auspd, 1 ; access_unit_size_pow2 -= 1
cmp r7d, 4 ; is maxchan < 4?
lea index2q, [indexq*2 + 1] ; index2 = 2 * index + 1;
jl .loop4_shift ; jump if maxchan < 4
align 16
.loop8_shift:
; Process 5 or more channels
REMATRIX
LOOP_SHIFT_END
jne .loop8_shift
RET
align 16
.loop4_shift:
; Process up to 4 channels
movdqa xm0, [samplesq]
movdqa xm1, [coeffsq ]
pshufd xm2, xm0, q2301
pshufd xm3, xm1, q2301
pmuldq xm0, xm1
pmuldq xm3, xm2
paddq xm0, xm3
LOOP_SHIFT_END
jne .loop4_shift
RET
%endmacro
INIT_XMM sse4
MLP_REMATRIX_CHANNEL
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2, bmi2
MLP_REMATRIX_CHANNEL
%endif
%endif ; ARCH_X86_64
原文地址:https://blog.csdn.net/durongze/article/details/142881957
免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!