ff_mlp_rematrix_channel_sse4

🕗 发布于 2024-10-13 06:26 ffmpeg

#define REMATRIX_CHANNEL_FUNC(opt) \
void ff_mlp_rematrix_channel_##opt(int32_t *samples, \
const int32_t *coeffs, \
const uint8_t *bypassed_lsbs, \
const int8_t *noise_buffer, \
int index, \
unsigned int dest_ch, \
uint16_t blockpos, \
unsigned int maxchan, \
int matrix_noise_shift, \
int access_unit_size_pow2, \
int32_t mask);

REMATRIX_CHANNEL_FUNC(sse4)
REMATRIX_CHANNEL_FUNC(avx2_bmi2)

---------------------------------------------------一下是错误的------------------------------------------

#if !(ARCH_X86_64)
void ff_mlp_rematrix_channel_avx2_bmi2(int32_t *samples, const int32_t *coeffs, const uint8_t *bypassed_lsbs, const int8_t *noise_buffer, int index, unsigned int dest_ch, uint16_t blockpos, unsigned int maxchan, int matrix_noise_shift, int access_unit_size_pow2, int32_t mask) {return;}
#endif
#if !(ARCH_X86_64)
void ff_mlp_rematrix_channel_sse4(int32_t *samples, const int32_t *coeffs, const uint8_t *bypassed_lsbs, const int8_t *noise_buffer, int index, unsigned int dest_ch, uint16_t blockpos, unsigned int maxchan, int matrix_noise_shift, int access_unit_size_pow2, int32_t mask) {return;}
#endif

;void ff_mlp_rematrix_channel(int32_t *samples, const int32_t *coeffs,
; const uint8_t *bypassed_lsbs, const int8_t *noise_buffer,
; int index, unsigned int dest_ch, uint16_t blockpos,
; unsigned int maxchan, int matrix_noise_shift,
; int access_unit_size_pow2, int32_t mask)
%macro MLP_REMATRIX_CHANNEL 0
cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \
index, dest_ch, blockpos, maxchan, mns, \
accum, mask, cnt
mov mnsd, mnsm ; load matrix_noise_shift
movzx blockposq, word blockposm ; load and zero extend blockpos (16bit)
mov maxchand, maxchanm ; load maxchan
mov maskd, maskm ; load mask
%if WIN64
mov dest_chd, dest_chm ; load dest_chd (not needed on UNIX64)
%endif
shl dest_chd, 2
lea cntq, [blsbs_ptrq + blockposq*8]
test mnsd, mnsd ; is matrix_noise_shift != 0?
jne .shift ; jump if true
cmp maxchand, 4 ; is maxchan < 4?
jl .loop4 ; jump if true

align 16
.loop8:
; Process 5 or more channels
REMATRIX
LOOP_END
jne .loop8
RET

align 16
.loop4:
; Process up to 4 channels
movdqa xm0, [samplesq]
movdqa xm1, [coeffsq ]
pshufd xm2, xm0, q2301
pshufd xm3, xm1, q2301
pmuldq xm0, xm1
pmuldq xm3, xm2
paddq xm0, xm3
LOOP_END
jne .loop4
RET

.shift:
%if WIN64
mov indexd, indexm ; load index (not needed on UNIX64)
%endif
mov r9d, r9m ; load access_unit_size_pow2
%if cpuflag(bmi2)
; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place.
DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \
index, dest_ch, accum, index2, mns, \
ausp, mask, cnt, noise
add mnsd, 7 ; matrix_noise_shift += 7
%else ; sse4
mov r6, rcx ; move rcx elsewhere so we can use cl for matrix_noise_shift
%if WIN64
; r0 = rcx
DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \
index2, accum, ausp, mask, cnt, noise
%else ; UNIX64
; r3 = rcx
DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \
index2, accum, ausp, mask, cnt, noise
%endif
lea mnsd, [r8 + 7] ; rcx = matrix_noise_shift + 7
%endif ; cpuflag
sub auspd, 1 ; access_unit_size_pow2 -= 1
cmp r7d, 4 ; is maxchan < 4?
lea index2q, [indexq*2 + 1] ; index2 = 2 * index + 1;
jl .loop4_shift ; jump if maxchan < 4

align 16
.loop8_shift:
; Process 5 or more channels
REMATRIX
LOOP_SHIFT_END
jne .loop8_shift
RET

align 16
.loop4_shift:
; Process up to 4 channels
movdqa xm0, [samplesq]
movdqa xm1, [coeffsq ]
pshufd xm2, xm0, q2301
pshufd xm3, xm1, q2301
pmuldq xm0, xm1
pmuldq xm3, xm2
paddq xm0, xm3
LOOP_SHIFT_END
jne .loop4_shift
RET
%endmacro

INIT_XMM sse4
MLP_REMATRIX_CHANNEL
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2, bmi2
MLP_REMATRIX_CHANNEL
%endif

%endif ; ARCH_X86_64

原文地址：https://blog.csdn.net/durongze/article/details/142881957

免责声明：本站文章内容转载自网络资源，如本站内容侵犯了原著者的合法权益，可联系本站删除。更多内容请关注自学内容网（zxcms.com）！

上一篇：重阳+1 圭峰山
下一篇：Pytest(五)——进阶之fixture的使用（超详细）

使用 `fork()` 和 `waitpid()` 进行进程管理的详解
fork & wait &waitpid用法 & wait waitpid区别
阅读更多2024-10-15
Centos7 搭建logstash
官网：https://www.elastic.co/guide/en/logstash/current/input-plugins.html。事件源可以是从stdin屏幕输入读取，可以从file指定的
阅读更多2024-10-15
面试题tcc补偿事务
‌ TCC（‌Try/‌Confirm/‌Cancel）编程模式的核心思想是：针对每个分支事务操作，都要向全局事务发起方注册Try、Confirm和Cancel三个操作，具体这些操作由我们自己根据业务
阅读更多2024-10-15
访问者模式
访问者模式（Visitor Pattern）是一种行为设计模式，它允许你将数据操作与数据结构分离。ObjectStructure（对象结构）：能够枚举其所有元素，并可以允许访问者访问这些元素。通常是一
阅读更多2024-10-15
深入探讨Python网络爬虫的实现与应用
随着大数据和人工智能的发展，网络爬虫技术的重要性将愈加凸显，掌握这一技能将为数据驱动决策和创新提供更为广阔的视野。Python凭借其简洁的语法和强大的库生态系统，成为开发网络爬虫的理想语言。Scrap
阅读更多2024-10-15
vue父子组件传参的方法
父组件中定义需要传递给子组件的数据。
阅读更多2024-10-15
充电宝哪个牌子性价比高？2024实测分享五款热销高质量产品！
这款移动电源采用白色的外观，看起来非常优雅大方，使用起来也很方便，长按电源键 1 秒即可开机，长按 10 秒即可关机，这款电源容量为10000mAh，可为您的手机、平板电脑和其他电子设备提供持久的续航
阅读更多2024-10-15
C++：从小白到基础（一）
C++是基于C进行开发的，不仅有着超高的运行效率，还有着诸多被封装好的库，这大大加强了C++的开发效率。现在C++主要运用在游戏开发，嵌入式系统开发等领域。以下是C++的关键字关于具体介绍请看以下链接
阅读更多2024-10-15
【C++】——AVL树
插入数据要更新平衡因子，如果高度差大于一，就需要通过旋转平衡。和二叉搜索树相似，AVL树多了一个平衡因子。AVL树是在平衡二叉树的基础上改进的。
阅读更多2024-10-15
【C】数据类型与变量（2）
C语言中为了方便运算，提供了一系列操作符，其中有一组操作符叫:算术操作符。,这些操作符都是:操作符也被叫做运算符，是不同的翻译，意思是一样的。
阅读更多2024-10-15

ff_mlp_rematrix_channel_sse4

相关文章