From 0922c6b01bd50f0ce6e659f765c244f6a8f29eb3 Mon Sep 17 00:00:00 2001
From: James Almer <jamrial@gmail.com>
Date: Thu, 22 Sep 2022 17:10:37 -0300
Subject: [PATCH] x86/lpc: use fused negative multiply-add instructions where
 useful

Signed-off-by: James Almer <jamrial@gmail.com>
---
 libavcodec/x86/lpc.asm | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/libavcodec/x86/lpc.asm b/libavcodec/x86/lpc.asm
index 61a5796e5dc..a585c17ef58 100644
--- a/libavcodec/x86/lpc.asm
+++ b/libavcodec/x86/lpc.asm
@@ -79,11 +79,12 @@ cglobal lpc_apply_welch_window, 3, 5, 8, data, len, out, off1, off2
 
 .loop_o:
     movapd m1, m6
-    mulpd m2, m0, m0
-    subpd m1, m2
 %if cpuflag(avx2)
+    fnmaddpd m1, m0, m0, m1
     vpermpd m2, m1, q0123
 %else
+    mulpd m2, m0, m0
+    subpd m1, m2
     shufpd m2, m1, m1, 01b
 %endif
 
@@ -116,8 +117,12 @@ cglobal lpc_apply_welch_window, 3, 5, 8, data, len, out, off1, off2
 
 .loop_o_scalar:
     movapd xm1, xm6
+%if cpuflag(avx2)
+    fnmaddpd xm1, xm0, xm0, xm1
+%else
     mulpd xm2, xm0, xm0
     subpd xm1, xm2
+%endif
 
     cvtdq2pd xm3, [dataq + off1q]
     cvtdq2pd xm4, [dataq + off2q]
@@ -174,8 +179,12 @@ cglobal lpc_apply_welch_window, 3, 5, 8, data, len, out, off1, off2
 
 .loop_e:
     movapd m1, m6
+%if cpuflag(avx2)
+    fnmaddpd m1, m0, m0, m1
+%else
     mulpd m2, m0, m0
     subpd m1, m2
+%endif
 %if cpuflag(avx2)
     vpermpd m2, m1, q0123
 %else
@@ -210,8 +219,12 @@ cglobal lpc_apply_welch_window, 3, 5, 8, data, len, out, off1, off2
 
 .loop_e_scalar:
     movapd xm1, xm6
+%if cpuflag(avx2)
+    fnmaddpd xm1, xm0, xm0, xm1
+%else
     mulpd xm2, xm0, xm0
     subpd xm1, xm2
+%endif
 
     cvtdq2pd xm3, [dataq + off1q]
     cvtdq2pd xm4, [dataq + off2q]
-- 
GitLab