[dynarmic] vector AVX2 count leading zeroes

2025-07-27 18:59:33 +01:00 · 2025-07-27 18:59:33 +01:00 · c21a4f8041
commit c21a4f8041
parent 0b0f045943
1 changed files with 53 additions and 52 deletions
--- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp
+++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp
@ -1033,7 +1033,7 @@ void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) {

        ctx.reg_alloc.DefineValue(inst, data);
    } else {
-    EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u8>);
+        EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u8>);
    }
 }

@ -1101,22 +1101,32 @@ void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) {

        ctx.reg_alloc.DefineValue(inst, result);
    } else {
-    EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u16>);
+        EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u16>);
    }
 }

 void EmitX64::EmitVectorCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) {
+    auto args = ctx.reg_alloc.GetArgumentInfo(inst);
    if (code.HasHostFeature(HostFeature::AVX512_Ortho | HostFeature::AVX512CD)) {
-        auto args = ctx.reg_alloc.GetArgumentInfo(inst);
-
        const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
        code.vplzcntd(data, data);
-
        ctx.reg_alloc.DefineValue(inst, data);
-        return;
+    // See https://stackoverflow.com/questions/58823140/count-leading-zero-bits-for-each-element-in-avx2-vector-emulate-mm256-lzcnt-ep/58827596#58827596
+    } else if (code.HasHostFeature(HostFeature::AVX2)) {
+        const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]);
+        const Xbyak::Xmm temp = ctx.reg_alloc.ScratchXmm();
+        code.vmovdqa(temp, data);
+        code.vpsrld(data, data, 8);
+        code.vpandn(data, data, temp);
+        code.vmovdqa(temp, code.Const(xword, 0x0000009E0000009E, 0x0000009E0000009E));
+        code.vcvtdq2ps(data, data);
+        code.vpsrld(data, data, 23);
+        code.vpsubusw(data, temp, data);
+        code.vpminsw(data, data, code.Const(xword, 0x0000002000000020, 0x0000002000000020));
+        ctx.reg_alloc.DefineValue(inst, data);
+    } else {
+        EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u32>);
    }
-
-    EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros<u32>);
 }

 void EmitX64::EmitVectorDeinterleaveEven8(EmitContext& ctx, IR::Inst* inst) {
@ -5663,15 +5673,15 @@ static void EmitVectorUnsignedAbsoluteDifference(size_t esize, EmitContext& ctx,
                code.psubd(x, y);
            } else {
                // Smaller code size - about 36 bytes
-            code.movdqa(temp, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
-            code.pxor(x, temp);
-            code.pxor(y, temp);
-            code.movdqa(temp, x);
-            code.psubd(temp, y);
-            code.pcmpgtd(y, x);
-            code.psrld(y, 1);
-            code.pxor(temp, y);
-            code.psubd(temp, y);
+                code.movdqa(temp, code.Const(xword, 0x8000000080000000, 0x8000000080000000));
+                code.pxor(x, temp);
+                code.pxor(y, temp);
+                code.movdqa(temp, x);
+                code.psubd(temp, y);
+                code.pcmpgtd(y, x);
+                code.psrld(y, 1);
+                code.pxor(temp, y);
+                code.psubd(temp, y);
            }
        }
        break;
@ -5738,10 +5748,7 @@ void EmitX64::EmitVectorUnsignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
        code.vpmulld(result, x, y);

        ctx.reg_alloc.DefineValue(lower_inst, result);
-        return;
-    }
-
-    if (code.HasHostFeature(HostFeature::AVX)) {
+    } else if (code.HasHostFeature(HostFeature::AVX)) {
        const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
        const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);

@ -5760,39 +5767,33 @@ void EmitX64::EmitVectorUnsignedMultiply32(EmitContext& ctx, IR::Inst* inst) {
        code.shufps(result, x, 0b11011101);

        ctx.reg_alloc.DefineValue(upper_inst, result);
-        return;
-    }
+    } else {
+        const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
+        const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
+        const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
+        const Xbyak::Xmm upper_result = upper_inst ? ctx.reg_alloc.ScratchXmm() : Xbyak::Xmm{-1};
+        const Xbyak::Xmm lower_result = lower_inst ? ctx.reg_alloc.ScratchXmm() : Xbyak::Xmm{-1};

-    const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]);
-    const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]);
-    const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
-    const Xbyak::Xmm upper_result = ctx.reg_alloc.ScratchXmm();
-    const Xbyak::Xmm lower_result = ctx.reg_alloc.ScratchXmm();
+        // calculate unsigned multiply
+        code.movdqa(tmp, x);
+        code.pmuludq(tmp, y);
+        code.psrlq(x, 32);
+        code.psrlq(y, 32);
+        code.pmuludq(x, y);

-    // calculate unsigned multiply
-    code.movdqa(tmp, x);
-    code.pmuludq(tmp, y);
-    code.psrlq(x, 32);
-    code.psrlq(y, 32);
-    code.pmuludq(x, y);
-
-    // put everything into place
-    code.pcmpeqw(upper_result, upper_result);
-    code.pcmpeqw(lower_result, lower_result);
-    code.psllq(upper_result, 32);
-    code.psrlq(lower_result, 32);
-    code.pand(upper_result, x);
-    code.pand(lower_result, tmp);
-    code.psrlq(tmp, 32);
-    code.psllq(x, 32);
-    code.por(upper_result, tmp);
-    code.por(lower_result, x);
-
-    if (upper_inst) {
-        ctx.reg_alloc.DefineValue(upper_inst, upper_result);
-    }
-    if (lower_inst) {
-        ctx.reg_alloc.DefineValue(lower_inst, lower_result);
+        // put everything into place - only if needed
+        if (upper_inst) code.pcmpeqw(upper_result, upper_result);
+        if (lower_inst) code.pcmpeqw(lower_result, lower_result);
+        if (upper_inst) code.psllq(upper_result, 32);
+        if (lower_inst) code.psrlq(lower_result, 32);
+        if (upper_inst) code.pand(upper_result, x);
+        if (lower_inst) code.pand(lower_result, tmp);
+        if (upper_inst) code.psrlq(tmp, 32);
+        if (lower_inst) code.psllq(x, 32);
+        if (upper_inst) code.por(upper_result, tmp);
+        if (lower_inst) code.por(lower_result, x);
+        if (upper_inst) ctx.reg_alloc.DefineValue(upper_inst, upper_result);
+        if (lower_inst) ctx.reg_alloc.DefineValue(lower_inst, lower_result);
    }
 }