From 5dba7898b6e18973cec3273df30c681c17ac6cb4 Mon Sep 17 00:00:00 2001 From: lizzie Date: Sun, 27 Jul 2025 18:40:16 +0100 Subject: [PATCH] [dynarmic] new optimization option: codesize --- .../dynarmic/backend/x64/emit_x64_vector.cpp | 41 ++++++++++++------- .../dynarmic/interface/optimization_flags.h | 2 + 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp index e9b8866b52..d2e7fc199d 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector.cpp @@ -25,6 +25,7 @@ #include "dynarmic/backend/x64/constants.h" #include "dynarmic/backend/x64/emit_x64.h" #include "dynarmic/common/math_util.h" +#include "dynarmic/interface/optimization_flags.h" #include "dynarmic/ir/basic_block.h" #include "dynarmic/ir/microinstruction.h" #include "dynarmic/ir/opcodes.h" @@ -1009,10 +1010,7 @@ void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) { code.gf2p8affineqb(result, code.BConst<64>(xword, 0xaaccf0ff'00000000), 8); ctx.reg_alloc.DefineValue(inst, result); - return; - } - - if (code.HasHostFeature(HostFeature::SSSE3)) { + } else if (code.HasHostFeature(HostFeature::SSSE3)) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); @@ -1034,10 +1032,9 @@ void EmitX64::EmitVectorCountLeadingZeros8(EmitContext& ctx, IR::Inst* inst) { code.paddb(data, tmp1); ctx.reg_alloc.DefineValue(inst, data); - return; - } - + } else { EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros); + } } void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) { @@ -1070,10 +1067,7 @@ void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) { code.vpshufb(result, result, data); ctx.reg_alloc.DefineValue(inst, result); - return; - } - - if (code.HasHostFeature(HostFeature::SSSE3)) { + } else if (code.HasHostFeature(HostFeature::SSSE3)) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Xmm data = ctx.reg_alloc.UseScratchXmm(args[0]); @@ -1106,10 +1100,9 @@ void EmitX64::EmitVectorCountLeadingZeros16(EmitContext& ctx, IR::Inst* inst) { code.pshufb(result, data); ctx.reg_alloc.DefineValue(inst, result); - return; - } - + } else { EmitOneArgumentFallback(code, ctx, inst, EmitVectorCountLeadingZeros); + } } void EmitX64::EmitVectorCountLeadingZeros32(EmitContext& ctx, IR::Inst* inst) { @@ -5641,6 +5634,7 @@ static void EmitVectorUnsignedAbsoluteDifference(size_t esize, EmitContext& ctx, break; } case 32: + // See https://stackoverflow.com/questions/3380785/compute-the-absolute-difference-between-unsigned-integers-using-sse/3527267#3527267 if (code.HasHostFeature(HostFeature::SSE41)) { const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm y = ctx.reg_alloc.UseXmm(args[1]); @@ -5652,7 +5646,23 @@ static void EmitVectorUnsignedAbsoluteDifference(size_t esize, EmitContext& ctx, } else { const Xbyak::Xmm x = ctx.reg_alloc.UseScratchXmm(args[0]); const Xbyak::Xmm y = ctx.reg_alloc.UseScratchXmm(args[1]); - + if (ctx.HasOptimization(OptimizationFlag::CodeSpeed)) { + // About 45 bytes + const Xbyak::Xmm temp_x = ctx.reg_alloc.ScratchXmm(); + const Xbyak::Xmm temp_y = ctx.reg_alloc.ScratchXmm(); + code.pcmpeqd(temp, temp); + code.pslld(temp, 31); + code.movdqa(temp_x, x); + code.movdqa(temp_y, y); + code.paddd(temp_x, x); + code.paddd(temp_y, y); + code.pcmpgtd(temp_y, temp_x); + code.psubd(x, y); + code.pandn(temp, temp_y); + code.pxor(x, y); + code.psubd(x, y); + } else { + // Smaller code size - about 36 bytes code.movdqa(temp, code.Const(xword, 0x8000000080000000, 0x8000000080000000)); code.pxor(x, temp); code.pxor(y, temp); @@ -5662,6 +5672,7 @@ static void EmitVectorUnsignedAbsoluteDifference(size_t esize, EmitContext& ctx, code.psrld(y, 1); code.pxor(temp, y); code.psubd(temp, y); + } } break; } diff --git a/src/dynarmic/src/dynarmic/interface/optimization_flags.h b/src/dynarmic/src/dynarmic/interface/optimization_flags.h index 2f65f0bfa4..743d902767 100644 --- a/src/dynarmic/src/dynarmic/interface/optimization_flags.h +++ b/src/dynarmic/src/dynarmic/interface/optimization_flags.h @@ -32,6 +32,8 @@ enum class OptimizationFlag : std::uint32_t { ConstProp = 0x00000010, /// This is enables miscellaneous safe IR optimizations. MiscIROpt = 0x00000020, + /// Optimize for code speed rather than for code size (this serves well for tight loops) + CodeSpeed = 0x00000040, /// This is an UNSAFE optimization that reduces accuracy of fused multiply-add operations. /// This unfuses fused instructions to improve performance on host CPUs without FMA support.