From 71241b7560ea4bb2955a57db6688ce2bc01ede9c Mon Sep 17 00:00:00 2001 From: lizzie Date: Wed, 22 Oct 2025 12:23:13 +0200 Subject: [PATCH] [dynarmic] remove bloated LUT usage that makes giant functions for no good reason (#2801) Long story short, each of these LUT generates a function that is only ran once, spam the entire code with bunch of little "specialized templates" (remember, N * M * L) and basically bloat codesize for no good reason Additionally it tries to outsmart the compiler which it's not always a good idea - herein I replace most (except 1) of those "helper" LUTs with something the compiler can actually work satisfiably Happy reminder of the sheer amount of functions spammed: mcl::list thing generates a number from 0 to 63 for (fsize 64), add the 0-31 for fsize 32, 0-15 for fsize 16 and you got around... 64+32+16 = 112 functions for a single parameter Now include in the equation rounding_mode, which is about, what, 6? so 112*6 = 672 Yeah Signed-off-by: lizzie Reviewed-on: https://git.eden-emu.dev/eden-emu/eden/pulls/2801 Reviewed-by: crueter Reviewed-by: Shinmegumi Co-authored-by: lizzie Co-committed-by: lizzie --- .../emit_arm64_vector_floating_point.cpp | 71 ++++-- .../backend/x64/emit_x64_floating_point.cpp | 128 ++++------ .../x64/emit_x64_vector_floating_point.cpp | 241 +++++++++--------- .../src/dynarmic/common/fp/rounding_mode.h | 3 + .../src/dynarmic/common/lut_from_list.h | 2 - 5 files changed, 225 insertions(+), 220 deletions(-) diff --git a/src/dynarmic/src/dynarmic/backend/arm64/emit_arm64_vector_floating_point.cpp b/src/dynarmic/src/dynarmic/backend/arm64/emit_arm64_vector_floating_point.cpp index e0a53bf9e4..81cfc0b2db 100644 --- a/src/dynarmic/src/dynarmic/backend/arm64/emit_arm64_vector_floating_point.cpp +++ b/src/dynarmic/src/dynarmic/backend/arm64/emit_arm64_vector_floating_point.cpp @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + /* This file is part of the dynarmic project. * Copyright (c) 2022 MerryMage * SPDX-License-Identifier: 0BSD @@ -566,37 +569,49 @@ void EmitIR(oaknut::CodeGenerator& code, E EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FRECPS(Vresult, Va, Vb); }); } +/// @brief Assembly thunk "parameters" are in template parameters +/// TODO: we have space for a 5th parameter? :) +template +static void EmitIRVectorRoundInt16Thunk(VectorArray& output, const VectorArray& input, FP::FPCR fpcr, FP::FPSR& fpsr) { + for (size_t i = 0; i < output.size(); ++i) + output[i] = FPT(FP::FPRoundInt(input[i], fpcr, rounding_mode, exact, fpsr)); +} + template<> void EmitIR(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) { - const auto rounding = static_cast(inst->GetArg(1).GetU8()); + const auto rounding = FP::RoundingMode(inst->GetArg(1).GetU8()); const bool exact = inst->GetArg(2).GetU1(); - - using rounding_list = mp::list< - mp::lift_value, - mp::lift_value, - mp::lift_value, - mp::lift_value, - mp::lift_value>; - using exact_list = mp::list; - - static const auto lut = Common::GenerateLookupTableFromList( - [](I) { - using FPT = u16; - return std::pair{ - mp::lower_to_tuple_v, - Common::FptrCast( - [](VectorArray& output, const VectorArray& input, FP::FPCR fpcr, FP::FPSR& fpsr) { - constexpr FP::RoundingMode rounding_mode = mp::get<0, I>::value; - constexpr bool exact = mp::get<1, I>::value; - - for (size_t i = 0; i < output.size(); ++i) { - output[i] = static_cast(FP::FPRoundInt(input[i], fpcr, rounding_mode, exact, fpsr)); - } - })}; - }, - mp::cartesian_product{}); - - EmitTwoOpFallback<3>(code, ctx, inst, lut.at(std::make_tuple(rounding, exact))); + // Don't even think about making this a LUT -- it's bad + using FPT = u16; // Yes it's u16, no fsize madness + switch (rounding) { + case FP::RoundingMode::ToNearest_TieEven: + exact + ? EmitTwoOpFallback<3>(code, ctx, inst, EmitIRVectorRoundInt16Thunk) + : EmitTwoOpFallback<3>(code, ctx, inst, EmitIRVectorRoundInt16Thunk); + break; + case FP::RoundingMode::TowardsPlusInfinity: + exact + ? EmitTwoOpFallback<3>(code, ctx, inst, EmitIRVectorRoundInt16Thunk) + : EmitTwoOpFallback<3>(code, ctx, inst, EmitIRVectorRoundInt16Thunk); + break; + case FP::RoundingMode::TowardsMinusInfinity: + exact + ? EmitTwoOpFallback<3>(code, ctx, inst, EmitIRVectorRoundInt16Thunk) + : EmitTwoOpFallback<3>(code, ctx, inst, EmitIRVectorRoundInt16Thunk); + break; + case FP::RoundingMode::TowardsZero: + exact + ? EmitTwoOpFallback<3>(code, ctx, inst, EmitIRVectorRoundInt16Thunk) + : EmitTwoOpFallback<3>(code, ctx, inst, EmitIRVectorRoundInt16Thunk); + break; + case FP::RoundingMode::ToNearest_TieAwayFromZero: + exact + ? EmitTwoOpFallback<3>(code, ctx, inst, EmitIRVectorRoundInt16Thunk) + : EmitTwoOpFallback<3>(code, ctx, inst, EmitIRVectorRoundInt16Thunk); + break; + default: + UNREACHABLE(); + } } template<> diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp index 47e51acb03..a2fe4001a9 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_floating_point.cpp @@ -1071,57 +1071,42 @@ void EmitX64::EmitFPRecipStepFused64(EmitContext& ctx, IR::Inst* inst) { EmitFPRecipStepFused<64>(code, ctx, inst); } +/// @brief An assembly thunk for the general rounding of FP +/// Conciously making this be a template, because the cost of comparing "fsize" for every +/// round is not worth it. Remember we are making only 3 copies (u16,u32,u64) +/// Extra args layout: +/// 0-7 rounding_mode +/// 8-15 exact (!= 0) +template +static u64 EmitFPRoundThunk(u64 input, FP::FPSR& fpsr, FP::FPCR fpcr, u32 extra_args) { + auto const exact = ((extra_args >> 16) & 0xff) != 0; + auto const rounding_mode = FP::RoundingMode((extra_args >> 8) & 0xff); + return FP::FPRoundInt(FPT(input), fpcr, rounding_mode, exact, fpsr); +} + static void EmitFPRound(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, size_t fsize) { - const auto rounding_mode = static_cast(inst->GetArg(1).GetU8()); - const bool exact = inst->GetArg(2).GetU1(); - const auto round_imm = ConvertRoundingModeToX64Immediate(rounding_mode); - + auto const rounding_mode = FP::RoundingMode(inst->GetArg(1).GetU8()); + bool const exact = inst->GetArg(2).GetU1(); + auto const round_imm = ConvertRoundingModeToX64Immediate(rounding_mode); if (fsize != 16 && code.HasHostFeature(HostFeature::SSE41) && round_imm && !exact) { - if (fsize == 64) { - FPTwoOp<64>(code, ctx, inst, [&](Xbyak::Xmm result) { - code.roundsd(result, result, *round_imm); - }); - } else { - FPTwoOp<32>(code, ctx, inst, [&](Xbyak::Xmm result) { - code.roundss(result, result, *round_imm); - }); + fsize == 64 + ? FPTwoOp<64>(code, ctx, inst, [&](Xbyak::Xmm result) { code.roundsd(result, result, *round_imm); }) + : FPTwoOp<32>(code, ctx, inst, [&](Xbyak::Xmm result) { code.roundss(result, result, *round_imm); }); + } else { + auto args = ctx.reg_alloc.GetArgumentInfo(inst); + // See EmitFPRoundThunk + auto const extra_args = u32(rounding_mode) | (u32(exact) << 8); + ctx.reg_alloc.HostCall(inst, args[0]); + code.lea(code.ABI_PARAM2, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]); + code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR().Value()); + code.mov(code.ABI_PARAM4.cvt32(), extra_args); + switch (fsize) { + case 64: code.CallFunction(EmitFPRoundThunk); break; + case 32: code.CallFunction(EmitFPRoundThunk); break; + case 16: code.CallFunction(EmitFPRoundThunk); break; + default: UNREACHABLE(); } - - return; } - - using fsize_list = mp::list, - mp::lift_value, - mp::lift_value>; - using rounding_list = mp::list< - mp::lift_value, - mp::lift_value, - mp::lift_value, - mp::lift_value, - mp::lift_value>; - using exact_list = mp::list; - - static const auto lut = Common::GenerateLookupTableFromList( - [](I) { - return std::pair{ - mp::lower_to_tuple_v, - Common::FptrCast( - [](u64 input, FP::FPSR& fpsr, FP::FPCR fpcr) { - constexpr size_t fsize = mp::get<0, I>::value; - constexpr FP::RoundingMode rounding_mode = mp::get<1, I>::value; - constexpr bool exact = mp::get<2, I>::value; - using InputSize = mcl::unsigned_integer_of_size; - - return FP::FPRoundInt(static_cast(input), fpcr, rounding_mode, exact, fpsr); - })}; - }, - mp::cartesian_product{}); - - auto args = ctx.reg_alloc.GetArgumentInfo(inst); - ctx.reg_alloc.HostCall(inst, args[0]); - code.lea(code.ABI_PARAM2, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]); - code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR().Value()); - code.CallFunction(lut.at(std::make_tuple(fsize, rounding_mode, exact))); } void EmitX64::EmitFPRoundInt16(EmitContext& ctx, IR::Inst* inst) { @@ -1621,12 +1606,29 @@ void EmitX64::EmitFPDoubleToSingle(EmitContext& ctx, IR::Inst* inst) { } } +/// @brief Thunk used in assembly +/// The extra args is conformed as follows: +/// 0-7 fbits +/// 8-15 rounding +/// 16-23 isize +/// 24-31 unsigned_ (!= 0) +/// Better than spamming thousands of templates aye? +template +static u64 EmitFPToFixedThunk(u64 input, FP::FPSR& fpsr, FP::FPCR fpcr, u32 extra_args) { + using FPT = mcl::unsigned_integer_of_size; + auto const unsigned_ = ((extra_args >> 24) & 0xff) != 0; + auto const isize = ((extra_args >> 16) & 0xff); + auto const rounding = FP::RoundingMode((extra_args >> 8) & 0xff); + auto const fbits = ((extra_args >> 0) & 0xff); + return FP::FPToFixed(isize, FPT(input), fbits, unsigned_, fpcr, rounding, fpsr); +} + template static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const size_t fbits = args[1].GetImmediateU8(); - const auto rounding_mode = static_cast(args[2].GetImmediateU8()); + const auto rounding_mode = FP::RoundingMode(args[2].GetImmediateU8()); if constexpr (fsize != 16) { const auto round_imm = ConvertRoundingModeToX64Immediate(rounding_mode); @@ -1732,34 +1734,14 @@ static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { return; } } - - using fbits_list = mp::lift_sequence>; - using rounding_list = mp::list< - mp::lift_value, - mp::lift_value, - mp::lift_value, - mp::lift_value, - mp::lift_value>; - - static const auto lut = Common::GenerateLookupTableFromList( - [](I) { - return std::pair{ - mp::lower_to_tuple_v, - Common::FptrCast( - [](u64 input, FP::FPSR& fpsr, FP::FPCR fpcr) { - constexpr size_t fbits = mp::get<0, I>::value; - constexpr FP::RoundingMode rounding_mode = mp::get<1, I>::value; - using FPT = mcl::unsigned_integer_of_size; - - return FP::FPToFixed(isize, static_cast(input), fbits, unsigned_, fpcr, rounding_mode, fpsr); - })}; - }, - mp::cartesian_product{}); - + // See EmitFPToFixedThunk + auto const extra_args = (u32(unsigned_) << 24) | (u32(isize) << 16) + | (u32(rounding_mode) << 8) | (u32(fbits)); ctx.reg_alloc.HostCall(inst, args[0]); code.lea(code.ABI_PARAM2, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]); code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR().Value()); - code.CallFunction(lut.at(std::make_tuple(fbits, rounding_mode))); + code.mov(code.ABI_PARAM4.cvt32(), extra_args); + code.CallFunction(EmitFPToFixedThunk); } void EmitX64::EmitFPDoubleToFixedS16(EmitContext& ctx, IR::Inst* inst) { diff --git a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp index a368e6703f..66a179a481 100644 --- a/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp +++ b/src/dynarmic/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp @@ -439,9 +439,9 @@ void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* i ctx.reg_alloc.DefineValue(inst, result); } -template -void EmitTwoOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result, Xbyak::Xmm arg1, Lambda lambda, bool fpcr_controlled) { - const auto fn = static_cast*>(lambda); +template +void EmitTwoOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result, Xbyak::Xmm arg1, F lambda, bool fpcr_controlled) { + const auto fn = static_cast*>(lambda); const u32 fpcr = ctx.FPCR(fpcr_controlled).Value(); @@ -459,8 +459,8 @@ void EmitTwoOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xbyak code.add(rsp, stack_space + ABI_SHADOW_SPACE); } -template -void EmitTwoOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) { +template +void EmitTwoOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, F lambda) { auto args = ctx.reg_alloc.GetArgumentInfo(inst); const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]); const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm(); @@ -665,8 +665,14 @@ void EmitX64::EmitFPVectorEqual64(EmitContext& ctx, IR::Inst* inst) { ctx.reg_alloc.DefineValue(inst, a); } +template +static void EmitFPVectorFromHalf32Thunk(VectorArray& output, const VectorArray& input, FP::FPCR fpcr, FP::FPSR& fpsr) { + for (size_t i = 0; i < output.size(); ++i) + output[i] = FP::FPConvert(input[i], fpcr, rounding_mode, fpsr); +} + void EmitX64::EmitFPVectorFromHalf32(EmitContext& ctx, IR::Inst* inst) { - const auto rounding_mode = static_cast(inst->GetArg(1).GetU8()); + const auto rounding_mode = FP::RoundingMode(inst->GetArg(1).GetU8()); const bool fpcr_controlled = inst->GetArg(2).GetU1(); if (code.HasHostFeature(HostFeature::F16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) { @@ -679,32 +685,27 @@ void EmitX64::EmitFPVectorFromHalf32(EmitContext& ctx, IR::Inst* inst) { ForceToDefaultNaN<32>(code, ctx.FPCR(fpcr_controlled), result); ctx.reg_alloc.DefineValue(inst, result); - return; + } else { + switch (rounding_mode) { + case FP::RoundingMode::ToNearest_TieEven: + EmitTwoOpFallback<2>(code, ctx, inst, EmitFPVectorFromHalf32Thunk); + break; + case FP::RoundingMode::TowardsPlusInfinity: + EmitTwoOpFallback<2>(code, ctx, inst, EmitFPVectorFromHalf32Thunk); + break; + case FP::RoundingMode::TowardsMinusInfinity: + EmitTwoOpFallback<2>(code, ctx, inst, EmitFPVectorFromHalf32Thunk); + break; + case FP::RoundingMode::TowardsZero: + EmitTwoOpFallback<2>(code, ctx, inst, EmitFPVectorFromHalf32Thunk); + break; + case FP::RoundingMode::ToNearest_TieAwayFromZero: + EmitTwoOpFallback<2>(code, ctx, inst, EmitFPVectorFromHalf32Thunk); + break; + default: + UNREACHABLE(); + } } - - using rounding_list = mp::list< - mp::lift_value, - mp::lift_value, - mp::lift_value, - mp::lift_value, - mp::lift_value>; - - static const auto lut = Common::GenerateLookupTableFromList( - [](I) { - return std::pair{ - mp::lower_to_tuple_v, - Common::FptrCast( - [](VectorArray& output, const VectorArray& input, FP::FPCR fpcr, FP::FPSR& fpsr) { - constexpr FP::RoundingMode rounding_mode = mp::get<0, I>::value; - - for (size_t i = 0; i < output.size(); ++i) { - output[i] = FP::FPConvert(input[i], fpcr, rounding_mode, fpsr); - } - })}; - }, - mp::cartesian_product{}); - - EmitTwoOpFallback<2>(code, ctx, inst, lut.at(std::make_tuple(rounding_mode))); } void EmitX64::EmitFPVectorFromSignedFixed32(EmitContext& ctx, IR::Inst* inst) { @@ -1681,25 +1682,26 @@ void EmitX64::EmitFPVectorRecipStepFused64(EmitContext& ctx, IR::Inst* inst) { EmitRecipStepFused<64>(code, ctx, inst); } +template +static void EmitFPVectorRoundIntThunk(VectorArray& output, const VectorArray& input, FP::FPCR fpcr, FP::FPSR& fpsr) { + for (size_t i = 0; i < output.size(); ++i) + output[i] = FPT(FP::FPRoundInt(input[i], fpcr, rounding_mode, exact, fpsr)); +} + template void EmitFPVectorRoundInt(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { - const auto rounding = static_cast(inst->GetArg(1).GetU8()); + const auto rounding = FP::RoundingMode(inst->GetArg(1).GetU8()); const bool exact = inst->GetArg(2).GetU1(); if constexpr (fsize != 16) { if (code.HasHostFeature(HostFeature::SSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero && !exact) { const u8 round_imm = [&]() -> u8 { switch (rounding) { - case FP::RoundingMode::ToNearest_TieEven: - return 0b00; - case FP::RoundingMode::TowardsPlusInfinity: - return 0b10; - case FP::RoundingMode::TowardsMinusInfinity: - return 0b01; - case FP::RoundingMode::TowardsZero: - return 0b11; - default: - UNREACHABLE(); + case FP::RoundingMode::ToNearest_TieEven: return 0b00; + case FP::RoundingMode::TowardsPlusInfinity: return 0b10; + case FP::RoundingMode::TowardsMinusInfinity: return 0b01; + case FP::RoundingMode::TowardsZero: return 0b11; + default: UNREACHABLE(); } }(); @@ -1711,32 +1713,37 @@ void EmitFPVectorRoundInt(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { } } - using rounding_list = mp::list< - mp::lift_value, - mp::lift_value, - mp::lift_value, - mp::lift_value, - mp::lift_value>; - using exact_list = mp::list; - - static const auto lut = Common::GenerateLookupTableFromList( - [](I) { - using FPT = mcl::unsigned_integer_of_size; // WORKAROUND: For issue 678 on MSVC - return std::pair{ - mp::lower_to_tuple_v, - Common::FptrCast( - [](VectorArray& output, const VectorArray& input, FP::FPCR fpcr, FP::FPSR& fpsr) { - constexpr FP::RoundingMode rounding_mode = mp::get<0, I>::value; - constexpr bool exact = mp::get<1, I>::value; - - for (size_t i = 0; i < output.size(); ++i) { - output[i] = static_cast(FP::FPRoundInt(input[i], fpcr, rounding_mode, exact, fpsr)); - } - })}; - }, - mp::cartesian_product{}); - - EmitTwoOpFallback<3>(code, ctx, inst, lut.at(std::make_tuple(rounding, exact))); + // Do not make a LUT out of this, let the compiler do it's thing + using FPT = mcl::unsigned_integer_of_size; + switch (rounding) { + case FP::RoundingMode::ToNearest_TieEven: + exact + ? EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk) + : EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk); + break; + case FP::RoundingMode::TowardsPlusInfinity: + exact + ? EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk) + : EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk); + break; + case FP::RoundingMode::TowardsMinusInfinity: + exact + ? EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk) + : EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk); + break; + case FP::RoundingMode::TowardsZero: + exact + ? EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk) + : EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk); + break; + case FP::RoundingMode::ToNearest_TieAwayFromZero: + exact + ? EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk) + : EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk); + break; + default: + UNREACHABLE(); + } } void EmitX64::EmitFPVectorRoundInt16(EmitContext& ctx, IR::Inst* inst) { @@ -1965,8 +1972,14 @@ void EmitX64::EmitFPVectorSub64(EmitContext& ctx, IR::Inst* inst) { EmitThreeOpVectorOperation<64, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::subpd); } +template +static void EmitFPVectorToHalf32Thunk(VectorArray& output, const VectorArray& input, FP::FPCR fpcr, FP::FPSR& fpsr) { + for (size_t i = 0; i < output.size(); ++i) + output[i] = i < input.size() ? FP::FPConvert(input[i], fpcr, rounding_mode, fpsr) : 0; +} + void EmitX64::EmitFPVectorToHalf32(EmitContext& ctx, IR::Inst* inst) { - const auto rounding_mode = static_cast(inst->GetArg(1).GetU8()); + const auto rounding_mode = FP::RoundingMode(inst->GetArg(1).GetU8()); const bool fpcr_controlled = inst->GetArg(2).GetU1(); if (code.HasHostFeature(HostFeature::F16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) { @@ -1976,41 +1989,39 @@ void EmitX64::EmitFPVectorToHalf32(EmitContext& ctx, IR::Inst* inst) { const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]); ForceToDefaultNaN<32>(code, ctx.FPCR(fpcr_controlled), result); - code.vcvtps2ph(result, result, static_cast(*round_imm)); + code.vcvtps2ph(result, result, u8(*round_imm)); ctx.reg_alloc.DefineValue(inst, result); - return; + } else { + switch (rounding_mode) { + case FP::RoundingMode::ToNearest_TieEven: + EmitTwoOpFallback<2>(code, ctx, inst, EmitFPVectorToHalf32Thunk); + break; + case FP::RoundingMode::TowardsPlusInfinity: + EmitTwoOpFallback<2>(code, ctx, inst, EmitFPVectorToHalf32Thunk); + break; + case FP::RoundingMode::TowardsMinusInfinity: + EmitTwoOpFallback<2>(code, ctx, inst, EmitFPVectorToHalf32Thunk); + break; + case FP::RoundingMode::TowardsZero: + EmitTwoOpFallback<2>(code, ctx, inst, EmitFPVectorToHalf32Thunk); + break; + case FP::RoundingMode::ToNearest_TieAwayFromZero: + EmitTwoOpFallback<2>(code, ctx, inst, EmitFPVectorToHalf32Thunk); + break; + default: + UNREACHABLE(); + } } - - using rounding_list = mp::list< - mp::lift_value, - mp::lift_value, - mp::lift_value, - mp::lift_value, - mp::lift_value>; - - static const auto lut = Common::GenerateLookupTableFromList( - [](I) { - return std::pair{ - mp::lower_to_tuple_v, - Common::FptrCast( - [](VectorArray& output, const VectorArray& input, FP::FPCR fpcr, FP::FPSR& fpsr) { - constexpr FP::RoundingMode rounding_mode = mp::get<0, I>::value; - - for (size_t i = 0; i < output.size(); ++i) { - if (i < input.size()) { - output[i] = FP::FPConvert(input[i], fpcr, rounding_mode, fpsr); - } else { - output[i] = 0; - } - } - })}; - }, - mp::cartesian_product{}); - - EmitTwoOpFallback<2>(code, ctx, inst, lut.at(std::make_tuple(rounding_mode))); } +// Assembly thunk; just remember not to specialise too much otherwise i-cache death! +// template +// static void EmitFPVectorToFixedThunk(VectorArray& output, const VectorArray& input, FP::FPCR fpcr, FP::FPSR& fpsr) { +// for (size_t i = 0; i < output.size(); ++i) +// output[i] = FPT(FP::FPToFixed(fsize, input[i], fbits, unsigned_, fpcr, rounding_mode, fpsr)); +// } + template void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { const size_t fbits = inst->GetArg(1).GetU8(); @@ -2108,7 +2119,7 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { FCODE(andp)(tmp, xmm0); FCODE(subp)(src, tmp); perform_conversion(src); - ICODE(psll)(xmm0, static_cast(fsize - 1)); + ICODE(psll)(xmm0, u8(fsize - 1)); FCODE(orp)(src, xmm0); // Saturate to max @@ -2116,7 +2127,7 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { } } else { using FPT = mcl::unsigned_integer_of_size; // WORKAROUND: For issue 678 on MSVC - constexpr u64 integer_max = static_cast((std::numeric_limits>>::max)()); + constexpr u64 integer_max = FPT((std::numeric_limits>>::max)()); code.movaps(xmm0, GetVectorOf(code)); FCODE(cmplep)(xmm0, src); @@ -2138,22 +2149,18 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) { mp::lift_value, mp::lift_value>; - static const auto lut = Common::GenerateLookupTableFromList( - [](I) { - using FPT = mcl::unsigned_integer_of_size; // WORKAROUND: For issue 678 on MSVC - return std::pair{ - mp::lower_to_tuple_v, - Common::FptrCast( - [](VectorArray& output, const VectorArray& input, FP::FPCR fpcr, FP::FPSR& fpsr) { - constexpr size_t fbits = mp::get<0, I>::value; - constexpr FP::RoundingMode rounding_mode = mp::get<1, I>::value; - - for (size_t i = 0; i < output.size(); ++i) { - output[i] = static_cast(FP::FPToFixed(fsize, input[i], fbits, unsigned_, fpcr, rounding_mode, fpsr)); - } - })}; - }, - mp::cartesian_product{}); + static const auto lut = Common::GenerateLookupTableFromList([](I) { + using FPT = mcl::unsigned_integer_of_size; // WORKAROUND: For issue 678 on MSVC + return std::pair{ + mp::lower_to_tuple_v, + Common::FptrCast([](VectorArray& output, const VectorArray& input, FP::FPCR fpcr, FP::FPSR& fpsr) { + constexpr size_t fbits = mp::get<0, I>::value; + constexpr FP::RoundingMode rounding_mode = mp::get<1, I>::value; + for (size_t i = 0; i < output.size(); ++i) + output[i] = FPT(FP::FPToFixed(fsize, input[i], fbits, unsigned_, fpcr, rounding_mode, fpsr)); + }) + }; + }, mp::cartesian_product{}); EmitTwoOpFallback<3>(code, ctx, inst, lut.at(std::make_tuple(fbits, rounding))); } diff --git a/src/dynarmic/src/dynarmic/common/fp/rounding_mode.h b/src/dynarmic/src/dynarmic/common/fp/rounding_mode.h index fcf36f6100..6643a8b512 100644 --- a/src/dynarmic/src/dynarmic/common/fp/rounding_mode.h +++ b/src/dynarmic/src/dynarmic/common/fp/rounding_mode.h @@ -1,3 +1,6 @@ +// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project +// SPDX-License-Identifier: GPL-3.0-or-later + /* This file is part of the dynarmic project. * Copyright (c) 2018 MerryMage * SPDX-License-Identifier: 0BSD diff --git a/src/dynarmic/src/dynarmic/common/lut_from_list.h b/src/dynarmic/src/dynarmic/common/lut_from_list.h index c904e2c041..633b62aeda 100644 --- a/src/dynarmic/src/dynarmic/common/lut_from_list.h +++ b/src/dynarmic/src/dynarmic/common/lut_from_list.h @@ -40,9 +40,7 @@ inline auto GenerateLookupTableFromList(Function f, mcl::mp::list) { using PairT = std::common_type_t...>; #endif using MapT = mcl::mp::apply; - static_assert(mcl::is_instance_of_template_v); - const std::initializer_list pair_array{f(Values{})...}; return MapT(pair_array.begin(), pair_array.end()); }