[dynarmic] remove bloated LUT usage that makes giant functions for no good reason (#2801)
Long story short, each of these LUT generates a function that is only ran once, spam the entire code with bunch of little "specialized templates" (remember, N * M * L) and basically bloat codesize for no good reason Additionally it tries to outsmart the compiler which it's not always a good idea - herein I replace most (except 1) of those "helper" LUTs with something the compiler can actually work satisfiably Happy reminder of the sheer amount of functions spammed: mcl::list thing generates a number from 0 to 63 for (fsize 64), add the 0-31 for fsize 32, 0-15 for fsize 16 and you got around... 64+32+16 = 112 functions for a single parameter Now include in the equation rounding_mode, which is about, what, 6? so 112*6 = 672 Yeah Signed-off-by: lizzie <lizzie@eden-emu.dev> Reviewed-on: #2801 Reviewed-by: crueter <crueter@eden-emu.dev> Reviewed-by: Shinmegumi <shinmegumi@eden-emu.dev> Co-authored-by: lizzie <lizzie@eden-emu.dev> Co-committed-by: lizzie <lizzie@eden-emu.dev>
This commit is contained in:
parent
e0c554976a
commit
71241b7560
5 changed files with 225 additions and 220 deletions
|
@ -1,3 +1,6 @@
|
|||
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
|
||||
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
/* This file is part of the dynarmic project.
|
||||
* Copyright (c) 2022 MerryMage
|
||||
* SPDX-License-Identifier: 0BSD
|
||||
|
@ -566,37 +569,49 @@ void EmitIR<IR::Opcode::FPVectorRecipStepFused64>(oaknut::CodeGenerator& code, E
|
|||
EmitThreeOpArranged<64>(code, ctx, inst, [&](auto Vresult, auto Va, auto Vb) { code.FRECPS(Vresult, Va, Vb); });
|
||||
}
|
||||
|
||||
/// @brief Assembly thunk "parameters" are in template parameters
|
||||
/// TODO: we have space for a 5th parameter? :)
|
||||
template<typename FPT, FP::RoundingMode rounding_mode, bool exact>
|
||||
static void EmitIRVectorRoundInt16Thunk(VectorArray<FPT>& output, const VectorArray<FPT>& input, FP::FPCR fpcr, FP::FPSR& fpsr) {
|
||||
for (size_t i = 0; i < output.size(); ++i)
|
||||
output[i] = FPT(FP::FPRoundInt<FPT>(input[i], fpcr, rounding_mode, exact, fpsr));
|
||||
}
|
||||
|
||||
template<>
|
||||
void EmitIR<IR::Opcode::FPVectorRoundInt16>(oaknut::CodeGenerator& code, EmitContext& ctx, IR::Inst* inst) {
|
||||
const auto rounding = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8());
|
||||
const auto rounding = FP::RoundingMode(inst->GetArg(1).GetU8());
|
||||
const bool exact = inst->GetArg(2).GetU1();
|
||||
|
||||
using rounding_list = mp::list<
|
||||
mp::lift_value<FP::RoundingMode::ToNearest_TieEven>,
|
||||
mp::lift_value<FP::RoundingMode::TowardsPlusInfinity>,
|
||||
mp::lift_value<FP::RoundingMode::TowardsMinusInfinity>,
|
||||
mp::lift_value<FP::RoundingMode::TowardsZero>,
|
||||
mp::lift_value<FP::RoundingMode::ToNearest_TieAwayFromZero>>;
|
||||
using exact_list = mp::list<std::true_type, std::false_type>;
|
||||
|
||||
static const auto lut = Common::GenerateLookupTableFromList(
|
||||
[]<typename I>(I) {
|
||||
using FPT = u16;
|
||||
return std::pair{
|
||||
mp::lower_to_tuple_v<I>,
|
||||
Common::FptrCast(
|
||||
[](VectorArray<FPT>& output, const VectorArray<FPT>& input, FP::FPCR fpcr, FP::FPSR& fpsr) {
|
||||
constexpr FP::RoundingMode rounding_mode = mp::get<0, I>::value;
|
||||
constexpr bool exact = mp::get<1, I>::value;
|
||||
|
||||
for (size_t i = 0; i < output.size(); ++i) {
|
||||
output[i] = static_cast<FPT>(FP::FPRoundInt<FPT>(input[i], fpcr, rounding_mode, exact, fpsr));
|
||||
// Don't even think about making this a LUT -- it's bad
|
||||
using FPT = u16; // Yes it's u16, no fsize madness
|
||||
switch (rounding) {
|
||||
case FP::RoundingMode::ToNearest_TieEven:
|
||||
exact
|
||||
? EmitTwoOpFallback<3>(code, ctx, inst, EmitIRVectorRoundInt16Thunk<FPT, FP::RoundingMode::ToNearest_TieEven, true>)
|
||||
: EmitTwoOpFallback<3>(code, ctx, inst, EmitIRVectorRoundInt16Thunk<FPT, FP::RoundingMode::ToNearest_TieEven, false>);
|
||||
break;
|
||||
case FP::RoundingMode::TowardsPlusInfinity:
|
||||
exact
|
||||
? EmitTwoOpFallback<3>(code, ctx, inst, EmitIRVectorRoundInt16Thunk<FPT, FP::RoundingMode::TowardsPlusInfinity, true>)
|
||||
: EmitTwoOpFallback<3>(code, ctx, inst, EmitIRVectorRoundInt16Thunk<FPT, FP::RoundingMode::TowardsPlusInfinity, false>);
|
||||
break;
|
||||
case FP::RoundingMode::TowardsMinusInfinity:
|
||||
exact
|
||||
? EmitTwoOpFallback<3>(code, ctx, inst, EmitIRVectorRoundInt16Thunk<FPT, FP::RoundingMode::TowardsMinusInfinity, true>)
|
||||
: EmitTwoOpFallback<3>(code, ctx, inst, EmitIRVectorRoundInt16Thunk<FPT, FP::RoundingMode::TowardsMinusInfinity, false>);
|
||||
break;
|
||||
case FP::RoundingMode::TowardsZero:
|
||||
exact
|
||||
? EmitTwoOpFallback<3>(code, ctx, inst, EmitIRVectorRoundInt16Thunk<FPT, FP::RoundingMode::TowardsZero, true>)
|
||||
: EmitTwoOpFallback<3>(code, ctx, inst, EmitIRVectorRoundInt16Thunk<FPT, FP::RoundingMode::TowardsZero, false>);
|
||||
break;
|
||||
case FP::RoundingMode::ToNearest_TieAwayFromZero:
|
||||
exact
|
||||
? EmitTwoOpFallback<3>(code, ctx, inst, EmitIRVectorRoundInt16Thunk<FPT, FP::RoundingMode::ToNearest_TieAwayFromZero, true>)
|
||||
: EmitTwoOpFallback<3>(code, ctx, inst, EmitIRVectorRoundInt16Thunk<FPT, FP::RoundingMode::ToNearest_TieAwayFromZero, false>);
|
||||
break;
|
||||
default:
|
||||
UNREACHABLE();
|
||||
}
|
||||
})};
|
||||
},
|
||||
mp::cartesian_product<rounding_list, exact_list>{});
|
||||
|
||||
EmitTwoOpFallback<3>(code, ctx, inst, lut.at(std::make_tuple(rounding, exact)));
|
||||
}
|
||||
|
||||
template<>
|
||||
|
|
|
@ -1071,57 +1071,42 @@ void EmitX64::EmitFPRecipStepFused64(EmitContext& ctx, IR::Inst* inst) {
|
|||
EmitFPRecipStepFused<64>(code, ctx, inst);
|
||||
}
|
||||
|
||||
/// @brief An assembly thunk for the general rounding of FP
|
||||
/// Conciously making this be a template, because the cost of comparing "fsize" for every
|
||||
/// round is not worth it. Remember we are making only 3 copies (u16,u32,u64)
|
||||
/// Extra args layout:
|
||||
/// 0-7 rounding_mode
|
||||
/// 8-15 exact (!= 0)
|
||||
template<typename FPT>
|
||||
static u64 EmitFPRoundThunk(u64 input, FP::FPSR& fpsr, FP::FPCR fpcr, u32 extra_args) {
|
||||
auto const exact = ((extra_args >> 16) & 0xff) != 0;
|
||||
auto const rounding_mode = FP::RoundingMode((extra_args >> 8) & 0xff);
|
||||
return FP::FPRoundInt<FPT>(FPT(input), fpcr, rounding_mode, exact, fpsr);
|
||||
}
|
||||
|
||||
static void EmitFPRound(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, size_t fsize) {
|
||||
const auto rounding_mode = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8());
|
||||
const bool exact = inst->GetArg(2).GetU1();
|
||||
const auto round_imm = ConvertRoundingModeToX64Immediate(rounding_mode);
|
||||
|
||||
auto const rounding_mode = FP::RoundingMode(inst->GetArg(1).GetU8());
|
||||
bool const exact = inst->GetArg(2).GetU1();
|
||||
auto const round_imm = ConvertRoundingModeToX64Immediate(rounding_mode);
|
||||
if (fsize != 16 && code.HasHostFeature(HostFeature::SSE41) && round_imm && !exact) {
|
||||
if (fsize == 64) {
|
||||
FPTwoOp<64>(code, ctx, inst, [&](Xbyak::Xmm result) {
|
||||
code.roundsd(result, result, *round_imm);
|
||||
});
|
||||
fsize == 64
|
||||
? FPTwoOp<64>(code, ctx, inst, [&](Xbyak::Xmm result) { code.roundsd(result, result, *round_imm); })
|
||||
: FPTwoOp<32>(code, ctx, inst, [&](Xbyak::Xmm result) { code.roundss(result, result, *round_imm); });
|
||||
} else {
|
||||
FPTwoOp<32>(code, ctx, inst, [&](Xbyak::Xmm result) {
|
||||
code.roundss(result, result, *round_imm);
|
||||
});
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
using fsize_list = mp::list<mp::lift_value<size_t(16)>,
|
||||
mp::lift_value<size_t(32)>,
|
||||
mp::lift_value<size_t(64)>>;
|
||||
using rounding_list = mp::list<
|
||||
mp::lift_value<FP::RoundingMode::ToNearest_TieEven>,
|
||||
mp::lift_value<FP::RoundingMode::TowardsPlusInfinity>,
|
||||
mp::lift_value<FP::RoundingMode::TowardsMinusInfinity>,
|
||||
mp::lift_value<FP::RoundingMode::TowardsZero>,
|
||||
mp::lift_value<FP::RoundingMode::ToNearest_TieAwayFromZero>>;
|
||||
using exact_list = mp::list<std::true_type, std::false_type>;
|
||||
|
||||
static const auto lut = Common::GenerateLookupTableFromList(
|
||||
[]<typename I>(I) {
|
||||
return std::pair{
|
||||
mp::lower_to_tuple_v<I>,
|
||||
Common::FptrCast(
|
||||
[](u64 input, FP::FPSR& fpsr, FP::FPCR fpcr) {
|
||||
constexpr size_t fsize = mp::get<0, I>::value;
|
||||
constexpr FP::RoundingMode rounding_mode = mp::get<1, I>::value;
|
||||
constexpr bool exact = mp::get<2, I>::value;
|
||||
using InputSize = mcl::unsigned_integer_of_size<fsize>;
|
||||
|
||||
return FP::FPRoundInt<InputSize>(static_cast<InputSize>(input), fpcr, rounding_mode, exact, fpsr);
|
||||
})};
|
||||
},
|
||||
mp::cartesian_product<fsize_list, rounding_list, exact_list>{});
|
||||
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
// See EmitFPRoundThunk
|
||||
auto const extra_args = u32(rounding_mode) | (u32(exact) << 8);
|
||||
ctx.reg_alloc.HostCall(inst, args[0]);
|
||||
code.lea(code.ABI_PARAM2, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
|
||||
code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR().Value());
|
||||
code.CallFunction(lut.at(std::make_tuple(fsize, rounding_mode, exact)));
|
||||
code.mov(code.ABI_PARAM4.cvt32(), extra_args);
|
||||
switch (fsize) {
|
||||
case 64: code.CallFunction(EmitFPRoundThunk<u64>); break;
|
||||
case 32: code.CallFunction(EmitFPRoundThunk<u32>); break;
|
||||
case 16: code.CallFunction(EmitFPRoundThunk<u16>); break;
|
||||
default: UNREACHABLE();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPRoundInt16(EmitContext& ctx, IR::Inst* inst) {
|
||||
|
@ -1621,12 +1606,29 @@ void EmitX64::EmitFPDoubleToSingle(EmitContext& ctx, IR::Inst* inst) {
|
|||
}
|
||||
}
|
||||
|
||||
/// @brief Thunk used in assembly
|
||||
/// The extra args is conformed as follows:
|
||||
/// 0-7 fbits
|
||||
/// 8-15 rounding
|
||||
/// 16-23 isize
|
||||
/// 24-31 unsigned_ (!= 0)
|
||||
/// Better than spamming thousands of templates aye?
|
||||
template<size_t fsize>
|
||||
static u64 EmitFPToFixedThunk(u64 input, FP::FPSR& fpsr, FP::FPCR fpcr, u32 extra_args) {
|
||||
using FPT = mcl::unsigned_integer_of_size<fsize>;
|
||||
auto const unsigned_ = ((extra_args >> 24) & 0xff) != 0;
|
||||
auto const isize = ((extra_args >> 16) & 0xff);
|
||||
auto const rounding = FP::RoundingMode((extra_args >> 8) & 0xff);
|
||||
auto const fbits = ((extra_args >> 0) & 0xff);
|
||||
return FP::FPToFixed<FPT>(isize, FPT(input), fbits, unsigned_, fpcr, rounding, fpsr);
|
||||
}
|
||||
|
||||
template<size_t fsize, bool unsigned_, size_t isize>
|
||||
static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
|
||||
const size_t fbits = args[1].GetImmediateU8();
|
||||
const auto rounding_mode = static_cast<FP::RoundingMode>(args[2].GetImmediateU8());
|
||||
const auto rounding_mode = FP::RoundingMode(args[2].GetImmediateU8());
|
||||
|
||||
if constexpr (fsize != 16) {
|
||||
const auto round_imm = ConvertRoundingModeToX64Immediate(rounding_mode);
|
||||
|
@ -1732,34 +1734,14 @@ static void EmitFPToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
|||
return;
|
||||
}
|
||||
}
|
||||
|
||||
using fbits_list = mp::lift_sequence<std::make_index_sequence<isize + 1>>;
|
||||
using rounding_list = mp::list<
|
||||
mp::lift_value<FP::RoundingMode::ToNearest_TieEven>,
|
||||
mp::lift_value<FP::RoundingMode::TowardsPlusInfinity>,
|
||||
mp::lift_value<FP::RoundingMode::TowardsMinusInfinity>,
|
||||
mp::lift_value<FP::RoundingMode::TowardsZero>,
|
||||
mp::lift_value<FP::RoundingMode::ToNearest_TieAwayFromZero>>;
|
||||
|
||||
static const auto lut = Common::GenerateLookupTableFromList(
|
||||
[]<typename I>(I) {
|
||||
return std::pair{
|
||||
mp::lower_to_tuple_v<I>,
|
||||
Common::FptrCast(
|
||||
[](u64 input, FP::FPSR& fpsr, FP::FPCR fpcr) {
|
||||
constexpr size_t fbits = mp::get<0, I>::value;
|
||||
constexpr FP::RoundingMode rounding_mode = mp::get<1, I>::value;
|
||||
using FPT = mcl::unsigned_integer_of_size<fsize>;
|
||||
|
||||
return FP::FPToFixed<FPT>(isize, static_cast<FPT>(input), fbits, unsigned_, fpcr, rounding_mode, fpsr);
|
||||
})};
|
||||
},
|
||||
mp::cartesian_product<fbits_list, rounding_list>{});
|
||||
|
||||
// See EmitFPToFixedThunk
|
||||
auto const extra_args = (u32(unsigned_) << 24) | (u32(isize) << 16)
|
||||
| (u32(rounding_mode) << 8) | (u32(fbits));
|
||||
ctx.reg_alloc.HostCall(inst, args[0]);
|
||||
code.lea(code.ABI_PARAM2, code.ptr[code.ABI_JIT_PTR + code.GetJitStateInfo().offsetof_fpsr_exc]);
|
||||
code.mov(code.ABI_PARAM3.cvt32(), ctx.FPCR().Value());
|
||||
code.CallFunction(lut.at(std::make_tuple(fbits, rounding_mode)));
|
||||
code.mov(code.ABI_PARAM4.cvt32(), extra_args);
|
||||
code.CallFunction(EmitFPToFixedThunk<fsize>);
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPDoubleToFixedS16(EmitContext& ctx, IR::Inst* inst) {
|
||||
|
|
|
@ -439,9 +439,9 @@ void EmitThreeOpVectorOperation(BlockOfCode& code, EmitContext& ctx, IR::Inst* i
|
|||
ctx.reg_alloc.DefineValue(inst, result);
|
||||
}
|
||||
|
||||
template<typename Lambda>
|
||||
void EmitTwoOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result, Xbyak::Xmm arg1, Lambda lambda, bool fpcr_controlled) {
|
||||
const auto fn = static_cast<mcl::equivalent_function_type<Lambda>*>(lambda);
|
||||
template<typename F>
|
||||
void EmitTwoOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xbyak::Xmm result, Xbyak::Xmm arg1, F lambda, bool fpcr_controlled) {
|
||||
const auto fn = static_cast<mcl::equivalent_function_type<F>*>(lambda);
|
||||
|
||||
const u32 fpcr = ctx.FPCR(fpcr_controlled).Value();
|
||||
|
||||
|
@ -459,8 +459,8 @@ void EmitTwoOpFallbackWithoutRegAlloc(BlockOfCode& code, EmitContext& ctx, Xbyak
|
|||
code.add(rsp, stack_space + ABI_SHADOW_SPACE);
|
||||
}
|
||||
|
||||
template<size_t fpcr_controlled_arg_index = 1, typename Lambda>
|
||||
void EmitTwoOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, Lambda lambda) {
|
||||
template<size_t fpcr_controlled_arg_index = 1, typename F>
|
||||
void EmitTwoOpFallback(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst, F lambda) {
|
||||
auto args = ctx.reg_alloc.GetArgumentInfo(inst);
|
||||
const Xbyak::Xmm arg1 = ctx.reg_alloc.UseXmm(args[0]);
|
||||
const Xbyak::Xmm result = ctx.reg_alloc.ScratchXmm();
|
||||
|
@ -665,8 +665,14 @@ void EmitX64::EmitFPVectorEqual64(EmitContext& ctx, IR::Inst* inst) {
|
|||
ctx.reg_alloc.DefineValue(inst, a);
|
||||
}
|
||||
|
||||
template<FP::RoundingMode rounding_mode>
|
||||
static void EmitFPVectorFromHalf32Thunk(VectorArray<u32>& output, const VectorArray<u16>& input, FP::FPCR fpcr, FP::FPSR& fpsr) {
|
||||
for (size_t i = 0; i < output.size(); ++i)
|
||||
output[i] = FP::FPConvert<u32, u16>(input[i], fpcr, rounding_mode, fpsr);
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPVectorFromHalf32(EmitContext& ctx, IR::Inst* inst) {
|
||||
const auto rounding_mode = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8());
|
||||
const auto rounding_mode = FP::RoundingMode(inst->GetArg(1).GetU8());
|
||||
const bool fpcr_controlled = inst->GetArg(2).GetU1();
|
||||
|
||||
if (code.HasHostFeature(HostFeature::F16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) {
|
||||
|
@ -679,32 +685,27 @@ void EmitX64::EmitFPVectorFromHalf32(EmitContext& ctx, IR::Inst* inst) {
|
|||
ForceToDefaultNaN<32>(code, ctx.FPCR(fpcr_controlled), result);
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, result);
|
||||
return;
|
||||
} else {
|
||||
switch (rounding_mode) {
|
||||
case FP::RoundingMode::ToNearest_TieEven:
|
||||
EmitTwoOpFallback<2>(code, ctx, inst, EmitFPVectorFromHalf32Thunk<FP::RoundingMode::ToNearest_TieEven>);
|
||||
break;
|
||||
case FP::RoundingMode::TowardsPlusInfinity:
|
||||
EmitTwoOpFallback<2>(code, ctx, inst, EmitFPVectorFromHalf32Thunk<FP::RoundingMode::TowardsPlusInfinity>);
|
||||
break;
|
||||
case FP::RoundingMode::TowardsMinusInfinity:
|
||||
EmitTwoOpFallback<2>(code, ctx, inst, EmitFPVectorFromHalf32Thunk<FP::RoundingMode::TowardsMinusInfinity>);
|
||||
break;
|
||||
case FP::RoundingMode::TowardsZero:
|
||||
EmitTwoOpFallback<2>(code, ctx, inst, EmitFPVectorFromHalf32Thunk<FP::RoundingMode::TowardsZero>);
|
||||
break;
|
||||
case FP::RoundingMode::ToNearest_TieAwayFromZero:
|
||||
EmitTwoOpFallback<2>(code, ctx, inst, EmitFPVectorFromHalf32Thunk<FP::RoundingMode::ToNearest_TieAwayFromZero>);
|
||||
break;
|
||||
default:
|
||||
UNREACHABLE();
|
||||
}
|
||||
|
||||
using rounding_list = mp::list<
|
||||
mp::lift_value<FP::RoundingMode::ToNearest_TieEven>,
|
||||
mp::lift_value<FP::RoundingMode::TowardsPlusInfinity>,
|
||||
mp::lift_value<FP::RoundingMode::TowardsMinusInfinity>,
|
||||
mp::lift_value<FP::RoundingMode::TowardsZero>,
|
||||
mp::lift_value<FP::RoundingMode::ToNearest_TieAwayFromZero>>;
|
||||
|
||||
static const auto lut = Common::GenerateLookupTableFromList(
|
||||
[]<typename I>(I) {
|
||||
return std::pair{
|
||||
mp::lower_to_tuple_v<I>,
|
||||
Common::FptrCast(
|
||||
[](VectorArray<u32>& output, const VectorArray<u16>& input, FP::FPCR fpcr, FP::FPSR& fpsr) {
|
||||
constexpr FP::RoundingMode rounding_mode = mp::get<0, I>::value;
|
||||
|
||||
for (size_t i = 0; i < output.size(); ++i) {
|
||||
output[i] = FP::FPConvert<u32, u16>(input[i], fpcr, rounding_mode, fpsr);
|
||||
}
|
||||
})};
|
||||
},
|
||||
mp::cartesian_product<rounding_list>{});
|
||||
|
||||
EmitTwoOpFallback<2>(code, ctx, inst, lut.at(std::make_tuple(rounding_mode)));
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPVectorFromSignedFixed32(EmitContext& ctx, IR::Inst* inst) {
|
||||
|
@ -1681,25 +1682,26 @@ void EmitX64::EmitFPVectorRecipStepFused64(EmitContext& ctx, IR::Inst* inst) {
|
|||
EmitRecipStepFused<64>(code, ctx, inst);
|
||||
}
|
||||
|
||||
template<typename FPT, FP::RoundingMode rounding_mode, bool exact>
|
||||
static void EmitFPVectorRoundIntThunk(VectorArray<FPT>& output, const VectorArray<FPT>& input, FP::FPCR fpcr, FP::FPSR& fpsr) {
|
||||
for (size_t i = 0; i < output.size(); ++i)
|
||||
output[i] = FPT(FP::FPRoundInt<FPT>(input[i], fpcr, rounding_mode, exact, fpsr));
|
||||
}
|
||||
|
||||
template<size_t fsize>
|
||||
void EmitFPVectorRoundInt(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||
const auto rounding = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8());
|
||||
const auto rounding = FP::RoundingMode(inst->GetArg(1).GetU8());
|
||||
const bool exact = inst->GetArg(2).GetU1();
|
||||
|
||||
if constexpr (fsize != 16) {
|
||||
if (code.HasHostFeature(HostFeature::SSE41) && rounding != FP::RoundingMode::ToNearest_TieAwayFromZero && !exact) {
|
||||
const u8 round_imm = [&]() -> u8 {
|
||||
switch (rounding) {
|
||||
case FP::RoundingMode::ToNearest_TieEven:
|
||||
return 0b00;
|
||||
case FP::RoundingMode::TowardsPlusInfinity:
|
||||
return 0b10;
|
||||
case FP::RoundingMode::TowardsMinusInfinity:
|
||||
return 0b01;
|
||||
case FP::RoundingMode::TowardsZero:
|
||||
return 0b11;
|
||||
default:
|
||||
UNREACHABLE();
|
||||
case FP::RoundingMode::ToNearest_TieEven: return 0b00;
|
||||
case FP::RoundingMode::TowardsPlusInfinity: return 0b10;
|
||||
case FP::RoundingMode::TowardsMinusInfinity: return 0b01;
|
||||
case FP::RoundingMode::TowardsZero: return 0b11;
|
||||
default: UNREACHABLE();
|
||||
}
|
||||
}();
|
||||
|
||||
|
@ -1711,32 +1713,37 @@ void EmitFPVectorRoundInt(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
|||
}
|
||||
}
|
||||
|
||||
using rounding_list = mp::list<
|
||||
mp::lift_value<FP::RoundingMode::ToNearest_TieEven>,
|
||||
mp::lift_value<FP::RoundingMode::TowardsPlusInfinity>,
|
||||
mp::lift_value<FP::RoundingMode::TowardsMinusInfinity>,
|
||||
mp::lift_value<FP::RoundingMode::TowardsZero>,
|
||||
mp::lift_value<FP::RoundingMode::ToNearest_TieAwayFromZero>>;
|
||||
using exact_list = mp::list<std::true_type, std::false_type>;
|
||||
|
||||
static const auto lut = Common::GenerateLookupTableFromList(
|
||||
[]<typename I>(I) {
|
||||
using FPT = mcl::unsigned_integer_of_size<fsize>; // WORKAROUND: For issue 678 on MSVC
|
||||
return std::pair{
|
||||
mp::lower_to_tuple_v<I>,
|
||||
Common::FptrCast(
|
||||
[](VectorArray<FPT>& output, const VectorArray<FPT>& input, FP::FPCR fpcr, FP::FPSR& fpsr) {
|
||||
constexpr FP::RoundingMode rounding_mode = mp::get<0, I>::value;
|
||||
constexpr bool exact = mp::get<1, I>::value;
|
||||
|
||||
for (size_t i = 0; i < output.size(); ++i) {
|
||||
output[i] = static_cast<FPT>(FP::FPRoundInt<FPT>(input[i], fpcr, rounding_mode, exact, fpsr));
|
||||
// Do not make a LUT out of this, let the compiler do it's thing
|
||||
using FPT = mcl::unsigned_integer_of_size<fsize>;
|
||||
switch (rounding) {
|
||||
case FP::RoundingMode::ToNearest_TieEven:
|
||||
exact
|
||||
? EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk<FPT, FP::RoundingMode::ToNearest_TieEven, true>)
|
||||
: EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk<FPT, FP::RoundingMode::ToNearest_TieEven, false>);
|
||||
break;
|
||||
case FP::RoundingMode::TowardsPlusInfinity:
|
||||
exact
|
||||
? EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk<FPT, FP::RoundingMode::TowardsPlusInfinity, true>)
|
||||
: EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk<FPT, FP::RoundingMode::TowardsPlusInfinity, false>);
|
||||
break;
|
||||
case FP::RoundingMode::TowardsMinusInfinity:
|
||||
exact
|
||||
? EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk<FPT, FP::RoundingMode::TowardsMinusInfinity, true>)
|
||||
: EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk<FPT, FP::RoundingMode::TowardsMinusInfinity, false>);
|
||||
break;
|
||||
case FP::RoundingMode::TowardsZero:
|
||||
exact
|
||||
? EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk<FPT, FP::RoundingMode::TowardsZero, true>)
|
||||
: EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk<FPT, FP::RoundingMode::TowardsZero, false>);
|
||||
break;
|
||||
case FP::RoundingMode::ToNearest_TieAwayFromZero:
|
||||
exact
|
||||
? EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk<FPT, FP::RoundingMode::ToNearest_TieAwayFromZero, true>)
|
||||
: EmitTwoOpFallback<3>(code, ctx, inst, EmitFPVectorRoundIntThunk<FPT, FP::RoundingMode::ToNearest_TieAwayFromZero, false>);
|
||||
break;
|
||||
default:
|
||||
UNREACHABLE();
|
||||
}
|
||||
})};
|
||||
},
|
||||
mp::cartesian_product<rounding_list, exact_list>{});
|
||||
|
||||
EmitTwoOpFallback<3>(code, ctx, inst, lut.at(std::make_tuple(rounding, exact)));
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPVectorRoundInt16(EmitContext& ctx, IR::Inst* inst) {
|
||||
|
@ -1965,8 +1972,14 @@ void EmitX64::EmitFPVectorSub64(EmitContext& ctx, IR::Inst* inst) {
|
|||
EmitThreeOpVectorOperation<64, DefaultIndexer>(code, ctx, inst, &Xbyak::CodeGenerator::subpd);
|
||||
}
|
||||
|
||||
template<FP::RoundingMode rounding_mode>
|
||||
static void EmitFPVectorToHalf32Thunk(VectorArray<u16>& output, const VectorArray<u32>& input, FP::FPCR fpcr, FP::FPSR& fpsr) {
|
||||
for (size_t i = 0; i < output.size(); ++i)
|
||||
output[i] = i < input.size() ? FP::FPConvert<u16, u32>(input[i], fpcr, rounding_mode, fpsr) : 0;
|
||||
}
|
||||
|
||||
void EmitX64::EmitFPVectorToHalf32(EmitContext& ctx, IR::Inst* inst) {
|
||||
const auto rounding_mode = static_cast<FP::RoundingMode>(inst->GetArg(1).GetU8());
|
||||
const auto rounding_mode = FP::RoundingMode(inst->GetArg(1).GetU8());
|
||||
const bool fpcr_controlled = inst->GetArg(2).GetU1();
|
||||
|
||||
if (code.HasHostFeature(HostFeature::F16C) && !ctx.FPCR().AHP() && !ctx.FPCR().FZ16()) {
|
||||
|
@ -1976,41 +1989,39 @@ void EmitX64::EmitFPVectorToHalf32(EmitContext& ctx, IR::Inst* inst) {
|
|||
const Xbyak::Xmm result = ctx.reg_alloc.UseScratchXmm(args[0]);
|
||||
|
||||
ForceToDefaultNaN<32>(code, ctx.FPCR(fpcr_controlled), result);
|
||||
code.vcvtps2ph(result, result, static_cast<u8>(*round_imm));
|
||||
code.vcvtps2ph(result, result, u8(*round_imm));
|
||||
|
||||
ctx.reg_alloc.DefineValue(inst, result);
|
||||
return;
|
||||
}
|
||||
|
||||
using rounding_list = mp::list<
|
||||
mp::lift_value<FP::RoundingMode::ToNearest_TieEven>,
|
||||
mp::lift_value<FP::RoundingMode::TowardsPlusInfinity>,
|
||||
mp::lift_value<FP::RoundingMode::TowardsMinusInfinity>,
|
||||
mp::lift_value<FP::RoundingMode::TowardsZero>,
|
||||
mp::lift_value<FP::RoundingMode::ToNearest_TieAwayFromZero>>;
|
||||
|
||||
static const auto lut = Common::GenerateLookupTableFromList(
|
||||
[]<typename I>(I) {
|
||||
return std::pair{
|
||||
mp::lower_to_tuple_v<I>,
|
||||
Common::FptrCast(
|
||||
[](VectorArray<u16>& output, const VectorArray<u32>& input, FP::FPCR fpcr, FP::FPSR& fpsr) {
|
||||
constexpr FP::RoundingMode rounding_mode = mp::get<0, I>::value;
|
||||
|
||||
for (size_t i = 0; i < output.size(); ++i) {
|
||||
if (i < input.size()) {
|
||||
output[i] = FP::FPConvert<u16, u32>(input[i], fpcr, rounding_mode, fpsr);
|
||||
} else {
|
||||
output[i] = 0;
|
||||
switch (rounding_mode) {
|
||||
case FP::RoundingMode::ToNearest_TieEven:
|
||||
EmitTwoOpFallback<2>(code, ctx, inst, EmitFPVectorToHalf32Thunk<FP::RoundingMode::ToNearest_TieEven>);
|
||||
break;
|
||||
case FP::RoundingMode::TowardsPlusInfinity:
|
||||
EmitTwoOpFallback<2>(code, ctx, inst, EmitFPVectorToHalf32Thunk<FP::RoundingMode::TowardsPlusInfinity>);
|
||||
break;
|
||||
case FP::RoundingMode::TowardsMinusInfinity:
|
||||
EmitTwoOpFallback<2>(code, ctx, inst, EmitFPVectorToHalf32Thunk<FP::RoundingMode::TowardsMinusInfinity>);
|
||||
break;
|
||||
case FP::RoundingMode::TowardsZero:
|
||||
EmitTwoOpFallback<2>(code, ctx, inst, EmitFPVectorToHalf32Thunk<FP::RoundingMode::TowardsZero>);
|
||||
break;
|
||||
case FP::RoundingMode::ToNearest_TieAwayFromZero:
|
||||
EmitTwoOpFallback<2>(code, ctx, inst, EmitFPVectorToHalf32Thunk<FP::RoundingMode::ToNearest_TieAwayFromZero>);
|
||||
break;
|
||||
default:
|
||||
UNREACHABLE();
|
||||
}
|
||||
}
|
||||
})};
|
||||
},
|
||||
mp::cartesian_product<rounding_list>{});
|
||||
|
||||
EmitTwoOpFallback<2>(code, ctx, inst, lut.at(std::make_tuple(rounding_mode)));
|
||||
}
|
||||
|
||||
// Assembly thunk; just remember not to specialise too much otherwise i-cache death!
|
||||
// template<typename FPT, size_t fbits, FP::RoundingMode rounding_mode>
|
||||
// static void EmitFPVectorToFixedThunk(VectorArray<FPT>& output, const VectorArray<FPT>& input, FP::FPCR fpcr, FP::FPSR& fpsr) {
|
||||
// for (size_t i = 0; i < output.size(); ++i)
|
||||
// output[i] = FPT(FP::FPToFixed<FPT>(fsize, input[i], fbits, unsigned_, fpcr, rounding_mode, fpsr));
|
||||
// }
|
||||
|
||||
template<size_t fsize, bool unsigned_>
|
||||
void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
||||
const size_t fbits = inst->GetArg(1).GetU8();
|
||||
|
@ -2108,7 +2119,7 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
|||
FCODE(andp)(tmp, xmm0);
|
||||
FCODE(subp)(src, tmp);
|
||||
perform_conversion(src);
|
||||
ICODE(psll)(xmm0, static_cast<u8>(fsize - 1));
|
||||
ICODE(psll)(xmm0, u8(fsize - 1));
|
||||
FCODE(orp)(src, xmm0);
|
||||
|
||||
// Saturate to max
|
||||
|
@ -2116,7 +2127,7 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
|||
}
|
||||
} else {
|
||||
using FPT = mcl::unsigned_integer_of_size<fsize>; // WORKAROUND: For issue 678 on MSVC
|
||||
constexpr u64 integer_max = static_cast<FPT>((std::numeric_limits<std::conditional_t<unsigned_, FPT, std::make_signed_t<FPT>>>::max)());
|
||||
constexpr u64 integer_max = FPT((std::numeric_limits<std::conditional_t<unsigned_, FPT, std::make_signed_t<FPT>>>::max)());
|
||||
|
||||
code.movaps(xmm0, GetVectorOf<fsize, float_upper_limit_signed>(code));
|
||||
FCODE(cmplep)(xmm0, src);
|
||||
|
@ -2138,22 +2149,18 @@ void EmitFPVectorToFixed(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
|
|||
mp::lift_value<FP::RoundingMode::TowardsZero>,
|
||||
mp::lift_value<FP::RoundingMode::ToNearest_TieAwayFromZero>>;
|
||||
|
||||
static const auto lut = Common::GenerateLookupTableFromList(
|
||||
[]<typename I>(I) {
|
||||
static const auto lut = Common::GenerateLookupTableFromList([]<typename I>(I) {
|
||||
using FPT = mcl::unsigned_integer_of_size<fsize>; // WORKAROUND: For issue 678 on MSVC
|
||||
return std::pair{
|
||||
mp::lower_to_tuple_v<I>,
|
||||
Common::FptrCast(
|
||||
[](VectorArray<FPT>& output, const VectorArray<FPT>& input, FP::FPCR fpcr, FP::FPSR& fpsr) {
|
||||
Common::FptrCast([](VectorArray<FPT>& output, const VectorArray<FPT>& input, FP::FPCR fpcr, FP::FPSR& fpsr) {
|
||||
constexpr size_t fbits = mp::get<0, I>::value;
|
||||
constexpr FP::RoundingMode rounding_mode = mp::get<1, I>::value;
|
||||
|
||||
for (size_t i = 0; i < output.size(); ++i) {
|
||||
output[i] = static_cast<FPT>(FP::FPToFixed<FPT>(fsize, input[i], fbits, unsigned_, fpcr, rounding_mode, fpsr));
|
||||
}
|
||||
})};
|
||||
},
|
||||
mp::cartesian_product<fbits_list, rounding_list>{});
|
||||
for (size_t i = 0; i < output.size(); ++i)
|
||||
output[i] = FPT(FP::FPToFixed<FPT>(fsize, input[i], fbits, unsigned_, fpcr, rounding_mode, fpsr));
|
||||
})
|
||||
};
|
||||
}, mp::cartesian_product<fbits_list, rounding_list>{});
|
||||
|
||||
EmitTwoOpFallback<3>(code, ctx, inst, lut.at(std::make_tuple(fbits, rounding)));
|
||||
}
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
|
||||
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
/* This file is part of the dynarmic project.
|
||||
* Copyright (c) 2018 MerryMage
|
||||
* SPDX-License-Identifier: 0BSD
|
||||
|
|
|
@ -40,9 +40,7 @@ inline auto GenerateLookupTableFromList(Function f, mcl::mp::list<Values...>) {
|
|||
using PairT = std::common_type_t<std::invoke_result_t<Function, Values>...>;
|
||||
#endif
|
||||
using MapT = mcl::mp::apply<std::map, PairT>;
|
||||
|
||||
static_assert(mcl::is_instance_of_template_v<std::pair, PairT>);
|
||||
|
||||
const std::initializer_list<PairT> pair_array{f(Values{})...};
|
||||
return MapT(pair_array.begin(), pair_array.end());
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue