revert revert [android] Snapdragon 865 patches (#23)

revert [android] Snapdragon 865 patches (#23)

Co-authored-by: Aleksandr Popovich <alekpopo@pm.me>
Reviewed-on: https://git.bixed.xyz/Bix/eden/pulls/23

Reverted due to heavy performance hits on Android with higher specifications, will be adjusted to be included in a specific build for older A6XX devices, as 855, 860, 865, 870, meanwhile it does fix critical issues with certain games crashing due to memory and VRAM usage, hits performance on SoC that can do it without this special flags.
This commit is contained in:
Bix 2025-07-12 20:15:19 +02:00 committed by crueter
parent 3372fe8678
commit eb3d0cf0d2
Signed by: crueter
GPG key ID: 425ACD2D4830EBC6
8 changed files with 293 additions and 5 deletions

View file

@ -227,7 +227,7 @@ HaltReason ArmNce::RunThread(Kernel::KThread* thread) {
if (auto it = post_handlers.find(m_guest_ctx.pc); it != post_handlers.end()) {
hr = ReturnToRunCodeByTrampoline(thread_params, &m_guest_ctx, it->second);
} else {
hr = ReturnToRunCodeByExceptionLevelChange(m_thread_id, thread_params);
hr = ReturnToRunCodeByExceptionLevelChange(m_thread_id, thread_params); // Android: Use "process handle SIGUSR2 -n true -p true -s false" (and SIGURG) in LLDB when debugging
}
// Critical section for thread cleanup

View file

@ -7,6 +7,7 @@
#include "common/bit_field.h"
#include "common/common_types.h"
#include "shader_recompiler/frontend/maxwell/translate/impl/impl.h"
#include <fstream>
namespace Shader::Maxwell {
namespace {
@ -36,6 +37,17 @@ enum class ShuffleMode : u64 {
}
}
bool IsKONA() {
std::ifstream machineFile("/sys/devices/soc0/machine");
if (machineFile.is_open()) {
std::string line;
std::getline(machineFile, line);
if (line == "KONA")
return true;
}
return false;
}
void Shuffle(TranslatorVisitor& v, u64 insn, const IR::U32& index, const IR::U32& mask) {
union {
u64 insn;
@ -47,6 +59,9 @@ void Shuffle(TranslatorVisitor& v, u64 insn, const IR::U32& index, const IR::U32
const IR::U32 result{ShuffleOperation(v.ir, v.X(shfl.src_reg), index, mask, shfl.mode)};
v.ir.SetPred(shfl.pred, v.ir.GetInBoundsFromOp(result));
if (IsKONA())
v.X(shfl.dest_reg, v.ir.Imm32(0xffffffff)); // This fixes the freeze for Retroid / Snapdragon SD865
else
v.X(shfl.dest_reg, result);
}
} // Anonymous namespace

View file

@ -26,7 +26,9 @@ BufferCache<P>::BufferCache(Tegra::MaxwellDeviceMemoryManager& device_memory_, R
void(slot_buffers.insert(runtime, NullBufferParams{}));
gpu_modified_ranges.Clear();
inline_buffer_id = NULL_BUFFER_ID;
#ifdef ANDROID
immediately_free = (Settings::values.vram_usage_mode.GetValue() == Settings::VramUsageMode::Aggressive);
#endif
if (!runtime.CanReportMemoryUsage()) {
minimum_memory = DEFAULT_EXPECTED_MEMORY;
critical_memory = DEFAULT_CRITICAL_MEMORY;
@ -1378,6 +1380,8 @@ void BufferCache<P>::JoinOverlap(BufferId new_buffer_id, BufferId overlap_id,
});
new_buffer.MarkUsage(copies[0].dst_offset, copies[0].size);
runtime.CopyBuffer(new_buffer, overlap, copies, true);
if (immediately_free)
runtime.Finish();
DeleteBuffer(overlap_id, true);
}
@ -1668,6 +1672,8 @@ void BufferCache<P>::DeleteBuffer(BufferId buffer_id, bool do_not_mark) {
}
Unregister(buffer_id);
if (!do_not_mark || !immediately_free)
delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id]));
slot_buffers.erase(buffer_id);

View file

@ -154,7 +154,11 @@ template <class P>
class BufferCache : public VideoCommon::ChannelSetupCaches<BufferCacheChannelInfo> {
// Page size for caching purposes.
// This is unrelated to the CPU page size and it can be changed as it seems optimal.
#ifdef ANDROID
static constexpr u32 CACHING_PAGEBITS = 12;
#else
static constexpr u32 CACHING_PAGEBITS = 16;
#endif
static constexpr u64 CACHING_PAGESIZE = u64{1} << CACHING_PAGEBITS;
static constexpr bool IS_OPENGL = P::IS_OPENGL;
@ -168,9 +172,15 @@ class BufferCache : public VideoCommon::ChannelSetupCaches<BufferCacheChannelInf
static constexpr bool SEPARATE_IMAGE_BUFFERS_BINDINGS = P::SEPARATE_IMAGE_BUFFER_BINDINGS;
static constexpr bool USE_MEMORY_MAPS_FOR_UPLOADS = P::USE_MEMORY_MAPS_FOR_UPLOADS;
#ifdef ANDROID
static constexpr s64 DEFAULT_EXPECTED_MEMORY = 512_MiB;
static constexpr s64 DEFAULT_CRITICAL_MEMORY = 1_GiB;
static constexpr s64 TARGET_THRESHOLD = 3_GiB;
#else
static constexpr s64 DEFAULT_EXPECTED_MEMORY = 512_MiB;
static constexpr s64 DEFAULT_CRITICAL_MEMORY = 1_GiB;
static constexpr s64 TARGET_THRESHOLD = 4_GiB;
#endif
// Debug Flags.
@ -446,7 +456,12 @@ private:
Tegra::MaxwellDeviceMemoryManager& device_memory;
Common::SlotVector<Buffer> slot_buffers;
DelayedDestructionRing<Buffer, 8> delayed_destruction_ring;
#ifdef ANDROID
static constexpr size_t TICKS_TO_DESTROY = 6;
#else
static constexpr size_t TICKS_TO_DESTROY = 8;
#endif
DelayedDestructionRing<Buffer, TICKS_TO_DESTROY> delayed_destruction_ring;
const Tegra::Engines::DrawManager::IndirectParams* current_draw_indirect{};
@ -478,6 +493,7 @@ private:
u64 minimum_memory = 0;
u64 critical_memory = 0;
BufferId inline_buffer_id;
bool immediately_free = false;
std::array<BufferId, ((1ULL << 34) >> CACHING_PAGEBITS)> page_table;
Common::ScratchBuffer<u8> tmp_buffer;

View file

@ -1,3 +1,6 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: 2021 yuzu Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
@ -18,9 +21,11 @@ Host1x::~Host1x() = default;
void Host1x::StartDevice(s32 fd, ChannelType type, u32 syncpt) {
switch (type) {
case ChannelType::NvDec:
std::call_once(nvdec_first_init, []() {std::this_thread::sleep_for(std::chrono::milliseconds{500});}); // HACK: For Astroneer
devices[fd] = std::make_unique<Tegra::Host1x::Nvdec>(*this, fd, syncpt, frame_queue);
break;
case ChannelType::VIC:
std::call_once(vic_first_init, []() {std::this_thread::sleep_for(std::chrono::milliseconds{500});}); // HACK: For Astroneer
devices[fd] = std::make_unique<Tegra::Host1x::Vic>(*this, fd, syncpt, frame_queue);
break;
default:

View file

@ -1,3 +1,6 @@
// SPDX-FileCopyrightText: Copyright 2025 Eden Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
// SPDX-FileCopyrightText: 2021 yuzu Emulator Project
// SPDX-License-Identifier: GPL-3.0-or-later
@ -201,6 +204,8 @@ private:
std::unique_ptr<Common::FlatAllocator<u32, 0, 32>> allocator;
FrameQueue frame_queue;
std::unordered_map<s32, std::unique_ptr<CDmaPusher>> devices;
std::once_flag nvdec_first_init;
std::once_flag vic_first_init;
};
} // namespace Tegra::Host1x

View file

@ -110,10 +110,17 @@ class TextureCache : public VideoCommon::ChannelSetupCaches<TextureCacheChannelI
static constexpr size_t UNSET_CHANNEL{(std::numeric_limits<size_t>::max)()};
#ifdef ANDROID
static constexpr s64 TARGET_THRESHOLD = 3_GiB;
static constexpr s64 DEFAULT_EXPECTED_MEMORY = 1_GiB + 125_MiB;
static constexpr s64 DEFAULT_CRITICAL_MEMORY = 1_GiB + 625_MiB;
static constexpr size_t GC_EMERGENCY_COUNTS = 2;
#else
static constexpr s64 TARGET_THRESHOLD = 4_GiB;
static constexpr s64 DEFAULT_EXPECTED_MEMORY = 1_GiB + 125_MiB;
static constexpr s64 DEFAULT_CRITICAL_MEMORY = 1_GiB + 625_MiB;
static constexpr size_t GC_EMERGENCY_COUNTS = 2;
#endif
using Runtime = typename P::Runtime;
using Image = typename P::Image;
@ -479,7 +486,11 @@ private:
};
Common::LeastRecentlyUsedCache<LRUItemParams> lru_cache;
#ifdef ANDROID
static constexpr size_t TICKS_TO_DESTROY = 6;
#else
static constexpr size_t TICKS_TO_DESTROY = 8;
#endif
DelayedDestructionRing<Image, TICKS_TO_DESTROY> sentenced_images;
DelayedDestructionRing<ImageView, TICKS_TO_DESTROY> sentenced_image_view;
DelayedDestructionRing<Framebuffer, TICKS_TO_DESTROY> sentenced_framebuffers;

View file

@ -325,4 +325,234 @@ namespace Vulkan {
return MemoryCommit(allocator, a, info);
}
MemoryAllocator* const allocator; ///< Parent memory allocation.
const vk::DeviceMemory memory; ///< Vulkan memory allocation handler.
const u64 allocation_size; ///< Size of this allocation.
const VkMemoryPropertyFlags property_flags; ///< Vulkan memory property flags.
const u32 shifted_memory_type; ///< Shifted Vulkan memory type.
std::vector<Range> commits; ///< All commit ranges done from this allocation.
std::span<u8> memory_mapped_span; ///< Memory mapped span. Empty if not queried before.
};
MemoryCommit::MemoryCommit(MemoryAllocation* allocation_, VkDeviceMemory memory_, u64 begin_,
u64 end_) noexcept
: allocation{allocation_}, memory{memory_}, begin{begin_}, end{end_} {}
MemoryCommit::~MemoryCommit() {
Release();
}
MemoryCommit& MemoryCommit::operator=(MemoryCommit&& rhs) noexcept {
Release();
allocation = std::exchange(rhs.allocation, nullptr);
memory = rhs.memory;
begin = rhs.begin;
end = rhs.end;
span = std::exchange(rhs.span, std::span<u8>{});
return *this;
}
MemoryCommit::MemoryCommit(MemoryCommit&& rhs) noexcept
: allocation{std::exchange(rhs.allocation, nullptr)}, memory{rhs.memory}, begin{rhs.begin},
end{rhs.end}, span{std::exchange(rhs.span, std::span<u8>{})} {}
std::span<u8> MemoryCommit::Map() {
if (span.empty()) {
span = allocation->Map().subspan(begin, end - begin);
}
return span;
}
void MemoryCommit::Release() {
if (allocation) {
allocation->Free(begin);
}
}
MemoryAllocator::MemoryAllocator(const Device& device_)
: device{device_}, allocator{device.GetAllocator()},
properties{device_.GetPhysical().GetMemoryProperties().memoryProperties},
buffer_image_granularity{
device_.GetPhysical().GetProperties().limits.bufferImageGranularity} {
// GPUs not supporting rebar may only have a region with less than 256MB host visible/device
// local memory. In that case, opening 2 RenderDoc captures side-by-side is not possible due to
// the heap running out of memory. With RenderDoc attached and only a small host/device region,
// only allow the stream buffer in this memory heap.
if (device.HasDebuggingToolAttached()) {
using namespace Common::Literals;
ForEachDeviceLocalHostVisibleHeap(device, [this](size_t index, VkMemoryHeap& heap) {
if (heap.size <= 256_MiB) {
valid_memory_types &= ~(1u << index);
}
});
}
}
MemoryAllocator::~MemoryAllocator() = default;
vk::Image MemoryAllocator::CreateImage(const VkImageCreateInfo& ci) const {
const VmaAllocationCreateInfo alloc_ci = {
.flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT,
.usage = VMA_MEMORY_USAGE_AUTO_PREFER_DEVICE,
.requiredFlags = 0,
.preferredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
.memoryTypeBits = 0,
.pool = VK_NULL_HANDLE,
.pUserData = nullptr,
.priority = 0.f,
};
VkImage handle{};
VmaAllocation allocation{};
vk::Check(vmaCreateImage(allocator, &ci, &alloc_ci, &handle, &allocation, nullptr));
return vk::Image(handle, ci.usage, *device.GetLogical(), allocator, allocation,
device.GetDispatchLoader());
}
vk::Buffer MemoryAllocator::CreateBuffer(const VkBufferCreateInfo& ci, MemoryUsage usage) const {
const VmaAllocationCreateInfo alloc_ci = {
.flags = VMA_ALLOCATION_CREATE_WITHIN_BUDGET_BIT | MemoryUsageVmaFlags(usage),
.usage = MemoryUsageVma(usage),
.requiredFlags = 0,
.preferredFlags = MemoryUsagePreferredVmaFlags(usage),
.memoryTypeBits = usage == MemoryUsage::Stream ? 0u : valid_memory_types,
.pool = VK_NULL_HANDLE,
.pUserData = nullptr,
.priority = 0.f,
};
VkBuffer handle{};
VmaAllocationInfo alloc_info{};
VmaAllocation allocation{};
VkMemoryPropertyFlags property_flags{};
VkResult result = vmaCreateBuffer(allocator, &ci, &alloc_ci, &handle, &allocation, &alloc_info);
if (result == VK_ERROR_OUT_OF_DEVICE_MEMORY) {
LOG_ERROR(Render_Vulkan, "Out of memory creating buffer (size: {})", ci.size);
}
vmaGetAllocationMemoryProperties(allocator, allocation, &property_flags);
u8* data = reinterpret_cast<u8*>(alloc_info.pMappedData);
const std::span<u8> mapped_data = data ? std::span<u8>{data, ci.size} : std::span<u8>{};
const bool is_coherent = property_flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
return vk::Buffer(handle, *device.GetLogical(), allocator, allocation, mapped_data, is_coherent,
device.GetDispatchLoader());
}
MemoryCommit MemoryAllocator::Commit(const VkMemoryRequirements& requirements, MemoryUsage usage) {
// Find the fastest memory flags we can afford with the current requirements
const u32 type_mask = requirements.memoryTypeBits;
const VkMemoryPropertyFlags usage_flags = MemoryUsagePropertyFlags(usage);
const VkMemoryPropertyFlags flags = MemoryPropertyFlags(type_mask, usage_flags);
if (std::optional<MemoryCommit> commit = TryCommit(requirements, flags)) {
return std::move(*commit);
}
// Commit has failed, allocate more memory.
const u64 chunk_size = AllocationChunkSize(requirements.size);
if (!TryAllocMemory(flags, type_mask, chunk_size)) {
// TODO(Rodrigo): Handle out of memory situations in some way like flushing to guest memory.
throw vk::Exception(VK_ERROR_OUT_OF_DEVICE_MEMORY);
}
// Commit again, this time it won't fail since there's a fresh allocation above.
// If it does, there's a bug.
return TryCommit(requirements, flags).value();
}
bool MemoryAllocator::TryAllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, u64 size) {
const auto type_opt = FindType(flags, type_mask);
if (!type_opt) {
return false;
}
// Adreno stands firm
const u64 aligned_size = (device.GetDriverID() == VK_DRIVER_ID_QUALCOMM_PROPRIETARY) ?
Common::AlignUp(size, 4096) :
size;
vk::DeviceMemory memory = device.GetLogical().TryAllocateMemory({
.sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,
.pNext = nullptr,
.allocationSize = aligned_size,
.memoryTypeIndex = *type_opt,
});
if (!memory) {
return false;
}
allocations.push_back(
std::make_unique<MemoryAllocation>(this, std::move(memory), flags, aligned_size, *type_opt));
return true;
}
void MemoryAllocator::ReleaseMemory(MemoryAllocation* alloc) {
const auto it = std::ranges::find(allocations, alloc, &std::unique_ptr<MemoryAllocation>::get);
ASSERT(it != allocations.end());
allocations.erase(it);
}
std::optional<MemoryCommit> MemoryAllocator::TryCommit(const VkMemoryRequirements& requirements,
VkMemoryPropertyFlags flags) {
// Conservative, spec-compliant alignment for suballocation
VkDeviceSize eff_align = requirements.alignment;
const auto& limits = device.GetPhysical().GetProperties().limits;
if ((flags & VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT) &&
!(flags & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT)) {
// Non-coherent memory must be invalidated on atom boundary
if (limits.nonCoherentAtomSize > eff_align) eff_align = limits.nonCoherentAtomSize;
}
// Separate buffers to avoid stalls on tilers
if (buffer_image_granularity > eff_align) {
eff_align = buffer_image_granularity;
}
eff_align = std::bit_ceil(eff_align);
for (auto& allocation : allocations) {
if (!allocation->IsCompatible(flags, requirements.memoryTypeBits)) {
continue;
}
if (auto commit = allocation->Commit(requirements.size, eff_align)) {
return commit;
}
}
if ((flags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) != 0) {
// Look for non device local commits on failure
return TryCommit(requirements, flags & ~VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
}
return std::nullopt;
}
VkMemoryPropertyFlags MemoryAllocator::MemoryPropertyFlags(u32 type_mask,
VkMemoryPropertyFlags flags) const {
if (FindType(flags, type_mask)) {
// Found a memory type with those requirements
return flags;
}
if ((flags & VK_MEMORY_PROPERTY_HOST_CACHED_BIT) != 0) {
// Remove host cached bit in case it's not supported
return MemoryPropertyFlags(type_mask, flags & ~VK_MEMORY_PROPERTY_HOST_CACHED_BIT);
}
if ((flags & VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT) != 0) {
// Remove device local, if it's not supported by the requested resource
return MemoryPropertyFlags(type_mask, flags & ~VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
}
ASSERT_MSG(false, "No compatible memory types found");
return 0;
}
std::optional<u32> MemoryAllocator::FindType(VkMemoryPropertyFlags flags, u32 type_mask) const {
for (u32 type_index = 0; type_index < properties.memoryTypeCount; ++type_index) {
const VkMemoryPropertyFlags type_flags = properties.memoryTypes[type_index].propertyFlags;
if ((type_mask & (1U << type_index)) != 0 && (type_flags & flags) == flags) {
// The type matches in type and in the wanted properties.
return type_index;
}
}
// Failed to find index
return std::nullopt;
}
} // namespace Vulkan