From 8cb6e6d5d4a767c406316adcdaa398b12220d671 Mon Sep 17 00:00:00 2001 From: Zephyron Date: Mon, 17 Mar 2025 12:20:38 +1000 Subject: [PATCH] Revert "Android: Implement TLB optimization to prevent deadlocks and improve performance" This reverts commit 21594b73aa78a8b49fe45ba87522ae98f9f87da8. --- .../renderer_vulkan/vk_buffer_cache.cpp | 30 -------------- .../renderer_vulkan/vk_buffer_cache.h | 37 ----------------- .../renderer_vulkan/vk_rasterizer.cpp | 41 +------------------ .../renderer_vulkan/vk_scheduler.cpp | 19 --------- .../renderer_vulkan/vk_texture_cache.cpp | 30 +------------- 5 files changed, 2 insertions(+), 155 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index f1497a0da..e5e1e3ab6 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -1,5 +1,4 @@ // SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project -// SPDX-FileCopyrightText: Copyright 2025 citron Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later #include @@ -328,11 +327,8 @@ BufferCacheRuntime::BufferCacheRuntime(const Device& device_, MemoryAllocator& m DescriptorPool& descriptor_pool) : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_}, staging_pool{staging_pool_}, guest_descriptor_queue{guest_descriptor_queue_}, - accelerate{nullptr}, quad_index_pass(device, scheduler, descriptor_pool, staging_pool, compute_pass_descriptor_queue) { - accelerate = new BufferCacheAccelerator(); - if (device.GetDriverID() != VK_DRIVER_ID_QUALCOMM_PROPRIETARY) { // TODO: FixMe: Uint8Pass compute shader does not build on some Qualcomm drivers. uint8_pass = std::make_unique(device, scheduler, descriptor_pool, staging_pool, @@ -673,30 +669,4 @@ vk::Buffer BufferCacheRuntime::CreateNullBuffer() { return ret; } -void BufferCacheRuntime::InsertTLBBarrierImpl() { -#ifdef ANDROID - // Create a memory barrier specifically optimized for TLB coherency - // This helps prevent Android-specific deadlocks by ensuring proper - // GPU<->GPU memory coherency without a full pipeline stall - static constexpr VkMemoryBarrier TLB_BARRIER{ - .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT, - .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, - }; - - scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([](vk::CommandBuffer cmdbuf) { - cmdbuf.PipelineBarrier( - VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, - VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, - 0, TLB_BARRIER, {}, {}); - }); -#endif -} - -BufferCacheRuntime::~BufferCacheRuntime() { - delete accelerate; -} - } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index e7a401246..efe960258 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -1,5 +1,4 @@ // SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project -// SPDX-FileCopyrightText: Copyright 2025 citron Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later #pragma once @@ -23,21 +22,6 @@ class Scheduler; struct HostVertexBinding; class BufferCacheRuntime; -class BufferCacheAccelerator; - -struct OverlapResult { - bool has_stream_buffer; - bool has_written_buffer; -}; - -class BufferCacheAccelerator { -public: - OverlapResult CheckRangeOverlaps(DAddr addr, u64 size) { - // Simple implementation - assume there are overlaps - // This can be expanded with actual buffer tracking if needed - return OverlapResult{true, true}; - } -}; class Buffer : public VideoCommon::BufferBase { public: @@ -96,7 +80,6 @@ public: GuestDescriptorQueue& guest_descriptor_queue, ComputePassDescriptorQueue& compute_pass_descriptor_queue, DescriptorPool& descriptor_pool); - ~BufferCacheRuntime(); void TickFrame(Common::SlotVector& slot_buffers) noexcept; @@ -162,22 +145,6 @@ public: guest_descriptor_queue.AddTexelBuffer(buffer.View(offset, size, format)); } - /// TLB-aware memory barrier to prevent deadlocks, particularly on Android - void InsertTLBBarrier(DAddr addr, u64 size) { - // This provides a more precise way to synchronize memory - // without causing unnecessary TLB invalidations -#ifdef ANDROID - std::scoped_lock lock{mutex}; - OverlapResult result = accelerate->CheckRangeOverlaps(addr, size); - if (!result.has_stream_buffer && !result.has_written_buffer) { - // If no overlap with active memory, skip barrier to maintain TLB entries - return; - } - - InsertTLBBarrierImpl(); -#endif - } - private: void BindBuffer(VkBuffer buffer, u32 offset, u32 size) { guest_descriptor_queue.AddBuffer(buffer, offset, size); @@ -185,7 +152,6 @@ private: void ReserveNullBuffer(); vk::Buffer CreateNullBuffer(); - void InsertTLBBarrierImpl(); const Device& device; MemoryAllocator& memory_allocator; @@ -198,9 +164,6 @@ private: vk::Buffer null_buffer; - std::mutex mutex; - BufferCacheAccelerator* accelerate; - std::unique_ptr uint8_pass; QuadIndexedPass quad_index_pass; }; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index d1260b365..cd0b255a0 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -718,34 +718,7 @@ void RasterizerVulkan::FlushAndInvalidateRegion(DAddr addr, u64 size, if (Settings::IsGPULevelExtreme()) { FlushRegion(addr, size, which); } - - // TLB optimization to avoid redundant flushing and potential deadlocks - static constexpr size_t TLB_CACHE_SIZE = 128; - static std::array, TLB_CACHE_SIZE> tlb_cache; - static size_t tlb_cache_index = 0; - static std::mutex tlb_mutex; - - { - std::scoped_lock lock{tlb_mutex}; - // Check if this region is already in our TLB cache - bool found_in_tlb = false; - for (const auto& entry : tlb_cache) { - if (entry.first <= addr && addr + size <= entry.first + entry.second) { - // This region is already in our TLB cache, no need to flush - found_in_tlb = true; - break; - } - } - - if (!found_in_tlb) { - // Add to TLB cache - tlb_cache[tlb_cache_index] = {addr, size}; - tlb_cache_index = (tlb_cache_index + 1) % TLB_CACHE_SIZE; - - // Proceed with normal invalidation - InvalidateRegion(addr, size, which); - } - } + InvalidateRegion(addr, size, which); } void RasterizerVulkan::WaitForIdle() { @@ -875,18 +848,6 @@ void RasterizerVulkan::LoadDiskResources(u64 title_id, std::stop_token stop_load void RasterizerVulkan::FlushWork() { #ifdef ANDROID static constexpr u32 DRAWS_TO_DISPATCH = 1024; - - // Android-specific TLB optimization to prevent deadlocks - // This limits the maximum number of outstanding memory operations to avoid TLB thrashing - static constexpr u32 MAX_TLB_OPERATIONS = 64; - static u32 tlb_operation_counter = 0; - - if (++tlb_operation_counter >= MAX_TLB_OPERATIONS) { - // Force a flush to ensure memory operations complete - scheduler.Flush(); - scheduler.WaitIdle(); // Make sure all operations complete to clear TLB state - tlb_operation_counter = 0; - } #else static constexpr u32 DRAWS_TO_DISPATCH = 4096; #endif // ANDROID diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 9928efba5..146923db4 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -1,5 +1,4 @@ // SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project -// SPDX-FileCopyrightText: Copyright 2025 citron Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later #include @@ -282,24 +281,6 @@ void Scheduler::EndPendingOperations() { // This is problematic on Android, disable on GPU Normal. // query_cache->DisableStreams(); } - - // Add TLB-aware memory barrier handling for Android - // This reduces the likelihood of deadlocks due to memory stalls - static constexpr VkMemoryBarrier TLB_OPTIMIZED_BARRIER{ - .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT, - // Only use necessary access flags to avoid full TLB flush - .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_SHADER_READ_BIT, - }; - - Record([barrier = TLB_OPTIMIZED_BARRIER](vk::CommandBuffer cmdbuf) { - // Use a more specific pipeline stage for better performance - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, - VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, - 0, barrier); - }); #else // query_cache->DisableStreams(); #endif diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index b639c34e3..bcccb0af8 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1677,35 +1677,7 @@ bool TextureCacheRuntime::CanReportMemoryUsage() const { return device.CanReportMemoryUsage(); } -void TextureCacheRuntime::TickFrame() { - // Implement TLB prefetching for better memory access patterns - // This helps avoid the 0.0 FPS deadlock issues on Android - static std::vector tlb_prefetch_offsets; - static std::vector tlb_prefetch_sizes; - static std::vector tlb_prefetch_barriers; - - // Clear previous frame's data - tlb_prefetch_offsets.clear(); - tlb_prefetch_sizes.clear(); - tlb_prefetch_barriers.clear(); - -#ifdef ANDROID - // Prefetch commonly accessed texture memory regions - // This helps the TLB maintain a more stable state and prevents cache thrashing - scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([this](vk::CommandBuffer cmdbuf) { - if (!tlb_prefetch_barriers.empty()) { - cmdbuf.PipelineBarrier( - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | VK_PIPELINE_STAGE_VERTEX_SHADER_BIT, - 0, - vk::Span{}, - vk::Span{}, - vk::Span(tlb_prefetch_barriers.data(), tlb_prefetch_barriers.size())); - } - }); -#endif -} +void TextureCacheRuntime::TickFrame() {} Image::Image(TextureCacheRuntime& runtime_, const ImageInfo& info_, GPUVAddr gpu_addr_, VAddr cpu_addr_)