diff options
author | liamwhite <liamwhite@users.noreply.github.com> | 2023-06-07 20:03:57 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2023-06-07 20:03:57 +0200 |
commit | cfb76d8f3ed8862bc341afeaf6d25a401e2976cf (patch) | |
tree | 30098242f24010db0da3cd6152a4a716d739b20d /src | |
parent | Merge pull request #10583 from ameerj/ill-logic (diff) | |
parent | gl_staging_buffers: Optimization to reduce fence waiting (diff) | |
download | yuzu-cfb76d8f3ed8862bc341afeaf6d25a401e2976cf.tar yuzu-cfb76d8f3ed8862bc341afeaf6d25a401e2976cf.tar.gz yuzu-cfb76d8f3ed8862bc341afeaf6d25a401e2976cf.tar.bz2 yuzu-cfb76d8f3ed8862bc341afeaf6d25a401e2976cf.tar.lz yuzu-cfb76d8f3ed8862bc341afeaf6d25a401e2976cf.tar.xz yuzu-cfb76d8f3ed8862bc341afeaf6d25a401e2976cf.tar.zst yuzu-cfb76d8f3ed8862bc341afeaf6d25a401e2976cf.zip |
Diffstat (limited to 'src')
-rw-r--r-- | src/video_core/CMakeLists.txt | 4 | ||||
-rw-r--r-- | src/video_core/buffer_cache/buffer_cache.h | 10 | ||||
-rw-r--r-- | src/video_core/buffer_cache/buffer_cache_base.h | 1 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_buffer_cache.cpp | 58 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_buffer_cache.h | 29 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_rasterizer.cpp | 6 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_rasterizer.h | 1 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_staging_buffer_pool.cpp | 150 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_staging_buffer_pool.h (renamed from src/video_core/renderer_opengl/gl_stream_buffer.h) | 44 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_stream_buffer.cpp | 63 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_texture_cache.cpp | 87 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/gl_texture_cache.h | 47 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/util_shaders.cpp | 9 | ||||
-rw-r--r-- | src/video_core/renderer_opengl/util_shaders.h | 10 | ||||
-rw-r--r-- | src/video_core/renderer_vulkan/vk_buffer_cache.h | 1 |
15 files changed, 316 insertions, 204 deletions
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 94e3000ba..bf6439530 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -133,8 +133,8 @@ add_library(video_core STATIC renderer_opengl/gl_shader_util.h renderer_opengl/gl_state_tracker.cpp renderer_opengl/gl_state_tracker.h - renderer_opengl/gl_stream_buffer.cpp - renderer_opengl/gl_stream_buffer.h + renderer_opengl/gl_staging_buffer_pool.cpp + renderer_opengl/gl_staging_buffer_pool.h renderer_opengl/gl_texture_cache.cpp renderer_opengl/gl_texture_cache.h renderer_opengl/gl_texture_cache_base.cpp diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index f1ad5f7cb..2f281b370 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -478,7 +478,6 @@ void BufferCache<P>::CommitAsyncFlushesHigh() { if (committed_ranges.empty()) { if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { - async_buffers.emplace_back(std::optional<Async_Buffer>{}); } return; @@ -539,7 +538,6 @@ void BufferCache<P>::CommitAsyncFlushesHigh() { committed_ranges.clear(); if (downloads.empty()) { if constexpr (IMPLEMENTS_ASYNC_DOWNLOADS) { - async_buffers.emplace_back(std::optional<Async_Buffer>{}); } return; @@ -691,7 +689,7 @@ void BufferCache<P>::BindHostIndexBuffer() { const u32 size = channel_state->index_buffer.size; const auto& draw_state = maxwell3d->draw_manager->GetDrawState(); if (!draw_state.inline_index_draw_indexes.empty()) [[unlikely]] { - if constexpr (USE_MEMORY_MAPS) { + if constexpr (USE_MEMORY_MAPS_FOR_UPLOADS) { auto upload_staging = runtime.UploadStagingBuffer(size); std::array<BufferCopy, 1> copies{ {BufferCopy{.src_offset = upload_staging.offset, .dst_offset = 0, .size = size}}}; @@ -1462,7 +1460,7 @@ bool BufferCache<P>::SynchronizeBufferNoModified(Buffer& buffer, VAddr cpu_addr, template <class P> void BufferCache<P>::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, std::span<BufferCopy> copies) { - if constexpr (USE_MEMORY_MAPS) { + if constexpr (USE_MEMORY_MAPS_FOR_UPLOADS) { MappedUploadMemory(buffer, total_size_bytes, copies); } else { ImmediateUploadMemory(buffer, largest_copy, copies); @@ -1473,7 +1471,7 @@ template <class P> void BufferCache<P>::ImmediateUploadMemory([[maybe_unused]] Buffer& buffer, [[maybe_unused]] u64 largest_copy, [[maybe_unused]] std::span<const BufferCopy> copies) { - if constexpr (!USE_MEMORY_MAPS) { + if constexpr (!USE_MEMORY_MAPS_FOR_UPLOADS) { std::span<u8> immediate_buffer; for (const BufferCopy& copy : copies) { std::span<const u8> upload_span; @@ -1532,7 +1530,7 @@ bool BufferCache<P>::InlineMemory(VAddr dest_address, size_t copy_size, auto& buffer = slot_buffers[buffer_id]; SynchronizeBuffer(buffer, dest_address, static_cast<u32>(copy_size)); - if constexpr (USE_MEMORY_MAPS) { + if constexpr (USE_MEMORY_MAPS_FOR_UPLOADS) { auto upload_staging = runtime.UploadStagingBuffer(copy_size); std::array copies{BufferCopy{ .src_offset = upload_staging.offset, diff --git a/src/video_core/buffer_cache/buffer_cache_base.h b/src/video_core/buffer_cache/buffer_cache_base.h index c689fe06b..60a1f285e 100644 --- a/src/video_core/buffer_cache/buffer_cache_base.h +++ b/src/video_core/buffer_cache/buffer_cache_base.h @@ -173,6 +173,7 @@ class BufferCache : public VideoCommon::ChannelSetupCaches<BufferCacheChannelInf static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS; static constexpr bool SEPARATE_IMAGE_BUFFERS_BINDINGS = P::SEPARATE_IMAGE_BUFFER_BINDINGS; static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = P::IMPLEMENTS_ASYNC_DOWNLOADS; + static constexpr bool USE_MEMORY_MAPS_FOR_UPLOADS = P::USE_MEMORY_MAPS_FOR_UPLOADS; static constexpr s64 DEFAULT_EXPECTED_MEMORY = 512_MiB; static constexpr s64 DEFAULT_CRITICAL_MEMORY = 1_GiB; diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index 6d3bda192..c419714d4 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -106,8 +106,10 @@ GLuint Buffer::View(u32 offset, u32 size, PixelFormat format) { return views.back().texture.handle; } -BufferCacheRuntime::BufferCacheRuntime(const Device& device_) - : device{device_}, has_fast_buffer_sub_data{device.HasFastBufferSubData()}, +BufferCacheRuntime::BufferCacheRuntime(const Device& device_, + StagingBufferPool& staging_buffer_pool_) + : device{device_}, staging_buffer_pool{staging_buffer_pool_}, + has_fast_buffer_sub_data{device.HasFastBufferSubData()}, use_assembly_shaders{device.UseAssemblyShaders()}, has_unified_vertex_buffers{device.HasVertexBufferUnifiedMemory()}, stream_buffer{has_fast_buffer_sub_data ? std::nullopt : std::make_optional<StreamBuffer>()} { @@ -140,6 +142,14 @@ BufferCacheRuntime::BufferCacheRuntime(const Device& device_) }(); } +StagingBufferMap BufferCacheRuntime::UploadStagingBuffer(size_t size) { + return staging_buffer_pool.RequestUploadBuffer(size); +} + +StagingBufferMap BufferCacheRuntime::DownloadStagingBuffer(size_t size) { + return staging_buffer_pool.RequestDownloadBuffer(size); +} + u64 BufferCacheRuntime::GetDeviceMemoryUsage() const { if (device.CanReportMemoryUsage()) { return device_access_memory - device.GetCurrentDedicatedVideoMemory(); @@ -147,15 +157,49 @@ u64 BufferCacheRuntime::GetDeviceMemoryUsage() const { return 2_GiB; } -void BufferCacheRuntime::CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer, - std::span<const VideoCommon::BufferCopy> copies) { +void BufferCacheRuntime::CopyBuffer(GLuint dst_buffer, GLuint src_buffer, + std::span<const VideoCommon::BufferCopy> copies, bool barrier) { + if (barrier) { + PreCopyBarrier(); + } for (const VideoCommon::BufferCopy& copy : copies) { - glCopyNamedBufferSubData( - src_buffer.Handle(), dst_buffer.Handle(), static_cast<GLintptr>(copy.src_offset), - static_cast<GLintptr>(copy.dst_offset), static_cast<GLsizeiptr>(copy.size)); + glCopyNamedBufferSubData(src_buffer, dst_buffer, static_cast<GLintptr>(copy.src_offset), + static_cast<GLintptr>(copy.dst_offset), + static_cast<GLsizeiptr>(copy.size)); + } + if (barrier) { + PostCopyBarrier(); } } +void BufferCacheRuntime::CopyBuffer(GLuint dst_buffer, Buffer& src_buffer, + std::span<const VideoCommon::BufferCopy> copies, bool barrier) { + CopyBuffer(dst_buffer, src_buffer.Handle(), copies, barrier); +} + +void BufferCacheRuntime::CopyBuffer(Buffer& dst_buffer, GLuint src_buffer, + std::span<const VideoCommon::BufferCopy> copies, bool barrier) { + CopyBuffer(dst_buffer.Handle(), src_buffer, copies, barrier); +} + +void BufferCacheRuntime::CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer, + std::span<const VideoCommon::BufferCopy> copies) { + CopyBuffer(dst_buffer.Handle(), src_buffer.Handle(), copies); +} + +void BufferCacheRuntime::PreCopyBarrier() { + // TODO: finer grained barrier? + glMemoryBarrier(GL_ALL_BARRIER_BITS); +} + +void BufferCacheRuntime::PostCopyBarrier() { + glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT | GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT); +} + +void BufferCacheRuntime::Finish() { + glFinish(); +} + void BufferCacheRuntime::ClearBuffer(Buffer& dest_buffer, u32 offset, size_t size, u32 value) { glClearNamedBufferSubData(dest_buffer.Handle(), GL_R32UI, static_cast<GLintptr>(offset), static_cast<GLsizeiptr>(size), GL_RED, GL_UNSIGNED_INT, &value); diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index 18d3c3ac0..a24991585 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -12,7 +12,7 @@ #include "video_core/rasterizer_interface.h" #include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_resource_manager.h" -#include "video_core/renderer_opengl/gl_stream_buffer.h" +#include "video_core/renderer_opengl/gl_staging_buffer_pool.h" namespace OpenGL { @@ -60,11 +60,28 @@ class BufferCacheRuntime { public: static constexpr u8 INVALID_BINDING = std::numeric_limits<u8>::max(); - explicit BufferCacheRuntime(const Device& device_); + explicit BufferCacheRuntime(const Device& device_, StagingBufferPool& staging_buffer_pool_); + + [[nodiscard]] StagingBufferMap UploadStagingBuffer(size_t size); + + [[nodiscard]] StagingBufferMap DownloadStagingBuffer(size_t size); + + void CopyBuffer(GLuint dst_buffer, GLuint src_buffer, + std::span<const VideoCommon::BufferCopy> copies, bool barrier = true); + + void CopyBuffer(GLuint dst_buffer, Buffer& src_buffer, + std::span<const VideoCommon::BufferCopy> copies, bool barrier = true); + + void CopyBuffer(Buffer& dst_buffer, GLuint src_buffer, + std::span<const VideoCommon::BufferCopy> copies, bool barrier = true); void CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer, std::span<const VideoCommon::BufferCopy> copies); + void PreCopyBarrier(); + void PostCopyBarrier(); + void Finish(); + void ClearBuffer(Buffer& dest_buffer, u32 offset, size_t size, u32 value); void BindIndexBuffer(Buffer& buffer, u32 offset, u32 size); @@ -169,6 +186,7 @@ private: }; const Device& device; + StagingBufferPool& staging_buffer_pool; bool has_fast_buffer_sub_data = false; bool use_assembly_shaders = false; @@ -201,7 +219,7 @@ private: struct BufferCacheParams { using Runtime = OpenGL::BufferCacheRuntime; using Buffer = OpenGL::Buffer; - using Async_Buffer = u32; + using Async_Buffer = OpenGL::StagingBufferMap; using MemoryTracker = VideoCommon::MemoryTrackerBase<VideoCore::RasterizerInterface>; static constexpr bool IS_OPENGL = true; @@ -209,9 +227,12 @@ struct BufferCacheParams { static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = true; static constexpr bool NEEDS_BIND_UNIFORM_INDEX = true; static constexpr bool NEEDS_BIND_STORAGE_INDEX = true; - static constexpr bool USE_MEMORY_MAPS = false; + static constexpr bool USE_MEMORY_MAPS = true; static constexpr bool SEPARATE_IMAGE_BUFFER_BINDINGS = true; static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = false; + + // TODO: Investigate why OpenGL seems to perform worse with persistently mapped buffer uploads + static constexpr bool USE_MEMORY_MAPS_FOR_UPLOADS = false; }; using BufferCache = VideoCommon::BufferCache<BufferCacheParams>; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index f5baa0f3c..fc711c44a 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -24,6 +24,7 @@ #include "video_core/renderer_opengl/gl_query_cache.h" #include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_shader_cache.h" +#include "video_core/renderer_opengl/gl_staging_buffer_pool.h" #include "video_core/renderer_opengl/gl_texture_cache.h" #include "video_core/renderer_opengl/maxwell_to_gl.h" #include "video_core/renderer_opengl/renderer_opengl.h" @@ -58,8 +59,9 @@ RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra StateTracker& state_tracker_) : RasterizerAccelerated(cpu_memory_), gpu(gpu_), device(device_), screen_info(screen_info_), program_manager(program_manager_), state_tracker(state_tracker_), - texture_cache_runtime(device, program_manager, state_tracker), - texture_cache(texture_cache_runtime, *this), buffer_cache_runtime(device), + texture_cache_runtime(device, program_manager, state_tracker, staging_buffer_pool), + texture_cache(texture_cache_runtime, *this), + buffer_cache_runtime(device, staging_buffer_pool), buffer_cache(*this, cpu_memory_, buffer_cache_runtime), shader_cache(*this, emu_window_, device, texture_cache, buffer_cache, program_manager, state_tracker, gpu.ShaderNotify()), diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 410d8ffc5..a73ad15c1 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -230,6 +230,7 @@ private: ProgramManager& program_manager; StateTracker& state_tracker; + StagingBufferPool staging_buffer_pool; TextureCacheRuntime texture_cache_runtime; TextureCache texture_cache; BufferCacheRuntime buffer_cache_runtime; diff --git a/src/video_core/renderer_opengl/gl_staging_buffer_pool.cpp b/src/video_core/renderer_opengl/gl_staging_buffer_pool.cpp new file mode 100644 index 000000000..bbb06e51f --- /dev/null +++ b/src/video_core/renderer_opengl/gl_staging_buffer_pool.cpp @@ -0,0 +1,150 @@ +// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <array> +#include <memory> +#include <span> + +#include <glad/glad.h> + +#include "common/alignment.h" +#include "common/assert.h" +#include "common/bit_util.h" +#include "common/microprofile.h" +#include "video_core/renderer_opengl/gl_staging_buffer_pool.h" + +MICROPROFILE_DEFINE(OpenGL_BufferRequest, "OpenGL", "BufferRequest", MP_RGB(128, 128, 192)); + +namespace OpenGL { + +StagingBufferMap::~StagingBufferMap() { + if (sync) { + sync->Create(); + } +} + +StagingBuffers::StagingBuffers(GLenum storage_flags_, GLenum map_flags_) + : storage_flags{storage_flags_}, map_flags{map_flags_} {} + +StagingBuffers::~StagingBuffers() = default; + +StagingBufferMap StagingBuffers::RequestMap(size_t requested_size, bool insert_fence) { + MICROPROFILE_SCOPE(OpenGL_BufferRequest); + + const size_t index = RequestBuffer(requested_size); + OGLSync* const sync = insert_fence ? &syncs[index] : nullptr; + sync_indices[index] = insert_fence ? ++current_sync_index : 0; + return StagingBufferMap{ + .mapped_span = std::span(maps[index], requested_size), + .sync = sync, + .buffer = buffers[index].handle, + }; +} + +size_t StagingBuffers::RequestBuffer(size_t requested_size) { + if (const std::optional<size_t> index = FindBuffer(requested_size); index) { + return *index; + } + + OGLBuffer& buffer = buffers.emplace_back(); + buffer.Create(); + const auto next_pow2_size = Common::NextPow2(requested_size); + glNamedBufferStorage(buffer.handle, next_pow2_size, nullptr, + storage_flags | GL_MAP_PERSISTENT_BIT); + maps.push_back(static_cast<u8*>(glMapNamedBufferRange(buffer.handle, 0, next_pow2_size, + map_flags | GL_MAP_PERSISTENT_BIT))); + syncs.emplace_back(); + sync_indices.emplace_back(); + sizes.push_back(next_pow2_size); + + ASSERT(syncs.size() == buffers.size() && buffers.size() == maps.size() && + maps.size() == sizes.size()); + + return buffers.size() - 1; +} + +std::optional<size_t> StagingBuffers::FindBuffer(size_t requested_size) { + size_t known_unsignaled_index = current_sync_index + 1; + size_t smallest_buffer = std::numeric_limits<size_t>::max(); + std::optional<size_t> found; + const size_t num_buffers = sizes.size(); + for (size_t index = 0; index < num_buffers; ++index) { + const size_t buffer_size = sizes[index]; + if (buffer_size < requested_size || buffer_size >= smallest_buffer) { + continue; + } + if (syncs[index].handle != 0) { + if (sync_indices[index] >= known_unsignaled_index) { + // This fence is later than a fence that is known to not be signaled + continue; + } + if (!syncs[index].IsSignaled()) { + // Since this fence hasn't been signaled, it's safe to assume all later + // fences haven't been signaled either + known_unsignaled_index = std::min(known_unsignaled_index, sync_indices[index]); + continue; + } + syncs[index].Release(); + } + smallest_buffer = buffer_size; + found = index; + } + return found; +} + +StreamBuffer::StreamBuffer() { + static constexpr GLenum flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT; + buffer.Create(); + glObjectLabel(GL_BUFFER, buffer.handle, -1, "Stream Buffer"); + glNamedBufferStorage(buffer.handle, STREAM_BUFFER_SIZE, nullptr, flags); + mapped_pointer = + static_cast<u8*>(glMapNamedBufferRange(buffer.handle, 0, STREAM_BUFFER_SIZE, flags)); + for (OGLSync& sync : fences) { + sync.Create(); + } +} + +std::pair<std::span<u8>, size_t> StreamBuffer::Request(size_t size) noexcept { + ASSERT(size < REGION_SIZE); + for (size_t region = Region(used_iterator), region_end = Region(iterator); region < region_end; + ++region) { + fences[region].Create(); + } + used_iterator = iterator; + + for (size_t region = Region(free_iterator) + 1, + region_end = std::min(Region(iterator + size) + 1, NUM_SYNCS); + region < region_end; ++region) { + glClientWaitSync(fences[region].handle, 0, GL_TIMEOUT_IGNORED); + fences[region].Release(); + } + if (iterator + size >= free_iterator) { + free_iterator = iterator + size; + } + if (iterator + size > STREAM_BUFFER_SIZE) { + for (size_t region = Region(used_iterator); region < NUM_SYNCS; ++region) { + fences[region].Create(); + } + used_iterator = 0; + iterator = 0; + free_iterator = size; + + for (size_t region = 0, region_end = Region(size); region <= region_end; ++region) { + glClientWaitSync(fences[region].handle, 0, GL_TIMEOUT_IGNORED); + fences[region].Release(); + } + } + const size_t offset = iterator; + iterator = Common::AlignUp(iterator + size, MAX_ALIGNMENT); + return {std::span(mapped_pointer + offset, size), offset}; +} + +StagingBufferMap StagingBufferPool::RequestUploadBuffer(size_t size) { + return upload_buffers.RequestMap(size, true); +} + +StagingBufferMap StagingBufferPool::RequestDownloadBuffer(size_t size) { + return download_buffers.RequestMap(size, false); +} + +} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_staging_buffer_pool.h index 8fe927aaf..60f72d3a0 100644 --- a/src/video_core/renderer_opengl/gl_stream_buffer.h +++ b/src/video_core/renderer_opengl/gl_staging_buffer_pool.h @@ -4,8 +4,10 @@ #pragma once #include <array> +#include <optional> #include <span> #include <utility> +#include <vector> #include <glad/glad.h> @@ -17,6 +19,35 @@ namespace OpenGL { using namespace Common::Literals; +struct StagingBufferMap { + ~StagingBufferMap(); + + std::span<u8> mapped_span; + size_t offset = 0; + OGLSync* sync; + GLuint buffer; +}; + +struct StagingBuffers { + explicit StagingBuffers(GLenum storage_flags_, GLenum map_flags_); + ~StagingBuffers(); + + StagingBufferMap RequestMap(size_t requested_size, bool insert_fence); + + size_t RequestBuffer(size_t requested_size); + + std::optional<size_t> FindBuffer(size_t requested_size); + + std::vector<OGLSync> syncs; + std::vector<OGLBuffer> buffers; + std::vector<u8*> maps; + std::vector<size_t> sizes; + std::vector<size_t> sync_indices; + GLenum storage_flags; + GLenum map_flags; + size_t current_sync_index = 0; +}; + class StreamBuffer { static constexpr size_t STREAM_BUFFER_SIZE = 64_MiB; static constexpr size_t NUM_SYNCS = 16; @@ -48,4 +79,17 @@ private: std::array<OGLSync, NUM_SYNCS> fences; }; +class StagingBufferPool { +public: + StagingBufferPool() = default; + ~StagingBufferPool() = default; + + StagingBufferMap RequestUploadBuffer(size_t size); + StagingBufferMap RequestDownloadBuffer(size_t size); + +private: + StagingBuffers upload_buffers{GL_MAP_WRITE_BIT, GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT}; + StagingBuffers download_buffers{GL_MAP_READ_BIT | GL_CLIENT_STORAGE_BIT, GL_MAP_READ_BIT}; +}; + } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp deleted file mode 100644 index 2005c8993..000000000 --- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp +++ /dev/null @@ -1,63 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2021 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include <array> -#include <memory> -#include <span> - -#include <glad/glad.h> - -#include "common/alignment.h" -#include "common/assert.h" -#include "video_core/renderer_opengl/gl_stream_buffer.h" - -namespace OpenGL { - -StreamBuffer::StreamBuffer() { - static constexpr GLenum flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT; - buffer.Create(); - glObjectLabel(GL_BUFFER, buffer.handle, -1, "Stream Buffer"); - glNamedBufferStorage(buffer.handle, STREAM_BUFFER_SIZE, nullptr, flags); - mapped_pointer = - static_cast<u8*>(glMapNamedBufferRange(buffer.handle, 0, STREAM_BUFFER_SIZE, flags)); - for (OGLSync& sync : fences) { - sync.Create(); - } -} - -std::pair<std::span<u8>, size_t> StreamBuffer::Request(size_t size) noexcept { - ASSERT(size < REGION_SIZE); - for (size_t region = Region(used_iterator), region_end = Region(iterator); region < region_end; - ++region) { - fences[region].Create(); - } - used_iterator = iterator; - - for (size_t region = Region(free_iterator) + 1, - region_end = std::min(Region(iterator + size) + 1, NUM_SYNCS); - region < region_end; ++region) { - glClientWaitSync(fences[region].handle, 0, GL_TIMEOUT_IGNORED); - fences[region].Release(); - } - if (iterator + size >= free_iterator) { - free_iterator = iterator + size; - } - if (iterator + size > STREAM_BUFFER_SIZE) { - for (size_t region = Region(used_iterator); region < NUM_SYNCS; ++region) { - fences[region].Create(); - } - used_iterator = 0; - iterator = 0; - free_iterator = size; - - for (size_t region = 0, region_end = Region(size); region <= region_end; ++region) { - glClientWaitSync(fences[region].handle, 0, GL_TIMEOUT_IGNORED); - fences[region].Release(); - } - } - const size_t offset = iterator; - iterator = Common::AlignUp(iterator + size, MAX_ALIGNMENT); - return {std::span(mapped_pointer + offset, size), offset}; -} - -} // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 56d0ff869..1c5dbcdd8 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -456,19 +456,14 @@ OGLTexture MakeImage(const VideoCommon::ImageInfo& info, GLenum gl_internal_form return is_srgb ? GL_SRGB8_ALPHA8 : GL_RGBA8; } } - } // Anonymous namespace -ImageBufferMap::~ImageBufferMap() { - if (sync) { - sync->Create(); - } -} - TextureCacheRuntime::TextureCacheRuntime(const Device& device_, ProgramManager& program_manager, - StateTracker& state_tracker_) - : device{device_}, state_tracker{state_tracker_}, util_shaders(program_manager), - format_conversion_pass{util_shaders}, resolution{Settings::values.resolution_info} { + StateTracker& state_tracker_, + StagingBufferPool& staging_buffer_pool_) + : device{device_}, state_tracker{state_tracker_}, staging_buffer_pool{staging_buffer_pool_}, + util_shaders(program_manager), format_conversion_pass{util_shaders}, + resolution{Settings::values.resolution_info} { static constexpr std::array TARGETS{GL_TEXTURE_1D_ARRAY, GL_TEXTURE_2D_ARRAY, GL_TEXTURE_3D}; for (size_t i = 0; i < TARGETS.size(); ++i) { const GLenum target = TARGETS[i]; @@ -558,12 +553,12 @@ void TextureCacheRuntime::Finish() { glFinish(); } -ImageBufferMap TextureCacheRuntime::UploadStagingBuffer(size_t size) { - return upload_buffers.RequestMap(size, true); +StagingBufferMap TextureCacheRuntime::UploadStagingBuffer(size_t size) { + return staging_buffer_pool.RequestUploadBuffer(size); } -ImageBufferMap TextureCacheRuntime::DownloadStagingBuffer(size_t size) { - return download_buffers.RequestMap(size, false); +StagingBufferMap TextureCacheRuntime::DownloadStagingBuffer(size_t size) { + return staging_buffer_pool.RequestDownloadBuffer(size); } u64 TextureCacheRuntime::GetDeviceMemoryUsage() const { @@ -648,7 +643,7 @@ void TextureCacheRuntime::BlitFramebuffer(Framebuffer* dst, Framebuffer* src, is_linear ? GL_LINEAR : GL_NEAREST); } -void TextureCacheRuntime::AccelerateImageUpload(Image& image, const ImageBufferMap& map, +void TextureCacheRuntime::AccelerateImageUpload(Image& image, const StagingBufferMap& map, std::span<const SwizzleParameters> swizzles) { switch (image.info.type) { case ImageType::e2D: @@ -690,64 +685,6 @@ bool TextureCacheRuntime::HasNativeASTC() const noexcept { return device.HasASTC(); } -TextureCacheRuntime::StagingBuffers::StagingBuffers(GLenum storage_flags_, GLenum map_flags_) - : storage_flags{storage_flags_}, map_flags{map_flags_} {} - -TextureCacheRuntime::StagingBuffers::~StagingBuffers() = default; - -ImageBufferMap TextureCacheRuntime::StagingBuffers::RequestMap(size_t requested_size, - bool insert_fence) { - const size_t index = RequestBuffer(requested_size); - OGLSync* const sync = insert_fence ? &syncs[index] : nullptr; - return ImageBufferMap{ - .mapped_span = std::span(maps[index], requested_size), - .sync = sync, - .buffer = buffers[index].handle, - }; -} - -size_t TextureCacheRuntime::StagingBuffers::RequestBuffer(size_t requested_size) { - if (const std::optional<size_t> index = FindBuffer(requested_size); index) { - return *index; - } - - OGLBuffer& buffer = buffers.emplace_back(); - buffer.Create(); - glNamedBufferStorage(buffer.handle, requested_size, nullptr, - storage_flags | GL_MAP_PERSISTENT_BIT); - maps.push_back(static_cast<u8*>(glMapNamedBufferRange(buffer.handle, 0, requested_size, - map_flags | GL_MAP_PERSISTENT_BIT))); - - syncs.emplace_back(); - sizes.push_back(requested_size); - - ASSERT(syncs.size() == buffers.size() && buffers.size() == maps.size() && - maps.size() == sizes.size()); - - return buffers.size() - 1; -} - -std::optional<size_t> TextureCacheRuntime::StagingBuffers::FindBuffer(size_t requested_size) { - size_t smallest_buffer = std::numeric_limits<size_t>::max(); - std::optional<size_t> found; - const size_t num_buffers = sizes.size(); - for (size_t index = 0; index < num_buffers; ++index) { - const size_t buffer_size = sizes[index]; - if (buffer_size < requested_size || buffer_size >= smallest_buffer) { - continue; - } - if (syncs[index].handle != 0) { - if (!syncs[index].IsSignaled()) { - continue; - } - syncs[index].Release(); - } - smallest_buffer = buffer_size; - found = index; - } - return found; -} - Image::Image(TextureCacheRuntime& runtime_, const VideoCommon::ImageInfo& info_, GPUVAddr gpu_addr_, VAddr cpu_addr_) : VideoCommon::ImageBase(info_, gpu_addr_, cpu_addr_), runtime{&runtime_} { @@ -823,7 +760,7 @@ void Image::UploadMemory(GLuint buffer_handle, size_t buffer_offset, } } -void Image::UploadMemory(const ImageBufferMap& map, +void Image::UploadMemory(const StagingBufferMap& map, std::span<const VideoCommon::BufferImageCopy> copies) { UploadMemory(map.buffer, map.offset, copies); } @@ -870,7 +807,7 @@ void Image::DownloadMemory(std::span<GLuint> buffer_handles, std::span<size_t> b } } -void Image::DownloadMemory(ImageBufferMap& map, +void Image::DownloadMemory(StagingBufferMap& map, std::span<const VideoCommon::BufferImageCopy> copies) { DownloadMemory(map.buffer, map.offset, copies); } diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 3e9b3302b..1148b73d7 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -11,6 +11,7 @@ #include "shader_recompiler/shader_info.h" #include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_resource_manager.h" +#include "video_core/renderer_opengl/gl_staging_buffer_pool.h" #include "video_core/renderer_opengl/util_shaders.h" #include "video_core/texture_cache/image_view_base.h" #include "video_core/texture_cache/texture_cache_base.h" @@ -37,15 +38,6 @@ using VideoCommon::Region2D; using VideoCommon::RenderTargets; using VideoCommon::SlotVector; -struct ImageBufferMap { - ~ImageBufferMap(); - - std::span<u8> mapped_span; - size_t offset = 0; - OGLSync* sync; - GLuint buffer; -}; - struct FormatProperties { GLenum compatibility_class; bool compatibility_by_size; @@ -74,14 +66,15 @@ class TextureCacheRuntime { public: explicit TextureCacheRuntime(const Device& device, ProgramManager& program_manager, - StateTracker& state_tracker); + StateTracker& state_tracker, + StagingBufferPool& staging_buffer_pool); ~TextureCacheRuntime(); void Finish(); - ImageBufferMap UploadStagingBuffer(size_t size); + StagingBufferMap UploadStagingBuffer(size_t size); - ImageBufferMap DownloadStagingBuffer(size_t size); + StagingBufferMap DownloadStagingBuffer(size_t size); u64 GetDeviceLocalMemory() const { return device_access_memory; @@ -120,7 +113,7 @@ public: const Region2D& src_region, Tegra::Engines::Fermi2D::Filter filter, Tegra::Engines::Fermi2D::Operation operation); - void AccelerateImageUpload(Image& image, const ImageBufferMap& map, + void AccelerateImageUpload(Image& image, const StagingBufferMap& map, std::span<const VideoCommon::SwizzleParameters> swizzles); void InsertUploadMemoryBarrier(); @@ -149,35 +142,16 @@ public: } private: - struct StagingBuffers { - explicit StagingBuffers(GLenum storage_flags_, GLenum map_flags_); - ~StagingBuffers(); - - ImageBufferMap RequestMap(size_t requested_size, bool insert_fence); - - size_t RequestBuffer(size_t requested_size); - - std::optional<size_t> FindBuffer(size_t requested_size); - - std::vector<OGLSync> syncs; - std::vector<OGLBuffer> buffers; - std::vector<u8*> maps; - std::vector<size_t> sizes; - GLenum storage_flags; - GLenum map_flags; - }; - const Device& device; StateTracker& state_tracker; + StagingBufferPool& staging_buffer_pool; + UtilShaders util_shaders; FormatConversionPass format_conversion_pass; std::array<std::unordered_map<GLenum, FormatProperties>, 3> format_properties; bool has_broken_texture_view_formats = false; - StagingBuffers upload_buffers{GL_MAP_WRITE_BIT, GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT}; - StagingBuffers download_buffers{GL_MAP_READ_BIT | GL_CLIENT_STORAGE_BIT, GL_MAP_READ_BIT}; - OGLTexture null_image_1d_array; OGLTexture null_image_cube_array; OGLTexture null_image_3d; @@ -213,7 +187,7 @@ public: void UploadMemory(GLuint buffer_handle, size_t buffer_offset, std::span<const VideoCommon::BufferImageCopy> copies); - void UploadMemory(const ImageBufferMap& map, + void UploadMemory(const StagingBufferMap& map, std::span<const VideoCommon::BufferImageCopy> copies); void DownloadMemory(GLuint buffer_handle, size_t buffer_offset, @@ -222,7 +196,8 @@ public: void DownloadMemory(std::span<GLuint> buffer_handle, std::span<size_t> buffer_offset, std::span<const VideoCommon::BufferImageCopy> copies); - void DownloadMemory(ImageBufferMap& map, std::span<const VideoCommon::BufferImageCopy> copies); + void DownloadMemory(StagingBufferMap& map, + std::span<const VideoCommon::BufferImageCopy> copies); GLuint StorageHandle() noexcept; diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp index 2c7ac210b..544982d18 100644 --- a/src/video_core/renderer_opengl/util_shaders.cpp +++ b/src/video_core/renderer_opengl/util_shaders.cpp @@ -19,6 +19,7 @@ #include "video_core/host_shaders/pitch_unswizzle_comp.h" #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_shader_util.h" +#include "video_core/renderer_opengl/gl_staging_buffer_pool.h" #include "video_core/renderer_opengl/gl_texture_cache.h" #include "video_core/renderer_opengl/util_shaders.h" #include "video_core/texture_cache/accelerated_swizzle.h" @@ -63,7 +64,7 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_) UtilShaders::~UtilShaders() = default; -void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, +void UtilShaders::ASTCDecode(Image& image, const StagingBufferMap& map, std::span<const VideoCommon::SwizzleParameters> swizzles) { static constexpr GLuint BINDING_INPUT_BUFFER = 0; static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; @@ -111,7 +112,7 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, program_manager.RestoreGuestCompute(); } -void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, +void UtilShaders::BlockLinearUpload2D(Image& image, const StagingBufferMap& map, std::span<const SwizzleParameters> swizzles) { static constexpr Extent3D WORKGROUP_SIZE{32, 32, 1}; static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0; @@ -148,7 +149,7 @@ void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, program_manager.RestoreGuestCompute(); } -void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, +void UtilShaders::BlockLinearUpload3D(Image& image, const StagingBufferMap& map, std::span<const SwizzleParameters> swizzles) { static constexpr Extent3D WORKGROUP_SIZE{16, 8, 8}; @@ -189,7 +190,7 @@ void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, program_manager.RestoreGuestCompute(); } -void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, +void UtilShaders::PitchUpload(Image& image, const StagingBufferMap& map, std::span<const SwizzleParameters> swizzles) { static constexpr Extent3D WORKGROUP_SIZE{32, 32, 1}; static constexpr GLuint BINDING_INPUT_BUFFER = 0; diff --git a/src/video_core/renderer_opengl/util_shaders.h b/src/video_core/renderer_opengl/util_shaders.h index 9013808e7..feecd404c 100644 --- a/src/video_core/renderer_opengl/util_shaders.h +++ b/src/video_core/renderer_opengl/util_shaders.h @@ -16,23 +16,23 @@ namespace OpenGL { class Image; class ProgramManager; -struct ImageBufferMap; +struct StagingBufferMap; class UtilShaders { public: explicit UtilShaders(ProgramManager& program_manager); ~UtilShaders(); - void ASTCDecode(Image& image, const ImageBufferMap& map, + void ASTCDecode(Image& image, const StagingBufferMap& map, std::span<const VideoCommon::SwizzleParameters> swizzles); - void BlockLinearUpload2D(Image& image, const ImageBufferMap& map, + void BlockLinearUpload2D(Image& image, const StagingBufferMap& map, std::span<const VideoCommon::SwizzleParameters> swizzles); - void BlockLinearUpload3D(Image& image, const ImageBufferMap& map, + void BlockLinearUpload3D(Image& image, const StagingBufferMap& map, std::span<const VideoCommon::SwizzleParameters> swizzles); - void PitchUpload(Image& image, const ImageBufferMap& map, + void PitchUpload(Image& image, const StagingBufferMap& map, std::span<const VideoCommon::SwizzleParameters> swizzles); void CopyBC4(Image& dst_image, Image& src_image, diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index 794dd0758..92b4f7859 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -157,6 +157,7 @@ struct BufferCacheParams { static constexpr bool USE_MEMORY_MAPS = true; static constexpr bool SEPARATE_IMAGE_BUFFER_BINDINGS = false; static constexpr bool IMPLEMENTS_ASYNC_DOWNLOADS = true; + static constexpr bool USE_MEMORY_MAPS_FOR_UPLOADS = true; }; using BufferCache = VideoCommon::BufferCache<BufferCacheParams>; |