diff options
Diffstat (limited to 'src/video_core')
39 files changed, 1216 insertions, 1032 deletions
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 7bfd57369..d350c9b36 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -570,13 +570,12 @@ bool BufferCache<P>::DMACopy(GPUVAddr src_address, GPUVAddr dest_address, u64 am ForEachWrittenRange(*cpu_src_address, amount, mirror); // This subtraction in this order is important for overlapping copies. common_ranges.subtract(subtract_interval); - bool atleast_1_download = tmp_intervals.size() != 0; - for (const IntervalType add_interval : tmp_intervals) { + const bool has_new_downloads = tmp_intervals.size() != 0; + for (const IntervalType& add_interval : tmp_intervals) { common_ranges.add(add_interval); } - runtime.CopyBuffer(dest_buffer, src_buffer, copies); - if (atleast_1_download) { + if (has_new_downloads) { dest_buffer.MarkRegionAsGpuModified(*cpu_dest_address, amount); } std::vector<u8> tmp_buffer(amount); diff --git a/src/video_core/cdma_pusher.cpp b/src/video_core/cdma_pusher.cpp index 8b86ad050..a8c4b4415 100644 --- a/src/video_core/cdma_pusher.cpp +++ b/src/video_core/cdma_pusher.cpp @@ -24,6 +24,7 @@ #include "command_classes/vic.h" #include "video_core/cdma_pusher.h" #include "video_core/command_classes/nvdec_common.h" +#include "video_core/command_classes/sync_manager.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/gpu.h" #include "video_core/memory_manager.h" diff --git a/src/video_core/cdma_pusher.h b/src/video_core/cdma_pusher.h index 1bada44dd..87b49d6ea 100644 --- a/src/video_core/cdma_pusher.h +++ b/src/video_core/cdma_pusher.h @@ -9,13 +9,13 @@ #include "common/bit_field.h" #include "common/common_types.h" -#include "video_core/command_classes/sync_manager.h" namespace Tegra { class GPU; class Host1x; class Nvdec; +class SyncptIncrManager; class Vic; enum class ChSubmissionMode : u32 { diff --git a/src/video_core/command_classes/codecs/h264.cpp b/src/video_core/command_classes/codecs/h264.cpp index 51ee14c13..5519c4705 100644 --- a/src/video_core/command_classes/codecs/h264.cpp +++ b/src/video_core/command_classes/codecs/h264.cpp @@ -20,6 +20,8 @@ #include <array> #include <bit> + +#include "common/settings.h" #include "video_core/command_classes/codecs/h264.h" #include "video_core/gpu.h" #include "video_core/memory_manager.h" @@ -96,7 +98,10 @@ const std::vector<u8>& H264::ComposeFrameHeader(const NvdecCommon::NvdecRegister (context.h264_parameter_set.frame_mbs_only_flag ? 1 : 2); // TODO (ameerj): Where do we get this number, it seems to be particular for each stream - writer.WriteUe(6); // Max number of reference frames + const auto nvdec_decoding = Settings::values.nvdec_emulation.GetValue(); + const bool uses_gpu_decoding = nvdec_decoding == Settings::NvdecEmulation::GPU; + const u32 max_num_ref_frames = uses_gpu_decoding ? 6u : 16u; + writer.WriteUe(max_num_ref_frames); writer.WriteBit(false); writer.WriteUe(context.h264_parameter_set.pic_width_in_mbs - 1); writer.WriteUe(pic_height - 1); diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp index 0ee07f398..051616124 100644 --- a/src/video_core/command_classes/vic.cpp +++ b/src/video_core/command_classes/vic.cpp @@ -16,6 +16,7 @@ extern "C" { } #include "common/assert.h" +#include "common/bit_field.h" #include "common/logging/log.h" #include "video_core/command_classes/nvdec.h" @@ -26,6 +27,25 @@ extern "C" { #include "video_core/textures/decoders.h" namespace Tegra { +namespace { +enum class VideoPixelFormat : u64_le { + RGBA8 = 0x1f, + BGRA8 = 0x20, + RGBX8 = 0x23, + YUV420 = 0x44, +}; +} // Anonymous namespace + +union VicConfig { + u64_le raw{}; + BitField<0, 7, VideoPixelFormat> pixel_format; + BitField<7, 2, u64_le> chroma_loc_horiz; + BitField<9, 2, u64_le> chroma_loc_vert; + BitField<11, 4, u64_le> block_linear_kind; + BitField<15, 4, u64_le> block_linear_height_log2; + BitField<32, 14, u64_le> surface_width_minus1; + BitField<46, 14, u64_le> surface_height_minus1; +}; Vic::Vic(GPU& gpu_, std::shared_ptr<Nvdec> nvdec_processor_) : gpu(gpu_), @@ -65,134 +85,155 @@ void Vic::Execute() { if (!frame) { return; } - const auto pixel_format = static_cast<VideoPixelFormat>(config.pixel_format.Value()); - switch (pixel_format) { + const u64 surface_width = config.surface_width_minus1 + 1; + const u64 surface_height = config.surface_height_minus1 + 1; + if (static_cast<u64>(frame->width) != surface_width || + static_cast<u64>(frame->height) != surface_height) { + // TODO: Properly support multiple video streams with differing frame dimensions + LOG_WARNING(Service_NVDRV, "Frame dimensions {}x{} don't match surface dimensions {}x{}", + frame->width, frame->height, surface_width, surface_height); + } + switch (config.pixel_format) { + case VideoPixelFormat::RGBA8: case VideoPixelFormat::BGRA8: - case VideoPixelFormat::RGBA8: { - LOG_TRACE(Service_NVDRV, "Writing RGB Frame"); + case VideoPixelFormat::RGBX8: + WriteRGBFrame(frame, config); + break; + case VideoPixelFormat::YUV420: + WriteYUVFrame(frame, config); + break; + default: + UNIMPLEMENTED_MSG("Unknown video pixel format {:X}", config.pixel_format.Value()); + break; + } +} - if (scaler_ctx == nullptr || frame->width != scaler_width || - frame->height != scaler_height) { - const AVPixelFormat target_format = - (pixel_format == VideoPixelFormat::RGBA8) ? AV_PIX_FMT_RGBA : AV_PIX_FMT_BGRA; +void Vic::WriteRGBFrame(const AVFrame* frame, const VicConfig& config) { + LOG_TRACE(Service_NVDRV, "Writing RGB Frame"); + + if (!scaler_ctx || frame->width != scaler_width || frame->height != scaler_height) { + const AVPixelFormat target_format = [pixel_format = config.pixel_format]() { + switch (pixel_format) { + case VideoPixelFormat::RGBA8: + return AV_PIX_FMT_RGBA; + case VideoPixelFormat::BGRA8: + return AV_PIX_FMT_BGRA; + case VideoPixelFormat::RGBX8: + return AV_PIX_FMT_RGB0; + default: + return AV_PIX_FMT_RGBA; + } + }(); + + sws_freeContext(scaler_ctx); + // Frames are decoded into either YUV420 or NV12 formats. Convert to desired RGB format + scaler_ctx = sws_getContext(frame->width, frame->height, + static_cast<AVPixelFormat>(frame->format), frame->width, + frame->height, target_format, 0, nullptr, nullptr, nullptr); + scaler_width = frame->width; + scaler_height = frame->height; + converted_frame_buffer.reset(); + } + if (!converted_frame_buffer) { + const size_t frame_size = frame->width * frame->height * 4; + converted_frame_buffer = AVMallocPtr{static_cast<u8*>(av_malloc(frame_size)), av_free}; + } + const std::array<int, 4> converted_stride{frame->width * 4, frame->height * 4, 0, 0}; + u8* const converted_frame_buf_addr{converted_frame_buffer.get()}; + sws_scale(scaler_ctx, frame->data, frame->linesize, 0, frame->height, &converted_frame_buf_addr, + converted_stride.data()); + + // Use the minimum of surface/frame dimensions to avoid buffer overflow. + const u32 surface_width = static_cast<u32>(config.surface_width_minus1) + 1; + const u32 surface_height = static_cast<u32>(config.surface_height_minus1) + 1; + const u32 width = std::min(surface_width, static_cast<u32>(frame->width)); + const u32 height = std::min(surface_height, static_cast<u32>(frame->height)); + const u32 blk_kind = static_cast<u32>(config.block_linear_kind); + if (blk_kind != 0) { + // swizzle pitch linear to block linear + const u32 block_height = static_cast<u32>(config.block_linear_height_log2); + const auto size = Texture::CalculateSize(true, 4, width, height, 1, block_height, 0); + luma_buffer.resize(size); + Texture::SwizzleSubrect(width, height, width * 4, width, 4, luma_buffer.data(), + converted_frame_buf_addr, block_height, 0, 0); + + gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), size); + } else { + // send pitch linear frame + const size_t linear_size = width * height * 4; + gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr, + linear_size); + } +} - sws_freeContext(scaler_ctx); - scaler_ctx = nullptr; +void Vic::WriteYUVFrame(const AVFrame* frame, const VicConfig& config) { + LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame"); - // Frames are decoded into either YUV420 or NV12 formats. Convert to desired format - scaler_ctx = sws_getContext(frame->width, frame->height, - static_cast<AVPixelFormat>(frame->format), frame->width, - frame->height, target_format, 0, nullptr, nullptr, nullptr); + const std::size_t surface_width = config.surface_width_minus1 + 1; + const std::size_t surface_height = config.surface_height_minus1 + 1; + const std::size_t aligned_width = (surface_width + 0xff) & ~0xffUL; + // Use the minimum of surface/frame dimensions to avoid buffer overflow. + const auto frame_width = std::min(surface_width, static_cast<size_t>(frame->width)); + const auto frame_height = std::min(surface_height, static_cast<size_t>(frame->height)); - scaler_width = frame->width; - scaler_height = frame->height; - } - // Get Converted frame - const u32 width = static_cast<u32>(frame->width); - const u32 height = static_cast<u32>(frame->height); - const std::size_t linear_size = width * height * 4; - - // Only allocate frame_buffer once per stream, as the size is not expected to change - if (!converted_frame_buffer) { - converted_frame_buffer = AVMallocPtr{static_cast<u8*>(av_malloc(linear_size)), av_free}; + const auto stride = static_cast<size_t>(frame->linesize[0]); + + luma_buffer.resize(aligned_width * surface_height); + chroma_buffer.resize(aligned_width * surface_height / 2); + + // Populate luma buffer + const u8* luma_src = frame->data[0]; + for (std::size_t y = 0; y < frame_height; ++y) { + const std::size_t src = y * stride; + const std::size_t dst = y * aligned_width; + for (std::size_t x = 0; x < frame_width; ++x) { + luma_buffer[dst + x] = luma_src[src + x]; } - const std::array<int, 4> converted_stride{frame->width * 4, frame->height * 4, 0, 0}; - u8* const converted_frame_buf_addr{converted_frame_buffer.get()}; - - sws_scale(scaler_ctx, frame->data, frame->linesize, 0, frame->height, - &converted_frame_buf_addr, converted_stride.data()); - - const u32 blk_kind = static_cast<u32>(config.block_linear_kind); - if (blk_kind != 0) { - // swizzle pitch linear to block linear - const u32 block_height = static_cast<u32>(config.block_linear_height_log2); - const auto size = - Tegra::Texture::CalculateSize(true, 4, width, height, 1, block_height, 0); - luma_buffer.resize(size); - Tegra::Texture::SwizzleSubrect(width, height, width * 4, width, 4, luma_buffer.data(), - converted_frame_buffer.get(), block_height, 0, 0); - - gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), size); - } else { - // send pitch linear frame - gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr, - linear_size); + } + gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), + luma_buffer.size()); + + // Chroma + const std::size_t half_height = frame_height / 2; + const auto half_stride = static_cast<size_t>(frame->linesize[1]); + + switch (frame->format) { + case AV_PIX_FMT_YUV420P: { + // Frame from FFmpeg software + // Populate chroma buffer from both channels with interleaving. + const std::size_t half_width = frame_width / 2; + const u8* chroma_b_src = frame->data[1]; + const u8* chroma_r_src = frame->data[2]; + for (std::size_t y = 0; y < half_height; ++y) { + const std::size_t src = y * half_stride; + const std::size_t dst = y * aligned_width; + + for (std::size_t x = 0; x < half_width; ++x) { + chroma_buffer[dst + x * 2] = chroma_b_src[src + x]; + chroma_buffer[dst + x * 2 + 1] = chroma_r_src[src + x]; + } } break; } - case VideoPixelFormat::Yuv420: { - LOG_TRACE(Service_NVDRV, "Writing YUV420 Frame"); - - const std::size_t surface_width = config.surface_width_minus1 + 1; - const std::size_t surface_height = config.surface_height_minus1 + 1; - const auto frame_width = std::min(surface_width, static_cast<size_t>(frame->width)); - const auto frame_height = std::min(surface_height, static_cast<size_t>(frame->height)); - const std::size_t aligned_width = (surface_width + 0xff) & ~0xffUL; - - const auto stride = static_cast<size_t>(frame->linesize[0]); - - luma_buffer.resize(aligned_width * surface_height); - chroma_buffer.resize(aligned_width * surface_height / 2); - - // Populate luma buffer - const u8* luma_src = frame->data[0]; - for (std::size_t y = 0; y < frame_height; ++y) { + case AV_PIX_FMT_NV12: { + // Frame from VA-API hardware + // This is already interleaved so just copy + const u8* chroma_src = frame->data[1]; + for (std::size_t y = 0; y < half_height; ++y) { const std::size_t src = y * stride; const std::size_t dst = y * aligned_width; for (std::size_t x = 0; x < frame_width; ++x) { - luma_buffer[dst + x] = luma_src[src + x]; - } - } - gpu.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), - luma_buffer.size()); - - // Chroma - const std::size_t half_height = frame_height / 2; - const auto half_stride = static_cast<size_t>(frame->linesize[1]); - - switch (frame->format) { - case AV_PIX_FMT_YUV420P: { - // Frame from FFmpeg software - // Populate chroma buffer from both channels with interleaving. - const std::size_t half_width = frame_width / 2; - const u8* chroma_b_src = frame->data[1]; - const u8* chroma_r_src = frame->data[2]; - for (std::size_t y = 0; y < half_height; ++y) { - const std::size_t src = y * half_stride; - const std::size_t dst = y * aligned_width; - - for (std::size_t x = 0; x < half_width; ++x) { - chroma_buffer[dst + x * 2] = chroma_b_src[src + x]; - chroma_buffer[dst + x * 2 + 1] = chroma_r_src[src + x]; - } + chroma_buffer[dst + x] = chroma_src[src + x]; } - break; - } - case AV_PIX_FMT_NV12: { - // Frame from VA-API hardware - // This is already interleaved so just copy - const u8* chroma_src = frame->data[1]; - for (std::size_t y = 0; y < half_height; ++y) { - const std::size_t src = y * stride; - const std::size_t dst = y * aligned_width; - for (std::size_t x = 0; x < frame_width; ++x) { - chroma_buffer[dst + x] = chroma_src[src + x]; - } - } - break; - } - default: - UNREACHABLE(); - break; } - gpu.MemoryManager().WriteBlock(output_surface_chroma_address, chroma_buffer.data(), - chroma_buffer.size()); break; } default: - UNIMPLEMENTED_MSG("Unknown video pixel format {}", config.pixel_format.Value()); + UNREACHABLE(); break; } + gpu.MemoryManager().WriteBlock(output_surface_chroma_address, chroma_buffer.data(), + chroma_buffer.size()); } } // namespace Tegra diff --git a/src/video_core/command_classes/vic.h b/src/video_core/command_classes/vic.h index 74246e08c..6d4cdfd57 100644 --- a/src/video_core/command_classes/vic.h +++ b/src/video_core/command_classes/vic.h @@ -6,7 +6,6 @@ #include <memory> #include <vector> -#include "common/bit_field.h" #include "common/common_types.h" struct SwsContext; @@ -14,6 +13,7 @@ struct SwsContext; namespace Tegra { class GPU; class Nvdec; +union VicConfig; class Vic { public: @@ -27,6 +27,7 @@ public: }; explicit Vic(GPU& gpu, std::shared_ptr<Nvdec> nvdec_processor); + ~Vic(); /// Write to the device state. @@ -35,22 +36,9 @@ public: private: void Execute(); - enum class VideoPixelFormat : u64_le { - RGBA8 = 0x1f, - BGRA8 = 0x20, - Yuv420 = 0x44, - }; + void WriteRGBFrame(const AVFrame* frame, const VicConfig& config); - union VicConfig { - u64_le raw{}; - BitField<0, 7, u64_le> pixel_format; - BitField<7, 2, u64_le> chroma_loc_horiz; - BitField<9, 2, u64_le> chroma_loc_vert; - BitField<11, 4, u64_le> block_linear_kind; - BitField<15, 4, u64_le> block_linear_height_log2; - BitField<32, 14, u64_le> surface_width_minus1; - BitField<46, 14, u64_le> surface_height_minus1; - }; + void WriteYUVFrame(const AVFrame* frame, const VicConfig& config); GPU& gpu; std::shared_ptr<Tegra::Nvdec> nvdec_processor; diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 7f4ca6282..f22342dfb 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -6,6 +6,7 @@ #include <array> #include <bitset> +#include <cmath> #include <limits> #include <optional> #include <type_traits> diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index c7ec1eac9..67388d980 100644 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -82,41 +82,41 @@ void MaxwellDMA::Launch() { } void MaxwellDMA::CopyPitchToPitch() { - // When `multi_line_enable` bit is disabled the copy is performed as if we were copying a 1D - // buffer of length `line_length_in`. - // Otherwise we copy a 2D image of dimensions (line_length_in, line_count). - auto& accelerate = rasterizer->AccessAccelerateDMA(); - if (!regs.launch_dma.multi_line_enable) { - const bool is_buffer_clear = regs.launch_dma.remap_enable != 0 && - regs.remap_const.dst_x == RemapConst::Swizzle::CONST_A; - // TODO: allow multisized components. - if (is_buffer_clear) { - ASSERT(regs.remap_const.component_size_minus_one == 3); - accelerate.BufferClear(regs.offset_out, regs.line_length_in, regs.remap_consta_value); - std::vector<u32> tmp_buffer(regs.line_length_in, regs.remap_consta_value); - memory_manager.WriteBlockUnsafe(regs.offset_out, - reinterpret_cast<u8*>(tmp_buffer.data()), - regs.line_length_in * sizeof(u32)); - return; - } - UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0); - if (!accelerate.BufferCopy(regs.offset_in, regs.offset_out, regs.line_length_in)) { - std::vector<u8> tmp_buffer(regs.line_length_in); - memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(), regs.line_length_in); - memory_manager.WriteBlock(regs.offset_out, tmp_buffer.data(), regs.line_length_in); + // When `multi_line_enable` bit is enabled we copy a 2D image of dimensions + // (line_length_in, line_count). + // Otherwise the copy is performed as if we were copying a 1D buffer of length line_length_in. + const bool remap_enabled = regs.launch_dma.remap_enable != 0; + if (regs.launch_dma.multi_line_enable) { + UNIMPLEMENTED_IF(remap_enabled); + + // Perform a line-by-line copy. + // We're going to take a subrect of size (line_length_in, line_count) from the source + // rectangle. There is no need to manually flush/invalidate the regions because CopyBlock + // does that for us. + for (u32 line = 0; line < regs.line_count; ++line) { + const GPUVAddr source_line = regs.offset_in + static_cast<size_t>(line) * regs.pitch_in; + const GPUVAddr dest_line = regs.offset_out + static_cast<size_t>(line) * regs.pitch_out; + memory_manager.CopyBlock(dest_line, source_line, regs.line_length_in); } return; } - - UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0); - - // Perform a line-by-line copy. - // We're going to take a subrect of size (line_length_in, line_count) from the source rectangle. - // There is no need to manually flush/invalidate the regions because CopyBlock does that for us. - for (u32 line = 0; line < regs.line_count; ++line) { - const GPUVAddr source_line = regs.offset_in + static_cast<size_t>(line) * regs.pitch_in; - const GPUVAddr dest_line = regs.offset_out + static_cast<size_t>(line) * regs.pitch_out; - memory_manager.CopyBlock(dest_line, source_line, regs.line_length_in); + // TODO: allow multisized components. + auto& accelerate = rasterizer->AccessAccelerateDMA(); + const bool is_const_a_dst = regs.remap_const.dst_x == RemapConst::Swizzle::CONST_A; + const bool is_buffer_clear = remap_enabled && is_const_a_dst; + if (is_buffer_clear) { + ASSERT(regs.remap_const.component_size_minus_one == 3); + accelerate.BufferClear(regs.offset_out, regs.line_length_in, regs.remap_consta_value); + std::vector<u32> tmp_buffer(regs.line_length_in, regs.remap_consta_value); + memory_manager.WriteBlockUnsafe(regs.offset_out, reinterpret_cast<u8*>(tmp_buffer.data()), + regs.line_length_in * sizeof(u32)); + return; + } + UNIMPLEMENTED_IF(remap_enabled); + if (!accelerate.BufferCopy(regs.offset_in, regs.offset_out, regs.line_length_in)) { + std::vector<u8> tmp_buffer(regs.line_length_in); + memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(), regs.line_length_in); + memory_manager.WriteBlock(regs.offset_out, tmp_buffer.data(), regs.line_length_in); } } diff --git a/src/video_core/engines/maxwell_dma.h b/src/video_core/engines/maxwell_dma.h index 9e457ae16..a04514425 100644 --- a/src/video_core/engines/maxwell_dma.h +++ b/src/video_core/engines/maxwell_dma.h @@ -175,7 +175,7 @@ public: static_assert(sizeof(LaunchDMA) == 4); struct RemapConst { - enum Swizzle : u32 { + enum class Swizzle : u32 { SRC_X = 0, SRC_Y = 1, SRC_Z = 2, diff --git a/src/video_core/framebuffer_config.h b/src/video_core/framebuffer_config.h index b86c3a757..b1d455e30 100644 --- a/src/video_core/framebuffer_config.h +++ b/src/video_core/framebuffer_config.h @@ -4,8 +4,10 @@ #pragma once -namespace Tegra { +#include "common/common_types.h" +#include "common/math_util.h" +namespace Tegra { /** * Struct describing framebuffer configuration */ @@ -16,6 +18,21 @@ struct FramebufferConfig { B8G8R8A8_UNORM = 5, }; + enum class TransformFlags : u32 { + /// No transform flags are set + Unset = 0x00, + /// Flip source image horizontally (around the vertical axis) + FlipH = 0x01, + /// Flip source image vertically (around the horizontal axis) + FlipV = 0x02, + /// Rotate source image 90 degrees clockwise + Rotate90 = 0x04, + /// Rotate source image 180 degrees + Rotate180 = 0x03, + /// Rotate source image 270 degrees clockwise + Rotate270 = 0x07, + }; + VAddr address{}; u32 offset{}; u32 width{}; @@ -23,7 +40,6 @@ struct FramebufferConfig { u32 stride{}; PixelFormat pixel_format{}; - using TransformFlags = Service::NVFlinger::BufferQueue::BufferTransformFlags; TransformFlags transform_flags{}; Common::Rectangle<int> crop_rect; }; diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index ff024f530..ab7c21a49 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -2,548 +2,913 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include <array> +#include <atomic> #include <chrono> +#include <condition_variable> +#include <list> +#include <memory> #include "common/assert.h" #include "common/microprofile.h" #include "common/settings.h" #include "core/core.h" #include "core/core_timing.h" -#include "core/core_timing_util.h" #include "core/frontend/emu_window.h" #include "core/hardware_interrupt_manager.h" -#include "core/memory.h" +#include "core/hle/service/nvdrv/nvdata.h" +#include "core/hle/service/nvflinger/buffer_queue.h" #include "core/perf_stats.h" +#include "video_core/cdma_pusher.h" +#include "video_core/dma_pusher.h" #include "video_core/engines/fermi_2d.h" #include "video_core/engines/kepler_compute.h" #include "video_core/engines/kepler_memory.h" #include "video_core/engines/maxwell_3d.h" #include "video_core/engines/maxwell_dma.h" #include "video_core/gpu.h" +#include "video_core/gpu_thread.h" #include "video_core/memory_manager.h" #include "video_core/renderer_base.h" #include "video_core/shader_notify.h" -#include "video_core/video_core.h" namespace Tegra { MICROPROFILE_DEFINE(GPU_wait, "GPU", "Wait for the GPU", MP_RGB(128, 128, 192)); -GPU::GPU(Core::System& system_, bool is_async_, bool use_nvdec_) - : system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>(system)}, - dma_pusher{std::make_unique<Tegra::DmaPusher>(system, *this)}, use_nvdec{use_nvdec_}, - maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)}, - fermi_2d{std::make_unique<Engines::Fermi2D>()}, - kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)}, - maxwell_dma{std::make_unique<Engines::MaxwellDMA>(system, *memory_manager)}, - kepler_memory{std::make_unique<Engines::KeplerMemory>(system, *memory_manager)}, - shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_}, - gpu_thread{system_, is_async_} {} +struct GPU::Impl { + explicit Impl(GPU& gpu_, Core::System& system_, bool is_async_, bool use_nvdec_) + : gpu{gpu_}, system{system_}, memory_manager{std::make_unique<Tegra::MemoryManager>( + system)}, + dma_pusher{std::make_unique<Tegra::DmaPusher>(system, gpu)}, use_nvdec{use_nvdec_}, + maxwell_3d{std::make_unique<Engines::Maxwell3D>(system, *memory_manager)}, + fermi_2d{std::make_unique<Engines::Fermi2D>()}, + kepler_compute{std::make_unique<Engines::KeplerCompute>(system, *memory_manager)}, + maxwell_dma{std::make_unique<Engines::MaxwellDMA>(system, *memory_manager)}, + kepler_memory{std::make_unique<Engines::KeplerMemory>(system, *memory_manager)}, + shader_notify{std::make_unique<VideoCore::ShaderNotify>()}, is_async{is_async_}, + gpu_thread{system_, is_async_} {} + + ~Impl() = default; + + /// Binds a renderer to the GPU. + void BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer_) { + renderer = std::move(renderer_); + rasterizer = renderer->ReadRasterizer(); + + memory_manager->BindRasterizer(rasterizer); + maxwell_3d->BindRasterizer(rasterizer); + fermi_2d->BindRasterizer(rasterizer); + kepler_compute->BindRasterizer(rasterizer); + maxwell_dma->BindRasterizer(rasterizer); + } -GPU::~GPU() = default; + /// Calls a GPU method. + void CallMethod(const GPU::MethodCall& method_call) { + LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method_call.method, + method_call.subchannel); + + ASSERT(method_call.subchannel < bound_engines.size()); + + if (ExecuteMethodOnEngine(method_call.method)) { + CallEngineMethod(method_call); + } else { + CallPullerMethod(method_call); + } + } + + /// Calls a GPU multivalue method. + void CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount, + u32 methods_pending) { + LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method, subchannel); + + ASSERT(subchannel < bound_engines.size()); + + if (ExecuteMethodOnEngine(method)) { + CallEngineMultiMethod(method, subchannel, base_start, amount, methods_pending); + } else { + for (std::size_t i = 0; i < amount; i++) { + CallPullerMethod(GPU::MethodCall{ + method, + base_start[i], + subchannel, + methods_pending - static_cast<u32>(i), + }); + } + } + } + + /// Flush all current written commands into the host GPU for execution. + void FlushCommands() { + rasterizer->FlushCommands(); + } + + /// Synchronizes CPU writes with Host GPU memory. + void SyncGuestHost() { + rasterizer->SyncGuestHost(); + } + + /// Signal the ending of command list. + void OnCommandListEnd() { + if (is_async) { + // This command only applies to asynchronous GPU mode + gpu_thread.OnCommandListEnd(); + } + } + + /// Request a host GPU memory flush from the CPU. + [[nodiscard]] u64 RequestFlush(VAddr addr, std::size_t size) { + std::unique_lock lck{flush_request_mutex}; + const u64 fence = ++last_flush_fence; + flush_requests.emplace_back(fence, addr, size); + return fence; + } + + /// Obtains current flush request fence id. + [[nodiscard]] u64 CurrentFlushRequestFence() const { + return current_flush_fence.load(std::memory_order_relaxed); + } + + /// Tick pending requests within the GPU. + void TickWork() { + std::unique_lock lck{flush_request_mutex}; + while (!flush_requests.empty()) { + auto& request = flush_requests.front(); + const u64 fence = request.fence; + const VAddr addr = request.addr; + const std::size_t size = request.size; + flush_requests.pop_front(); + flush_request_mutex.unlock(); + rasterizer->FlushRegion(addr, size); + current_flush_fence.store(fence); + flush_request_mutex.lock(); + } + } + + /// Returns a reference to the Maxwell3D GPU engine. + [[nodiscard]] Engines::Maxwell3D& Maxwell3D() { + return *maxwell_3d; + } + + /// Returns a const reference to the Maxwell3D GPU engine. + [[nodiscard]] const Engines::Maxwell3D& Maxwell3D() const { + return *maxwell_3d; + } + + /// Returns a reference to the KeplerCompute GPU engine. + [[nodiscard]] Engines::KeplerCompute& KeplerCompute() { + return *kepler_compute; + } + + /// Returns a reference to the KeplerCompute GPU engine. + [[nodiscard]] const Engines::KeplerCompute& KeplerCompute() const { + return *kepler_compute; + } + + /// Returns a reference to the GPU memory manager. + [[nodiscard]] Tegra::MemoryManager& MemoryManager() { + return *memory_manager; + } + + /// Returns a const reference to the GPU memory manager. + [[nodiscard]] const Tegra::MemoryManager& MemoryManager() const { + return *memory_manager; + } + + /// Returns a reference to the GPU DMA pusher. + [[nodiscard]] Tegra::DmaPusher& DmaPusher() { + return *dma_pusher; + } + + /// Returns a const reference to the GPU DMA pusher. + [[nodiscard]] const Tegra::DmaPusher& DmaPusher() const { + return *dma_pusher; + } + + /// Returns a reference to the GPU CDMA pusher. + [[nodiscard]] Tegra::CDmaPusher& CDmaPusher() { + return *cdma_pusher; + } + + /// Returns a const reference to the GPU CDMA pusher. + [[nodiscard]] const Tegra::CDmaPusher& CDmaPusher() const { + return *cdma_pusher; + } + + /// Returns a reference to the underlying renderer. + [[nodiscard]] VideoCore::RendererBase& Renderer() { + return *renderer; + } + + /// Returns a const reference to the underlying renderer. + [[nodiscard]] const VideoCore::RendererBase& Renderer() const { + return *renderer; + } + + /// Returns a reference to the shader notifier. + [[nodiscard]] VideoCore::ShaderNotify& ShaderNotify() { + return *shader_notify; + } + + /// Returns a const reference to the shader notifier. + [[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const { + return *shader_notify; + } + + /// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame. + void WaitFence(u32 syncpoint_id, u32 value) { + // Synced GPU, is always in sync + if (!is_async) { + return; + } + if (syncpoint_id == UINT32_MAX) { + // TODO: Research what this does. + LOG_ERROR(HW_GPU, "Waiting for syncpoint -1 not implemented"); + return; + } + MICROPROFILE_SCOPE(GPU_wait); + std::unique_lock lock{sync_mutex}; + sync_cv.wait(lock, [=, this] { + if (shutting_down.load(std::memory_order_relaxed)) { + // We're shutting down, ensure no threads continue to wait for the next syncpoint + return true; + } + return syncpoints.at(syncpoint_id).load() >= value; + }); + } + + void IncrementSyncPoint(u32 syncpoint_id) { + auto& syncpoint = syncpoints.at(syncpoint_id); + syncpoint++; + std::lock_guard lock{sync_mutex}; + sync_cv.notify_all(); + auto& interrupt = syncpt_interrupts.at(syncpoint_id); + if (!interrupt.empty()) { + u32 value = syncpoint.load(); + auto it = interrupt.begin(); + while (it != interrupt.end()) { + if (value >= *it) { + TriggerCpuInterrupt(syncpoint_id, *it); + it = interrupt.erase(it); + continue; + } + it++; + } + } + } + + [[nodiscard]] u32 GetSyncpointValue(u32 syncpoint_id) const { + return syncpoints.at(syncpoint_id).load(); + } + + void RegisterSyncptInterrupt(u32 syncpoint_id, u32 value) { + std::lock_guard lock{sync_mutex}; + auto& interrupt = syncpt_interrupts.at(syncpoint_id); + bool contains = std::any_of(interrupt.begin(), interrupt.end(), + [value](u32 in_value) { return in_value == value; }); + if (contains) { + return; + } + interrupt.emplace_back(value); + } + + [[nodiscard]] bool CancelSyncptInterrupt(u32 syncpoint_id, u32 value) { + std::lock_guard lock{sync_mutex}; + auto& interrupt = syncpt_interrupts.at(syncpoint_id); + const auto iter = + std::find_if(interrupt.begin(), interrupt.end(), + [value](u32 interrupt_value) { return value == interrupt_value; }); + + if (iter == interrupt.end()) { + return false; + } + interrupt.erase(iter); + return true; + } + + [[nodiscard]] u64 GetTicks() const { + // This values were reversed engineered by fincs from NVN + // The gpu clock is reported in units of 385/625 nanoseconds + constexpr u64 gpu_ticks_num = 384; + constexpr u64 gpu_ticks_den = 625; + + u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count(); + if (Settings::values.use_fast_gpu_time.GetValue()) { + nanoseconds /= 256; + } + const u64 nanoseconds_num = nanoseconds / gpu_ticks_den; + const u64 nanoseconds_rem = nanoseconds % gpu_ticks_den; + return nanoseconds_num * gpu_ticks_num + (nanoseconds_rem * gpu_ticks_num) / gpu_ticks_den; + } + + [[nodiscard]] bool IsAsync() const { + return is_async; + } + + [[nodiscard]] bool UseNvdec() const { + return use_nvdec; + } + + void RendererFrameEndNotify() { + system.GetPerfStats().EndGameFrame(); + } + + /// Performs any additional setup necessary in order to begin GPU emulation. + /// This can be used to launch any necessary threads and register any necessary + /// core timing events. + void Start() { + gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher); + cpu_context = renderer->GetRenderWindow().CreateSharedContext(); + cpu_context->MakeCurrent(); + } + + /// Obtain the CPU Context + void ObtainContext() { + cpu_context->MakeCurrent(); + } + + /// Release the CPU Context + void ReleaseContext() { + cpu_context->DoneCurrent(); + } + + /// Push GPU command entries to be processed + void PushGPUEntries(Tegra::CommandList&& entries) { + gpu_thread.SubmitList(std::move(entries)); + } + + /// Push GPU command buffer entries to be processed + void PushCommandBuffer(Tegra::ChCommandHeaderList& entries) { + if (!use_nvdec) { + return; + } + + if (!cdma_pusher) { + cdma_pusher = std::make_unique<Tegra::CDmaPusher>(gpu); + } + + // SubmitCommandBuffer would make the nvdec operations async, this is not currently working + // TODO(ameerj): RE proper async nvdec operation + // gpu_thread.SubmitCommandBuffer(std::move(entries)); + + cdma_pusher->ProcessEntries(std::move(entries)); + } + + /// Frees the CDMAPusher instance to free up resources + void ClearCdmaInstance() { + cdma_pusher.reset(); + } + + /// Swap buffers (render frame) + void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { + gpu_thread.SwapBuffers(framebuffer); + } + + /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory + void FlushRegion(VAddr addr, u64 size) { + gpu_thread.FlushRegion(addr, size); + } + + /// Notify rasterizer that any caches of the specified region should be invalidated + void InvalidateRegion(VAddr addr, u64 size) { + gpu_thread.InvalidateRegion(addr, size); + } + + /// Notify rasterizer that any caches of the specified region should be flushed and invalidated + void FlushAndInvalidateRegion(VAddr addr, u64 size) { + gpu_thread.FlushAndInvalidateRegion(addr, size); + } + + void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const { + auto& interrupt_manager = system.InterruptManager(); + interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value); + } + + void ProcessBindMethod(const GPU::MethodCall& method_call) { + // Bind the current subchannel to the desired engine id. + LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", method_call.subchannel, + method_call.argument); + const auto engine_id = static_cast<EngineID>(method_call.argument); + bound_engines[method_call.subchannel] = static_cast<EngineID>(engine_id); + switch (engine_id) { + case EngineID::FERMI_TWOD_A: + dma_pusher->BindSubchannel(fermi_2d.get(), method_call.subchannel); + break; + case EngineID::MAXWELL_B: + dma_pusher->BindSubchannel(maxwell_3d.get(), method_call.subchannel); + break; + case EngineID::KEPLER_COMPUTE_B: + dma_pusher->BindSubchannel(kepler_compute.get(), method_call.subchannel); + break; + case EngineID::MAXWELL_DMA_COPY_A: + dma_pusher->BindSubchannel(maxwell_dma.get(), method_call.subchannel); + break; + case EngineID::KEPLER_INLINE_TO_MEMORY_B: + dma_pusher->BindSubchannel(kepler_memory.get(), method_call.subchannel); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", engine_id); + } + } -void GPU::BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer_) { - renderer = std::move(renderer_); - rasterizer = renderer->ReadRasterizer(); + void ProcessFenceActionMethod() { + switch (regs.fence_action.op) { + case GPU::FenceOperation::Acquire: + WaitFence(regs.fence_action.syncpoint_id, regs.fence_value); + break; + case GPU::FenceOperation::Increment: + IncrementSyncPoint(regs.fence_action.syncpoint_id); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented operation {}", regs.fence_action.op.Value()); + } + } + + void ProcessWaitForInterruptMethod() { + // TODO(bunnei) ImplementMe + LOG_WARNING(HW_GPU, "(STUBBED) called"); + } + + void ProcessSemaphoreTriggerMethod() { + const auto semaphoreOperationMask = 0xF; + const auto op = + static_cast<GpuSemaphoreOperation>(regs.semaphore_trigger & semaphoreOperationMask); + if (op == GpuSemaphoreOperation::WriteLong) { + struct Block { + u32 sequence; + u32 zeros = 0; + u64 timestamp; + }; + + Block block{}; + block.sequence = regs.semaphore_sequence; + // TODO(Kmather73): Generate a real GPU timestamp and write it here instead of + // CoreTiming + block.timestamp = GetTicks(); + memory_manager->WriteBlock(regs.semaphore_address.SemaphoreAddress(), &block, + sizeof(block)); + } else { + const u32 word{memory_manager->Read<u32>(regs.semaphore_address.SemaphoreAddress())}; + if ((op == GpuSemaphoreOperation::AcquireEqual && word == regs.semaphore_sequence) || + (op == GpuSemaphoreOperation::AcquireGequal && + static_cast<s32>(word - regs.semaphore_sequence) > 0) || + (op == GpuSemaphoreOperation::AcquireMask && (word & regs.semaphore_sequence))) { + // Nothing to do in this case + } else { + regs.acquire_source = true; + regs.acquire_value = regs.semaphore_sequence; + if (op == GpuSemaphoreOperation::AcquireEqual) { + regs.acquire_active = true; + regs.acquire_mode = false; + } else if (op == GpuSemaphoreOperation::AcquireGequal) { + regs.acquire_active = true; + regs.acquire_mode = true; + } else if (op == GpuSemaphoreOperation::AcquireMask) { + // TODO(kemathe) The acquire mask operation waits for a value that, ANDed with + // semaphore_sequence, gives a non-0 result + LOG_ERROR(HW_GPU, "Invalid semaphore operation AcquireMask not implemented"); + } else { + LOG_ERROR(HW_GPU, "Invalid semaphore operation"); + } + } + } + } + + void ProcessSemaphoreRelease() { + memory_manager->Write<u32>(regs.semaphore_address.SemaphoreAddress(), + regs.semaphore_release); + } + + void ProcessSemaphoreAcquire() { + const u32 word = memory_manager->Read<u32>(regs.semaphore_address.SemaphoreAddress()); + const auto value = regs.semaphore_acquire; + if (word != value) { + regs.acquire_active = true; + regs.acquire_value = value; + // TODO(kemathe73) figure out how to do the acquire_timeout + regs.acquire_mode = false; + regs.acquire_source = false; + } + } - memory_manager->BindRasterizer(rasterizer); - maxwell_3d->BindRasterizer(rasterizer); - fermi_2d->BindRasterizer(rasterizer); - kepler_compute->BindRasterizer(rasterizer); - maxwell_dma->BindRasterizer(rasterizer); + /// Calls a GPU puller method. + void CallPullerMethod(const GPU::MethodCall& method_call) { + regs.reg_array[method_call.method] = method_call.argument; + const auto method = static_cast<BufferMethods>(method_call.method); + + switch (method) { + case BufferMethods::BindObject: { + ProcessBindMethod(method_call); + break; + } + case BufferMethods::Nop: + case BufferMethods::SemaphoreAddressHigh: + case BufferMethods::SemaphoreAddressLow: + case BufferMethods::SemaphoreSequence: + case BufferMethods::UnkCacheFlush: + case BufferMethods::WrcacheFlush: + case BufferMethods::FenceValue: + break; + case BufferMethods::RefCnt: + rasterizer->SignalReference(); + break; + case BufferMethods::FenceAction: + ProcessFenceActionMethod(); + break; + case BufferMethods::WaitForInterrupt: + ProcessWaitForInterruptMethod(); + break; + case BufferMethods::SemaphoreTrigger: { + ProcessSemaphoreTriggerMethod(); + break; + } + case BufferMethods::NotifyIntr: { + // TODO(Kmather73): Research and implement this method. + LOG_ERROR(HW_GPU, "Special puller engine method NotifyIntr not implemented"); + break; + } + case BufferMethods::Unk28: { + // TODO(Kmather73): Research and implement this method. + LOG_ERROR(HW_GPU, "Special puller engine method Unk28 not implemented"); + break; + } + case BufferMethods::SemaphoreAcquire: { + ProcessSemaphoreAcquire(); + break; + } + case BufferMethods::SemaphoreRelease: { + ProcessSemaphoreRelease(); + break; + } + case BufferMethods::Yield: { + // TODO(Kmather73): Research and implement this method. + LOG_ERROR(HW_GPU, "Special puller engine method Yield not implemented"); + break; + } + default: + LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented", method); + break; + } + } + + /// Calls a GPU engine method. + void CallEngineMethod(const GPU::MethodCall& method_call) { + const EngineID engine = bound_engines[method_call.subchannel]; + + switch (engine) { + case EngineID::FERMI_TWOD_A: + fermi_2d->CallMethod(method_call.method, method_call.argument, + method_call.IsLastCall()); + break; + case EngineID::MAXWELL_B: + maxwell_3d->CallMethod(method_call.method, method_call.argument, + method_call.IsLastCall()); + break; + case EngineID::KEPLER_COMPUTE_B: + kepler_compute->CallMethod(method_call.method, method_call.argument, + method_call.IsLastCall()); + break; + case EngineID::MAXWELL_DMA_COPY_A: + maxwell_dma->CallMethod(method_call.method, method_call.argument, + method_call.IsLastCall()); + break; + case EngineID::KEPLER_INLINE_TO_MEMORY_B: + kepler_memory->CallMethod(method_call.method, method_call.argument, + method_call.IsLastCall()); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented engine"); + } + } + + /// Calls a GPU engine multivalue method. + void CallEngineMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount, + u32 methods_pending) { + const EngineID engine = bound_engines[subchannel]; + + switch (engine) { + case EngineID::FERMI_TWOD_A: + fermi_2d->CallMultiMethod(method, base_start, amount, methods_pending); + break; + case EngineID::MAXWELL_B: + maxwell_3d->CallMultiMethod(method, base_start, amount, methods_pending); + break; + case EngineID::KEPLER_COMPUTE_B: + kepler_compute->CallMultiMethod(method, base_start, amount, methods_pending); + break; + case EngineID::MAXWELL_DMA_COPY_A: + maxwell_dma->CallMultiMethod(method, base_start, amount, methods_pending); + break; + case EngineID::KEPLER_INLINE_TO_MEMORY_B: + kepler_memory->CallMultiMethod(method, base_start, amount, methods_pending); + break; + default: + UNIMPLEMENTED_MSG("Unimplemented engine"); + } + } + + /// Determines where the method should be executed. + [[nodiscard]] bool ExecuteMethodOnEngine(u32 method) { + const auto buffer_method = static_cast<BufferMethods>(method); + return buffer_method >= BufferMethods::NonPullerMethods; + } + + struct Regs { + static constexpr size_t NUM_REGS = 0x40; + + union { + struct { + INSERT_PADDING_WORDS_NOINIT(0x4); + struct { + u32 address_high; + u32 address_low; + + [[nodiscard]] GPUVAddr SemaphoreAddress() const { + return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | + address_low); + } + } semaphore_address; + + u32 semaphore_sequence; + u32 semaphore_trigger; + INSERT_PADDING_WORDS_NOINIT(0xC); + + // The pusher and the puller share the reference counter, the pusher only has read + // access + u32 reference_count; + INSERT_PADDING_WORDS_NOINIT(0x5); + + u32 semaphore_acquire; + u32 semaphore_release; + u32 fence_value; + GPU::FenceAction fence_action; + INSERT_PADDING_WORDS_NOINIT(0xE2); + + // Puller state + u32 acquire_mode; + u32 acquire_source; + u32 acquire_active; + u32 acquire_timeout; + u32 acquire_value; + }; + std::array<u32, NUM_REGS> reg_array; + }; + } regs{}; + + GPU& gpu; + Core::System& system; + std::unique_ptr<Tegra::MemoryManager> memory_manager; + std::unique_ptr<Tegra::DmaPusher> dma_pusher; + std::unique_ptr<Tegra::CDmaPusher> cdma_pusher; + std::unique_ptr<VideoCore::RendererBase> renderer; + VideoCore::RasterizerInterface* rasterizer = nullptr; + const bool use_nvdec; + + /// Mapping of command subchannels to their bound engine ids + std::array<EngineID, 8> bound_engines{}; + /// 3D engine + std::unique_ptr<Engines::Maxwell3D> maxwell_3d; + /// 2D engine + std::unique_ptr<Engines::Fermi2D> fermi_2d; + /// Compute engine + std::unique_ptr<Engines::KeplerCompute> kepler_compute; + /// DMA engine + std::unique_ptr<Engines::MaxwellDMA> maxwell_dma; + /// Inline memory engine + std::unique_ptr<Engines::KeplerMemory> kepler_memory; + /// Shader build notifier + std::unique_ptr<VideoCore::ShaderNotify> shader_notify; + /// When true, we are about to shut down emulation session, so terminate outstanding tasks + std::atomic_bool shutting_down{}; + + std::array<std::atomic<u32>, Service::Nvidia::MaxSyncPoints> syncpoints{}; + + std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts; + + std::mutex sync_mutex; + std::mutex device_mutex; + + std::condition_variable sync_cv; + + struct FlushRequest { + explicit FlushRequest(u64 fence_, VAddr addr_, std::size_t size_) + : fence{fence_}, addr{addr_}, size{size_} {} + u64 fence; + VAddr addr; + std::size_t size; + }; + + std::list<FlushRequest> flush_requests; + std::atomic<u64> current_flush_fence{}; + u64 last_flush_fence{}; + std::mutex flush_request_mutex; + + const bool is_async; + + VideoCommon::GPUThread::ThreadManager gpu_thread; + std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context; + +#define ASSERT_REG_POSITION(field_name, position) \ + static_assert(offsetof(Regs, field_name) == position * 4, \ + "Field " #field_name " has invalid position") + + ASSERT_REG_POSITION(semaphore_address, 0x4); + ASSERT_REG_POSITION(semaphore_sequence, 0x6); + ASSERT_REG_POSITION(semaphore_trigger, 0x7); + ASSERT_REG_POSITION(reference_count, 0x14); + ASSERT_REG_POSITION(semaphore_acquire, 0x1A); + ASSERT_REG_POSITION(semaphore_release, 0x1B); + ASSERT_REG_POSITION(fence_value, 0x1C); + ASSERT_REG_POSITION(fence_action, 0x1D); + + ASSERT_REG_POSITION(acquire_mode, 0x100); + ASSERT_REG_POSITION(acquire_source, 0x101); + ASSERT_REG_POSITION(acquire_active, 0x102); + ASSERT_REG_POSITION(acquire_timeout, 0x103); + ASSERT_REG_POSITION(acquire_value, 0x104); + +#undef ASSERT_REG_POSITION + + enum class GpuSemaphoreOperation { + AcquireEqual = 0x1, + WriteLong = 0x2, + AcquireGequal = 0x4, + AcquireMask = 0x8, + }; +}; + +GPU::GPU(Core::System& system, bool is_async, bool use_nvdec) + : impl{std::make_unique<Impl>(*this, system, is_async, use_nvdec)} {} + +GPU::~GPU() = default; + +void GPU::BindRenderer(std::unique_ptr<VideoCore::RendererBase> renderer) { + impl->BindRenderer(std::move(renderer)); } -Engines::Maxwell3D& GPU::Maxwell3D() { - return *maxwell_3d; +void GPU::CallMethod(const MethodCall& method_call) { + impl->CallMethod(method_call); } -const Engines::Maxwell3D& GPU::Maxwell3D() const { - return *maxwell_3d; +void GPU::CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount, + u32 methods_pending) { + impl->CallMultiMethod(method, subchannel, base_start, amount, methods_pending); } -Engines::KeplerCompute& GPU::KeplerCompute() { - return *kepler_compute; +void GPU::FlushCommands() { + impl->FlushCommands(); } -const Engines::KeplerCompute& GPU::KeplerCompute() const { - return *kepler_compute; +void GPU::SyncGuestHost() { + impl->SyncGuestHost(); } -MemoryManager& GPU::MemoryManager() { - return *memory_manager; +void GPU::OnCommandListEnd() { + impl->OnCommandListEnd(); } -const MemoryManager& GPU::MemoryManager() const { - return *memory_manager; +u64 GPU::RequestFlush(VAddr addr, std::size_t size) { + return impl->RequestFlush(addr, size); } -DmaPusher& GPU::DmaPusher() { - return *dma_pusher; +u64 GPU::CurrentFlushRequestFence() const { + return impl->CurrentFlushRequestFence(); } -Tegra::CDmaPusher& GPU::CDmaPusher() { - return *cdma_pusher; +void GPU::TickWork() { + impl->TickWork(); } -const DmaPusher& GPU::DmaPusher() const { - return *dma_pusher; +Engines::Maxwell3D& GPU::Maxwell3D() { + return impl->Maxwell3D(); } -const Tegra::CDmaPusher& GPU::CDmaPusher() const { - return *cdma_pusher; +const Engines::Maxwell3D& GPU::Maxwell3D() const { + return impl->Maxwell3D(); } -void GPU::WaitFence(u32 syncpoint_id, u32 value) { - // Synced GPU, is always in sync - if (!is_async) { - return; - } - if (syncpoint_id == UINT32_MAX) { - // TODO: Research what this does. - LOG_ERROR(HW_GPU, "Waiting for syncpoint -1 not implemented"); - return; - } - MICROPROFILE_SCOPE(GPU_wait); - std::unique_lock lock{sync_mutex}; - sync_cv.wait(lock, [=, this] { - if (shutting_down.load(std::memory_order_relaxed)) { - // We're shutting down, ensure no threads continue to wait for the next syncpoint - return true; - } - return syncpoints.at(syncpoint_id).load() >= value; - }); -} - -void GPU::IncrementSyncPoint(const u32 syncpoint_id) { - auto& syncpoint = syncpoints.at(syncpoint_id); - syncpoint++; - std::lock_guard lock{sync_mutex}; - sync_cv.notify_all(); - auto& interrupt = syncpt_interrupts.at(syncpoint_id); - if (!interrupt.empty()) { - u32 value = syncpoint.load(); - auto it = interrupt.begin(); - while (it != interrupt.end()) { - if (value >= *it) { - TriggerCpuInterrupt(syncpoint_id, *it); - it = interrupt.erase(it); - continue; - } - it++; - } - } +Engines::KeplerCompute& GPU::KeplerCompute() { + return impl->KeplerCompute(); } -u32 GPU::GetSyncpointValue(const u32 syncpoint_id) const { - return syncpoints.at(syncpoint_id).load(); +const Engines::KeplerCompute& GPU::KeplerCompute() const { + return impl->KeplerCompute(); } -void GPU::RegisterSyncptInterrupt(const u32 syncpoint_id, const u32 value) { - auto& interrupt = syncpt_interrupts.at(syncpoint_id); - bool contains = std::any_of(interrupt.begin(), interrupt.end(), - [value](u32 in_value) { return in_value == value; }); - if (contains) { - return; - } - interrupt.emplace_back(value); +Tegra::MemoryManager& GPU::MemoryManager() { + return impl->MemoryManager(); } -bool GPU::CancelSyncptInterrupt(const u32 syncpoint_id, const u32 value) { - std::lock_guard lock{sync_mutex}; - auto& interrupt = syncpt_interrupts.at(syncpoint_id); - const auto iter = - std::find_if(interrupt.begin(), interrupt.end(), - [value](u32 interrupt_value) { return value == interrupt_value; }); +const Tegra::MemoryManager& GPU::MemoryManager() const { + return impl->MemoryManager(); +} - if (iter == interrupt.end()) { - return false; - } - interrupt.erase(iter); - return true; +Tegra::DmaPusher& GPU::DmaPusher() { + return impl->DmaPusher(); } -u64 GPU::RequestFlush(VAddr addr, std::size_t size) { - std::unique_lock lck{flush_request_mutex}; - const u64 fence = ++last_flush_fence; - flush_requests.emplace_back(fence, addr, size); - return fence; +const Tegra::DmaPusher& GPU::DmaPusher() const { + return impl->DmaPusher(); } -void GPU::TickWork() { - std::unique_lock lck{flush_request_mutex}; - while (!flush_requests.empty()) { - auto& request = flush_requests.front(); - const u64 fence = request.fence; - const VAddr addr = request.addr; - const std::size_t size = request.size; - flush_requests.pop_front(); - flush_request_mutex.unlock(); - rasterizer->FlushRegion(addr, size); - current_flush_fence.store(fence); - flush_request_mutex.lock(); - } +Tegra::CDmaPusher& GPU::CDmaPusher() { + return impl->CDmaPusher(); } -u64 GPU::GetTicks() const { - // This values were reversed engineered by fincs from NVN - // The gpu clock is reported in units of 385/625 nanoseconds - constexpr u64 gpu_ticks_num = 384; - constexpr u64 gpu_ticks_den = 625; +const Tegra::CDmaPusher& GPU::CDmaPusher() const { + return impl->CDmaPusher(); +} - u64 nanoseconds = system.CoreTiming().GetGlobalTimeNs().count(); - if (Settings::values.use_fast_gpu_time.GetValue()) { - nanoseconds /= 256; - } - const u64 nanoseconds_num = nanoseconds / gpu_ticks_den; - const u64 nanoseconds_rem = nanoseconds % gpu_ticks_den; - return nanoseconds_num * gpu_ticks_num + (nanoseconds_rem * gpu_ticks_num) / gpu_ticks_den; +VideoCore::RendererBase& GPU::Renderer() { + return impl->Renderer(); } -void GPU::RendererFrameEndNotify() { - system.GetPerfStats().EndGameFrame(); +const VideoCore::RendererBase& GPU::Renderer() const { + return impl->Renderer(); } -void GPU::FlushCommands() { - rasterizer->FlushCommands(); +VideoCore::ShaderNotify& GPU::ShaderNotify() { + return impl->ShaderNotify(); } -void GPU::SyncGuestHost() { - rasterizer->SyncGuestHost(); +const VideoCore::ShaderNotify& GPU::ShaderNotify() const { + return impl->ShaderNotify(); } -enum class GpuSemaphoreOperation { - AcquireEqual = 0x1, - WriteLong = 0x2, - AcquireGequal = 0x4, - AcquireMask = 0x8, -}; +void GPU::WaitFence(u32 syncpoint_id, u32 value) { + impl->WaitFence(syncpoint_id, value); +} -void GPU::CallMethod(const MethodCall& method_call) { - LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method_call.method, - method_call.subchannel); +void GPU::IncrementSyncPoint(u32 syncpoint_id) { + impl->IncrementSyncPoint(syncpoint_id); +} - ASSERT(method_call.subchannel < bound_engines.size()); +u32 GPU::GetSyncpointValue(u32 syncpoint_id) const { + return impl->GetSyncpointValue(syncpoint_id); +} - if (ExecuteMethodOnEngine(method_call.method)) { - CallEngineMethod(method_call); - } else { - CallPullerMethod(method_call); - } +void GPU::RegisterSyncptInterrupt(u32 syncpoint_id, u32 value) { + impl->RegisterSyncptInterrupt(syncpoint_id, value); } -void GPU::CallMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount, - u32 methods_pending) { - LOG_TRACE(HW_GPU, "Processing method {:08X} on subchannel {}", method, subchannel); - - ASSERT(subchannel < bound_engines.size()); - - if (ExecuteMethodOnEngine(method)) { - CallEngineMultiMethod(method, subchannel, base_start, amount, methods_pending); - } else { - for (std::size_t i = 0; i < amount; i++) { - CallPullerMethod(MethodCall{ - method, - base_start[i], - subchannel, - methods_pending - static_cast<u32>(i), - }); - } - } +bool GPU::CancelSyncptInterrupt(u32 syncpoint_id, u32 value) { + return impl->CancelSyncptInterrupt(syncpoint_id, value); } -bool GPU::ExecuteMethodOnEngine(u32 method) { - const auto buffer_method = static_cast<BufferMethods>(method); - return buffer_method >= BufferMethods::NonPullerMethods; -} - -void GPU::CallPullerMethod(const MethodCall& method_call) { - regs.reg_array[method_call.method] = method_call.argument; - const auto method = static_cast<BufferMethods>(method_call.method); - - switch (method) { - case BufferMethods::BindObject: { - ProcessBindMethod(method_call); - break; - } - case BufferMethods::Nop: - case BufferMethods::SemaphoreAddressHigh: - case BufferMethods::SemaphoreAddressLow: - case BufferMethods::SemaphoreSequence: - case BufferMethods::UnkCacheFlush: - case BufferMethods::WrcacheFlush: - case BufferMethods::FenceValue: - break; - case BufferMethods::RefCnt: - rasterizer->SignalReference(); - break; - case BufferMethods::FenceAction: - ProcessFenceActionMethod(); - break; - case BufferMethods::WaitForInterrupt: - ProcessWaitForInterruptMethod(); - break; - case BufferMethods::SemaphoreTrigger: { - ProcessSemaphoreTriggerMethod(); - break; - } - case BufferMethods::NotifyIntr: { - // TODO(Kmather73): Research and implement this method. - LOG_ERROR(HW_GPU, "Special puller engine method NotifyIntr not implemented"); - break; - } - case BufferMethods::Unk28: { - // TODO(Kmather73): Research and implement this method. - LOG_ERROR(HW_GPU, "Special puller engine method Unk28 not implemented"); - break; - } - case BufferMethods::SemaphoreAcquire: { - ProcessSemaphoreAcquire(); - break; - } - case BufferMethods::SemaphoreRelease: { - ProcessSemaphoreRelease(); - break; - } - case BufferMethods::Yield: { - // TODO(Kmather73): Research and implement this method. - LOG_ERROR(HW_GPU, "Special puller engine method Yield not implemented"); - break; - } - default: - LOG_ERROR(HW_GPU, "Special puller engine method {:X} not implemented", method); - break; - } -} - -void GPU::CallEngineMethod(const MethodCall& method_call) { - const EngineID engine = bound_engines[method_call.subchannel]; - - switch (engine) { - case EngineID::FERMI_TWOD_A: - fermi_2d->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall()); - break; - case EngineID::MAXWELL_B: - maxwell_3d->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall()); - break; - case EngineID::KEPLER_COMPUTE_B: - kepler_compute->CallMethod(method_call.method, method_call.argument, - method_call.IsLastCall()); - break; - case EngineID::MAXWELL_DMA_COPY_A: - maxwell_dma->CallMethod(method_call.method, method_call.argument, method_call.IsLastCall()); - break; - case EngineID::KEPLER_INLINE_TO_MEMORY_B: - kepler_memory->CallMethod(method_call.method, method_call.argument, - method_call.IsLastCall()); - break; - default: - UNIMPLEMENTED_MSG("Unimplemented engine"); - } -} - -void GPU::CallEngineMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount, - u32 methods_pending) { - const EngineID engine = bound_engines[subchannel]; - - switch (engine) { - case EngineID::FERMI_TWOD_A: - fermi_2d->CallMultiMethod(method, base_start, amount, methods_pending); - break; - case EngineID::MAXWELL_B: - maxwell_3d->CallMultiMethod(method, base_start, amount, methods_pending); - break; - case EngineID::KEPLER_COMPUTE_B: - kepler_compute->CallMultiMethod(method, base_start, amount, methods_pending); - break; - case EngineID::MAXWELL_DMA_COPY_A: - maxwell_dma->CallMultiMethod(method, base_start, amount, methods_pending); - break; - case EngineID::KEPLER_INLINE_TO_MEMORY_B: - kepler_memory->CallMultiMethod(method, base_start, amount, methods_pending); - break; - default: - UNIMPLEMENTED_MSG("Unimplemented engine"); - } -} - -void GPU::ProcessBindMethod(const MethodCall& method_call) { - // Bind the current subchannel to the desired engine id. - LOG_DEBUG(HW_GPU, "Binding subchannel {} to engine {}", method_call.subchannel, - method_call.argument); - const auto engine_id = static_cast<EngineID>(method_call.argument); - bound_engines[method_call.subchannel] = static_cast<EngineID>(engine_id); - switch (engine_id) { - case EngineID::FERMI_TWOD_A: - dma_pusher->BindSubchannel(fermi_2d.get(), method_call.subchannel); - break; - case EngineID::MAXWELL_B: - dma_pusher->BindSubchannel(maxwell_3d.get(), method_call.subchannel); - break; - case EngineID::KEPLER_COMPUTE_B: - dma_pusher->BindSubchannel(kepler_compute.get(), method_call.subchannel); - break; - case EngineID::MAXWELL_DMA_COPY_A: - dma_pusher->BindSubchannel(maxwell_dma.get(), method_call.subchannel); - break; - case EngineID::KEPLER_INLINE_TO_MEMORY_B: - dma_pusher->BindSubchannel(kepler_memory.get(), method_call.subchannel); - break; - default: - UNIMPLEMENTED_MSG("Unimplemented engine {:04X}", engine_id); - } -} - -void GPU::ProcessFenceActionMethod() { - switch (regs.fence_action.op) { - case FenceOperation::Acquire: - WaitFence(regs.fence_action.syncpoint_id, regs.fence_value); - break; - case FenceOperation::Increment: - IncrementSyncPoint(regs.fence_action.syncpoint_id); - break; - default: - UNIMPLEMENTED_MSG("Unimplemented operation {}", regs.fence_action.op.Value()); - } -} - -void GPU::ProcessWaitForInterruptMethod() { - // TODO(bunnei) ImplementMe - LOG_WARNING(HW_GPU, "(STUBBED) called"); -} - -void GPU::ProcessSemaphoreTriggerMethod() { - const auto semaphoreOperationMask = 0xF; - const auto op = - static_cast<GpuSemaphoreOperation>(regs.semaphore_trigger & semaphoreOperationMask); - if (op == GpuSemaphoreOperation::WriteLong) { - struct Block { - u32 sequence; - u32 zeros = 0; - u64 timestamp; - }; +u64 GPU::GetTicks() const { + return impl->GetTicks(); +} - Block block{}; - block.sequence = regs.semaphore_sequence; - // TODO(Kmather73): Generate a real GPU timestamp and write it here instead of - // CoreTiming - block.timestamp = GetTicks(); - memory_manager->WriteBlock(regs.semaphore_address.SemaphoreAddress(), &block, - sizeof(block)); - } else { - const u32 word{memory_manager->Read<u32>(regs.semaphore_address.SemaphoreAddress())}; - if ((op == GpuSemaphoreOperation::AcquireEqual && word == regs.semaphore_sequence) || - (op == GpuSemaphoreOperation::AcquireGequal && - static_cast<s32>(word - regs.semaphore_sequence) > 0) || - (op == GpuSemaphoreOperation::AcquireMask && (word & regs.semaphore_sequence))) { - // Nothing to do in this case - } else { - regs.acquire_source = true; - regs.acquire_value = regs.semaphore_sequence; - if (op == GpuSemaphoreOperation::AcquireEqual) { - regs.acquire_active = true; - regs.acquire_mode = false; - } else if (op == GpuSemaphoreOperation::AcquireGequal) { - regs.acquire_active = true; - regs.acquire_mode = true; - } else if (op == GpuSemaphoreOperation::AcquireMask) { - // TODO(kemathe) The acquire mask operation waits for a value that, ANDed with - // semaphore_sequence, gives a non-0 result - LOG_ERROR(HW_GPU, "Invalid semaphore operation AcquireMask not implemented"); - } else { - LOG_ERROR(HW_GPU, "Invalid semaphore operation"); - } - } - } +bool GPU::IsAsync() const { + return impl->IsAsync(); } -void GPU::ProcessSemaphoreRelease() { - memory_manager->Write<u32>(regs.semaphore_address.SemaphoreAddress(), regs.semaphore_release); +bool GPU::UseNvdec() const { + return impl->UseNvdec(); } -void GPU::ProcessSemaphoreAcquire() { - const u32 word = memory_manager->Read<u32>(regs.semaphore_address.SemaphoreAddress()); - const auto value = regs.semaphore_acquire; - if (word != value) { - regs.acquire_active = true; - regs.acquire_value = value; - // TODO(kemathe73) figure out how to do the acquire_timeout - regs.acquire_mode = false; - regs.acquire_source = false; - } +void GPU::RendererFrameEndNotify() { + impl->RendererFrameEndNotify(); } void GPU::Start() { - gpu_thread.StartThread(*renderer, renderer->Context(), *dma_pusher); - cpu_context = renderer->GetRenderWindow().CreateSharedContext(); - cpu_context->MakeCurrent(); + impl->Start(); } void GPU::ObtainContext() { - cpu_context->MakeCurrent(); + impl->ObtainContext(); } void GPU::ReleaseContext() { - cpu_context->DoneCurrent(); + impl->ReleaseContext(); } void GPU::PushGPUEntries(Tegra::CommandList&& entries) { - gpu_thread.SubmitList(std::move(entries)); + impl->PushGPUEntries(std::move(entries)); } void GPU::PushCommandBuffer(Tegra::ChCommandHeaderList& entries) { - if (!use_nvdec) { - return; - } - - if (!cdma_pusher) { - cdma_pusher = std::make_unique<Tegra::CDmaPusher>(*this); - } - - // SubmitCommandBuffer would make the nvdec operations async, this is not currently working - // TODO(ameerj): RE proper async nvdec operation - // gpu_thread.SubmitCommandBuffer(std::move(entries)); - - cdma_pusher->ProcessEntries(std::move(entries)); + impl->PushCommandBuffer(entries); } void GPU::ClearCdmaInstance() { - cdma_pusher.reset(); + impl->ClearCdmaInstance(); } void GPU::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { - gpu_thread.SwapBuffers(framebuffer); + impl->SwapBuffers(framebuffer); } void GPU::FlushRegion(VAddr addr, u64 size) { - gpu_thread.FlushRegion(addr, size); + impl->FlushRegion(addr, size); } void GPU::InvalidateRegion(VAddr addr, u64 size) { - gpu_thread.InvalidateRegion(addr, size); + impl->InvalidateRegion(addr, size); } void GPU::FlushAndInvalidateRegion(VAddr addr, u64 size) { - gpu_thread.FlushAndInvalidateRegion(addr, size); -} - -void GPU::TriggerCpuInterrupt(const u32 syncpoint_id, const u32 value) const { - auto& interrupt_manager = system.InterruptManager(); - interrupt_manager.GPUInterruptSyncpt(syncpoint_id, value); -} - -void GPU::ShutDown() { - // Signal that threads should no longer block on syncpoint fences - shutting_down.store(true, std::memory_order_relaxed); - sync_cv.notify_all(); - - gpu_thread.ShutDown(); -} - -void GPU::OnCommandListEnd() { - if (is_async) { - // This command only applies to asynchronous GPU mode - gpu_thread.OnCommandListEnd(); - } + impl->FlushAndInvalidateRegion(addr, size); } } // namespace Tegra diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index a8e98e51b..05e5c94f3 100644 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -4,28 +4,12 @@ #pragma once -#include <array> -#include <atomic> -#include <condition_variable> -#include <list> #include <memory> -#include <mutex> + +#include "common/bit_field.h" #include "common/common_types.h" -#include "core/hle/service/nvdrv/nvdata.h" -#include "core/hle/service/nvflinger/buffer_queue.h" #include "video_core/cdma_pusher.h" -#include "video_core/dma_pusher.h" #include "video_core/framebuffer_config.h" -#include "video_core/gpu_thread.h" - -using CacheAddr = std::uintptr_t; -[[nodiscard]] inline CacheAddr ToCacheAddr(const void* host_ptr) { - return reinterpret_cast<CacheAddr>(host_ptr); -} - -[[nodiscard]] inline u8* FromCacheAddr(CacheAddr cache_addr) { - return reinterpret_cast<u8*>(cache_addr); -} namespace Core { namespace Frontend { @@ -40,6 +24,9 @@ class ShaderNotify; } // namespace VideoCore namespace Tegra { +class DmaPusher; +class CDmaPusher; +struct CommandList; enum class RenderTargetFormat : u32 { NONE = 0x0, @@ -138,7 +125,18 @@ public: } }; - explicit GPU(Core::System& system_, bool is_async_, bool use_nvdec_); + enum class FenceOperation : u32 { + Acquire = 0, + Increment = 1, + }; + + union FenceAction { + u32 raw; + BitField<0, 1, FenceOperation> op; + BitField<8, 24, u32> syncpoint_id; + }; + + explicit GPU(Core::System& system, bool is_async, bool use_nvdec); ~GPU(); /// Binds a renderer to the GPU. @@ -162,9 +160,7 @@ public: [[nodiscard]] u64 RequestFlush(VAddr addr, std::size_t size); /// Obtains current flush request fence id. - [[nodiscard]] u64 CurrentFlushRequestFence() const { - return current_flush_fence.load(std::memory_order_relaxed); - } + [[nodiscard]] u64 CurrentFlushRequestFence() const; /// Tick pending requests within the GPU. void TickWork(); @@ -200,27 +196,16 @@ public: [[nodiscard]] const Tegra::CDmaPusher& CDmaPusher() const; /// Returns a reference to the underlying renderer. - [[nodiscard]] VideoCore::RendererBase& Renderer() { - return *renderer; - } + [[nodiscard]] VideoCore::RendererBase& Renderer(); /// Returns a const reference to the underlying renderer. - [[nodiscard]] const VideoCore::RendererBase& Renderer() const { - return *renderer; - } + [[nodiscard]] const VideoCore::RendererBase& Renderer() const; /// Returns a reference to the shader notifier. - [[nodiscard]] VideoCore::ShaderNotify& ShaderNotify() { - return *shader_notify; - } + [[nodiscard]] VideoCore::ShaderNotify& ShaderNotify(); /// Returns a const reference to the shader notifier. - [[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const { - return *shader_notify; - } - - // Stops the GPU execution and waits for the GPU to finish working - void ShutDown(); + [[nodiscard]] const VideoCore::ShaderNotify& ShaderNotify() const; /// Allows the CPU/NvFlinger to wait on the GPU before presenting a frame. void WaitFence(u32 syncpoint_id, u32 value); @@ -235,80 +220,12 @@ public: [[nodiscard]] u64 GetTicks() const; - [[nodiscard]] std::unique_lock<std::mutex> LockSync() { - return std::unique_lock{sync_mutex}; - } - - [[nodiscard]] bool IsAsync() const { - return is_async; - } + [[nodiscard]] bool IsAsync() const; - [[nodiscard]] bool UseNvdec() const { - return use_nvdec; - } + [[nodiscard]] bool UseNvdec() const; void RendererFrameEndNotify(); - enum class FenceOperation : u32 { - Acquire = 0, - Increment = 1, - }; - - union FenceAction { - u32 raw; - BitField<0, 1, FenceOperation> op; - BitField<8, 24, u32> syncpoint_id; - - [[nodiscard]] static CommandHeader Build(FenceOperation op, u32 syncpoint_id) { - FenceAction result{}; - result.op.Assign(op); - result.syncpoint_id.Assign(syncpoint_id); - return {result.raw}; - } - }; - - struct Regs { - static constexpr size_t NUM_REGS = 0x40; - - union { - struct { - INSERT_PADDING_WORDS_NOINIT(0x4); - struct { - u32 address_high; - u32 address_low; - - [[nodiscard]] GPUVAddr SemaphoreAddress() const { - return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | - address_low); - } - } semaphore_address; - - u32 semaphore_sequence; - u32 semaphore_trigger; - INSERT_PADDING_WORDS_NOINIT(0xC); - - // The pusher and the puller share the reference counter, the pusher only has read - // access - u32 reference_count; - INSERT_PADDING_WORDS_NOINIT(0x5); - - u32 semaphore_acquire; - u32 semaphore_release; - u32 fence_value; - FenceAction fence_action; - INSERT_PADDING_WORDS_NOINIT(0xE2); - - // Puller state - u32 acquire_mode; - u32 acquire_source; - u32 acquire_active; - u32 acquire_timeout; - u32 acquire_value; - }; - std::array<u32, NUM_REGS> reg_array; - }; - } regs{}; - /// Performs any additional setup necessary in order to begin GPU emulation. /// This can be used to launch any necessary threads and register any necessary /// core timing events. @@ -341,104 +258,9 @@ public: /// Notify rasterizer that any caches of the specified region should be flushed and invalidated void FlushAndInvalidateRegion(VAddr addr, u64 size); -protected: - void TriggerCpuInterrupt(u32 syncpoint_id, u32 value) const; - -private: - void ProcessBindMethod(const MethodCall& method_call); - void ProcessFenceActionMethod(); - void ProcessWaitForInterruptMethod(); - void ProcessSemaphoreTriggerMethod(); - void ProcessSemaphoreRelease(); - void ProcessSemaphoreAcquire(); - - /// Calls a GPU puller method. - void CallPullerMethod(const MethodCall& method_call); - - /// Calls a GPU engine method. - void CallEngineMethod(const MethodCall& method_call); - - /// Calls a GPU engine multivalue method. - void CallEngineMultiMethod(u32 method, u32 subchannel, const u32* base_start, u32 amount, - u32 methods_pending); - - /// Determines where the method should be executed. - [[nodiscard]] bool ExecuteMethodOnEngine(u32 method); - -protected: - Core::System& system; - std::unique_ptr<Tegra::MemoryManager> memory_manager; - std::unique_ptr<Tegra::DmaPusher> dma_pusher; - std::unique_ptr<Tegra::CDmaPusher> cdma_pusher; - std::unique_ptr<VideoCore::RendererBase> renderer; - VideoCore::RasterizerInterface* rasterizer = nullptr; - const bool use_nvdec; - private: - /// Mapping of command subchannels to their bound engine ids - std::array<EngineID, 8> bound_engines = {}; - /// 3D engine - std::unique_ptr<Engines::Maxwell3D> maxwell_3d; - /// 2D engine - std::unique_ptr<Engines::Fermi2D> fermi_2d; - /// Compute engine - std::unique_ptr<Engines::KeplerCompute> kepler_compute; - /// DMA engine - std::unique_ptr<Engines::MaxwellDMA> maxwell_dma; - /// Inline memory engine - std::unique_ptr<Engines::KeplerMemory> kepler_memory; - /// Shader build notifier - std::unique_ptr<VideoCore::ShaderNotify> shader_notify; - /// When true, we are about to shut down emulation session, so terminate outstanding tasks - std::atomic_bool shutting_down{}; - - std::array<std::atomic<u32>, Service::Nvidia::MaxSyncPoints> syncpoints{}; - - std::array<std::list<u32>, Service::Nvidia::MaxSyncPoints> syncpt_interrupts; - - std::mutex sync_mutex; - std::mutex device_mutex; - - std::condition_variable sync_cv; - - struct FlushRequest { - explicit FlushRequest(u64 fence_, VAddr addr_, std::size_t size_) - : fence{fence_}, addr{addr_}, size{size_} {} - u64 fence; - VAddr addr; - std::size_t size; - }; - - std::list<FlushRequest> flush_requests; - std::atomic<u64> current_flush_fence{}; - u64 last_flush_fence{}; - std::mutex flush_request_mutex; - - const bool is_async; - - VideoCommon::GPUThread::ThreadManager gpu_thread; - std::unique_ptr<Core::Frontend::GraphicsContext> cpu_context; + struct Impl; + std::unique_ptr<Impl> impl; }; -#define ASSERT_REG_POSITION(field_name, position) \ - static_assert(offsetof(GPU::Regs, field_name) == position * 4, \ - "Field " #field_name " has invalid position") - -ASSERT_REG_POSITION(semaphore_address, 0x4); -ASSERT_REG_POSITION(semaphore_sequence, 0x6); -ASSERT_REG_POSITION(semaphore_trigger, 0x7); -ASSERT_REG_POSITION(reference_count, 0x14); -ASSERT_REG_POSITION(semaphore_acquire, 0x1A); -ASSERT_REG_POSITION(semaphore_release, 0x1B); -ASSERT_REG_POSITION(fence_value, 0x1C); -ASSERT_REG_POSITION(fence_action, 0x1D); - -ASSERT_REG_POSITION(acquire_mode, 0x100); -ASSERT_REG_POSITION(acquire_source, 0x101); -ASSERT_REG_POSITION(acquire_active, 0x102); -ASSERT_REG_POSITION(acquire_timeout, 0x103); -ASSERT_REG_POSITION(acquire_value, 0x104); - -#undef ASSERT_REG_POSITION - } // namespace Tegra diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index 46f642b19..9547f277a 100644 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -17,9 +17,9 @@ namespace VideoCommon::GPUThread { /// Runs the GPU thread -static void RunThread(Core::System& system, VideoCore::RendererBase& renderer, - Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher, - SynchState& state) { +static void RunThread(std::stop_token stop_token, Core::System& system, + VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context, + Tegra::DmaPusher& dma_pusher, SynchState& state) { std::string name = "yuzu:GPU"; MicroProfileOnThreadCreate(name.c_str()); SCOPE_EXIT({ MicroProfileOnThreadExit(); }); @@ -28,20 +28,14 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer, Common::SetCurrentThreadPriority(Common::ThreadPriority::High); system.RegisterHostThread(); - // Wait for first GPU command before acquiring the window context - state.queue.Wait(); - - // If emulation was stopped during disk shader loading, abort before trying to acquire context - if (!state.is_running) { - return; - } - auto current_context = context.Acquire(); VideoCore::RasterizerInterface* const rasterizer = renderer.ReadRasterizer(); - CommandDataContainer next; - while (state.is_running) { - next = state.queue.PopWait(); + while (!stop_token.stop_requested()) { + CommandDataContainer next = state.queue.PopWait(stop_token); + if (stop_token.stop_requested()) { + break; + } if (auto* submit_list = std::get_if<SubmitListCommand>(&next.data)) { dma_pusher.Push(std::move(submit_list->entries)); dma_pusher.DispatchCalls(); @@ -55,8 +49,6 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer, rasterizer->FlushRegion(flush->addr, flush->size); } else if (const auto* invalidate = std::get_if<InvalidateRegionCommand>(&next.data)) { rasterizer->OnCPUWrite(invalidate->addr, invalidate->size); - } else if (std::holds_alternative<EndProcessingCommand>(next.data)) { - ASSERT(state.is_running == false); } else { UNREACHABLE(); } @@ -73,16 +65,14 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer, ThreadManager::ThreadManager(Core::System& system_, bool is_async_) : system{system_}, is_async{is_async_} {} -ThreadManager::~ThreadManager() { - ShutDown(); -} +ThreadManager::~ThreadManager() = default; void ThreadManager::StartThread(VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher) { rasterizer = renderer.ReadRasterizer(); - thread = std::thread(RunThread, std::ref(system), std::ref(renderer), std::ref(context), - std::ref(dma_pusher), std::ref(state)); + thread = std::jthread(RunThread, std::ref(system), std::ref(renderer), std::ref(context), + std::ref(dma_pusher), std::ref(state)); } void ThreadManager::SubmitList(Tegra::CommandList&& entries) { @@ -117,26 +107,6 @@ void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) { rasterizer->OnCPUWrite(addr, size); } -void ThreadManager::ShutDown() { - if (!state.is_running) { - return; - } - - { - std::lock_guard lk(state.write_lock); - state.is_running = false; - state.cv.notify_all(); - } - - if (!thread.joinable()) { - return; - } - - // Notify GPU thread that a shutdown is pending - PushCommand(EndProcessingCommand()); - thread.join(); -} - void ThreadManager::OnCommandListEnd() { PushCommand(OnCommandListEndCommand()); } @@ -152,9 +122,8 @@ u64 ThreadManager::PushCommand(CommandData&& command_data, bool block) { state.queue.Push(CommandDataContainer(std::move(command_data), fence, block)); if (block) { - state.cv.wait(lk, [this, fence] { - return fence <= state.signaled_fence.load(std::memory_order_relaxed) || - !state.is_running; + state.cv.wait(lk, thread.get_stop_token(), [this, fence] { + return fence <= state.signaled_fence.load(std::memory_order_relaxed); }); } diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h index 11a648f38..00984188e 100644 --- a/src/video_core/gpu_thread.h +++ b/src/video_core/gpu_thread.h @@ -33,9 +33,6 @@ class RendererBase; namespace VideoCommon::GPUThread { -/// Command to signal to the GPU thread that processing has ended -struct EndProcessingCommand final {}; - /// Command to signal to the GPU thread that a command list is ready for processing struct SubmitListCommand final { explicit SubmitListCommand(Tegra::CommandList&& entries_) : entries{std::move(entries_)} {} @@ -83,7 +80,7 @@ struct OnCommandListEndCommand final {}; struct GPUTickCommand final {}; using CommandData = - std::variant<EndProcessingCommand, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand, + std::variant<std::monostate, SubmitListCommand, SwapBuffersCommand, FlushRegionCommand, InvalidateRegionCommand, FlushAndInvalidateRegionCommand, OnCommandListEndCommand, GPUTickCommand>; @@ -100,14 +97,12 @@ struct CommandDataContainer { /// Struct used to synchronize the GPU thread struct SynchState final { - std::atomic_bool is_running{true}; - - using CommandQueue = Common::SPSCQueue<CommandDataContainer>; + using CommandQueue = Common::SPSCQueue<CommandDataContainer, true>; std::mutex write_lock; CommandQueue queue; u64 last_fence{}; std::atomic<u64> signaled_fence{}; - std::condition_variable cv; + std::condition_variable_any cv; }; /// Class used to manage the GPU thread @@ -135,9 +130,6 @@ public: /// Notify rasterizer that any caches of the specified region should be flushed and invalidated void FlushAndInvalidateRegion(VAddr addr, u64 size); - // Stops the GPU execution and waits for the GPU to finish working - void ShutDown(); - void OnCommandListEnd(); private: @@ -149,7 +141,7 @@ private: VideoCore::RasterizerInterface* rasterizer = nullptr; SynchState state; - std::thread thread; + std::jthread thread; }; } // namespace VideoCommon::GPUThread diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index c9cff7450..20d748c12 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -6,7 +6,6 @@ set(SHADER_FILES convert_float_to_depth.frag full_screen_triangle.vert opengl_copy_bc4.comp - opengl_copy_bgra.comp opengl_present.frag opengl_present.vert pitch_unswizzle.comp diff --git a/src/video_core/host_shaders/opengl_copy_bgra.comp b/src/video_core/host_shaders/opengl_copy_bgra.comp deleted file mode 100644 index 2571a4abf..000000000 --- a/src/video_core/host_shaders/opengl_copy_bgra.comp +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright 2021 yuzu Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#version 430 core - -layout (local_size_x = 4, local_size_y = 4) in; - -layout(binding = 0, rgba8) readonly uniform image2DArray bgr_input; -layout(binding = 1, rgba8) writeonly uniform image2DArray bgr_output; - -void main() { - vec4 color = imageLoad(bgr_input, ivec3(gl_GlobalInvocationID)); - imageStore(bgr_output, ivec3(gl_GlobalInvocationID), color.bgra); -} diff --git a/src/video_core/query_cache.h b/src/video_core/query_cache.h index aac851253..73231061a 100644 --- a/src/video_core/query_cache.h +++ b/src/video_core/query_cache.h @@ -8,6 +8,7 @@ #include <array> #include <cstring> #include <iterator> +#include <list> #include <memory> #include <mutex> #include <optional> diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index 07a995f7d..187a28e4d 100644 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -147,8 +147,7 @@ void BufferCacheRuntime::CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer, void BufferCacheRuntime::ClearBuffer(Buffer& dest_buffer, u32 offset, size_t size, u32 value) { glClearNamedBufferSubData(dest_buffer.Handle(), GL_R32UI, static_cast<GLintptr>(offset), - static_cast<GLsizeiptr>(size / sizeof(u32)), GL_RED, GL_UNSIGNED_INT, - &value); + static_cast<GLsizeiptr>(size), GL_RED, GL_UNSIGNED_INT, &value); } void BufferCacheRuntime::BindIndexBuffer(Buffer& buffer, u32 offset, u32 size) { diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index b0aee6cc1..8c3ca3d82 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -20,6 +20,7 @@ #include "video_core/surface.h" #include "video_core/texture_cache/formatter.h" #include "video_core/texture_cache/samples_helper.h" +#include "video_core/texture_cache/util.h" namespace OpenGL { namespace { @@ -461,7 +462,7 @@ bool TextureCacheRuntime::CanImageBeCopied(const Image& dst, const Image& src) { if (dst.info.type == ImageType::e3D && dst.info.format == PixelFormat::BC4_UNORM) { return false; } - if (IsPixelFormatBGR(dst.info.format) || IsPixelFormatBGR(src.info.format)) { + if (IsPixelFormatBGR(dst.info.format) != IsPixelFormatBGR(src.info.format)) { return false; } return true; @@ -473,7 +474,7 @@ void TextureCacheRuntime::EmulateCopyImage(Image& dst, Image& src, ASSERT(src.info.type == ImageType::e3D); util_shaders.CopyBC4(dst, src, copies); } else if (IsPixelFormatBGR(dst.info.format) || IsPixelFormatBGR(src.info.format)) { - util_shaders.CopyBGR(dst, src, copies); + bgr_copy_pass.CopyBGR(dst, src, copies); } else { UNREACHABLE(); } @@ -1112,4 +1113,37 @@ Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM framebuffer.handle = handle; } +void BGRCopyPass::CopyBGR(Image& dst_image, Image& src_image, + std::span<const VideoCommon::ImageCopy> copies) { + static constexpr VideoCommon::Offset3D zero_offset{0, 0, 0}; + const u32 requested_pbo_size = + std::max(src_image.unswizzled_size_bytes, dst_image.unswizzled_size_bytes); + + if (bgr_pbo_size < requested_pbo_size) { + bgr_pbo.Create(); + bgr_pbo_size = requested_pbo_size; + glNamedBufferData(bgr_pbo.handle, bgr_pbo_size, nullptr, GL_STREAM_COPY); + } + for (const ImageCopy& copy : copies) { + ASSERT(copy.src_offset == zero_offset); + ASSERT(copy.dst_offset == zero_offset); + + // Copy from source to PBO + glPixelStorei(GL_PACK_ALIGNMENT, 1); + glPixelStorei(GL_PACK_ROW_LENGTH, copy.extent.width); + glBindBuffer(GL_PIXEL_PACK_BUFFER, bgr_pbo.handle); + glGetTextureSubImage(src_image.Handle(), 0, 0, 0, 0, copy.extent.width, copy.extent.height, + copy.src_subresource.num_layers, src_image.GlFormat(), + src_image.GlType(), static_cast<GLsizei>(bgr_pbo_size), nullptr); + + // Copy from PBO to destination in desired GL format + glPixelStorei(GL_UNPACK_ALIGNMENT, 1); + glPixelStorei(GL_UNPACK_ROW_LENGTH, copy.extent.width); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, bgr_pbo.handle); + glTextureSubImage3D(dst_image.Handle(), 0, 0, 0, 0, copy.extent.width, copy.extent.height, + copy.dst_subresource.num_layers, dst_image.GlFormat(), + dst_image.GlType(), nullptr); + } +} + } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 4a4f6301c..1ca2c90be 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -12,6 +12,7 @@ #include "shader_recompiler/shader_info.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/util_shaders.h" +#include "video_core/texture_cache/image_view_base.h" #include "video_core/texture_cache/texture_cache_base.h" namespace OpenGL { @@ -47,6 +48,19 @@ struct FormatProperties { bool is_compressed; }; +class BGRCopyPass { +public: + BGRCopyPass() = default; + ~BGRCopyPass() = default; + + void CopyBGR(Image& dst_image, Image& src_image, + std::span<const VideoCommon::ImageCopy> copies); + +private: + OGLBuffer bgr_pbo; + size_t bgr_pbo_size{}; +}; + class TextureCacheRuntime { friend Framebuffer; friend Image; @@ -118,6 +132,7 @@ private: const Device& device; StateTracker& state_tracker; UtilShaders util_shaders; + BGRCopyPass bgr_copy_pass; std::array<std::unordered_map<GLenum, FormatProperties>, 3> format_properties; bool has_broken_texture_view_formats = false; @@ -162,6 +177,14 @@ public: return texture.handle; } + GLuint GlFormat() const noexcept { + return gl_format; + } + + GLuint GlType() const noexcept { + return gl_type; + } + private: void CopyBufferToImage(const VideoCommon::BufferImageCopy& copy, size_t buffer_offset); diff --git a/src/video_core/renderer_opengl/maxwell_to_gl.h b/src/video_core/renderer_opengl/maxwell_to_gl.h index 672f94bfc..39158aa3e 100644 --- a/src/video_core/renderer_opengl/maxwell_to_gl.h +++ b/src/video_core/renderer_opengl/maxwell_to_gl.h @@ -52,7 +52,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> FORMAT_TAB {GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT}, // BC6H_UFLOAT {GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT}, // BC6H_SFLOAT {GL_COMPRESSED_RGBA_ASTC_4x4_KHR}, // ASTC_2D_4X4_UNORM - {GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE}, // B8G8R8A8_UNORM + {GL_RGBA8, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV}, // B8G8R8A8_UNORM {GL_RGBA32F, GL_RGBA, GL_FLOAT}, // R32G32B32A32_FLOAT {GL_RGBA32I, GL_RGBA_INTEGER, GL_INT}, // R32G32B32A32_SINT {GL_RG32F, GL_RG, GL_FLOAT}, // R32G32_FLOAT @@ -81,7 +81,7 @@ constexpr std::array<FormatTuple, VideoCore::Surface::MaxPixelFormat> FORMAT_TAB {GL_COMPRESSED_RGBA_ASTC_8x8_KHR}, // ASTC_2D_8X8_UNORM {GL_COMPRESSED_RGBA_ASTC_8x5_KHR}, // ASTC_2D_8X5_UNORM {GL_COMPRESSED_RGBA_ASTC_5x4_KHR}, // ASTC_2D_5X4_UNORM - {GL_SRGB8_ALPHA8, GL_RGBA, GL_UNSIGNED_BYTE}, // B8G8R8A8_SRGB + {GL_SRGB8_ALPHA8, GL_BGRA, GL_UNSIGNED_INT_8_8_8_8_REV}, // B8G8R8A8_SRGB {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT}, // BC1_RGBA_SRGB {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT}, // BC2_SRGB {GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT}, // BC3_SRGB diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp index 333f35a1c..897c380b3 100644 --- a/src/video_core/renderer_opengl/util_shaders.cpp +++ b/src/video_core/renderer_opengl/util_shaders.cpp @@ -14,7 +14,6 @@ #include "video_core/host_shaders/block_linear_unswizzle_2d_comp.h" #include "video_core/host_shaders/block_linear_unswizzle_3d_comp.h" #include "video_core/host_shaders/opengl_copy_bc4_comp.h" -#include "video_core/host_shaders/opengl_copy_bgra_comp.h" #include "video_core/host_shaders/pitch_unswizzle_comp.h" #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_shader_util.h" @@ -44,11 +43,6 @@ namespace { OGLProgram MakeProgram(std::string_view source) { return CreateProgram(source, GL_COMPUTE_SHADER); } - -size_t NumPixelsInCopy(const VideoCommon::ImageCopy& copy) { - return static_cast<size_t>(copy.extent.width * copy.extent.height * - copy.src_subresource.num_layers); -} } // Anonymous namespace UtilShaders::UtilShaders(ProgramManager& program_manager_) @@ -56,7 +50,6 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_) block_linear_unswizzle_2d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_2D_COMP)), block_linear_unswizzle_3d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_3D_COMP)), pitch_unswizzle_program(MakeProgram(PITCH_UNSWIZZLE_COMP)), - copy_bgra_program(MakeProgram(OPENGL_COPY_BGRA_COMP)), copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) { const auto swizzle_table = Tegra::Texture::MakeSwizzleTable(); swizzle_table_buffer.Create(); @@ -255,43 +248,6 @@ void UtilShaders::CopyBC4(Image& dst_image, Image& src_image, std::span<const Im program_manager.RestoreGuestCompute(); } -void UtilShaders::CopyBGR(Image& dst_image, Image& src_image, - std::span<const VideoCommon::ImageCopy> copies) { - static constexpr GLuint BINDING_INPUT_IMAGE = 0; - static constexpr GLuint BINDING_OUTPUT_IMAGE = 1; - static constexpr VideoCommon::Offset3D zero_offset{0, 0, 0}; - const u32 bytes_per_block = BytesPerBlock(dst_image.info.format); - switch (bytes_per_block) { - case 2: - // BGR565 copy - for (const ImageCopy& copy : copies) { - ASSERT(copy.src_offset == zero_offset); - ASSERT(copy.dst_offset == zero_offset); - bgr_copy_pass.Execute(dst_image, src_image, copy); - } - break; - case 4: { - // BGRA8 copy - program_manager.BindComputeProgram(copy_bgra_program.handle); - constexpr GLenum FORMAT = GL_RGBA8; - for (const ImageCopy& copy : copies) { - ASSERT(copy.src_offset == zero_offset); - ASSERT(copy.dst_offset == zero_offset); - glBindImageTexture(BINDING_INPUT_IMAGE, src_image.StorageHandle(), - copy.src_subresource.base_level, GL_FALSE, 0, GL_READ_ONLY, FORMAT); - glBindImageTexture(BINDING_OUTPUT_IMAGE, dst_image.StorageHandle(), - copy.dst_subresource.base_level, GL_FALSE, 0, GL_WRITE_ONLY, FORMAT); - glDispatchCompute(copy.extent.width, copy.extent.height, copy.extent.depth); - } - program_manager.RestoreGuestCompute(); - break; - } - default: - UNREACHABLE(); - break; - } -} - GLenum StoreFormat(u32 bytes_per_block) { switch (bytes_per_block) { case 1: @@ -309,36 +265,4 @@ GLenum StoreFormat(u32 bytes_per_block) { return GL_R8UI; } -void Bgr565CopyPass::Execute(const Image& dst_image, const Image& src_image, - const ImageCopy& copy) { - if (CopyBufferCreationNeeded(copy)) { - CreateNewCopyBuffer(copy, GL_TEXTURE_2D_ARRAY, GL_RGB565); - } - // Copy from source to PBO - glPixelStorei(GL_PACK_ALIGNMENT, 1); - glPixelStorei(GL_PACK_ROW_LENGTH, copy.extent.width); - glBindBuffer(GL_PIXEL_PACK_BUFFER, bgr16_pbo.handle); - glGetTextureSubImage(src_image.Handle(), 0, 0, 0, 0, copy.extent.width, copy.extent.height, - copy.src_subresource.num_layers, GL_RGB, GL_UNSIGNED_SHORT_5_6_5, - static_cast<GLsizei>(bgr16_pbo_size), nullptr); - - // Copy from PBO to destination in reverse order - glPixelStorei(GL_UNPACK_ALIGNMENT, 1); - glPixelStorei(GL_UNPACK_ROW_LENGTH, copy.extent.width); - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, bgr16_pbo.handle); - glTextureSubImage3D(dst_image.Handle(), 0, 0, 0, 0, copy.extent.width, copy.extent.height, - copy.dst_subresource.num_layers, GL_RGB, GL_UNSIGNED_SHORT_5_6_5_REV, - nullptr); -} - -bool Bgr565CopyPass::CopyBufferCreationNeeded(const ImageCopy& copy) { - return bgr16_pbo_size < NumPixelsInCopy(copy) * sizeof(u16); -} - -void Bgr565CopyPass::CreateNewCopyBuffer(const ImageCopy& copy, GLenum target, GLuint format) { - bgr16_pbo.Create(); - bgr16_pbo_size = NumPixelsInCopy(copy) * sizeof(u16); - glNamedBufferData(bgr16_pbo.handle, bgr16_pbo_size, nullptr, GL_STREAM_COPY); -} - } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/util_shaders.h b/src/video_core/renderer_opengl/util_shaders.h index ef881e35f..5de95ea7a 100644 --- a/src/video_core/renderer_opengl/util_shaders.h +++ b/src/video_core/renderer_opengl/util_shaders.h @@ -19,22 +19,6 @@ class ProgramManager; struct ImageBufferMap; -class Bgr565CopyPass { -public: - Bgr565CopyPass() = default; - ~Bgr565CopyPass() = default; - - void Execute(const Image& dst_image, const Image& src_image, - const VideoCommon::ImageCopy& copy); - -private: - [[nodiscard]] bool CopyBufferCreationNeeded(const VideoCommon::ImageCopy& copy); - void CreateNewCopyBuffer(const VideoCommon::ImageCopy& copy, GLenum target, GLuint format); - - OGLBuffer bgr16_pbo; - size_t bgr16_pbo_size{}; -}; - class UtilShaders { public: explicit UtilShaders(ProgramManager& program_manager); @@ -55,9 +39,6 @@ public: void CopyBC4(Image& dst_image, Image& src_image, std::span<const VideoCommon::ImageCopy> copies); - void CopyBGR(Image& dst_image, Image& src_image, - std::span<const VideoCommon::ImageCopy> copies); - private: ProgramManager& program_manager; @@ -67,10 +48,7 @@ private: OGLProgram block_linear_unswizzle_2d_program; OGLProgram block_linear_unswizzle_3d_program; OGLProgram pitch_unswizzle_program; - OGLProgram copy_bgra_program; OGLProgram copy_bc4_program; - - Bgr565CopyPass bgr_copy_pass; }; GLenum StoreFormat(u32 bytes_per_block); diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index 9ff0a28cd..74822814d 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -97,19 +97,14 @@ RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_, Core::Frontend::EmuWindow& emu_window, Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_, std::unique_ptr<Core::Frontend::GraphicsContext> context_) try - : RendererBase(emu_window, std::move(context_)), - telemetry_session(telemetry_session_), - cpu_memory(cpu_memory_), - gpu(gpu_), - library(OpenLibrary()), + : RendererBase(emu_window, std::move(context_)), telemetry_session(telemetry_session_), + cpu_memory(cpu_memory_), gpu(gpu_), library(OpenLibrary()), instance(CreateInstance(library, dld, VK_API_VERSION_1_1, render_window.GetWindowInfo().type, true, Settings::values.renderer_debug.GetValue())), debug_callback(Settings::values.renderer_debug ? CreateDebugCallback(instance) : nullptr), surface(CreateSurface(instance, render_window)), - device(CreateDevice(instance, dld, *surface)), - memory_allocator(device, false), - state_tracker(gpu), - scheduler(device, state_tracker), + device(CreateDevice(instance, dld, *surface)), memory_allocator(device, false), + state_tracker(gpu), scheduler(device, state_tracker), swapchain(*surface, device, scheduler, render_window.GetFramebufferLayout().width, render_window.GetFramebufferLayout().height, false), blit_screen(cpu_memory, render_window, device, memory_allocator, swapchain, scheduler, @@ -149,7 +144,7 @@ void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { const Layout::FramebufferLayout layout = render_window.GetFramebufferLayout(); swapchain.Create(layout.width, layout.height, is_srgb); }; - if (swapchain.IsSubOptimal() || swapchain.HasColorSpaceChanged(is_srgb)) { + if (swapchain.NeedsRecreation(is_srgb)) { recreate_swapchain(); } bool is_outdated; diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index 7c0f91007..8634c3316 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -507,8 +507,9 @@ void GraphicsPipeline::MakePipeline(VkRenderPass render_pass) { vertex_attributes.push_back({ .location = static_cast<u32>(index), .binding = 0, - .format = type == 1 ? VK_FORMAT_R32_SFLOAT - : type == 2 ? VK_FORMAT_R32_SINT : VK_FORMAT_R32_UINT, + .format = type == 1 ? VK_FORMAT_R32_SFLOAT + : type == 2 ? VK_FORMAT_R32_SINT + : VK_FORMAT_R32_UINT, .offset = 0, }); } @@ -567,12 +568,21 @@ void GraphicsPipeline::MakePipeline(VkRenderPass render_pass) { if (!vertex_binding_divisors.empty()) { vertex_input_ci.pNext = &input_divisor_ci; } + const bool has_tess_stages = spv_modules[1] || spv_modules[2]; auto input_assembly_topology = MaxwellToVK::PrimitiveTopology(device, key.state.topology); if (input_assembly_topology == VK_PRIMITIVE_TOPOLOGY_PATCH_LIST) { - if (!spv_modules[1] && !spv_modules[2]) { + if (!has_tess_stages) { LOG_WARNING(Render_Vulkan, "Patch topology used without tessellation, using points"); input_assembly_topology = VK_PRIMITIVE_TOPOLOGY_POINT_LIST; } + } else { + if (has_tess_stages) { + // The Vulkan spec requires patch list IA topology be used with tessellation + // shader stages. Forcing it fixes a crash on some drivers + LOG_WARNING(Render_Vulkan, + "Patch topology not used with tessellation, using patch list"); + input_assembly_topology = VK_PRIMITIVE_TOPOLOGY_PATCH_LIST; + } } const VkPipelineInputAssemblyStateCreateInfo input_assembly_ci{ .sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO, diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 1d438787a..0c11c814f 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -43,17 +43,10 @@ VKScheduler::VKScheduler(const Device& device_, StateTracker& state_tracker_) command_pool{std::make_unique<CommandPool>(*master_semaphore, device)} { AcquireNewChunk(); AllocateWorkerCommandBuffer(); - worker_thread = std::thread(&VKScheduler::WorkerThread, this); + worker_thread = std::jthread([this](std::stop_token token) { WorkerThread(token); }); } -VKScheduler::~VKScheduler() { - { - std::lock_guard lock{work_mutex}; - quit = true; - } - work_cv.notify_all(); - worker_thread.join(); -} +VKScheduler::~VKScheduler() = default; void VKScheduler::Flush(VkSemaphore signal_semaphore, VkSemaphore wait_semaphore) { SubmitExecution(signal_semaphore, wait_semaphore); @@ -135,7 +128,7 @@ bool VKScheduler::UpdateGraphicsPipeline(GraphicsPipeline* pipeline) { return true; } -void VKScheduler::WorkerThread() { +void VKScheduler::WorkerThread(std::stop_token stop_token) { Common::SetCurrentThreadName("yuzu:VulkanWorker"); do { if (work_queue.empty()) { @@ -144,8 +137,8 @@ void VKScheduler::WorkerThread() { std::unique_ptr<CommandChunk> work; { std::unique_lock lock{work_mutex}; - work_cv.wait(lock, [this] { return !work_queue.empty() || quit; }); - if (quit) { + work_cv.wait(lock, stop_token, [this] { return !work_queue.empty(); }); + if (stop_token.stop_requested()) { continue; } work = std::move(work_queue.front()); @@ -158,7 +151,7 @@ void VKScheduler::WorkerThread() { } std::lock_guard reserve_lock{reserve_mutex}; chunk_reserve.push_back(std::move(work)); - } while (!quit); + } while (!stop_token.stop_requested()); } void VKScheduler::AllocateWorkerCommandBuffer() { diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 759ed5a48..85fc1712f 100644 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -187,7 +187,7 @@ private: GraphicsPipeline* graphics_pipeline = nullptr; }; - void WorkerThread(); + void WorkerThread(std::stop_token stop_token); void AllocateWorkerCommandBuffer(); @@ -212,7 +212,6 @@ private: vk::CommandBuffer current_cmdbuf; std::unique_ptr<CommandChunk> chunk; - std::thread worker_thread; State state; @@ -224,9 +223,9 @@ private: std::vector<std::unique_ptr<CommandChunk>> chunk_reserve; std::mutex reserve_mutex; std::mutex work_mutex; - std::condition_variable work_cv; + std::condition_variable_any work_cv; std::condition_variable wait_cv; - std::atomic_bool quit{}; + std::jthread worker_thread; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp index aadf03cb0..8972a6921 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.cpp +++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp @@ -9,6 +9,7 @@ #include "common/assert.h" #include "common/logging/log.h" +#include "common/settings.h" #include "core/core.h" #include "core/frontend/framebuffer_layout.h" #include "video_core/renderer_vulkan/vk_scheduler.h" @@ -36,8 +37,19 @@ VkSurfaceFormatKHR ChooseSwapSurfaceFormat(vk::Span<VkSurfaceFormatKHR> formats) VkPresentModeKHR ChooseSwapPresentMode(vk::Span<VkPresentModeKHR> modes) { // Mailbox doesn't lock the application like fifo (vsync), prefer it - const auto found = std::find(modes.begin(), modes.end(), VK_PRESENT_MODE_MAILBOX_KHR); - return found != modes.end() ? *found : VK_PRESENT_MODE_FIFO_KHR; + const auto found_mailbox = std::find(modes.begin(), modes.end(), VK_PRESENT_MODE_MAILBOX_KHR); + if (found_mailbox != modes.end()) { + return VK_PRESENT_MODE_MAILBOX_KHR; + } + if (Settings::values.disable_fps_limit.GetValue()) { + // FIFO present mode locks the framerate to the monitor's refresh rate, + // Find an alternative to surpass this limitation if FPS is unlocked. + const auto found_imm = std::find(modes.begin(), modes.end(), VK_PRESENT_MODE_IMMEDIATE_KHR); + if (found_imm != modes.end()) { + return VK_PRESENT_MODE_IMMEDIATE_KHR; + } + } + return VK_PRESENT_MODE_FIFO_KHR; } VkExtent2D ChooseSwapExtent(const VkSurfaceCapabilitiesKHR& capabilities, u32 width, u32 height) { @@ -143,7 +155,7 @@ void VKSwapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities, const auto present_modes{physical_device.GetSurfacePresentModesKHR(surface)}; const VkSurfaceFormatKHR surface_format{ChooseSwapSurfaceFormat(formats)}; - const VkPresentModeKHR present_mode{ChooseSwapPresentMode(present_modes)}; + present_mode = ChooseSwapPresentMode(present_modes); u32 requested_image_count{capabilities.minImageCount + 1}; if (capabilities.maxImageCount > 0 && requested_image_count > capabilities.maxImageCount) { @@ -196,6 +208,7 @@ void VKSwapchain::CreateSwapchain(const VkSurfaceCapabilitiesKHR& capabilities, extent = swapchain_ci.imageExtent; current_srgb = srgb; + current_fps_unlocked = Settings::values.disable_fps_limit.GetValue(); images = swapchain.GetImages(); image_count = static_cast<u32>(images.size()); @@ -248,4 +261,14 @@ void VKSwapchain::Destroy() { swapchain.reset(); } +bool VKSwapchain::HasFpsUnlockChanged() const { + return current_fps_unlocked != Settings::values.disable_fps_limit.GetValue(); +} + +bool VKSwapchain::NeedsPresentModeUpdate() const { + // Mailbox present mode is the ideal for all scenarios. If it is not available, + // A different present mode is needed to support unlocked FPS above the monitor's refresh rate. + return present_mode != VK_PRESENT_MODE_MAILBOX_KHR && HasFpsUnlockChanged(); +} + } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_swapchain.h b/src/video_core/renderer_vulkan/vk_swapchain.h index 5bce41e21..61a6d959e 100644 --- a/src/video_core/renderer_vulkan/vk_swapchain.h +++ b/src/video_core/renderer_vulkan/vk_swapchain.h @@ -33,6 +33,11 @@ public: /// Presents the rendered image to the swapchain. void Present(VkSemaphore render_semaphore); + /// Returns true when the swapchain needs to be recreated. + bool NeedsRecreation(bool is_srgb) const { + return HasColorSpaceChanged(is_srgb) || IsSubOptimal() || NeedsPresentModeUpdate(); + } + /// Returns true when the color space has changed. bool HasColorSpaceChanged(bool is_srgb) const { return current_srgb != is_srgb; @@ -84,6 +89,10 @@ private: void Destroy(); + bool HasFpsUnlockChanged() const; + + bool NeedsPresentModeUpdate() const; + const VkSurfaceKHR surface; const Device& device; VKScheduler& scheduler; @@ -102,8 +111,10 @@ private: VkFormat image_view_format{}; VkExtent2D extent{}; + VkPresentModeKHR present_mode{}; bool current_srgb{}; + bool current_fps_unlocked{}; bool is_outdated{}; bool is_suboptimal{}; }; diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index ff979a7ac..06c5fb867 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -21,6 +21,7 @@ #include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/texture_cache/formatter.h" #include "video_core/texture_cache/samples_helper.h" +#include "video_core/texture_cache/util.h" #include "video_core/vulkan_common/vulkan_device.h" #include "video_core/vulkan_common/vulkan_memory_allocator.h" #include "video_core/vulkan_common/vulkan_wrapper.h" @@ -127,7 +128,7 @@ constexpr VkBorderColor ConvertBorderColor(const std::array<float, 4>& color) { const auto format_info = MaxwellToVK::SurfaceFormat(device, FormatType::Optimal, false, format); VkImageCreateFlags flags = VK_IMAGE_CREATE_MUTABLE_FORMAT_BIT; if (info.type == ImageType::e2D && info.resources.layers >= 6 && - info.size.width == info.size.height) { + info.size.width == info.size.height && !device.HasBrokenCubeImageCompability()) { flags |= VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT; } if (info.type == ImageType::e3D) { diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index 6d5a68bfe..b09c468e4 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -4,11 +4,11 @@ #pragma once -#include <compare> #include <span> #include "shader_recompiler/shader_info.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" +#include "video_core/texture_cache/image_view_base.h" #include "video_core/texture_cache/texture_cache_base.h" #include "video_core/vulkan_common/vulkan_memory_allocator.h" #include "video_core/vulkan_common/vulkan_wrapper.h" diff --git a/src/video_core/shader_environment.cpp b/src/video_core/shader_environment.cpp index 81a878bb2..05850afd0 100644 --- a/src/video_core/shader_environment.cpp +++ b/src/video_core/shader_environment.cpp @@ -16,6 +16,7 @@ #include "common/fs/fs.h" #include "common/logging/log.h" #include "shader_recompiler/environment.h" +#include "video_core/engines/kepler_compute.h" #include "video_core/memory_manager.h" #include "video_core/shader_environment.h" #include "video_core/textures/texture.h" diff --git a/src/video_core/shader_environment.h b/src/video_core/shader_environment.h index 2079979db..6640e53d0 100644 --- a/src/video_core/shader_environment.h +++ b/src/video_core/shader_environment.h @@ -5,13 +5,13 @@ #pragma once #include <array> -#include <atomic> #include <filesystem> #include <iosfwd> #include <limits> #include <memory> #include <optional> #include <span> +#include <stop_token> #include <type_traits> #include <unordered_map> #include <vector> @@ -19,9 +19,7 @@ #include "common/common_types.h" #include "common/unique_function.h" #include "shader_recompiler/environment.h" -#include "video_core/engines/kepler_compute.h" #include "video_core/engines/maxwell_3d.h" -#include "video_core/textures/texture.h" namespace Tegra { class Memorymanager; diff --git a/src/video_core/texture_cache/image_view_info.cpp b/src/video_core/texture_cache/image_view_info.cpp index 6527e14c8..e751f26c7 100644 --- a/src/video_core/texture_cache/image_view_info.cpp +++ b/src/video_core/texture_cache/image_view_info.cpp @@ -8,6 +8,7 @@ #include "video_core/texture_cache/image_view_info.h" #include "video_core/texture_cache/texture_cache_base.h" #include "video_core/texture_cache/types.h" +#include "video_core/texture_cache/util.h" #include "video_core/textures/texture.h" namespace VideoCommon { diff --git a/src/video_core/texture_cache/slot_vector.h b/src/video_core/texture_cache/slot_vector.h index 74cd3c9d8..50df06409 100644 --- a/src/video_core/texture_cache/slot_vector.h +++ b/src/video_core/texture_cache/slot_vector.h @@ -31,8 +31,8 @@ struct SlotId { }; template <class T> -requires std::is_nothrow_move_assignable_v<T>&& - std::is_nothrow_move_constructible_v<T> class SlotVector { +requires std::is_nothrow_move_assignable_v<T> && std::is_nothrow_move_constructible_v<T> +class SlotVector { public: class Iterator { friend SlotVector<T>; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 24b809242..329df2e49 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -4,10 +4,15 @@ #pragma once +#include <unordered_set> + #include "common/alignment.h" #include "video_core/dirty_flags.h" +#include "video_core/engines/kepler_compute.h" +#include "video_core/texture_cache/image_view_base.h" #include "video_core/texture_cache/samples_helper.h" #include "video_core/texture_cache/texture_cache_base.h" +#include "video_core/texture_cache/util.h" namespace VideoCommon { diff --git a/src/video_core/texture_cache/texture_cache_base.h b/src/video_core/texture_cache/texture_cache_base.h index d7528ed24..2d1893c1c 100644 --- a/src/video_core/texture_cache/texture_cache_base.h +++ b/src/video_core/texture_cache/texture_cache_base.h @@ -4,13 +4,12 @@ #pragma once -#include <array> #include <mutex> #include <span> #include <type_traits> #include <unordered_map> -#include <unordered_set> #include <vector> +#include <queue> #include "common/common_types.h" #include "common/literals.h" @@ -18,10 +17,6 @@ #include "video_core/compatible_formats.h" #include "video_core/delayed_destruction_ring.h" #include "video_core/engines/fermi_2d.h" -#include "video_core/engines/kepler_compute.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/memory_manager.h" -#include "video_core/rasterizer_interface.h" #include "video_core/surface.h" #include "video_core/texture_cache/descriptor_table.h" #include "video_core/texture_cache/image_base.h" @@ -30,7 +25,6 @@ #include "video_core/texture_cache/render_targets.h" #include "video_core/texture_cache/slot_vector.h" #include "video_core/texture_cache/types.h" -#include "video_core/texture_cache/util.h" #include "video_core/textures/texture.h" namespace VideoCommon { diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index c2ec9f76a..6388ed2eb 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -588,22 +588,27 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR ext_extended_dynamic_state = false; } } - sets_per_pool = 64; - if (driver_id == VK_DRIVER_ID_AMD_PROPRIETARY || driver_id == VK_DRIVER_ID_AMD_OPEN_SOURCE) { + + const bool is_amd = + driver_id == VK_DRIVER_ID_AMD_PROPRIETARY || driver_id == VK_DRIVER_ID_AMD_OPEN_SOURCE; + if (is_amd) { // AMD drivers need a higher amount of Sets per Pool in certain circunstances like in XC2. sets_per_pool = 96; - } - - const bool is_amd = driver_id == VK_DRIVER_ID_AMD_PROPRIETARY || - driver_id == VK_DRIVER_ID_MESA_RADV || - driver_id == VK_DRIVER_ID_AMD_OPEN_SOURCE; - if (ext_sampler_filter_minmax && is_amd) { - // Disable ext_sampler_filter_minmax on AMD GCN4 and lower as it is broken. + // Disable VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT on AMD GCN4 and lower as it is broken. if (!is_float16_supported) { LOG_WARNING( Render_Vulkan, - "Blacklisting AMD GCN4 and lower for VK_EXT_SAMPLER_FILTER_MINMAX_EXTENSION_NAME"); + "AMD GCN4 and earlier do not properly support VK_IMAGE_CREATE_CUBE_COMPATIBLE_BIT"); + has_broken_cube_compatibility = true; + } + } + const bool is_amd_or_radv = is_amd || driver_id == VK_DRIVER_ID_MESA_RADV; + if (ext_sampler_filter_minmax && is_amd_or_radv) { + // Disable ext_sampler_filter_minmax on AMD GCN4 and lower as it is broken. + if (!is_float16_supported) { + LOG_WARNING(Render_Vulkan, + "Blacklisting AMD GCN4 and earlier for VK_EXT_sampler_filter_minmax"); ext_sampler_filter_minmax = false; } } diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index bc180a32a..d9e74f1aa 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -309,6 +309,11 @@ public: return has_renderdoc || has_nsight_graphics; } + /// Returns true when the device does not properly support cube compatibility. + bool HasBrokenCubeImageCompability() const { + return has_broken_cube_compatibility; + } + /// Returns the vendor name reported from Vulkan. std::string_view GetVendorName() const { return vendor_name; @@ -417,6 +422,7 @@ private: bool ext_conservative_rasterization{}; ///< Support for VK_EXT_conservative_rasterization. bool ext_provoking_vertex{}; ///< Support for VK_EXT_provoking_vertex. bool nv_device_diagnostics_config{}; ///< Support for VK_NV_device_diagnostics_config. + bool has_broken_cube_compatibility{}; ///< Has broken cube compatiblity bit bool has_renderdoc{}; ///< Has RenderDoc attached bool has_nsight_graphics{}; ///< Has Nsight Graphics attached |