diff options
author | bunnei <bunneidev@gmail.com> | 2017-10-10 23:32:14 +0200 |
---|---|---|
committer | bunnei <bunneidev@gmail.com> | 2017-10-10 23:32:14 +0200 |
commit | 0906de9a14b735d1d409290ca050eb7d2c2d3d84 (patch) | |
tree | 79bb57d3a4dc4ca377e7a62744c3941de29e785b /src/core/hw | |
parent | Merge remote-tracking branch 'upstream/master' into nx (diff) | |
download | yuzu-0906de9a14b735d1d409290ca050eb7d2c2d3d84.tar yuzu-0906de9a14b735d1d409290ca050eb7d2c2d3d84.tar.gz yuzu-0906de9a14b735d1d409290ca050eb7d2c2d3d84.tar.bz2 yuzu-0906de9a14b735d1d409290ca050eb7d2c2d3d84.tar.lz yuzu-0906de9a14b735d1d409290ca050eb7d2c2d3d84.tar.xz yuzu-0906de9a14b735d1d409290ca050eb7d2c2d3d84.tar.zst yuzu-0906de9a14b735d1d409290ca050eb7d2c2d3d84.zip |
Diffstat (limited to 'src/core/hw')
-rw-r--r-- | src/core/hw/y2r.cpp | 382 | ||||
-rw-r--r-- | src/core/hw/y2r.h | 17 |
2 files changed, 0 insertions, 399 deletions
diff --git a/src/core/hw/y2r.cpp b/src/core/hw/y2r.cpp deleted file mode 100644 index e697f84b3..000000000 --- a/src/core/hw/y2r.cpp +++ /dev/null @@ -1,382 +0,0 @@ -// Copyright 2015 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#include <algorithm> -#include <array> -#include <cstddef> -#include <memory> -#include "common/assert.h" -#include "common/color.h" -#include "common/common_types.h" -#include "common/math_util.h" -#include "common/vector_math.h" -#include "core/hle/service/y2r_u.h" -#include "core/hw/y2r.h" -#include "core/memory.h" - -namespace HW { -namespace Y2R { - -using namespace Service::Y2R; - -static const size_t MAX_TILES = 1024 / 8; -static const size_t TILE_SIZE = 8 * 8; -using ImageTile = std::array<u32, TILE_SIZE>; - -/// Converts a image strip from the source YUV format into individual 8x8 RGB32 tiles. -static void ConvertYUVToRGB(InputFormat input_format, const u8* input_Y, const u8* input_U, - const u8* input_V, ImageTile output[], unsigned int width, - unsigned int height, const CoefficientSet& coefficients) { - - for (unsigned int y = 0; y < height; ++y) { - for (unsigned int x = 0; x < width; ++x) { - s32 Y = 0; - s32 U = 0; - s32 V = 0; - switch (input_format) { - case InputFormat::YUV422_Indiv8: - case InputFormat::YUV422_Indiv16: - Y = input_Y[y * width + x]; - U = input_U[(y * width + x) / 2]; - V = input_V[(y * width + x) / 2]; - break; - case InputFormat::YUV420_Indiv8: - case InputFormat::YUV420_Indiv16: - Y = input_Y[y * width + x]; - U = input_U[((y / 2) * width + x) / 2]; - V = input_V[((y / 2) * width + x) / 2]; - break; - case InputFormat::YUYV422_Interleaved: - Y = input_Y[(y * width + x) * 2]; - U = input_Y[(y * width + (x / 2) * 2) * 2 + 1]; - V = input_Y[(y * width + (x / 2) * 2) * 2 + 3]; - break; - } - - // This conversion process is bit-exact with hardware, as far as could be tested. - auto& c = coefficients; - s32 cY = c[0] * Y; - - s32 r = cY + c[1] * V; - s32 g = cY - c[2] * V - c[3] * U; - s32 b = cY + c[4] * U; - - const s32 rounding_offset = 0x18; - r = (r >> 3) + c[5] + rounding_offset; - g = (g >> 3) + c[6] + rounding_offset; - b = (b >> 3) + c[7] + rounding_offset; - - unsigned int tile = x / 8; - unsigned int tile_x = x % 8; - u32* out = &output[tile][y * 8 + tile_x]; - - using MathUtil::Clamp; - *out = ((u32)Clamp(r >> 5, 0, 0xFF) << 24) | ((u32)Clamp(g >> 5, 0, 0xFF) << 16) | - ((u32)Clamp(b >> 5, 0, 0xFF) << 8); - } - } -} - -/// Simulates an incoming CDMA transfer. The N parameter is used to automatically convert 16-bit -/// formats to 8-bit. -template <size_t N> -static void ReceiveData(u8* output, ConversionBuffer& buf, size_t amount_of_data) { - const u8* input = Memory::GetPointer(buf.address); - - size_t output_unit = buf.transfer_unit / N; - ASSERT(amount_of_data % output_unit == 0); - - while (amount_of_data > 0) { - for (size_t i = 0; i < output_unit; ++i) { - output[i] = input[i * N]; - } - - output += output_unit; - input += buf.transfer_unit + buf.gap; - - buf.address += buf.transfer_unit + buf.gap; - buf.image_size -= buf.transfer_unit; - amount_of_data -= output_unit; - } -} - -/// Convert intermediate RGB32 format to the final output format while simulating an outgoing CDMA -/// transfer. -static void SendData(const u32* input, ConversionBuffer& buf, int amount_of_data, - OutputFormat output_format, u8 alpha) { - - u8* output = Memory::GetPointer(buf.address); - - while (amount_of_data > 0) { - u8* unit_end = output + buf.transfer_unit; - while (output < unit_end) { - u32 color = *input++; - Math::Vec4<u8> col_vec{(u8)(color >> 24), (u8)(color >> 16), (u8)(color >> 8), alpha}; - - switch (output_format) { - case OutputFormat::RGBA8: - Color::EncodeRGBA8(col_vec, output); - output += 4; - break; - case OutputFormat::RGB8: - Color::EncodeRGB8(col_vec, output); - output += 3; - break; - case OutputFormat::RGB5A1: - Color::EncodeRGB5A1(col_vec, output); - output += 2; - break; - case OutputFormat::RGB565: - Color::EncodeRGB565(col_vec, output); - output += 2; - break; - } - - amount_of_data -= 1; - } - - output += buf.gap; - buf.address += buf.transfer_unit + buf.gap; - buf.image_size -= buf.transfer_unit; - } -} - -static const u8 linear_lut[TILE_SIZE] = { - // clang-format off - 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15, - 16, 17, 18, 19, 20, 21, 22, 23, - 24, 25, 26, 27, 28, 29, 30, 31, - 32, 33, 34, 35, 36, 37, 38, 39, - 40, 41, 42, 43, 44, 45, 46, 47, - 48, 49, 50, 51, 52, 53, 54, 55, - 56, 57, 58, 59, 60, 61, 62, 63, - // clang-format on -}; - -static const u8 morton_lut[TILE_SIZE] = { - // clang-format off - 0, 1, 4, 5, 16, 17, 20, 21, - 2, 3, 6, 7, 18, 19, 22, 23, - 8, 9, 12, 13, 24, 25, 28, 29, - 10, 11, 14, 15, 26, 27, 30, 31, - 32, 33, 36, 37, 48, 49, 52, 53, - 34, 35, 38, 39, 50, 51, 54, 55, - 40, 41, 44, 45, 56, 57, 60, 61, - 42, 43, 46, 47, 58, 59, 62, 63, - // clang-format on -}; - -static void RotateTile0(const ImageTile& input, ImageTile& output, int height, - const u8 out_map[64]) { - for (int i = 0; i < height * 8; ++i) { - output[out_map[i]] = input[i]; - } -} - -static void RotateTile90(const ImageTile& input, ImageTile& output, int height, - const u8 out_map[64]) { - int out_i = 0; - for (int x = 0; x < 8; ++x) { - for (int y = height - 1; y >= 0; --y) { - output[out_map[out_i++]] = input[y * 8 + x]; - } - } -} - -static void RotateTile180(const ImageTile& input, ImageTile& output, int height, - const u8 out_map[64]) { - int out_i = 0; - for (int i = height * 8 - 1; i >= 0; --i) { - output[out_map[out_i++]] = input[i]; - } -} - -static void RotateTile270(const ImageTile& input, ImageTile& output, int height, - const u8 out_map[64]) { - int out_i = 0; - for (int x = 8 - 1; x >= 0; --x) { - for (int y = 0; y < height; ++y) { - output[out_map[out_i++]] = input[y * 8 + x]; - } - } -} - -static void WriteTileToOutput(u32* output, const ImageTile& tile, int height, int line_stride) { - for (int y = 0; y < height; ++y) { - for (int x = 0; x < 8; ++x) { - output[y * line_stride + x] = tile[y * 8 + x]; - } - } -} - -/** - * Performs a Y2R colorspace conversion. - * - * The Y2R hardware implements hardware-accelerated YUV to RGB colorspace conversions. It is most - * commonly used for video playback or to display camera input to the screen. - * - * The conversion process is quite configurable, and can be divided in distinct steps. From - * observation, it appears that the hardware buffers a single 8-pixel tall strip of image data - * internally and converts it in one go before writing to the output and loading the next strip. - * - * The steps taken to convert one strip of image data are: - * - * - The hardware receives data via CDMA (http://3dbrew.org/wiki/Corelink_DMA_Engines), which is - * presumably stored in one or more internal buffers. This process can be done in several separate - * transfers, as long as they don't exceed the size of the internal image buffer. This allows - * flexibility in input strides. - * - The input data is decoded into a YUV tuple. Several formats are suported, see the `InputFormat` - * enum. - * - The YUV tuple is converted, using fixed point calculations, to RGB. This step can be configured - * using a set of coefficients to support different colorspace standards. See `CoefficientSet`. - * - The strip can be optionally rotated 90, 180 or 270 degrees. Since each strip is processed - * independently, this notably rotates each *strip*, not the entire image. This means that for 90 - * or 270 degree rotations, the output will be in terms of several 8 x height images, and for any - * non-zero rotation the strips will have to be re-arranged so that the parts of the image will - * not be shuffled together. This limitation makes this a feature of somewhat dubious utility. 90 - * or 270 degree rotations in images with non-even height don't seem to work properly. - * - The data is converted to the output RGB format. See the `OutputFormat` enum. - * - The data can be output either linearly line-by-line or in the swizzled 8x8 tile format used by - * the PICA. This is decided by the `BlockAlignment` enum. If 8x8 alignment is used, then the - * image must have a height divisible by 8. The image width must always be divisible by 8. - * - The final data is then CDMAed out to main memory and the next image strip is processed. This - * offers the same flexibility as the input stage. - * - * In this implementation, to avoid the combinatorial explosion of parameter combinations, common - * intermediate formats are used and where possible tables or parameters are used instead of - * diverging code paths to keep the amount of branches in check. Some steps are also merged to - * increase efficiency. - * - * Output for all valid settings combinations matches hardware, however output in some edge-cases - * differs: - * - * - `Block8x8` alignment with non-mod8 height produces different garbage patterns on the last - * strip, especially when combined with rotation. - * - Hardware, when using `Linear` alignment with a non-even height and 90 or 270 degree rotation - * produces misaligned output on the last strip. This implmentation produces output with the - * correct "expected" alignment. - * - * Hardware behaves strangely (doesn't fire the completion interrupt, for example) in these cases, - * so they are believed to be invalid configurations anyway. - */ -void PerformConversion(ConversionConfiguration& cvt) { - ASSERT(cvt.input_line_width % 8 == 0); - ASSERT(cvt.block_alignment != BlockAlignment::Block8x8 || cvt.input_lines % 8 == 0); - // Tiles per row - size_t num_tiles = cvt.input_line_width / 8; - ASSERT(num_tiles <= MAX_TILES); - - // Buffer used as a CDMA source/target. - std::unique_ptr<u8[]> data_buffer(new u8[cvt.input_line_width * 8 * 4]); - // Intermediate storage for decoded 8x8 image tiles. Always stored as RGB32. - std::unique_ptr<ImageTile[]> tiles(new ImageTile[num_tiles]); - ImageTile tmp_tile; - - // LUT used to remap writes to a tile. Used to allow linear or swizzled output without - // requiring two different code paths. - const u8* tile_remap = nullptr; - switch (cvt.block_alignment) { - case BlockAlignment::Linear: - tile_remap = linear_lut; - break; - case BlockAlignment::Block8x8: - tile_remap = morton_lut; - break; - } - - for (unsigned int y = 0; y < cvt.input_lines; y += 8) { - unsigned int row_height = std::min(cvt.input_lines - y, 8u); - - // Total size in pixels of incoming data required for this strip. - const size_t row_data_size = row_height * cvt.input_line_width; - - u8* input_Y = data_buffer.get(); - u8* input_U = input_Y + 8 * cvt.input_line_width; - u8* input_V = input_U + 8 * cvt.input_line_width / 2; - - switch (cvt.input_format) { - case InputFormat::YUV422_Indiv8: - ReceiveData<1>(input_Y, cvt.src_Y, row_data_size); - ReceiveData<1>(input_U, cvt.src_U, row_data_size / 2); - ReceiveData<1>(input_V, cvt.src_V, row_data_size / 2); - break; - case InputFormat::YUV420_Indiv8: - ReceiveData<1>(input_Y, cvt.src_Y, row_data_size); - ReceiveData<1>(input_U, cvt.src_U, row_data_size / 4); - ReceiveData<1>(input_V, cvt.src_V, row_data_size / 4); - break; - case InputFormat::YUV422_Indiv16: - ReceiveData<2>(input_Y, cvt.src_Y, row_data_size); - ReceiveData<2>(input_U, cvt.src_U, row_data_size / 2); - ReceiveData<2>(input_V, cvt.src_V, row_data_size / 2); - break; - case InputFormat::YUV420_Indiv16: - ReceiveData<2>(input_Y, cvt.src_Y, row_data_size); - ReceiveData<2>(input_U, cvt.src_U, row_data_size / 4); - ReceiveData<2>(input_V, cvt.src_V, row_data_size / 4); - break; - case InputFormat::YUYV422_Interleaved: - input_U = nullptr; - input_V = nullptr; - ReceiveData<1>(input_Y, cvt.src_YUYV, row_data_size * 2); - break; - } - - // Note(yuriks): If additional optimization is required, input_format can be moved to a - // template parameter, so that its dispatch can be moved to outside the inner loop. - ConvertYUVToRGB(cvt.input_format, input_Y, input_U, input_V, tiles.get(), - cvt.input_line_width, row_height, cvt.coefficients); - - u32* output_buffer = reinterpret_cast<u32*>(data_buffer.get()); - - for (size_t i = 0; i < num_tiles; ++i) { - int image_strip_width = 0; - int output_stride = 0; - - switch (cvt.rotation) { - case Rotation::None: - RotateTile0(tiles[i], tmp_tile, row_height, tile_remap); - image_strip_width = cvt.input_line_width; - output_stride = 8; - break; - case Rotation::Clockwise_90: - RotateTile90(tiles[i], tmp_tile, row_height, tile_remap); - image_strip_width = 8; - output_stride = 8 * row_height; - break; - case Rotation::Clockwise_180: - // For 180 and 270 degree rotations we also invert the order of tiles in the strip, - // since the rotates are done individually on each tile. - RotateTile180(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap); - image_strip_width = cvt.input_line_width; - output_stride = 8; - break; - case Rotation::Clockwise_270: - RotateTile270(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap); - image_strip_width = 8; - output_stride = 8 * row_height; - break; - } - - switch (cvt.block_alignment) { - case BlockAlignment::Linear: - WriteTileToOutput(output_buffer, tmp_tile, row_height, image_strip_width); - output_buffer += output_stride; - break; - case BlockAlignment::Block8x8: - WriteTileToOutput(output_buffer, tmp_tile, 8, 8); - output_buffer += TILE_SIZE; - break; - } - } - - // Note(yuriks): If additional optimization is required, output_format can be moved to a - // template parameter, so that its dispatch can be moved to outside the inner loop. - SendData(reinterpret_cast<u32*>(data_buffer.get()), cvt.dst, (int)row_data_size, - cvt.output_format, (u8)cvt.alpha); - } -} -} -} diff --git a/src/core/hw/y2r.h b/src/core/hw/y2r.h deleted file mode 100644 index 25fcd781c..000000000 --- a/src/core/hw/y2r.h +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright 2015 Citra Emulator Project -// Licensed under GPLv2 or any later version -// Refer to the license.txt file included. - -#pragma once - -namespace Service { -namespace Y2R { -struct ConversionConfiguration; -} -} - -namespace HW { -namespace Y2R { -void PerformConversion(Service::Y2R::ConversionConfiguration& cvt); -} -} |