| 
									
										
										
										
											2015-06-07 22:24:03 -03:00
										 |  |  | // Copyright 2015 Citra Emulator Project
 | 
					
						
							|  |  |  | // Licensed under GPLv2 or any later version
 | 
					
						
							|  |  |  | // Refer to the license.txt file included.
 | 
					
						
							|  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2015-06-27 17:56:17 +01:00
										 |  |  | #include <algorithm>
 | 
					
						
							| 
									
										
										
										
											2015-06-07 22:24:03 -03:00
										 |  |  | #include <array>
 | 
					
						
							| 
									
										
										
										
											2015-06-27 17:56:17 +01:00
										 |  |  | #include <cstddef>
 | 
					
						
							|  |  |  | #include <memory>
 | 
					
						
							| 
									
										
										
										
											2015-06-07 22:24:03 -03:00
										 |  |  | 
 | 
					
						
							|  |  |  | #include "common/assert.h"
 | 
					
						
							|  |  |  | #include "common/color.h"
 | 
					
						
							|  |  |  | #include "common/common_types.h"
 | 
					
						
							|  |  |  | #include "common/math_util.h"
 | 
					
						
							|  |  |  | #include "common/vector_math.h"
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | #include "core/hle/service/y2r_u.h"
 | 
					
						
							| 
									
										
										
										
											2015-07-29 12:12:39 -04:00
										 |  |  | #include "core/hw/y2r.h"
 | 
					
						
							| 
									
										
										
										
											2015-06-07 22:24:03 -03:00
										 |  |  | #include "core/memory.h"
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | namespace HW { | 
					
						
							|  |  |  | namespace Y2R { | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | using namespace Y2R_U; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static const size_t MAX_TILES = 1024 / 8; | 
					
						
							|  |  |  | static const size_t TILE_SIZE = 8 * 8; | 
					
						
							|  |  |  | using ImageTile = std::array<u32, TILE_SIZE>; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /// Converts a image strip from the source YUV format into individual 8x8 RGB32 tiles.
 | 
					
						
							|  |  |  | static void ConvertYUVToRGB(InputFormat input_format, | 
					
						
							|  |  |  |         const u8* input_Y, const u8* input_U, const u8* input_V, ImageTile output[], | 
					
						
							|  |  |  |         unsigned int width, unsigned int height, const CoefficientSet& coefficients) { | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for (unsigned int y = 0; y < height; ++y) { | 
					
						
							|  |  |  |         for (unsigned int x = 0; x < width; ++x) { | 
					
						
							|  |  |  |             s32 Y, U, V; | 
					
						
							|  |  |  |             switch (input_format) { | 
					
						
							|  |  |  |             case InputFormat::YUV422_Indiv8: | 
					
						
							|  |  |  |             case InputFormat::YUV422_Indiv16: | 
					
						
							|  |  |  |                 Y = input_Y[y * width + x]; | 
					
						
							|  |  |  |                 U = input_U[(y * width + x) / 2]; | 
					
						
							|  |  |  |                 V = input_V[(y * width + x) / 2]; | 
					
						
							|  |  |  |                 break; | 
					
						
							|  |  |  |             case InputFormat::YUV420_Indiv8: | 
					
						
							|  |  |  |             case InputFormat::YUV420_Indiv16: | 
					
						
							|  |  |  |                 Y = input_Y[y * width + x]; | 
					
						
							|  |  |  |                 U = input_U[((y / 2) * width + x) / 2]; | 
					
						
							|  |  |  |                 V = input_V[((y / 2) * width + x) / 2]; | 
					
						
							|  |  |  |                 break; | 
					
						
							|  |  |  |             case InputFormat::YUYV422_Interleaved: | 
					
						
							|  |  |  |                 Y = input_Y[(y * width + x) * 2]; | 
					
						
							|  |  |  |                 U = input_Y[(y * width + (x / 2) * 2) * 2 + 1]; | 
					
						
							|  |  |  |                 V = input_Y[(y * width + (x / 2) * 2) * 2 + 3]; | 
					
						
							|  |  |  |                 break; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             // This conversion process is bit-exact with hardware, as far as could be tested.
 | 
					
						
							|  |  |  |             auto& c = coefficients; | 
					
						
							|  |  |  |             s32 cY = c[0]*Y; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             s32 r = cY          + c[1]*V; | 
					
						
							|  |  |  |             s32 g = cY - c[3]*U - c[2]*V; | 
					
						
							|  |  |  |             s32 b = cY + c[4]*U; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             const s32 rounding_offset = 0x18; | 
					
						
							|  |  |  |             r = (r >> 3) + c[5] + rounding_offset; | 
					
						
							|  |  |  |             g = (g >> 3) + c[6] + rounding_offset; | 
					
						
							|  |  |  |             b = (b >> 3) + c[7] + rounding_offset; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             unsigned int tile = x / 8; | 
					
						
							|  |  |  |             unsigned int tile_x = x % 8; | 
					
						
							|  |  |  |             u32* out = &output[tile][y * 8 + tile_x]; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             using MathUtil::Clamp; | 
					
						
							|  |  |  |             *out = ((u32)Clamp(r >> 5, 0, 0xFF) << 24) | | 
					
						
							|  |  |  |                    ((u32)Clamp(g >> 5, 0, 0xFF) << 16) | | 
					
						
							|  |  |  |                    ((u32)Clamp(b >> 5, 0, 0xFF) << 8); | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /// Simulates an incoming CDMA transfer. The N parameter is used to automatically convert 16-bit formats to 8-bit.
 | 
					
						
							|  |  |  | template <size_t N> | 
					
						
							|  |  |  | static void ReceiveData(u8* output, ConversionBuffer& buf, size_t amount_of_data) { | 
					
						
							|  |  |  |     const u8* input = Memory::GetPointer(buf.address); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     size_t output_unit = buf.transfer_unit / N; | 
					
						
							|  |  |  |     ASSERT(amount_of_data % output_unit == 0); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     while (amount_of_data > 0) { | 
					
						
							|  |  |  |         for (size_t i = 0; i < output_unit; ++i) { | 
					
						
							|  |  |  |             output[i] = input[i * N]; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         output += output_unit; | 
					
						
							|  |  |  |         input += buf.transfer_unit + buf.gap; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         buf.address += buf.transfer_unit + buf.gap; | 
					
						
							|  |  |  |         buf.image_size -= buf.transfer_unit; | 
					
						
							|  |  |  |         amount_of_data -= output_unit; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /// Convert intermediate RGB32 format to the final output format while simulating an outgoing CDMA transfer.
 | 
					
						
							|  |  |  | static void SendData(const u32* input, ConversionBuffer& buf, int amount_of_data, | 
					
						
							|  |  |  |         OutputFormat output_format, u8 alpha) { | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     u8* output = Memory::GetPointer(buf.address); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     while (amount_of_data > 0) { | 
					
						
							|  |  |  |         u8* unit_end = output + buf.transfer_unit; | 
					
						
							|  |  |  |         while (output < unit_end) { | 
					
						
							|  |  |  |             u32 color = *input++; | 
					
						
							|  |  |  |             Math::Vec4<u8> col_vec{ | 
					
						
							| 
									
										
										
										
											2015-07-07 20:41:26 +02:00
										 |  |  |                 (u8)(color >> 24), (u8)(color >> 16), (u8)(color >> 8), alpha | 
					
						
							| 
									
										
										
										
											2015-06-07 22:24:03 -03:00
										 |  |  |             }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             switch (output_format) { | 
					
						
							|  |  |  |             case OutputFormat::RGBA8: | 
					
						
							|  |  |  |                 Color::EncodeRGBA8(col_vec, output); | 
					
						
							|  |  |  |                 output += 4; | 
					
						
							|  |  |  |                 break; | 
					
						
							|  |  |  |             case OutputFormat::RGB8: | 
					
						
							|  |  |  |                 Color::EncodeRGB8(col_vec, output); | 
					
						
							|  |  |  |                 output += 3; | 
					
						
							|  |  |  |                 break; | 
					
						
							|  |  |  |             case OutputFormat::RGB5A1: | 
					
						
							|  |  |  |                 Color::EncodeRGB5A1(col_vec, output); | 
					
						
							|  |  |  |                 output += 2; | 
					
						
							|  |  |  |                 break; | 
					
						
							|  |  |  |             case OutputFormat::RGB565: | 
					
						
							|  |  |  |                 Color::EncodeRGB565(col_vec, output); | 
					
						
							|  |  |  |                 output += 2; | 
					
						
							|  |  |  |                 break; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             amount_of_data -= 1; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         output += buf.gap; | 
					
						
							|  |  |  |         buf.address += buf.transfer_unit + buf.gap; | 
					
						
							|  |  |  |         buf.image_size -= buf.transfer_unit; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static const u8 linear_lut[64] = { | 
					
						
							|  |  |  |      0,  1,  2,  3,  4,  5,  6,  7, | 
					
						
							|  |  |  |      8,  9, 10, 11, 12, 13, 14, 15, | 
					
						
							|  |  |  |     16, 17, 18, 19, 20, 21, 22, 23, | 
					
						
							|  |  |  |     24, 25, 26, 27, 28, 29, 30, 31, | 
					
						
							|  |  |  |     32, 33, 34, 35, 36, 37, 38, 39, | 
					
						
							|  |  |  |     40, 41, 42, 43, 44, 45, 46, 47, | 
					
						
							|  |  |  |     48, 49, 50, 51, 52, 53, 54, 55, | 
					
						
							|  |  |  |     56, 57, 58, 59, 60, 61, 62, 63, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static const u8 morton_lut[64] = { | 
					
						
							|  |  |  |      0,  1,  4,  5, 16, 17, 20, 21, | 
					
						
							|  |  |  |      2,  3,  6,  7, 18, 19, 22, 23, | 
					
						
							|  |  |  |      8,  9, 12, 13, 24, 25, 28, 29, | 
					
						
							|  |  |  |     10, 11, 14, 15, 26, 27, 30, 31, | 
					
						
							|  |  |  |     32, 33, 36, 37, 48, 49, 52, 53, | 
					
						
							|  |  |  |     34, 35, 38, 39, 50, 51, 54, 55, | 
					
						
							|  |  |  |     40, 41, 44, 45, 56, 57, 60, 61, | 
					
						
							|  |  |  |     42, 43, 46, 47, 58, 59, 62, 63, | 
					
						
							|  |  |  | }; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static void RotateTile0(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) { | 
					
						
							|  |  |  |     for (int i = 0; i < height * 8; ++i) { | 
					
						
							|  |  |  |         output[out_map[i]] = input[i]; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static void RotateTile90(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) { | 
					
						
							|  |  |  |     int out_i = 0; | 
					
						
							|  |  |  |     for (int x = 0; x < 8; ++x) { | 
					
						
							|  |  |  |         for (int y = height - 1; y >= 0; --y) { | 
					
						
							|  |  |  |             output[out_map[out_i++]] = input[y * 8 + x]; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static void RotateTile180(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) { | 
					
						
							|  |  |  |     int out_i = 0; | 
					
						
							|  |  |  |     for (int i = height * 8 - 1; i >= 0; --i) { | 
					
						
							|  |  |  |         output[out_map[out_i++]] = input[i]; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static void RotateTile270(const ImageTile& input, ImageTile& output, int height, const u8 out_map[64]) { | 
					
						
							|  |  |  |     int out_i = 0; | 
					
						
							|  |  |  |     for (int x = 8-1; x >= 0; --x) { | 
					
						
							|  |  |  |         for (int y = 0; y < height; ++y) { | 
					
						
							|  |  |  |             output[out_map[out_i++]] = input[y * 8 + x]; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | static void WriteTileToOutput(u32* output, const ImageTile& tile, int height, int line_stride) { | 
					
						
							|  |  |  |     for (int y = 0; y < height; ++y) { | 
					
						
							|  |  |  |         for (int x = 0; x < 8; ++x) { | 
					
						
							|  |  |  |             output[y * line_stride + x] = tile[y * 8 + x]; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | /**
 | 
					
						
							|  |  |  |  * Performs a Y2R colorspace conversion. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * The Y2R hardware implements hardware-accelerated YUV to RGB colorspace conversions. It is most | 
					
						
							|  |  |  |  * commonly used for video playback or to display camera input to the screen. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * The conversion process is quite configurable, and can be divided in distinct steps. From | 
					
						
							|  |  |  |  * observation, it appears that the hardware buffers a single 8-pixel tall strip of image data | 
					
						
							|  |  |  |  * internally and converts it in one go before writing to the output and loading the next strip. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * The steps taken to convert one strip of image data are: | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * - The hardware receives data via CDMA (http://3dbrew.org/wiki/Corelink_DMA_Engines), which is
 | 
					
						
							|  |  |  |  *   presumably stored in one or more internal buffers. This process can be done in several separate | 
					
						
							|  |  |  |  *   transfers, as long as they don't exceed the size of the internal image buffer. This allows | 
					
						
							|  |  |  |  *   flexibility in input strides. | 
					
						
							|  |  |  |  * - The input data is decoded into a YUV tuple. Several formats are suported, see the `InputFormat` | 
					
						
							|  |  |  |  *   enum. | 
					
						
							|  |  |  |  * - The YUV tuple is converted, using fixed point calculations, to RGB. This step can be configured | 
					
						
							|  |  |  |  *   using a set of coefficients to support different colorspace standards. See `CoefficientSet`. | 
					
						
							|  |  |  |  * - The strip can be optionally rotated 90, 180 or 270 degrees. Since each strip is processed | 
					
						
							|  |  |  |  *   independently, this notably rotates each *strip*, not the entire image. This means that for 90 | 
					
						
							|  |  |  |  *   or 270 degree rotations, the output will be in terms of several 8 x height images, and for any | 
					
						
							|  |  |  |  *   non-zero rotation the strips will have to be re-arranged so that the parts of the image will | 
					
						
							|  |  |  |  *   not be shuffled together. This limitation makes this a feature of somewhat dubious utility. 90 | 
					
						
							|  |  |  |  *   or 270 degree rotations in images with non-even height don't seem to work properly. | 
					
						
							|  |  |  |  * - The data is converted to the output RGB format. See the `OutputFormat` enum. | 
					
						
							|  |  |  |  * - The data can be output either linearly line-by-line or in the swizzled 8x8 tile format used by | 
					
						
							|  |  |  |  *   the PICA. This is decided by the `BlockAlignment` enum. If 8x8 alignment is used, then the | 
					
						
							|  |  |  |  *   image must have a height divisible by 8. The image width must always be divisible by 8. | 
					
						
							|  |  |  |  * - The final data is then CDMAed out to main memory and the next image strip is processed. This | 
					
						
							|  |  |  |  *   offers the same flexibility as the input stage. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * In this implementation, to avoid the combinatorial explosion of parameter combinations, common | 
					
						
							|  |  |  |  * intermediate formats are used and where possible tables or parameters are used instead of | 
					
						
							|  |  |  |  * diverging code paths to keep the amount of branches in check. Some steps are also merged to | 
					
						
							|  |  |  |  * increase efficiency. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * Output for all valid settings combinations matches hardware, however output in some edge-cases | 
					
						
							|  |  |  |  * differs: | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * - `Block8x8` alignment with non-mod8 height produces different garbage patterns on the last | 
					
						
							|  |  |  |  *   strip, especially when combined with rotation. | 
					
						
							|  |  |  |  * - Hardware, when using `Linear` alignment with a non-even height and 90 or 270 degree rotation | 
					
						
							|  |  |  |  *   produces misaligned output on the last strip. This implmentation produces output with the | 
					
						
							|  |  |  |  *   correct "expected" alignment. | 
					
						
							|  |  |  |  * | 
					
						
							|  |  |  |  * Hardware behaves strangely (doesn't fire the completion interrupt, for example) in these cases, | 
					
						
							|  |  |  |  * so they are believed to be invalid configurations anyway. | 
					
						
							|  |  |  |  */ | 
					
						
							|  |  |  | void PerformConversion(ConversionConfiguration& cvt) { | 
					
						
							|  |  |  |     ASSERT(cvt.input_line_width % 8 == 0); | 
					
						
							|  |  |  |     ASSERT(cvt.block_alignment != BlockAlignment::Block8x8 || cvt.input_lines % 8 == 0); | 
					
						
							|  |  |  |     // Tiles per row
 | 
					
						
							|  |  |  |     size_t num_tiles = cvt.input_line_width / 8; | 
					
						
							|  |  |  |     ASSERT(num_tiles < MAX_TILES); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     // Buffer used as a CDMA source/target.
 | 
					
						
							|  |  |  |     std::unique_ptr<u8[]> data_buffer(new u8[cvt.input_line_width * 8 * 4]); | 
					
						
							|  |  |  |     // Intermediate storage for decoded 8x8 image tiles. Always stored as RGB32.
 | 
					
						
							|  |  |  |     std::unique_ptr<ImageTile[]> tiles(new ImageTile[num_tiles]); | 
					
						
							|  |  |  |     ImageTile tmp_tile; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     // LUT used to remap writes to a tile. Used to allow linear or swizzled output without
 | 
					
						
							|  |  |  |     // requiring two different code paths.
 | 
					
						
							|  |  |  |     const u8* tile_remap; | 
					
						
							|  |  |  |     switch (cvt.block_alignment) { | 
					
						
							|  |  |  |     case BlockAlignment::Linear: | 
					
						
							|  |  |  |         tile_remap = linear_lut; break; | 
					
						
							|  |  |  |     case BlockAlignment::Block8x8: | 
					
						
							|  |  |  |         tile_remap = morton_lut; break; | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for (unsigned int y = 0; y < cvt.input_lines; y += 8) { | 
					
						
							|  |  |  |         unsigned int row_height = std::min(cvt.input_lines - y, 8u); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         // Total size in pixels of incoming data required for this strip.
 | 
					
						
							|  |  |  |         const size_t row_data_size = row_height * cvt.input_line_width; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         u8* input_Y = data_buffer.get(); | 
					
						
							|  |  |  |         u8* input_U = input_Y + 8 * cvt.input_line_width; | 
					
						
							|  |  |  |         u8* input_V = input_U + 8 * cvt.input_line_width / 2; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         switch (cvt.input_format) { | 
					
						
							|  |  |  |         case InputFormat::YUV422_Indiv8: | 
					
						
							|  |  |  |             ReceiveData<1>(input_Y, cvt.src_Y, row_data_size); | 
					
						
							|  |  |  |             ReceiveData<1>(input_U, cvt.src_U, row_data_size / 2); | 
					
						
							|  |  |  |             ReceiveData<1>(input_V, cvt.src_V, row_data_size / 2); | 
					
						
							|  |  |  |             break; | 
					
						
							|  |  |  |         case InputFormat::YUV420_Indiv8: | 
					
						
							|  |  |  |             ReceiveData<1>(input_Y, cvt.src_Y, row_data_size); | 
					
						
							|  |  |  |             ReceiveData<1>(input_U, cvt.src_U, row_data_size / 4); | 
					
						
							|  |  |  |             ReceiveData<1>(input_V, cvt.src_V, row_data_size / 4); | 
					
						
							|  |  |  |             break; | 
					
						
							|  |  |  |         case InputFormat::YUV422_Indiv16: | 
					
						
							|  |  |  |             ReceiveData<2>(input_Y, cvt.src_Y, row_data_size); | 
					
						
							|  |  |  |             ReceiveData<2>(input_U, cvt.src_U, row_data_size / 2); | 
					
						
							|  |  |  |             ReceiveData<2>(input_V, cvt.src_V, row_data_size / 2); | 
					
						
							|  |  |  |             break; | 
					
						
							|  |  |  |         case InputFormat::YUV420_Indiv16: | 
					
						
							|  |  |  |             ReceiveData<2>(input_Y, cvt.src_Y, row_data_size); | 
					
						
							|  |  |  |             ReceiveData<2>(input_U, cvt.src_U, row_data_size / 4); | 
					
						
							|  |  |  |             ReceiveData<2>(input_V, cvt.src_V, row_data_size / 4); | 
					
						
							|  |  |  |             break; | 
					
						
							|  |  |  |         case InputFormat::YUYV422_Interleaved: | 
					
						
							|  |  |  |             input_U = nullptr; | 
					
						
							|  |  |  |             input_V = nullptr; | 
					
						
							|  |  |  |             ReceiveData<1>(input_Y, cvt.src_YUYV, row_data_size * 2); | 
					
						
							|  |  |  |             break; | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         // Note(yuriks): If additional optimization is required, input_format can be moved to a
 | 
					
						
							|  |  |  |         // template parameter, so that its dispatch can be moved to outside the inner loop.
 | 
					
						
							|  |  |  |         ConvertYUVToRGB(cvt.input_format, input_Y, input_U, input_V, tiles.get(), | 
					
						
							|  |  |  |                 cvt.input_line_width, row_height, cvt.coefficients); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         u32* output_buffer = reinterpret_cast<u32*>(data_buffer.get()); | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         for (int i = 0; i < num_tiles; ++i) { | 
					
						
							|  |  |  |             int image_strip_width, output_stride; | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             switch (cvt.rotation) { | 
					
						
							|  |  |  |             case Rotation::None: | 
					
						
							|  |  |  |                 RotateTile0(tiles[i], tmp_tile, row_height, tile_remap); | 
					
						
							|  |  |  |                 image_strip_width = cvt.input_line_width; | 
					
						
							|  |  |  |                 output_stride = 8; | 
					
						
							|  |  |  |                 break; | 
					
						
							|  |  |  |             case Rotation::Clockwise_90: | 
					
						
							|  |  |  |                 RotateTile90(tiles[i], tmp_tile, row_height, tile_remap); | 
					
						
							|  |  |  |                 image_strip_width = 8; | 
					
						
							|  |  |  |                 output_stride = 8 * row_height; | 
					
						
							|  |  |  |                 break; | 
					
						
							|  |  |  |             case Rotation::Clockwise_180: | 
					
						
							|  |  |  |                 // For 180 and 270 degree rotations we also invert the order of tiles in the strip,
 | 
					
						
							|  |  |  |                 // since the rotates are done individually on each tile.
 | 
					
						
							|  |  |  |                 RotateTile180(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap); | 
					
						
							|  |  |  |                 image_strip_width = cvt.input_line_width; | 
					
						
							|  |  |  |                 output_stride = 8; | 
					
						
							|  |  |  |                 break; | 
					
						
							|  |  |  |             case Rotation::Clockwise_270: | 
					
						
							|  |  |  |                 RotateTile270(tiles[num_tiles - i - 1], tmp_tile, row_height, tile_remap); | 
					
						
							|  |  |  |                 image_strip_width = 8; | 
					
						
							|  |  |  |                 output_stride = 8 * row_height; | 
					
						
							|  |  |  |                 break; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |             switch (cvt.block_alignment) { | 
					
						
							|  |  |  |             case BlockAlignment::Linear: | 
					
						
							|  |  |  |                 WriteTileToOutput(output_buffer, tmp_tile, row_height, image_strip_width); | 
					
						
							|  |  |  |                 output_buffer += output_stride; | 
					
						
							|  |  |  |                 break; | 
					
						
							|  |  |  |             case BlockAlignment::Block8x8: | 
					
						
							|  |  |  |                 WriteTileToOutput(output_buffer, tmp_tile, 8, 8); | 
					
						
							|  |  |  |                 output_buffer += TILE_SIZE; | 
					
						
							|  |  |  |                 break; | 
					
						
							|  |  |  |             } | 
					
						
							|  |  |  |         } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |         // Note(yuriks): If additional optimization is required, output_format can be moved to a
 | 
					
						
							|  |  |  |         // template parameter, so that its dispatch can be moved to outside the inner loop.
 | 
					
						
							|  |  |  |         SendData(reinterpret_cast<u32*>(data_buffer.get()), cvt.dst, (int)row_data_size, cvt.output_format, (u8)cvt.alpha); | 
					
						
							|  |  |  |     } | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | } | 
					
						
							|  |  |  | } |