forked from eden-emu/eden
		
	decoders: Optimize swizzle copy performance (#6790)
This makes UnswizzleTexture up to two times faster. It is the main bottleneck in NVDEC video decoding.
This commit is contained in:
		
							parent
							
								
									381aacdbb1
								
							
						
					
					
						commit
						f56d0db5bd
					
				
					 1 changed files with 43 additions and 9 deletions
				
			
		|  | @ -18,9 +18,9 @@ | ||||||
| 
 | 
 | ||||||
| namespace Tegra::Texture { | namespace Tegra::Texture { | ||||||
| namespace { | namespace { | ||||||
| template <bool TO_LINEAR> | template <bool TO_LINEAR, u32 BYTES_PER_PIXEL> | ||||||
| void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width, | void SwizzleImpl(std::span<u8> output, std::span<const u8> input, u32 width, u32 height, u32 depth, | ||||||
|              u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) { |                  u32 block_height, u32 block_depth, u32 stride_alignment) { | ||||||
|     // The origin of the transformation can be configured here, leave it as zero as the current API
 |     // The origin of the transformation can be configured here, leave it as zero as the current API
 | ||||||
|     // doesn't expose it.
 |     // doesn't expose it.
 | ||||||
|     static constexpr u32 origin_x = 0; |     static constexpr u32 origin_x = 0; | ||||||
|  | @ -28,9 +28,9 @@ void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixe | ||||||
|     static constexpr u32 origin_z = 0; |     static constexpr u32 origin_z = 0; | ||||||
| 
 | 
 | ||||||
|     // We can configure here a custom pitch
 |     // We can configure here a custom pitch
 | ||||||
|     // As it's not exposed 'width * bpp' will be the expected pitch.
 |     // As it's not exposed 'width * BYTES_PER_PIXEL' will be the expected pitch.
 | ||||||
|     const u32 pitch = width * bytes_per_pixel; |     const u32 pitch = width * BYTES_PER_PIXEL; | ||||||
|     const u32 stride = Common::AlignUpLog2(width, stride_alignment) * bytes_per_pixel; |     const u32 stride = Common::AlignUpLog2(width, stride_alignment) * BYTES_PER_PIXEL; | ||||||
| 
 | 
 | ||||||
|     const u32 gobs_in_x = Common::DivCeilLog2(stride, GOB_SIZE_X_SHIFT); |     const u32 gobs_in_x = Common::DivCeilLog2(stride, GOB_SIZE_X_SHIFT); | ||||||
|     const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth); |     const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth); | ||||||
|  | @ -54,14 +54,14 @@ void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixe | ||||||
|                                  ((block_y & block_height_mask) << GOB_SIZE_SHIFT); |                                  ((block_y & block_height_mask) << GOB_SIZE_SHIFT); | ||||||
| 
 | 
 | ||||||
|             for (u32 column = 0; column < width; ++column) { |             for (u32 column = 0; column < width; ++column) { | ||||||
|                 const u32 x = (column + origin_x) * bytes_per_pixel; |                 const u32 x = (column + origin_x) * BYTES_PER_PIXEL; | ||||||
|                 const u32 offset_x = (x >> GOB_SIZE_X_SHIFT) << x_shift; |                 const u32 offset_x = (x >> GOB_SIZE_X_SHIFT) << x_shift; | ||||||
| 
 | 
 | ||||||
|                 const u32 base_swizzled_offset = offset_z + offset_y + offset_x; |                 const u32 base_swizzled_offset = offset_z + offset_y + offset_x; | ||||||
|                 const u32 swizzled_offset = base_swizzled_offset + table[x % GOB_SIZE_X]; |                 const u32 swizzled_offset = base_swizzled_offset + table[x % GOB_SIZE_X]; | ||||||
| 
 | 
 | ||||||
|                 const u32 unswizzled_offset = |                 const u32 unswizzled_offset = | ||||||
|                     slice * pitch * height + line * pitch + column * bytes_per_pixel; |                     slice * pitch * height + line * pitch + column * BYTES_PER_PIXEL; | ||||||
| 
 | 
 | ||||||
|                 if (const auto offset = (TO_LINEAR ? unswizzled_offset : swizzled_offset); |                 if (const auto offset = (TO_LINEAR ? unswizzled_offset : swizzled_offset); | ||||||
|                     offset >= input.size()) { |                     offset >= input.size()) { | ||||||
|  | @ -73,11 +73,45 @@ void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixe | ||||||
| 
 | 
 | ||||||
|                 u8* const dst = &output[TO_LINEAR ? swizzled_offset : unswizzled_offset]; |                 u8* const dst = &output[TO_LINEAR ? swizzled_offset : unswizzled_offset]; | ||||||
|                 const u8* const src = &input[TO_LINEAR ? unswizzled_offset : swizzled_offset]; |                 const u8* const src = &input[TO_LINEAR ? unswizzled_offset : swizzled_offset]; | ||||||
|                 std::memcpy(dst, src, bytes_per_pixel); | 
 | ||||||
|  |                 std::memcpy(dst, src, BYTES_PER_PIXEL); | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
| } | } | ||||||
|  | 
 | ||||||
|  | template <bool TO_LINEAR> | ||||||
|  | void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width, | ||||||
|  |              u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) { | ||||||
|  |     switch (bytes_per_pixel) { | ||||||
|  |     case 1: | ||||||
|  |         return SwizzleImpl<TO_LINEAR, 1>(output, input, width, height, depth, block_height, | ||||||
|  |                                          block_depth, stride_alignment); | ||||||
|  |     case 2: | ||||||
|  |         return SwizzleImpl<TO_LINEAR, 2>(output, input, width, height, depth, block_height, | ||||||
|  |                                          block_depth, stride_alignment); | ||||||
|  |     case 3: | ||||||
|  |         return SwizzleImpl<TO_LINEAR, 3>(output, input, width, height, depth, block_height, | ||||||
|  |                                          block_depth, stride_alignment); | ||||||
|  |     case 4: | ||||||
|  |         return SwizzleImpl<TO_LINEAR, 4>(output, input, width, height, depth, block_height, | ||||||
|  |                                          block_depth, stride_alignment); | ||||||
|  |     case 6: | ||||||
|  |         return SwizzleImpl<TO_LINEAR, 6>(output, input, width, height, depth, block_height, | ||||||
|  |                                          block_depth, stride_alignment); | ||||||
|  |     case 8: | ||||||
|  |         return SwizzleImpl<TO_LINEAR, 8>(output, input, width, height, depth, block_height, | ||||||
|  |                                          block_depth, stride_alignment); | ||||||
|  |     case 12: | ||||||
|  |         return SwizzleImpl<TO_LINEAR, 12>(output, input, width, height, depth, block_height, | ||||||
|  |                                           block_depth, stride_alignment); | ||||||
|  |     case 16: | ||||||
|  |         return SwizzleImpl<TO_LINEAR, 16>(output, input, width, height, depth, block_height, | ||||||
|  |                                           block_depth, stride_alignment); | ||||||
|  |     default: | ||||||
|  |         UNREACHABLE_MSG("Invalid bytes_per_pixel={}", bytes_per_pixel); | ||||||
|  |     } | ||||||
|  | } | ||||||
| } // Anonymous namespace
 | } // Anonymous namespace
 | ||||||
| 
 | 
 | ||||||
| void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, | void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 yzct12345
						yzct12345