forked from eden-emu/eden
		
	Merge pull request #6791 from ameerj/astc-opt
astc_decoder: Various performance and memory optimizations
This commit is contained in:
		
						commit
						268b5764c7
					
				
					 7 changed files with 250 additions and 420 deletions
				
			
		|  | @ -10,33 +10,27 @@ | ||||||
| #define END_PUSH_CONSTANTS }; | #define END_PUSH_CONSTANTS }; | ||||||
| #define UNIFORM(n) | #define UNIFORM(n) | ||||||
| #define BINDING_INPUT_BUFFER 0 | #define BINDING_INPUT_BUFFER 0 | ||||||
| #define BINDING_ENC_BUFFER 1 | #define BINDING_OUTPUT_IMAGE 1 | ||||||
| #define BINDING_SWIZZLE_BUFFER 2 |  | ||||||
| #define BINDING_OUTPUT_IMAGE 3 |  | ||||||
| 
 | 
 | ||||||
| #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv | #else // ^^^ Vulkan ^^^ // vvv OpenGL vvv | ||||||
| 
 | 
 | ||||||
| #define BEGIN_PUSH_CONSTANTS | #define BEGIN_PUSH_CONSTANTS | ||||||
| #define END_PUSH_CONSTANTS | #define END_PUSH_CONSTANTS | ||||||
| #define UNIFORM(n) layout(location = n) uniform | #define UNIFORM(n) layout(location = n) uniform | ||||||
| #define BINDING_SWIZZLE_BUFFER 0 | #define BINDING_INPUT_BUFFER 0 | ||||||
| #define BINDING_INPUT_BUFFER 1 |  | ||||||
| #define BINDING_ENC_BUFFER 2 |  | ||||||
| #define BINDING_OUTPUT_IMAGE 0 | #define BINDING_OUTPUT_IMAGE 0 | ||||||
| 
 | 
 | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| layout(local_size_x = 32, local_size_y = 32, local_size_z = 1) in; | layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in; | ||||||
| 
 | 
 | ||||||
| BEGIN_PUSH_CONSTANTS | BEGIN_PUSH_CONSTANTS | ||||||
| UNIFORM(1) uvec2 block_dims; | UNIFORM(1) uvec2 block_dims; | ||||||
| 
 | UNIFORM(2) uint layer_stride; | ||||||
| UNIFORM(2) uint bytes_per_block_log2; | UNIFORM(3) uint block_size; | ||||||
| UNIFORM(3) uint layer_stride; | UNIFORM(4) uint x_shift; | ||||||
| UNIFORM(4) uint block_size; | UNIFORM(5) uint block_height; | ||||||
| UNIFORM(5) uint x_shift; | UNIFORM(6) uint block_height_mask; | ||||||
| UNIFORM(6) uint block_height; |  | ||||||
| UNIFORM(7) uint block_height_mask; |  | ||||||
| END_PUSH_CONSTANTS | END_PUSH_CONSTANTS | ||||||
| 
 | 
 | ||||||
| struct EncodingData { | struct EncodingData { | ||||||
|  | @ -55,45 +49,35 @@ struct TexelWeightParams { | ||||||
|     bool void_extent_hdr; |     bool void_extent_hdr; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| // Swizzle data |  | ||||||
| layout(binding = BINDING_SWIZZLE_BUFFER, std430) readonly buffer SwizzleTable { |  | ||||||
|     uint swizzle_table[]; |  | ||||||
| }; |  | ||||||
| 
 |  | ||||||
| layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 { | layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 { | ||||||
|     uint astc_data[]; |     uvec4 astc_data[]; | ||||||
| }; |  | ||||||
| 
 |  | ||||||
| // ASTC Encodings data |  | ||||||
| layout(binding = BINDING_ENC_BUFFER, std430) readonly buffer EncodingsValues { |  | ||||||
|     EncodingData encoding_values[]; |  | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2DArray dest_image; | layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2DArray dest_image; | ||||||
| 
 | 
 | ||||||
| const uint GOB_SIZE_X = 64; |  | ||||||
| const uint GOB_SIZE_Y = 8; |  | ||||||
| const uint GOB_SIZE_Z = 1; |  | ||||||
| const uint GOB_SIZE = GOB_SIZE_X * GOB_SIZE_Y * GOB_SIZE_Z; |  | ||||||
| 
 |  | ||||||
| const uint GOB_SIZE_X_SHIFT = 6; | const uint GOB_SIZE_X_SHIFT = 6; | ||||||
| const uint GOB_SIZE_Y_SHIFT = 3; | const uint GOB_SIZE_Y_SHIFT = 3; | ||||||
| const uint GOB_SIZE_Z_SHIFT = 0; | const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT; | ||||||
| const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT + GOB_SIZE_Z_SHIFT; |  | ||||||
| 
 | 
 | ||||||
| const uvec2 SWIZZLE_MASK = uvec2(GOB_SIZE_X - 1, GOB_SIZE_Y - 1); | const uint BYTES_PER_BLOCK_LOG2 = 4; | ||||||
| 
 |  | ||||||
| const int BLOCK_SIZE_IN_BYTES = 16; |  | ||||||
| 
 |  | ||||||
| const int BLOCK_INFO_ERROR = 0; |  | ||||||
| const int BLOCK_INFO_VOID_EXTENT_HDR = 1; |  | ||||||
| const int BLOCK_INFO_VOID_EXTENT_LDR = 2; |  | ||||||
| const int BLOCK_INFO_NORMAL = 3; |  | ||||||
| 
 | 
 | ||||||
| const int JUST_BITS = 0; | const int JUST_BITS = 0; | ||||||
| const int QUINT = 1; | const int QUINT = 1; | ||||||
| const int TRIT = 2; | const int TRIT = 2; | ||||||
| 
 | 
 | ||||||
|  | // ASTC Encodings data, sorted in ascending order based on their BitLength value | ||||||
|  | // (see GetBitLength() function) | ||||||
|  | EncodingData encoding_values[22] = EncodingData[]( | ||||||
|  |     EncodingData(JUST_BITS, 0, 0, 0), EncodingData(JUST_BITS, 1, 0, 0), EncodingData(TRIT, 0, 0, 0), | ||||||
|  |     EncodingData(JUST_BITS, 2, 0, 0), EncodingData(QUINT, 0, 0, 0), EncodingData(TRIT, 1, 0, 0), | ||||||
|  |     EncodingData(JUST_BITS, 3, 0, 0), EncodingData(QUINT, 1, 0, 0), EncodingData(TRIT, 2, 0, 0), | ||||||
|  |     EncodingData(JUST_BITS, 4, 0, 0), EncodingData(QUINT, 2, 0, 0), EncodingData(TRIT, 3, 0, 0), | ||||||
|  |     EncodingData(JUST_BITS, 5, 0, 0), EncodingData(QUINT, 3, 0, 0), EncodingData(TRIT, 4, 0, 0), | ||||||
|  |     EncodingData(JUST_BITS, 6, 0, 0), EncodingData(QUINT, 4, 0, 0), EncodingData(TRIT, 5, 0, 0), | ||||||
|  |     EncodingData(JUST_BITS, 7, 0, 0), EncodingData(QUINT, 5, 0, 0), EncodingData(TRIT, 6, 0, 0), | ||||||
|  |     EncodingData(JUST_BITS, 8, 0, 0) | ||||||
|  | ); | ||||||
|  | 
 | ||||||
| // The following constants are expanded variants of the Replicate() | // The following constants are expanded variants of the Replicate() | ||||||
| // function calls corresponding to the following arguments: | // function calls corresponding to the following arguments: | ||||||
| // value: index into the generated table | // value: index into the generated table | ||||||
|  | @ -135,44 +119,37 @@ const uint REPLICATE_7_BIT_TO_8_TABLE[128] = | ||||||
| // Input ASTC texture globals | // Input ASTC texture globals | ||||||
| uint current_index = 0; | uint current_index = 0; | ||||||
| int bitsread = 0; | int bitsread = 0; | ||||||
| uint total_bitsread = 0; | int total_bitsread = 0; | ||||||
| uint local_buff[16]; | uvec4 local_buff; | ||||||
| 
 | 
 | ||||||
| // Color data globals | // Color data globals | ||||||
| uint color_endpoint_data[16]; | uvec4 color_endpoint_data; | ||||||
| int color_bitsread = 0; | int color_bitsread = 0; | ||||||
| uint total_color_bitsread = 0; |  | ||||||
| int color_index = 0; |  | ||||||
| 
 | 
 | ||||||
| // Four values, two endpoints, four maximum paritions | // Four values, two endpoints, four maximum paritions | ||||||
| uint color_values[32]; | uint color_values[32]; | ||||||
| int colvals_index = 0; | int colvals_index = 0; | ||||||
| 
 | 
 | ||||||
| // Weight data globals | // Weight data globals | ||||||
| uint texel_weight_data[16]; | uvec4 texel_weight_data; | ||||||
| int texel_bitsread = 0; | int texel_bitsread = 0; | ||||||
| uint total_texel_bitsread = 0; |  | ||||||
| int texel_index = 0; |  | ||||||
| 
 | 
 | ||||||
| bool texel_flag = false; | bool texel_flag = false; | ||||||
| 
 | 
 | ||||||
| // Global "vectors" to be pushed into when decoding | // Global "vectors" to be pushed into when decoding | ||||||
| EncodingData result_vector[100]; | EncodingData result_vector[144]; | ||||||
| int result_index = 0; | int result_index = 0; | ||||||
| 
 | 
 | ||||||
| EncodingData texel_vector[100]; | EncodingData texel_vector[144]; | ||||||
| int texel_vector_index = 0; | int texel_vector_index = 0; | ||||||
| 
 | 
 | ||||||
| uint unquantized_texel_weights[2][144]; | uint unquantized_texel_weights[2][144]; | ||||||
| 
 | 
 | ||||||
| uint SwizzleOffset(uvec2 pos) { | uint SwizzleOffset(uvec2 pos) { | ||||||
|     pos = pos & SWIZZLE_MASK; |     uint x = pos.x; | ||||||
|     return swizzle_table[pos.y * 64 + pos.x]; |     uint y = pos.y; | ||||||
| } |     return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + ((x % 32) / 16) * 32 + | ||||||
| 
 |                           (y % 2) * 16 + (x % 16); | ||||||
| uint ReadTexel(uint offset) { |  | ||||||
|     // extract the 8-bit value from the 32-bit packed data. |  | ||||||
|     return bitfieldExtract(astc_data[offset / 4], int((offset * 8) & 24), 8); |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| // Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)] | // Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)] | ||||||
|  | @ -278,14 +255,10 @@ uint Hash52(uint p) { | ||||||
|     return p; |     return p; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bool small_block) { | uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) { | ||||||
|     if (partition_count == 1) { |  | ||||||
|         return 0; |  | ||||||
|     } |  | ||||||
|     if (small_block) { |     if (small_block) { | ||||||
|         x <<= 1; |         x <<= 1; | ||||||
|         y <<= 1; |         y <<= 1; | ||||||
|         z <<= 1; |  | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     seed += (partition_count - 1) * 1024; |     seed += (partition_count - 1) * 1024; | ||||||
|  | @ -299,10 +272,6 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo | ||||||
|     uint seed6 = uint((rnum >> 20) & 0xF); |     uint seed6 = uint((rnum >> 20) & 0xF); | ||||||
|     uint seed7 = uint((rnum >> 24) & 0xF); |     uint seed7 = uint((rnum >> 24) & 0xF); | ||||||
|     uint seed8 = uint((rnum >> 28) & 0xF); |     uint seed8 = uint((rnum >> 28) & 0xF); | ||||||
|     uint seed9 = uint((rnum >> 18) & 0xF); |  | ||||||
|     uint seed10 = uint((rnum >> 22) & 0xF); |  | ||||||
|     uint seed11 = uint((rnum >> 26) & 0xF); |  | ||||||
|     uint seed12 = uint(((rnum >> 30) | (rnum << 2)) & 0xF); |  | ||||||
| 
 | 
 | ||||||
|     seed1 = (seed1 * seed1); |     seed1 = (seed1 * seed1); | ||||||
|     seed2 = (seed2 * seed2); |     seed2 = (seed2 * seed2); | ||||||
|  | @ -312,12 +281,8 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo | ||||||
|     seed6 = (seed6 * seed6); |     seed6 = (seed6 * seed6); | ||||||
|     seed7 = (seed7 * seed7); |     seed7 = (seed7 * seed7); | ||||||
|     seed8 = (seed8 * seed8); |     seed8 = (seed8 * seed8); | ||||||
|     seed9 = (seed9 * seed9); |  | ||||||
|     seed10 = (seed10 * seed10); |  | ||||||
|     seed11 = (seed11 * seed11); |  | ||||||
|     seed12 = (seed12 * seed12); |  | ||||||
| 
 | 
 | ||||||
|     int sh1, sh2, sh3; |     uint sh1, sh2; | ||||||
|     if ((seed & 1) > 0) { |     if ((seed & 1) > 0) { | ||||||
|         sh1 = (seed & 2) > 0 ? 4 : 5; |         sh1 = (seed & 2) > 0 ? 4 : 5; | ||||||
|         sh2 = (partition_count == 3) ? 6 : 5; |         sh2 = (partition_count == 3) ? 6 : 5; | ||||||
|  | @ -325,25 +290,19 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo | ||||||
|         sh1 = (partition_count == 3) ? 6 : 5; |         sh1 = (partition_count == 3) ? 6 : 5; | ||||||
|         sh2 = (seed & 2) > 0 ? 4 : 5; |         sh2 = (seed & 2) > 0 ? 4 : 5; | ||||||
|     } |     } | ||||||
|     sh3 = (seed & 0x10) > 0 ? sh1 : sh2; |     seed1 >>= sh1; | ||||||
|  |     seed2 >>= sh2; | ||||||
|  |     seed3 >>= sh1; | ||||||
|  |     seed4 >>= sh2; | ||||||
|  |     seed5 >>= sh1; | ||||||
|  |     seed6 >>= sh2; | ||||||
|  |     seed7 >>= sh1; | ||||||
|  |     seed8 >>= sh2; | ||||||
| 
 | 
 | ||||||
|     seed1 = (seed1 >> sh1); |     uint a = seed1 * x + seed2 * y + (rnum >> 14); | ||||||
|     seed2 = (seed2 >> sh2); |     uint b = seed3 * x + seed4 * y + (rnum >> 10); | ||||||
|     seed3 = (seed3 >> sh1); |     uint c = seed5 * x + seed6 * y + (rnum >> 6); | ||||||
|     seed4 = (seed4 >> sh2); |     uint d = seed7 * x + seed8 * y + (rnum >> 2); | ||||||
|     seed5 = (seed5 >> sh1); |  | ||||||
|     seed6 = (seed6 >> sh2); |  | ||||||
|     seed7 = (seed7 >> sh1); |  | ||||||
|     seed8 = (seed8 >> sh2); |  | ||||||
|     seed9 = (seed9 >> sh3); |  | ||||||
|     seed10 = (seed10 >> sh3); |  | ||||||
|     seed11 = (seed11 >> sh3); |  | ||||||
|     seed12 = (seed12 >> sh3); |  | ||||||
| 
 |  | ||||||
|     uint a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14); |  | ||||||
|     uint b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10); |  | ||||||
|     uint c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6); |  | ||||||
|     uint d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2); |  | ||||||
| 
 | 
 | ||||||
|     a &= 0x3F; |     a &= 0x3F; | ||||||
|     b &= 0x3F; |     b &= 0x3F; | ||||||
|  | @ -368,58 +327,37 @@ uint SelectPartition(uint seed, uint x, uint y, uint z, uint partition_count, bo | ||||||
|     } |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) { | uint ExtractBits(uvec4 payload, int offset, int bits) { | ||||||
|     return SelectPartition(seed, x, y, 0, partition_count, small_block); |     if (bits <= 0) { | ||||||
| } |  | ||||||
| 
 |  | ||||||
| uint ReadBit() { |  | ||||||
|     if (current_index >= local_buff.length()) { |  | ||||||
|         return 0; |         return 0; | ||||||
|     } |     } | ||||||
|     uint bit = bitfieldExtract(local_buff[current_index], bitsread, 1); |     int last_offset = offset + bits - 1; | ||||||
|     ++bitsread; |     int shifted_offset = offset >> 5; | ||||||
|     ++total_bitsread; |     if ((last_offset >> 5) == shifted_offset) { | ||||||
|     if (bitsread == 8) { |         return bitfieldExtract(payload[shifted_offset], offset & 31, bits); | ||||||
|         ++current_index; |  | ||||||
|         bitsread = 0; |  | ||||||
|     } |     } | ||||||
|     return bit; |     int first_bits = 32 - (offset & 31); | ||||||
|  |     int result_first = int(bitfieldExtract(payload[shifted_offset], offset & 31, first_bits)); | ||||||
|  |     int result_second = int(bitfieldExtract(payload[shifted_offset + 1], 0, bits - first_bits)); | ||||||
|  |     return result_first | (result_second << first_bits); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| uint StreamBits(uint num_bits) { | uint StreamBits(uint num_bits) { | ||||||
|     uint ret = 0; |     int int_bits = int(num_bits); | ||||||
|     for (uint i = 0; i < num_bits; i++) { |     uint ret = ExtractBits(local_buff, total_bitsread, int_bits); | ||||||
|         ret |= ((ReadBit() & 1) << i); |     total_bitsread += int_bits; | ||||||
|     } |  | ||||||
|     return ret; |     return ret; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| uint ReadColorBit() { |  | ||||||
|     uint bit = 0; |  | ||||||
|     if (texel_flag) { |  | ||||||
|         bit = bitfieldExtract(texel_weight_data[texel_index], texel_bitsread, 1); |  | ||||||
|         ++texel_bitsread; |  | ||||||
|         ++total_texel_bitsread; |  | ||||||
|         if (texel_bitsread == 8) { |  | ||||||
|             ++texel_index; |  | ||||||
|             texel_bitsread = 0; |  | ||||||
|         } |  | ||||||
|     } else { |  | ||||||
|         bit = bitfieldExtract(color_endpoint_data[color_index], color_bitsread, 1); |  | ||||||
|         ++color_bitsread; |  | ||||||
|         ++total_color_bitsread; |  | ||||||
|         if (color_bitsread == 8) { |  | ||||||
|             ++color_index; |  | ||||||
|             color_bitsread = 0; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|     return bit; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| uint StreamColorBits(uint num_bits) { | uint StreamColorBits(uint num_bits) { | ||||||
|     uint ret = 0; |     uint ret = 0; | ||||||
|     for (uint i = 0; i < num_bits; i++) { |     int int_bits = int(num_bits); | ||||||
|         ret |= ((ReadColorBit() & 1) << i); |     if (texel_flag) { | ||||||
|  |         ret = ExtractBits(texel_weight_data, texel_bitsread, int_bits); | ||||||
|  |         texel_bitsread += int_bits; | ||||||
|  |     } else { | ||||||
|  |         ret = ExtractBits(color_endpoint_data, color_bitsread, int_bits); | ||||||
|  |         color_bitsread += int_bits; | ||||||
|     } |     } | ||||||
|     return ret; |     return ret; | ||||||
| } | } | ||||||
|  | @ -596,22 +534,16 @@ void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) { | ||||||
|     for (uint i = 0; i < num_partitions; i++) { |     for (uint i = 0; i < num_partitions; i++) { | ||||||
|         num_values += ((modes[i] >> 2) + 1) << 1; |         num_values += ((modes[i] >> 2) + 1) << 1; | ||||||
|     } |     } | ||||||
|     int range = 256; |     // Find the largest encoding that's within color_data_bits | ||||||
|     while (--range > 0) { |     // TODO(ameerj): profile with binary search | ||||||
|         EncodingData val = encoding_values[range]; |     int range = 0; | ||||||
|  |     while (++range < encoding_values.length()) { | ||||||
|         uint bit_length = GetBitLength(num_values, range); |         uint bit_length = GetBitLength(num_values, range); | ||||||
|         if (bit_length <= color_data_bits) { |         if (bit_length > color_data_bits) { | ||||||
|             while (--range > 0) { |  | ||||||
|                 EncodingData newval = encoding_values[range]; |  | ||||||
|                 if (newval.encoding != val.encoding && newval.num_bits != val.num_bits) { |  | ||||||
|             break; |             break; | ||||||
|         } |         } | ||||||
|     } |     } | ||||||
|             ++range; |     DecodeIntegerSequence(range - 1, num_values); | ||||||
|             break; |  | ||||||
|         } |  | ||||||
|     } |  | ||||||
|     DecodeIntegerSequence(range, num_values); |  | ||||||
|     uint out_index = 0; |     uint out_index = 0; | ||||||
|     for (int itr = 0; itr < result_index; ++itr) { |     for (int itr = 0; itr < result_index; ++itr) { | ||||||
|         if (out_index >= num_values) { |         if (out_index >= num_values) { | ||||||
|  | @ -1028,7 +960,7 @@ int FindLayout(uint mode) { | ||||||
|     return 5; |     return 5; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| TexelWeightParams DecodeBlockInfo(uint block_index) { | TexelWeightParams DecodeBlockInfo() { | ||||||
|     TexelWeightParams params = TexelWeightParams(uvec2(0), 0, false, false, false, false); |     TexelWeightParams params = TexelWeightParams(uvec2(0), 0, false, false, false, false); | ||||||
|     uint mode = StreamBits(11); |     uint mode = StreamBits(11); | ||||||
|     if ((mode & 0x1ff) == 0x1fc) { |     if ((mode & 0x1ff) == 0x1fc) { | ||||||
|  | @ -1110,10 +1042,10 @@ TexelWeightParams DecodeBlockInfo(uint block_index) { | ||||||
|     } |     } | ||||||
|     weight_index -= 2; |     weight_index -= 2; | ||||||
|     if ((mode_layout != 9) && ((mode & 0x200) != 0)) { |     if ((mode_layout != 9) && ((mode & 0x200) != 0)) { | ||||||
|         const int max_weights[6] = int[6](9, 11, 15, 19, 23, 31); |         const int max_weights[6] = int[6](7, 8, 9, 10, 11, 12); | ||||||
|         params.max_weight = max_weights[weight_index]; |         params.max_weight = max_weights[weight_index]; | ||||||
|     } else { |     } else { | ||||||
|         const int max_weights[6] = int[6](1, 2, 3, 4, 5, 7); |         const int max_weights[6] = int[6](1, 2, 3, 4, 5, 6); | ||||||
|         params.max_weight = max_weights[weight_index]; |         params.max_weight = max_weights[weight_index]; | ||||||
|     } |     } | ||||||
|     return params; |     return params; | ||||||
|  | @ -1144,8 +1076,8 @@ void FillVoidExtentLDR(ivec3 coord) { | ||||||
|     } |     } | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| void DecompressBlock(ivec3 coord, uint block_index) { | void DecompressBlock(ivec3 coord) { | ||||||
|     TexelWeightParams params = DecodeBlockInfo(block_index); |     TexelWeightParams params = DecodeBlockInfo(); | ||||||
|     if (params.error_state) { |     if (params.error_state) { | ||||||
|         FillError(coord); |         FillError(coord); | ||||||
|         return; |         return; | ||||||
|  | @ -1212,7 +1144,7 @@ void DecompressBlock(ivec3 coord, uint block_index) { | ||||||
|     // Read color data... |     // Read color data... | ||||||
|     uint color_data_bits = remaining_bits; |     uint color_data_bits = remaining_bits; | ||||||
|     while (remaining_bits > 0) { |     while (remaining_bits > 0) { | ||||||
|         int nb = int(min(remaining_bits, 8U)); |         int nb = int(min(remaining_bits, 32U)); | ||||||
|         uint b = StreamBits(nb); |         uint b = StreamBits(nb); | ||||||
|         color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb)); |         color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb)); | ||||||
|         ++ced_pointer; |         ++ced_pointer; | ||||||
|  | @ -1254,25 +1186,20 @@ void DecompressBlock(ivec3 coord, uint block_index) { | ||||||
|         ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_endpoint_mode[i]); |         ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_endpoint_mode[i]); | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     for (uint i = 0; i < 16; i++) { |     texel_weight_data = local_buff; | ||||||
|         texel_weight_data[i] = local_buff[i]; |     texel_weight_data = bitfieldReverse(texel_weight_data).wzyx; | ||||||
|     } |  | ||||||
|     for (uint i = 0; i < 8; i++) { |  | ||||||
| #define REVERSE_BYTE(b) ((b * 0x0802U & 0x22110U) | (b * 0x8020U & 0x88440U)) * 0x10101U >> 16 |  | ||||||
|         uint a = REVERSE_BYTE(texel_weight_data[i]); |  | ||||||
|         uint b = REVERSE_BYTE(texel_weight_data[15 - i]); |  | ||||||
| #undef REVERSE_BYTE |  | ||||||
|         texel_weight_data[i] = uint(bitfieldExtract(b, 0, 8)); |  | ||||||
|         texel_weight_data[15 - i] = uint(bitfieldExtract(a, 0, 8)); |  | ||||||
|     } |  | ||||||
|     uint clear_byte_start = |     uint clear_byte_start = | ||||||
|         (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) >> 3) + 1; |         (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) >> 3) + 1; | ||||||
|     texel_weight_data[clear_byte_start - 1] = | 
 | ||||||
|         texel_weight_data[clear_byte_start - 1] & |     uint byte_insert = ExtractBits(texel_weight_data, int(clear_byte_start - 1) * 8, 8) & | ||||||
|         uint( |         uint( | ||||||
|             ((1 << (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) % 8)) - 1)); |             ((1 << (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) % 8)) - 1)); | ||||||
|     for (uint i = 0; i < 16 - clear_byte_start; i++) { |     uint vec_index = (clear_byte_start - 1) >> 2; | ||||||
|         texel_weight_data[clear_byte_start + i] = 0U; |     texel_weight_data[vec_index] = | ||||||
|  |         bitfieldInsert(texel_weight_data[vec_index], byte_insert, int((clear_byte_start - 1) % 4) * 8, 8); | ||||||
|  |     for (uint i = clear_byte_start; i < 16; ++i) { | ||||||
|  |         uint idx = i >> 2; | ||||||
|  |         texel_weight_data[idx] = bitfieldInsert(texel_weight_data[idx], 0, int(i % 4) * 8, 8); | ||||||
|     } |     } | ||||||
|     texel_flag = true; // use texel "vector" and bit stream in integer decoding |     texel_flag = true; // use texel "vector" and bit stream in integer decoding | ||||||
|     DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane)); |     DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane)); | ||||||
|  | @ -1281,8 +1208,11 @@ void DecompressBlock(ivec3 coord, uint block_index) { | ||||||
| 
 | 
 | ||||||
|     for (uint j = 0; j < block_dims.y; j++) { |     for (uint j = 0; j < block_dims.y; j++) { | ||||||
|         for (uint i = 0; i < block_dims.x; i++) { |         for (uint i = 0; i < block_dims.x; i++) { | ||||||
|             uint local_partition = Select2DPartition(partition_index, i, j, num_partitions, |             uint local_partition = 0; | ||||||
|  |             if (num_partitions > 1) { | ||||||
|  |                 local_partition = Select2DPartition(partition_index, i, j, num_partitions, | ||||||
|                                                      (block_dims.y * block_dims.x) < 32); |                                                      (block_dims.y * block_dims.x) < 32); | ||||||
|  |             } | ||||||
|             vec4 p; |             vec4 p; | ||||||
|             uvec4 C0 = ReplicateByteTo16(endpoints[local_partition][0]); |             uvec4 C0 = ReplicateByteTo16(endpoints[local_partition][0]); | ||||||
|             uvec4 C1 = ReplicateByteTo16(endpoints[local_partition][1]); |             uvec4 C1 = ReplicateByteTo16(endpoints[local_partition][1]); | ||||||
|  | @ -1303,7 +1233,7 @@ void DecompressBlock(ivec3 coord, uint block_index) { | ||||||
| 
 | 
 | ||||||
| void main() { | void main() { | ||||||
|     uvec3 pos = gl_GlobalInvocationID; |     uvec3 pos = gl_GlobalInvocationID; | ||||||
|     pos.x <<= bytes_per_block_log2; |     pos.x <<= BYTES_PER_BLOCK_LOG2; | ||||||
| 
 | 
 | ||||||
|     // Read as soon as possible due to its latency |     // Read as soon as possible due to its latency | ||||||
|     const uint swizzle = SwizzleOffset(pos.xy); |     const uint swizzle = SwizzleOffset(pos.xy); | ||||||
|  | @ -1321,13 +1251,8 @@ void main() { | ||||||
|     if (any(greaterThanEqual(coord, imageSize(dest_image)))) { |     if (any(greaterThanEqual(coord, imageSize(dest_image)))) { | ||||||
|         return; |         return; | ||||||
|     } |     } | ||||||
|     uint block_index = |  | ||||||
|         pos.z * gl_WorkGroupSize.x * gl_WorkGroupSize.y + pos.y * gl_WorkGroupSize.x + pos.x; |  | ||||||
| 
 |  | ||||||
|     current_index = 0; |     current_index = 0; | ||||||
|     bitsread = 0; |     bitsread = 0; | ||||||
|     for (int i = 0; i < 16; i++) { |     local_buff = astc_data[offset / 16]; | ||||||
|         local_buff[i] = ReadTexel(offset + i); |     DecompressBlock(coord); | ||||||
|     } |  | ||||||
|     DecompressBlock(coord, block_index); |  | ||||||
| } | } | ||||||
|  |  | ||||||
|  | @ -60,19 +60,14 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_) | ||||||
|       copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) { |       copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) { | ||||||
|     const auto swizzle_table = Tegra::Texture::MakeSwizzleTable(); |     const auto swizzle_table = Tegra::Texture::MakeSwizzleTable(); | ||||||
|     swizzle_table_buffer.Create(); |     swizzle_table_buffer.Create(); | ||||||
|     astc_buffer.Create(); |  | ||||||
|     glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0); |     glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0); | ||||||
|     glNamedBufferStorage(astc_buffer.handle, sizeof(ASTC_ENCODINGS_VALUES), &ASTC_ENCODINGS_VALUES, |  | ||||||
|                          0); |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| UtilShaders::~UtilShaders() = default; | UtilShaders::~UtilShaders() = default; | ||||||
| 
 | 
 | ||||||
| void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, | void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, | ||||||
|                              std::span<const VideoCommon::SwizzleParameters> swizzles) { |                              std::span<const VideoCommon::SwizzleParameters> swizzles) { | ||||||
|     static constexpr GLuint BINDING_SWIZZLE_BUFFER = 0; |     static constexpr GLuint BINDING_INPUT_BUFFER = 0; | ||||||
|     static constexpr GLuint BINDING_INPUT_BUFFER = 1; |  | ||||||
|     static constexpr GLuint BINDING_ENC_BUFFER = 2; |  | ||||||
|     static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; |     static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; | ||||||
| 
 | 
 | ||||||
|     const Extent2D tile_size{ |     const Extent2D tile_size{ | ||||||
|  | @ -80,34 +75,32 @@ void UtilShaders::ASTCDecode(Image& image, const ImageBufferMap& map, | ||||||
|         .height = VideoCore::Surface::DefaultBlockHeight(image.info.format), |         .height = VideoCore::Surface::DefaultBlockHeight(image.info.format), | ||||||
|     }; |     }; | ||||||
|     program_manager.BindComputeProgram(astc_decoder_program.handle); |     program_manager.BindComputeProgram(astc_decoder_program.handle); | ||||||
|     glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); |  | ||||||
|     glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_ENC_BUFFER, astc_buffer.handle); |  | ||||||
| 
 |  | ||||||
|     glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes); |     glFlushMappedNamedBufferRange(map.buffer, map.offset, image.guest_size_bytes); | ||||||
|     glUniform2ui(1, tile_size.width, tile_size.height); |     glUniform2ui(1, tile_size.width, tile_size.height); | ||||||
|  | 
 | ||||||
|     // Ensure buffer data is valid before dispatching
 |     // Ensure buffer data is valid before dispatching
 | ||||||
|     glFlush(); |     glFlush(); | ||||||
|     for (const SwizzleParameters& swizzle : swizzles) { |     for (const SwizzleParameters& swizzle : swizzles) { | ||||||
|         const size_t input_offset = swizzle.buffer_offset + map.offset; |         const size_t input_offset = swizzle.buffer_offset + map.offset; | ||||||
|         const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U); |         const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 8U); | ||||||
|         const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U); |         const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 8U); | ||||||
| 
 | 
 | ||||||
|         const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); |         const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); | ||||||
|         ASSERT(params.origin == (std::array<u32, 3>{0, 0, 0})); |         ASSERT(params.origin == (std::array<u32, 3>{0, 0, 0})); | ||||||
|         ASSERT(params.destination == (std::array<s32, 3>{0, 0, 0})); |         ASSERT(params.destination == (std::array<s32, 3>{0, 0, 0})); | ||||||
|  |         ASSERT(params.bytes_per_block_log2 == 4); | ||||||
| 
 | 
 | ||||||
|         glUniform1ui(2, params.bytes_per_block_log2); |         glUniform1ui(2, params.layer_stride); | ||||||
|         glUniform1ui(3, params.layer_stride); |         glUniform1ui(3, params.block_size); | ||||||
|         glUniform1ui(4, params.block_size); |         glUniform1ui(4, params.x_shift); | ||||||
|         glUniform1ui(5, params.x_shift); |         glUniform1ui(5, params.block_height); | ||||||
|         glUniform1ui(6, params.block_height); |         glUniform1ui(6, params.block_height_mask); | ||||||
|         glUniform1ui(7, params.block_height_mask); |  | ||||||
| 
 | 
 | ||||||
|         glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_TRUE, 0, |  | ||||||
|                            GL_WRITE_ONLY, GL_RGBA8); |  | ||||||
|         // ASTC texture data
 |         // ASTC texture data
 | ||||||
|         glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset, |         glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset, | ||||||
|                           image.guest_size_bytes - swizzle.buffer_offset); |                           image.guest_size_bytes - swizzle.buffer_offset); | ||||||
|  |         glBindImageTexture(BINDING_OUTPUT_IMAGE, image.StorageHandle(), swizzle.level, GL_TRUE, 0, | ||||||
|  |                            GL_WRITE_ONLY, GL_RGBA8); | ||||||
| 
 | 
 | ||||||
|         glDispatchCompute(num_dispatches_x, num_dispatches_y, image.info.resources.layers); |         glDispatchCompute(num_dispatches_x, num_dispatches_y, image.info.resources.layers); | ||||||
|     } |     } | ||||||
|  |  | ||||||
|  | @ -62,7 +62,6 @@ private: | ||||||
|     ProgramManager& program_manager; |     ProgramManager& program_manager; | ||||||
| 
 | 
 | ||||||
|     OGLBuffer swizzle_table_buffer; |     OGLBuffer swizzle_table_buffer; | ||||||
|     OGLBuffer astc_buffer; |  | ||||||
| 
 | 
 | ||||||
|     OGLProgram astc_decoder_program; |     OGLProgram astc_decoder_program; | ||||||
|     OGLProgram block_linear_unswizzle_2d_program; |     OGLProgram block_linear_unswizzle_2d_program; | ||||||
|  |  | ||||||
|  | @ -30,16 +30,12 @@ | ||||||
| namespace Vulkan { | namespace Vulkan { | ||||||
| 
 | 
 | ||||||
| using Tegra::Texture::SWIZZLE_TABLE; | using Tegra::Texture::SWIZZLE_TABLE; | ||||||
| using Tegra::Texture::ASTC::ASTC_ENCODINGS_VALUES; |  | ||||||
| using namespace Tegra::Texture::ASTC; |  | ||||||
| 
 | 
 | ||||||
| namespace { | namespace { | ||||||
| 
 | 
 | ||||||
| constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0; | constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0; | ||||||
| constexpr u32 ASTC_BINDING_ENC_BUFFER = 1; | constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 1; | ||||||
| constexpr u32 ASTC_BINDING_SWIZZLE_BUFFER = 2; | constexpr size_t ASTC_NUM_BINDINGS = 2; | ||||||
| constexpr u32 ASTC_BINDING_OUTPUT_IMAGE = 3; |  | ||||||
| constexpr size_t ASTC_NUM_BINDINGS = 4; |  | ||||||
| 
 | 
 | ||||||
| template <size_t size> | template <size_t size> | ||||||
| inline constexpr VkPushConstantRange COMPUTE_PUSH_CONSTANT_RANGE{ | inline constexpr VkPushConstantRange COMPUTE_PUSH_CONSTANT_RANGE{ | ||||||
|  | @ -75,7 +71,7 @@ constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ | ||||||
|     .score = 2, |     .score = 2, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| constexpr std::array<VkDescriptorSetLayoutBinding, 4> ASTC_DESCRIPTOR_SET_BINDINGS{{ | constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCRIPTOR_SET_BINDINGS{{ | ||||||
|     { |     { | ||||||
|         .binding = ASTC_BINDING_INPUT_BUFFER, |         .binding = ASTC_BINDING_INPUT_BUFFER, | ||||||
|         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, | ||||||
|  | @ -83,20 +79,6 @@ constexpr std::array<VkDescriptorSetLayoutBinding, 4> ASTC_DESCRIPTOR_SET_BINDIN | ||||||
|         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, |         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, | ||||||
|         .pImmutableSamplers = nullptr, |         .pImmutableSamplers = nullptr, | ||||||
|     }, |     }, | ||||||
|     { |  | ||||||
|         .binding = ASTC_BINDING_ENC_BUFFER, |  | ||||||
|         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |  | ||||||
|         .descriptorCount = 1, |  | ||||||
|         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, |  | ||||||
|         .pImmutableSamplers = nullptr, |  | ||||||
|     }, |  | ||||||
|     { |  | ||||||
|         .binding = ASTC_BINDING_SWIZZLE_BUFFER, |  | ||||||
|         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |  | ||||||
|         .descriptorCount = 1, |  | ||||||
|         .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, |  | ||||||
|         .pImmutableSamplers = nullptr, |  | ||||||
|     }, |  | ||||||
|     { |     { | ||||||
|         .binding = ASTC_BINDING_OUTPUT_IMAGE, |         .binding = ASTC_BINDING_OUTPUT_IMAGE, | ||||||
|         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, |         .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, | ||||||
|  | @ -108,12 +90,12 @@ constexpr std::array<VkDescriptorSetLayoutBinding, 4> ASTC_DESCRIPTOR_SET_BINDIN | ||||||
| 
 | 
 | ||||||
| constexpr DescriptorBankInfo ASTC_BANK_INFO{ | constexpr DescriptorBankInfo ASTC_BANK_INFO{ | ||||||
|     .uniform_buffers = 0, |     .uniform_buffers = 0, | ||||||
|     .storage_buffers = 3, |     .storage_buffers = 1, | ||||||
|     .texture_buffers = 0, |     .texture_buffers = 0, | ||||||
|     .image_buffers = 0, |     .image_buffers = 0, | ||||||
|     .textures = 0, |     .textures = 0, | ||||||
|     .images = 1, |     .images = 1, | ||||||
|     .score = 4, |     .score = 2, | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| constexpr VkDescriptorUpdateTemplateEntryKHR INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE{ | constexpr VkDescriptorUpdateTemplateEntryKHR INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE{ | ||||||
|  | @ -135,22 +117,6 @@ constexpr std::array<VkDescriptorUpdateTemplateEntryKHR, ASTC_NUM_BINDINGS> | ||||||
|             .offset = ASTC_BINDING_INPUT_BUFFER * sizeof(DescriptorUpdateEntry), |             .offset = ASTC_BINDING_INPUT_BUFFER * sizeof(DescriptorUpdateEntry), | ||||||
|             .stride = sizeof(DescriptorUpdateEntry), |             .stride = sizeof(DescriptorUpdateEntry), | ||||||
|         }, |         }, | ||||||
|         { |  | ||||||
|             .dstBinding = ASTC_BINDING_ENC_BUFFER, |  | ||||||
|             .dstArrayElement = 0, |  | ||||||
|             .descriptorCount = 1, |  | ||||||
|             .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |  | ||||||
|             .offset = ASTC_BINDING_ENC_BUFFER * sizeof(DescriptorUpdateEntry), |  | ||||||
|             .stride = sizeof(DescriptorUpdateEntry), |  | ||||||
|         }, |  | ||||||
|         { |  | ||||||
|             .dstBinding = ASTC_BINDING_SWIZZLE_BUFFER, |  | ||||||
|             .dstArrayElement = 0, |  | ||||||
|             .descriptorCount = 1, |  | ||||||
|             .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, |  | ||||||
|             .offset = ASTC_BINDING_SWIZZLE_BUFFER * sizeof(DescriptorUpdateEntry), |  | ||||||
|             .stride = sizeof(DescriptorUpdateEntry), |  | ||||||
|         }, |  | ||||||
|         { |         { | ||||||
|             .dstBinding = ASTC_BINDING_OUTPUT_IMAGE, |             .dstBinding = ASTC_BINDING_OUTPUT_IMAGE, | ||||||
|             .dstArrayElement = 0, |             .dstArrayElement = 0, | ||||||
|  | @ -163,7 +129,6 @@ constexpr std::array<VkDescriptorUpdateTemplateEntryKHR, ASTC_NUM_BINDINGS> | ||||||
| 
 | 
 | ||||||
| struct AstcPushConstants { | struct AstcPushConstants { | ||||||
|     std::array<u32, 2> blocks_dims; |     std::array<u32, 2> blocks_dims; | ||||||
|     u32 bytes_per_block_log2; |  | ||||||
|     u32 layer_stride; |     u32 layer_stride; | ||||||
|     u32 block_size; |     u32 block_size; | ||||||
|     u32 x_shift; |     u32 x_shift; | ||||||
|  | @ -354,46 +319,6 @@ ASTCDecoderPass::ASTCDecoderPass(const Device& device_, VKScheduler& scheduler_, | ||||||
| 
 | 
 | ||||||
| ASTCDecoderPass::~ASTCDecoderPass() = default; | ASTCDecoderPass::~ASTCDecoderPass() = default; | ||||||
| 
 | 
 | ||||||
| void ASTCDecoderPass::MakeDataBuffer() { |  | ||||||
|     constexpr size_t TOTAL_BUFFER_SIZE = sizeof(ASTC_ENCODINGS_VALUES) + sizeof(SWIZZLE_TABLE); |  | ||||||
|     data_buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{ |  | ||||||
|         .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, |  | ||||||
|         .pNext = nullptr, |  | ||||||
|         .flags = 0, |  | ||||||
|         .size = TOTAL_BUFFER_SIZE, |  | ||||||
|         .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, |  | ||||||
|         .sharingMode = VK_SHARING_MODE_EXCLUSIVE, |  | ||||||
|         .queueFamilyIndexCount = 0, |  | ||||||
|         .pQueueFamilyIndices = nullptr, |  | ||||||
|     }); |  | ||||||
|     data_buffer_commit = memory_allocator.Commit(data_buffer, MemoryUsage::Upload); |  | ||||||
| 
 |  | ||||||
|     const auto staging_ref = staging_buffer_pool.Request(TOTAL_BUFFER_SIZE, MemoryUsage::Upload); |  | ||||||
|     std::memcpy(staging_ref.mapped_span.data(), &ASTC_ENCODINGS_VALUES, |  | ||||||
|                 sizeof(ASTC_ENCODINGS_VALUES)); |  | ||||||
|     // Tack on the swizzle table at the end of the buffer
 |  | ||||||
|     std::memcpy(staging_ref.mapped_span.data() + sizeof(ASTC_ENCODINGS_VALUES), &SWIZZLE_TABLE, |  | ||||||
|                 sizeof(SWIZZLE_TABLE)); |  | ||||||
| 
 |  | ||||||
|     scheduler.Record([src = staging_ref.buffer, offset = staging_ref.offset, dst = *data_buffer, |  | ||||||
|                       TOTAL_BUFFER_SIZE](vk::CommandBuffer cmdbuf) { |  | ||||||
|         static constexpr VkMemoryBarrier write_barrier{ |  | ||||||
|             .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, |  | ||||||
|             .pNext = nullptr, |  | ||||||
|             .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, |  | ||||||
|             .dstAccessMask = VK_ACCESS_SHADER_READ_BIT, |  | ||||||
|         }; |  | ||||||
|         const VkBufferCopy copy{ |  | ||||||
|             .srcOffset = offset, |  | ||||||
|             .dstOffset = 0, |  | ||||||
|             .size = TOTAL_BUFFER_SIZE, |  | ||||||
|         }; |  | ||||||
|         cmdbuf.CopyBuffer(src, dst, copy); |  | ||||||
|         cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, |  | ||||||
|                                0, write_barrier); |  | ||||||
|     }); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, | void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, | ||||||
|                                std::span<const VideoCommon::SwizzleParameters> swizzles) { |                                std::span<const VideoCommon::SwizzleParameters> swizzles) { | ||||||
|     using namespace VideoCommon::Accelerated; |     using namespace VideoCommon::Accelerated; | ||||||
|  | @ -402,9 +327,6 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, | ||||||
|         VideoCore::Surface::DefaultBlockHeight(image.info.format), |         VideoCore::Surface::DefaultBlockHeight(image.info.format), | ||||||
|     }; |     }; | ||||||
|     scheduler.RequestOutsideRenderPassOperationContext(); |     scheduler.RequestOutsideRenderPassOperationContext(); | ||||||
|     if (!data_buffer) { |  | ||||||
|         MakeDataBuffer(); |  | ||||||
|     } |  | ||||||
|     const VkPipeline vk_pipeline = *pipeline; |     const VkPipeline vk_pipeline = *pipeline; | ||||||
|     const VkImageAspectFlags aspect_mask = image.AspectMask(); |     const VkImageAspectFlags aspect_mask = image.AspectMask(); | ||||||
|     const VkImage vk_image = image.Handle(); |     const VkImage vk_image = image.Handle(); | ||||||
|  | @ -436,16 +358,13 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, | ||||||
|         }); |         }); | ||||||
|     for (const VideoCommon::SwizzleParameters& swizzle : swizzles) { |     for (const VideoCommon::SwizzleParameters& swizzle : swizzles) { | ||||||
|         const size_t input_offset = swizzle.buffer_offset + map.offset; |         const size_t input_offset = swizzle.buffer_offset + map.offset; | ||||||
|         const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 32U); |         const u32 num_dispatches_x = Common::DivCeil(swizzle.num_tiles.width, 8U); | ||||||
|         const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 32U); |         const u32 num_dispatches_y = Common::DivCeil(swizzle.num_tiles.height, 8U); | ||||||
|         const u32 num_dispatches_z = image.info.resources.layers; |         const u32 num_dispatches_z = image.info.resources.layers; | ||||||
| 
 | 
 | ||||||
|         update_descriptor_queue.Acquire(); |         update_descriptor_queue.Acquire(); | ||||||
|         update_descriptor_queue.AddBuffer(map.buffer, input_offset, |         update_descriptor_queue.AddBuffer(map.buffer, input_offset, | ||||||
|                                           image.guest_size_bytes - swizzle.buffer_offset); |                                           image.guest_size_bytes - swizzle.buffer_offset); | ||||||
|         update_descriptor_queue.AddBuffer(*data_buffer, 0, sizeof(ASTC_ENCODINGS_VALUES)); |  | ||||||
|         update_descriptor_queue.AddBuffer(*data_buffer, sizeof(ASTC_ENCODINGS_VALUES), |  | ||||||
|                                           sizeof(SWIZZLE_TABLE)); |  | ||||||
|         update_descriptor_queue.AddImage(image.StorageImageView(swizzle.level)); |         update_descriptor_queue.AddImage(image.StorageImageView(swizzle.level)); | ||||||
|         const void* const descriptor_data{update_descriptor_queue.UpdateData()}; |         const void* const descriptor_data{update_descriptor_queue.UpdateData()}; | ||||||
| 
 | 
 | ||||||
|  | @ -453,11 +372,11 @@ void ASTCDecoderPass::Assemble(Image& image, const StagingBufferRef& map, | ||||||
|         const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); |         const auto params = MakeBlockLinearSwizzle2DParams(swizzle, image.info); | ||||||
|         ASSERT(params.origin == (std::array<u32, 3>{0, 0, 0})); |         ASSERT(params.origin == (std::array<u32, 3>{0, 0, 0})); | ||||||
|         ASSERT(params.destination == (std::array<s32, 3>{0, 0, 0})); |         ASSERT(params.destination == (std::array<s32, 3>{0, 0, 0})); | ||||||
|  |         ASSERT(params.bytes_per_block_log2 == 4); | ||||||
|         scheduler.Record([this, num_dispatches_x, num_dispatches_y, num_dispatches_z, block_dims, |         scheduler.Record([this, num_dispatches_x, num_dispatches_y, num_dispatches_z, block_dims, | ||||||
|                           params, descriptor_data](vk::CommandBuffer cmdbuf) { |                           params, descriptor_data](vk::CommandBuffer cmdbuf) { | ||||||
|             const AstcPushConstants uniforms{ |             const AstcPushConstants uniforms{ | ||||||
|                 .blocks_dims = block_dims, |                 .blocks_dims = block_dims, | ||||||
|                 .bytes_per_block_log2 = params.bytes_per_block_log2, |  | ||||||
|                 .layer_stride = params.layer_stride, |                 .layer_stride = params.layer_stride, | ||||||
|                 .block_size = params.block_size, |                 .block_size = params.block_size, | ||||||
|                 .x_shift = params.x_shift, |                 .x_shift = params.x_shift, | ||||||
|  |  | ||||||
|  | @ -96,15 +96,10 @@ public: | ||||||
|                   std::span<const VideoCommon::SwizzleParameters> swizzles); |                   std::span<const VideoCommon::SwizzleParameters> swizzles); | ||||||
| 
 | 
 | ||||||
| private: | private: | ||||||
|     void MakeDataBuffer(); |  | ||||||
| 
 |  | ||||||
|     VKScheduler& scheduler; |     VKScheduler& scheduler; | ||||||
|     StagingBufferPool& staging_buffer_pool; |     StagingBufferPool& staging_buffer_pool; | ||||||
|     VKUpdateDescriptorQueue& update_descriptor_queue; |     VKUpdateDescriptorQueue& update_descriptor_queue; | ||||||
|     MemoryAllocator& memory_allocator; |     MemoryAllocator& memory_allocator; | ||||||
| 
 |  | ||||||
|     vk::Buffer data_buffer; |  | ||||||
|     MemoryCommit data_buffer_commit; |  | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| } // namespace Vulkan
 | } // namespace Vulkan
 | ||||||
|  |  | ||||||
|  | @ -151,6 +151,76 @@ private: | ||||||
|     const IntType& m_Bits; |     const IntType& m_Bits; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | enum class IntegerEncoding { JustBits, Quint, Trit }; | ||||||
|  | 
 | ||||||
|  | struct IntegerEncodedValue { | ||||||
|  |     constexpr IntegerEncodedValue() = default; | ||||||
|  | 
 | ||||||
|  |     constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_) | ||||||
|  |         : encoding{encoding_}, num_bits{num_bits_} {} | ||||||
|  | 
 | ||||||
|  |     constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const { | ||||||
|  |         return encoding == other.encoding && num_bits == other.num_bits; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     // Returns the number of bits required to encode num_vals values.
 | ||||||
|  |     u32 GetBitLength(u32 num_vals) const { | ||||||
|  |         u32 total_bits = num_bits * num_vals; | ||||||
|  |         if (encoding == IntegerEncoding::Trit) { | ||||||
|  |             total_bits += (num_vals * 8 + 4) / 5; | ||||||
|  |         } else if (encoding == IntegerEncoding::Quint) { | ||||||
|  |             total_bits += (num_vals * 7 + 2) / 3; | ||||||
|  |         } | ||||||
|  |         return total_bits; | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     IntegerEncoding encoding{}; | ||||||
|  |     u32 num_bits = 0; | ||||||
|  |     u32 bit_value = 0; | ||||||
|  |     union { | ||||||
|  |         u32 quint_value = 0; | ||||||
|  |         u32 trit_value; | ||||||
|  |     }; | ||||||
|  | }; | ||||||
|  | 
 | ||||||
|  | // Returns a new instance of this struct that corresponds to the
 | ||||||
|  | // can take no more than mav_value values
 | ||||||
|  | static constexpr IntegerEncodedValue CreateEncoding(u32 mav_value) { | ||||||
|  |     while (mav_value > 0) { | ||||||
|  |         u32 check = mav_value + 1; | ||||||
|  | 
 | ||||||
|  |         // Is mav_value a power of two?
 | ||||||
|  |         if (!(check & (check - 1))) { | ||||||
|  |             return IntegerEncodedValue(IntegerEncoding::JustBits, std::popcount(mav_value)); | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         // Is mav_value of the type 3*2^n - 1?
 | ||||||
|  |         if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) { | ||||||
|  |             return IntegerEncodedValue(IntegerEncoding::Trit, std::popcount(check / 3 - 1)); | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         // Is mav_value of the type 5*2^n - 1?
 | ||||||
|  |         if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) { | ||||||
|  |             return IntegerEncodedValue(IntegerEncoding::Quint, std::popcount(check / 5 - 1)); | ||||||
|  |         } | ||||||
|  | 
 | ||||||
|  |         // Apparently it can't be represented with a bounded integer sequence...
 | ||||||
|  |         // just iterate.
 | ||||||
|  |         mav_value--; | ||||||
|  |     } | ||||||
|  |     return IntegerEncodedValue(IntegerEncoding::JustBits, 0); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() { | ||||||
|  |     std::array<IntegerEncodedValue, 256> encodings{}; | ||||||
|  |     for (std::size_t i = 0; i < encodings.size(); ++i) { | ||||||
|  |         encodings[i] = CreateEncoding(static_cast<u32>(i)); | ||||||
|  |     } | ||||||
|  |     return encodings; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static constexpr std::array<IntegerEncodedValue, 256> ASTC_ENCODINGS_VALUES = MakeEncodedValues(); | ||||||
|  | 
 | ||||||
| namespace Tegra::Texture::ASTC { | namespace Tegra::Texture::ASTC { | ||||||
| using IntegerEncodedVector = boost::container::static_vector< | using IntegerEncodedVector = boost::container::static_vector< | ||||||
|     IntegerEncodedValue, 256, |     IntegerEncodedValue, 256, | ||||||
|  | @ -521,35 +591,41 @@ static TexelWeightParams DecodeBlockInfo(InputBitStream& strm) { | ||||||
|     return params; |     return params; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void FillVoidExtentLDR(InputBitStream& strm, std::span<u32> outBuf, u32 blockWidth, | // Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)]
 | ||||||
|                               u32 blockHeight) { | // is the same as [(num_bits - 1):0] and repeats all the way down.
 | ||||||
|     // Don't actually care about the void extent, just read the bits...
 | template <typename IntType> | ||||||
|     for (s32 i = 0; i < 4; ++i) { | static constexpr IntType Replicate(IntType val, u32 num_bits, u32 to_bit) { | ||||||
|         strm.ReadBits<13>(); |     if (num_bits == 0 || to_bit == 0) { | ||||||
|  |         return 0; | ||||||
|     } |     } | ||||||
| 
 |     const IntType v = val & static_cast<IntType>((1 << num_bits) - 1); | ||||||
|     // Decode the RGBA components and renormalize them to the range [0, 255]
 |     IntType res = v; | ||||||
|     u16 r = static_cast<u16>(strm.ReadBits<16>()); |     u32 reslen = num_bits; | ||||||
|     u16 g = static_cast<u16>(strm.ReadBits<16>()); |     while (reslen < to_bit) { | ||||||
|     u16 b = static_cast<u16>(strm.ReadBits<16>()); |         u32 comp = 0; | ||||||
|     u16 a = static_cast<u16>(strm.ReadBits<16>()); |         if (num_bits > to_bit - reslen) { | ||||||
| 
 |             u32 newshift = to_bit - reslen; | ||||||
|     u32 rgba = (r >> 8) | (g & 0xFF00) | (static_cast<u32>(b) & 0xFF00) << 8 | |             comp = num_bits - newshift; | ||||||
|                (static_cast<u32>(a) & 0xFF00) << 16; |             num_bits = newshift; | ||||||
| 
 |  | ||||||
|     for (u32 j = 0; j < blockHeight; j++) { |  | ||||||
|         for (u32 i = 0; i < blockWidth; i++) { |  | ||||||
|             outBuf[j * blockWidth + i] = rgba; |  | ||||||
|         } |         } | ||||||
|  |         res = static_cast<IntType>(res << num_bits); | ||||||
|  |         res = static_cast<IntType>(res | (v >> comp)); | ||||||
|  |         reslen += num_bits; | ||||||
|     } |     } | ||||||
|  |     return res; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static void FillError(std::span<u32> outBuf, u32 blockWidth, u32 blockHeight) { | static constexpr std::size_t NumReplicateEntries(u32 num_bits) { | ||||||
|     for (u32 j = 0; j < blockHeight; j++) { |     return std::size_t(1) << num_bits; | ||||||
|         for (u32 i = 0; i < blockWidth; i++) { | } | ||||||
|             outBuf[j * blockWidth + i] = 0xFFFF00FF; | 
 | ||||||
|         } | template <typename IntType, u32 num_bits, u32 to_bit> | ||||||
|  | static constexpr auto MakeReplicateTable() { | ||||||
|  |     std::array<IntType, NumReplicateEntries(num_bits)> table{}; | ||||||
|  |     for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) { | ||||||
|  |         table[value] = Replicate(value, num_bits, to_bit); | ||||||
|     } |     } | ||||||
|  |     return table; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>(); | static constexpr auto REPLICATE_BYTE_TO_16_TABLE = MakeReplicateTable<u32, 8, 16>(); | ||||||
|  | @ -572,6 +648,9 @@ static constexpr auto REPLICATE_2_BIT_TO_8_TABLE = MakeReplicateTable<u32, 2, 8> | ||||||
| static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>(); | static constexpr auto REPLICATE_3_BIT_TO_8_TABLE = MakeReplicateTable<u32, 3, 8>(); | ||||||
| static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>(); | static constexpr auto REPLICATE_4_BIT_TO_8_TABLE = MakeReplicateTable<u32, 4, 8>(); | ||||||
| static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>(); | static constexpr auto REPLICATE_5_BIT_TO_8_TABLE = MakeReplicateTable<u32, 5, 8>(); | ||||||
|  | static constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>(); | ||||||
|  | static constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>(); | ||||||
|  | static constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>(); | ||||||
| /// Use a precompiled table with the most common usages, if it's not in the expected range, fallback
 | /// Use a precompiled table with the most common usages, if it's not in the expected range, fallback
 | ||||||
| /// to the runtime implementation
 | /// to the runtime implementation
 | ||||||
| static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) { | static constexpr u32 FastReplicateTo8(u32 value, u32 num_bits) { | ||||||
|  | @ -1316,6 +1395,37 @@ static void ComputeEndpoints(Pixel& ep1, Pixel& ep2, const u32*& colorValues, | ||||||
| #undef READ_INT_VALUES | #undef READ_INT_VALUES | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static void FillVoidExtentLDR(InputBitStream& strm, std::span<u32> outBuf, u32 blockWidth, | ||||||
|  |                               u32 blockHeight) { | ||||||
|  |     // Don't actually care about the void extent, just read the bits...
 | ||||||
|  |     for (s32 i = 0; i < 4; ++i) { | ||||||
|  |         strm.ReadBits<13>(); | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     // Decode the RGBA components and renormalize them to the range [0, 255]
 | ||||||
|  |     u16 r = static_cast<u16>(strm.ReadBits<16>()); | ||||||
|  |     u16 g = static_cast<u16>(strm.ReadBits<16>()); | ||||||
|  |     u16 b = static_cast<u16>(strm.ReadBits<16>()); | ||||||
|  |     u16 a = static_cast<u16>(strm.ReadBits<16>()); | ||||||
|  | 
 | ||||||
|  |     u32 rgba = (r >> 8) | (g & 0xFF00) | (static_cast<u32>(b) & 0xFF00) << 8 | | ||||||
|  |                (static_cast<u32>(a) & 0xFF00) << 16; | ||||||
|  | 
 | ||||||
|  |     for (u32 j = 0; j < blockHeight; j++) { | ||||||
|  |         for (u32 i = 0; i < blockWidth; i++) { | ||||||
|  |             outBuf[j * blockWidth + i] = rgba; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static void FillError(std::span<u32> outBuf, u32 blockWidth, u32 blockHeight) { | ||||||
|  |     for (u32 j = 0; j < blockHeight; j++) { | ||||||
|  |         for (u32 i = 0; i < blockWidth; i++) { | ||||||
|  |             outBuf[j * blockWidth + i] = 0xFFFF00FF; | ||||||
|  |         } | ||||||
|  |     } | ||||||
|  | } | ||||||
|  | 
 | ||||||
| static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth, | static void DecompressBlock(std::span<const u8, 16> inBuf, const u32 blockWidth, | ||||||
|                             const u32 blockHeight, std::span<u32, 12 * 12> outBuf) { |                             const u32 blockHeight, std::span<u32, 12 * 12> outBuf) { | ||||||
|     InputBitStream strm(inBuf); |     InputBitStream strm(inBuf); | ||||||
|  |  | ||||||
|  | @ -9,117 +9,6 @@ | ||||||
| 
 | 
 | ||||||
| namespace Tegra::Texture::ASTC { | namespace Tegra::Texture::ASTC { | ||||||
| 
 | 
 | ||||||
| enum class IntegerEncoding { JustBits, Quint, Trit }; |  | ||||||
| 
 |  | ||||||
| struct IntegerEncodedValue { |  | ||||||
|     constexpr IntegerEncodedValue() = default; |  | ||||||
| 
 |  | ||||||
|     constexpr IntegerEncodedValue(IntegerEncoding encoding_, u32 num_bits_) |  | ||||||
|         : encoding{encoding_}, num_bits{num_bits_} {} |  | ||||||
| 
 |  | ||||||
|     constexpr bool MatchesEncoding(const IntegerEncodedValue& other) const { |  | ||||||
|         return encoding == other.encoding && num_bits == other.num_bits; |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     // Returns the number of bits required to encode num_vals values.
 |  | ||||||
|     u32 GetBitLength(u32 num_vals) const { |  | ||||||
|         u32 total_bits = num_bits * num_vals; |  | ||||||
|         if (encoding == IntegerEncoding::Trit) { |  | ||||||
|             total_bits += (num_vals * 8 + 4) / 5; |  | ||||||
|         } else if (encoding == IntegerEncoding::Quint) { |  | ||||||
|             total_bits += (num_vals * 7 + 2) / 3; |  | ||||||
|         } |  | ||||||
|         return total_bits; |  | ||||||
|     } |  | ||||||
| 
 |  | ||||||
|     IntegerEncoding encoding{}; |  | ||||||
|     u32 num_bits = 0; |  | ||||||
|     u32 bit_value = 0; |  | ||||||
|     union { |  | ||||||
|         u32 quint_value = 0; |  | ||||||
|         u32 trit_value; |  | ||||||
|     }; |  | ||||||
| }; |  | ||||||
| 
 |  | ||||||
| // Returns a new instance of this struct that corresponds to the
 |  | ||||||
| // can take no more than mav_value values
 |  | ||||||
| constexpr IntegerEncodedValue CreateEncoding(u32 mav_value) { |  | ||||||
|     while (mav_value > 0) { |  | ||||||
|         u32 check = mav_value + 1; |  | ||||||
| 
 |  | ||||||
|         // Is mav_value a power of two?
 |  | ||||||
|         if (!(check & (check - 1))) { |  | ||||||
|             return IntegerEncodedValue(IntegerEncoding::JustBits, std::popcount(mav_value)); |  | ||||||
|         } |  | ||||||
| 
 |  | ||||||
|         // Is mav_value of the type 3*2^n - 1?
 |  | ||||||
|         if ((check % 3 == 0) && !((check / 3) & ((check / 3) - 1))) { |  | ||||||
|             return IntegerEncodedValue(IntegerEncoding::Trit, std::popcount(check / 3 - 1)); |  | ||||||
|         } |  | ||||||
| 
 |  | ||||||
|         // Is mav_value of the type 5*2^n - 1?
 |  | ||||||
|         if ((check % 5 == 0) && !((check / 5) & ((check / 5) - 1))) { |  | ||||||
|             return IntegerEncodedValue(IntegerEncoding::Quint, std::popcount(check / 5 - 1)); |  | ||||||
|         } |  | ||||||
| 
 |  | ||||||
|         // Apparently it can't be represented with a bounded integer sequence...
 |  | ||||||
|         // just iterate.
 |  | ||||||
|         mav_value--; |  | ||||||
|     } |  | ||||||
|     return IntegerEncodedValue(IntegerEncoding::JustBits, 0); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| constexpr std::array<IntegerEncodedValue, 256> MakeEncodedValues() { |  | ||||||
|     std::array<IntegerEncodedValue, 256> encodings{}; |  | ||||||
|     for (std::size_t i = 0; i < encodings.size(); ++i) { |  | ||||||
|         encodings[i] = CreateEncoding(static_cast<u32>(i)); |  | ||||||
|     } |  | ||||||
|     return encodings; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| constexpr std::array<IntegerEncodedValue, 256> ASTC_ENCODINGS_VALUES = MakeEncodedValues(); |  | ||||||
| 
 |  | ||||||
| // Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)]
 |  | ||||||
| // is the same as [(num_bits - 1):0] and repeats all the way down.
 |  | ||||||
| template <typename IntType> |  | ||||||
| constexpr IntType Replicate(IntType val, u32 num_bits, u32 to_bit) { |  | ||||||
|     if (num_bits == 0 || to_bit == 0) { |  | ||||||
|         return 0; |  | ||||||
|     } |  | ||||||
|     const IntType v = val & static_cast<IntType>((1 << num_bits) - 1); |  | ||||||
|     IntType res = v; |  | ||||||
|     u32 reslen = num_bits; |  | ||||||
|     while (reslen < to_bit) { |  | ||||||
|         u32 comp = 0; |  | ||||||
|         if (num_bits > to_bit - reslen) { |  | ||||||
|             u32 newshift = to_bit - reslen; |  | ||||||
|             comp = num_bits - newshift; |  | ||||||
|             num_bits = newshift; |  | ||||||
|         } |  | ||||||
|         res = static_cast<IntType>(res << num_bits); |  | ||||||
|         res = static_cast<IntType>(res | (v >> comp)); |  | ||||||
|         reslen += num_bits; |  | ||||||
|     } |  | ||||||
|     return res; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| constexpr std::size_t NumReplicateEntries(u32 num_bits) { |  | ||||||
|     return std::size_t(1) << num_bits; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| template <typename IntType, u32 num_bits, u32 to_bit> |  | ||||||
| constexpr auto MakeReplicateTable() { |  | ||||||
|     std::array<IntType, NumReplicateEntries(num_bits)> table{}; |  | ||||||
|     for (IntType value = 0; value < static_cast<IntType>(std::size(table)); ++value) { |  | ||||||
|         table[value] = Replicate(value, num_bits, to_bit); |  | ||||||
|     } |  | ||||||
|     return table; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| constexpr auto REPLICATE_6_BIT_TO_8_TABLE = MakeReplicateTable<u32, 6, 8>(); |  | ||||||
| constexpr auto REPLICATE_7_BIT_TO_8_TABLE = MakeReplicateTable<u32, 7, 8>(); |  | ||||||
| constexpr auto REPLICATE_8_BIT_TO_8_TABLE = MakeReplicateTable<u32, 8, 8>(); |  | ||||||
| 
 |  | ||||||
| void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth, | void Decompress(std::span<const uint8_t> data, uint32_t width, uint32_t height, uint32_t depth, | ||||||
|                 uint32_t block_width, uint32_t block_height, std::span<uint8_t> output); |                 uint32_t block_width, uint32_t block_height, std::span<uint8_t> output); | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue
	
	 bunnei
						bunnei