Add to_binary/from_binary functions #6848

lokax · 2023-03-24T02:49:06Z

to_binary(bin), from_binary(unbin)

src/function/scalar/string/hex.cpp

Tishj · 2023-04-09T10:37:20Z

src/function/scalar/string/hex.cpp

-			*ptr = Blob::HEX_TABLE[byte];
-			ptr++;
-			buffer_size++;
+		if (input == 0) {


I get why this is necessary, but I don't like how this is duplicated across every place where LeadingZeros is used.
And also I could see future issues crop up where auto target = StringVector::EmptyString(result, sizeof(INPUT_TYPE) * 2 - (num_leading_zero / 4)); is called without first handling the 0 case.

The issue originates from LeadingZeros returning 64 for 0, maybe instead we need a LeadingZerosLow and LeadingZerosHigh, so the High variant can still return 64 for 0, and the Low variant would return 0 instead.

Also can you move the CountZeros struct from chimp to its own file, like common/bit_utils.hpp or common/bin_utils.hpp ?

First calculate buffer_size, I do some special handle if buffer_size is zero. This special case is unavoidable.
Then D_ASSERT(buffer_size) > 0; auto target = StringVector::EmptyString(buffer_size);

What else is causing buffer_size to be 0?
I believe it's only the problem that LeadingZeros returns 64 for 0, no?

Yes. I'm not sure how LeadingZerosHigh and LeadingZerosLow solve the problem.

I wasn't sure if there was another blocker here, so I wrote it up.
It's passing the tests you added:

diff --git a/src/common/types/vector.cpp b/src/common/types/vector.cpp index 3eca8cc972..371afcb230 100644 --- a/src/common/types/vector.cpp +++ b/src/common/types/vector.cpp @@ -1544,6 +1544,7 @@ string_t StringVector::AddStringOrBlob(Vector &vector, string_t data) { } string_t StringVector::EmptyString(Vector &vector, idx_t len) { + D_ASSERT(len > 0); D_ASSERT(vector.GetType().InternalType() == PhysicalType::VARCHAR); if (len <= string_t::INLINE_LENGTH) { return string_t(len); diff --git a/src/function/scalar/string/hex.cpp b/src/function/scalar/string/hex.cpp index dfed722fd1..5ee2c08867 100644 --- a/src/function/scalar/string/hex.cpp +++ b/src/function/scalar/string/hex.cpp @@ -86,22 +86,12 @@ struct HexIntegralOperator { template <class INPUT_TYPE, class RESULT_TYPE> static RESULT_TYPE Operation(INPUT_TYPE input, Vector &result) { - idx_t num_leading_zero = CountZeros<uint64_t>::Leading(input); + idx_t num_leading_zero = CountZeros<uint64_t>::LeadingLow(input); idx_t num_bits_to_check = 64 - num_leading_zero; D_ASSERT(num_bits_to_check <= sizeof(INPUT_TYPE) * 8); idx_t buffer_size = (num_bits_to_check + 3) / 4; - // Special case: All bits are zero - if (buffer_size == 0) { - auto target = StringVector::EmptyString(result, 1); - auto output = target.GetDataWriteable(); - *output = '0'; - target.Finalize(); - return target; - } - - D_ASSERT(buffer_size > 0); auto target = StringVector::EmptyString(result, buffer_size); auto output = target.GetDataWriteable(); @@ -116,18 +106,9 @@ struct HexHugeIntOperator { template <class INPUT_TYPE, class RESULT_TYPE> static RESULT_TYPE Operation(INPUT_TYPE input, Vector &result) { - idx_t num_leading_zero = CountZeros<hugeint_t>::Leading(input); + idx_t num_leading_zero = CountZeros<hugeint_t>::LeadingLow(input); idx_t buffer_size = sizeof(INPUT_TYPE) * 2 - (num_leading_zero / 4); - // Special case: All bits are zero - if (buffer_size == 0) { - auto target = StringVector::EmptyString(result, 1); - auto output = target.GetDataWriteable(); - *output = '0'; - target.Finalize(); - return target; - } - D_ASSERT(buffer_size > 0); auto target = StringVector::EmptyString(result, buffer_size); auto output = target.GetDataWriteable(); @@ -174,21 +155,12 @@ struct BinaryIntegralOperator { template <class INPUT_TYPE, class RESULT_TYPE> static RESULT_TYPE Operation(INPUT_TYPE input, Vector &result) { - idx_t num_leading_zero = CountZeros<uint64_t>::Leading(input); + idx_t num_leading_zero = CountZeros<uint64_t>::LeadingLow(input); idx_t num_bits_to_check = 64 - num_leading_zero; D_ASSERT(num_bits_to_check <= sizeof(INPUT_TYPE) * 8); idx_t buffer_size = num_bits_to_check; - // Special case: All bits are zero - if (buffer_size == 0) { - auto target = StringVector::EmptyString(result, 1); - auto output = target.GetDataWriteable(); - *output = '0'; - target.Finalize(); - return target; - } - D_ASSERT(buffer_size > 0); auto target = StringVector::EmptyString(result, buffer_size); auto output = target.GetDataWriteable(); @@ -203,18 +175,9 @@ struct BinaryIntegralOperator { struct BinaryHugeIntOperator { template <class INPUT_TYPE, class RESULT_TYPE> static RESULT_TYPE Operation(INPUT_TYPE input, Vector &result) { - idx_t num_leading_zero = CountZeros<hugeint_t>::Leading(input); + idx_t num_leading_zero = CountZeros<hugeint_t>::LeadingLow(input); idx_t buffer_size = sizeof(INPUT_TYPE) * 8 - num_leading_zero; - // Special case: All bits are zero - if (buffer_size == 0) { - auto target = StringVector::EmptyString(result, 1); - auto output = target.GetDataWriteable(); - *output = '0'; - target.Finalize(); - return target; - } - auto target = StringVector::EmptyString(result, buffer_size); auto output = target.GetDataWriteable(); diff --git a/src/include/duckdb/common/bit_utils.hpp b/src/include/duckdb/common/bit_utils.hpp index 6243b48029..263599f27d 100644 --- a/src/include/duckdb/common/bit_utils.hpp +++ b/src/include/duckdb/common/bit_utils.hpp @@ -74,73 +74,124 @@ static inline int __builtin_clz(unsigned int value) { namespace duckdb { +namespace { + +template <int ZERO_RESULT> +// uint32_t +inline static int LeadingZerosUINT32(uint32_t value) { + if (!value) { + return ZERO_RESULT; + } + return __builtin_clz(value); +} +template <int ZERO_RESULT> +inline static int TrailingZerosUINT32(uint32_t value) { + if (!value) { + return ZERO_RESULT; + } + return __builtin_ctz(value); +} + +// uint64_t +template <int ZERO_RESULT> +inline static int LeadingZerosUINT64(uint64_t value) { + if (!value) { + return ZERO_RESULT; + } + return __builtin_clzll(value); +} +template <int ZERO_RESULT> +inline static int TrailingZerosUINT64(uint64_t value) { + if (!value) { + return ZERO_RESULT; + } + return __builtin_ctzll(value); +} + +// hugeint_t (int128_t) +template <int ZERO_RESULT> +inline static int LeadingZerosUINT128(hugeint_t value) { + if (value == 0) { + return ZERO_RESULT; + } + + uint64_t upper = (uint64_t)value.upper; + uint64_t lower = value.lower; + + int res = __builtin_clzll(upper); + if (res == 64) { + res += __builtin_clzll(lower); + } + + return res; +} +template <int ZERO_RESULT> +inline static int TrailingZerosUINT128(hugeint_t value) { + if (value == 0) { + return ZERO_RESULT; + } + + uint64_t upper = (uint64_t)value.upper; + uint64_t lower = value.lower; + + int res = __builtin_ctzll(lower); + if (res == 64) { + res += __builtin_ctzll(upper); + } + + return res; +} + +} // namespace + template <class T> struct CountZeros {}; template <> struct CountZeros<uint32_t> { - inline static int Leading(uint32_t value) { - if (!value) { - return 32; - } - return __builtin_clz(value); - } - inline static int Trailing(uint32_t value) { - if (!value) { - return 32; - } - return __builtin_ctz(value); + inline static int LeadingHigh(uint32_t value) { + return LeadingZerosUINT32<32>(value); + } + inline static int LeadingLow(uint32_t value) { + return LeadingZerosUINT32<(sizeof(value) * 8) - 1>(value); + } + inline static int TrailingHigh(uint32_t value) { + return TrailingZerosUINT32<32>(value); + } + inline static int TrailingLow(uint32_t value) { + return TrailingZerosUINT32<(sizeof(value) * 8) - 1>(value); } }; template <> struct CountZeros<uint64_t> { - inline static int Leading(uint64_t value) { - if (!value) { - return 64; - } - return __builtin_clzll(value); - } - inline static int Trailing(uint64_t value) { - if (!value) { - return 64; - } - return __builtin_ctzll(value); + inline static int LeadingHigh(uint64_t value) { + return LeadingZerosUINT64<64>(value); + } + inline static int LeadingLow(uint64_t value) { + return LeadingZerosUINT64<(sizeof(value) * 8) - 1>(value); + } + inline static int TrailingHigh(uint64_t value) { + return TrailingZerosUINT64<64>(value); + } + inline static int TrailingLow(uint64_t value) { + return TrailingZerosUINT64<(sizeof(value) * 8) - 1>(value); } }; template <> struct CountZeros<hugeint_t> { - inline static int Leading(hugeint_t value) { - if (value == 0) { - return 128; - } - - uint64_t upper = (uint64_t)value.upper; - uint64_t lower = value.lower; - - int res = __builtin_clzll(upper); - if (res == 64) { - res += __builtin_clzll(lower); - } - - return res; + inline static int LeadingHigh(hugeint_t value) { + return LeadingZerosUINT128<128>(value); } - - inline static int Trailing(hugeint_t value) { - if (value == 0) { - return 128; - } - - uint64_t upper = (uint64_t)value.upper; - uint64_t lower = value.lower; - - int res = __builtin_ctzll(lower); - if (res == 64) { - res += __builtin_ctzll(upper); - } - - return res; + inline static int LeadingLow(hugeint_t value) { + return LeadingZerosUINT128<(sizeof(value) * 8) - 1>(value); + } + inline static int TrailingHigh(hugeint_t value) { + return TrailingZerosUINT128<128>(value); + } + inline static int TrailingLow(hugeint_t value) { + return TrailingZerosUINT128<(sizeof(value) * 8) - 1>(value); } }; diff --git a/src/include/duckdb/storage/compression/chimp/algorithm/chimp128.hpp b/src/include/duckdb/storage/compression/chimp/algorithm/chimp128.hpp index 50c3a123ae..0fa16c2a6b 100644 --- a/src/include/duckdb/storage/compression/chimp/algorithm/chimp128.hpp +++ b/src/include/duckdb/storage/compression/chimp/algorithm/chimp128.hpp @@ -121,7 +121,7 @@ public: } auto reference_value = state.ring_buffer.Value(current_index % ChimpConstants::BUFFER_SIZE); CHIMP_TYPE tempxor_result = (CHIMP_TYPE)in ^ reference_value; - trailing_zeros = CountZeros<CHIMP_TYPE>::Trailing(tempxor_result); + trailing_zeros = CountZeros<CHIMP_TYPE>::TrailingHigh(tempxor_result); trailing_zeros_exceed_threshold = trailing_zeros > TRAILING_ZERO_THRESHOLD; if (trailing_zeros_exceed_threshold) { previous_index = current_index % ChimpConstants::BUFFER_SIZE; @@ -143,7 +143,7 @@ public: state.SetLeadingZeros(); } else { // Values are not identical - auto leading_zeros_raw = CountZeros<CHIMP_TYPE>::Leading(xor_result); + auto leading_zeros_raw = CountZeros<CHIMP_TYPE>::LeadingHigh(xor_result); uint8_t leading_zeros = ChimpConstants::Compression::LEADING_ROUND[leading_zeros_raw]; if (trailing_zeros_exceed_threshold) { diff --git a/src/include/duckdb/storage/compression/patas/algorithm/patas.hpp b/src/include/duckdb/storage/compression/patas/algorithm/patas.hpp index 6dcddcb2d9..1a78ed24af 100644 --- a/src/include/duckdb/storage/compression/patas/algorithm/patas.hpp +++ b/src/include/duckdb/storage/compression/patas/algorithm/patas.hpp @@ -96,8 +96,8 @@ struct PatasCompression { EXACT_TYPE xor_result = value ^ reference_value; // Figure out the trailing zeros (max 6 bits) - const uint8_t trailing_zero = CountZeros<EXACT_TYPE>::Trailing(xor_result); - const uint8_t leading_zero = CountZeros<EXACT_TYPE>::Leading(xor_result); + const uint8_t trailing_zero = CountZeros<EXACT_TYPE>::TrailingHigh(xor_result); + const uint8_t leading_zero = CountZeros<EXACT_TYPE>::LeadingHigh(xor_result); const bool is_equal = xor_result == 0;

Maybe the names are confusing, especially since "Low" ended up just returning "High" - 1.
I wanted to get rid of the extra handling needed everywhere for 0, but maybe we just want to make the ZERO_RESULT a template parameter, so the caller can explicitly decide what the result for 0 should be, instead of having a "High" and "Low" variant

Or, also possible - it's not really necessary to remove the need for this special handling, because it's only done in 3 places

Mytherin · 2023-04-12T14:41:53Z

@Tishj is this ready to be merged? Can you do another pass over this?

Tishj · 2023-04-12T19:59:37Z

@Tishj is this ready to be merged? Can you do another pass over this?

I wasn't a huge fan of the need for special handling for 0, but if you think it's fine I'm happy to merge it.

Mytherin · 2023-04-13T11:50:13Z

I think that's fine, I understand why that special case is required here and it's not too bad. Thanks for reviewing and thanks @lokax for the PR!

lokax added 2 commits March 24, 2023 00:20

Add to_binary/from_binary functions

907fd11

Add from_binary

248f308

Tishj reviewed Mar 24, 2023

View reviewed changes

src/function/scalar/string/hex.cpp Outdated Show resolved Hide resolved

Tishj reviewed Mar 24, 2023

View reviewed changes

src/function/scalar/string/hex.cpp Outdated Show resolved Hide resolved

lokax added 4 commits March 25, 2023 10:27

Merge branch 'master' of github.com:duckdb/duckdb into to_binary

c6fcb2d

count leading zero for to_hex

af40d0e

Merge branch 'master' of github.com:duckdb/duckdb into to_binary

0fc3767

Merge branch 'master' of github.com:duckdb/duckdb into to_binary

1b8576e

lokax requested a review from Tishj April 9, 2023 08:05

Tishj reviewed Apr 9, 2023

View reviewed changes

src/function/scalar/string/hex.cpp Show resolved Hide resolved

Tishj reviewed Apr 9, 2023

View reviewed changes

lokax added 4 commits April 10, 2023 20:05

Move CountZero to common/bit_utils.hpp

f35fa07

Fix underflow. Use num_bits_to_check instead of num_leading_zero

f10bea1

fix offset shift

2eca1da

fix tidy check

4137298

Mytherin merged commit b1f8b0b into duckdb:master Apr 13, 2023

Dtenwolde mentioned this pull request Apr 25, 2023

Add functions to_hex/from_hex/from_binary/to_binary #6511

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Add to_binary/from_binary functions #6848

Add to_binary/from_binary functions #6848

Uh oh!

lokax commented Mar 24, 2023

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Tishj Apr 9, 2023 •

edited

Loading

Uh oh!

lokax Apr 10, 2023

Uh oh!

Tishj Apr 10, 2023

Uh oh!

lokax Apr 10, 2023

Uh oh!

Tishj Apr 10, 2023

Uh oh!

Tishj Apr 10, 2023 •

edited

Loading

Uh oh!

Mytherin commented Apr 12, 2023

Uh oh!

Tishj commented Apr 12, 2023

Uh oh!

Mytherin commented Apr 13, 2023 •

edited

Loading

Uh oh!

Uh oh!

Add to_binary/from_binary functions #6848

Add to_binary/from_binary functions #6848

Uh oh!

Conversation

lokax commented Mar 24, 2023

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Tishj Apr 9, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

lokax Apr 10, 2023

Choose a reason for hiding this comment

Uh oh!

Tishj Apr 10, 2023

Choose a reason for hiding this comment

Uh oh!

lokax Apr 10, 2023

Choose a reason for hiding this comment

Uh oh!

Tishj Apr 10, 2023

Choose a reason for hiding this comment

Uh oh!

Tishj Apr 10, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Mytherin commented Apr 12, 2023

Uh oh!

Tishj commented Apr 12, 2023

Uh oh!

Mytherin commented Apr 13, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Tishj Apr 9, 2023 •

edited

Loading

Tishj Apr 10, 2023 •

edited

Loading

Mytherin commented Apr 13, 2023 •

edited

Loading