From 7e548295b2775749585e34ac74aac72d9ccec702 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Joaqu=C3=ADn=20Atria?= Date: Wed, 26 Jun 2024 23:16:20 +0100 Subject: [PATCH] AVRO-1521 [Perl] Fix boolean encoding errors This change fixes a long-standing issue with the binary encoding of boolean values. In particular, that while several "smart" values were accepted as valid boolean values by Avro::Schema (eg. "true" and "no"), Avro::BinaryEncoder encoded them as true or false depending on their truth value for Perl. This resulted in both of those examples being encoded as true, because for Perl any non-empty string is true. This change makes it so that those values are accepted and properly handled, and handles other values that represent boolean values like JSON::PP::Boolean references and native Perl booleans (those that would be returned by eg. builtin::true). This also includes a small but possibly breaking bugfix for the detection of valid boolean values in Avro::Schema, which was using a non-anchored regular expression to filter values, meaning that eg. any value that had an "n" anywhere would be considered valid. This was most likely an involuntary error, so while breaking, it feels like we have to fix it. --- lang/perl/Changes | 12 ++++ lang/perl/lib/Avro/BinaryEncoder.pm | 24 +++++++- lang/perl/lib/Avro/Schema.pm | 4 +- lang/perl/t/02_bin_encode.t | 91 +++++++++++++++++++++-------- 4 files changed, 105 insertions(+), 26 deletions(-) diff --git a/lang/perl/Changes b/lang/perl/Changes index c1551566f5f..462f90e903a 100644 --- a/lang/perl/Changes +++ b/lang/perl/Changes @@ -15,6 +15,18 @@ Revision history for Perl extension Avro are encoded in fields of type 'byte' or 'fixed'. Errors resulting from this process will be raised as Avro::BinaryEncoder::Error exceptions + - Fixed a bug with the detection of valid boolean values + which was not using anchors in a regular expression. + Valid values are (case insensitively) 'yes', 'y', 'no', + 'n', 'true', 't', 'false', 'f', 0, 1, and anything that + is accepted by JSON::PP::is_bool. References rejected by + this last function are not valid + - Fixed some issues around the binary encoding of boolean + values. In particular, Avro::BinaryEncoder now correctly + encodes all the values that Avro::Schema accepts as + valid boolean values, which before were blindly evaluated + in boolean context resulting in strings like 'false' + being serialised as if they were true 1.00 Fri Jan 17 15:00:00 2014 - Relicense under apache license 2.0 diff --git a/lang/perl/lib/Avro/BinaryEncoder.pm b/lang/perl/lib/Avro/BinaryEncoder.pm index 47eb93116f0..51399a64c8d 100644 --- a/lang/perl/lib/Avro/BinaryEncoder.pm +++ b/lang/perl/lib/Avro/BinaryEncoder.pm @@ -23,6 +23,7 @@ use Config; use Encode(); use Error::Simple; use Regexp::Common qw(number); +use JSON::PP; # For is_bool our $VERSION = '++MODULE_VERSION++'; @@ -91,7 +92,28 @@ sub encode_null { sub encode_boolean { my $class = shift; my ($schema, $data, $cb) = @_; - $cb->( $data ? \"\x1" : \"\x0" ); + + throw Avro::BinaryEncoder::Error( " is not a valid boolean value") + unless defined $data; + + if ( my $type = ref $data ) { + throw Avro::BinaryEncoder::Error("cannot encode a '$type' reference as boolean") + unless $type eq 'JSON::PP::Boolean'; + } + else { + throw Avro::BinaryEncoder::Error( "'$data' is not a valid boolean value") + unless $data eq '' # For Perl versions without builtin::is_bool + || JSON::PP::is_bool($data) + || $data =~ /^(?:true|t|false|f|yes|y|no|n|0|1)$/i; + } + + # Some values might be false but evaluate to "truthy" for Perl, + # like the string 'false'. For these known exceptions, which + # would otherwise be false, we read a false value. For everything + # else, we rely on their value evaluated in a boolean context. + my $true = $data =~ /^(?:no|n|false|f)$/i ? 0 : !!$data; + + $cb->( $true ? \"\x1" : \"\x0" ); } sub encode_int { diff --git a/lang/perl/lib/Avro/Schema.pm b/lang/perl/lib/Avro/Schema.pm index 200d2947cd7..0543a90f062 100644 --- a/lang/perl/lib/Avro/Schema.pm +++ b/lang/perl/lib/Avro/Schema.pm @@ -321,9 +321,9 @@ sub is_data_valid { return 1 unless ref $data; } if ($type eq 'boolean') { + return 1 if JSON::PP::is_bool($data); return 0 if ref $data; # sometimes risky - return 1 if $data =~ m{yes|no|y|n|t|f|true|false}i; - return 0; + return $data =~ m{^(?:yes|no|y|n|t|f|true|false|0|1)$}i; } return 0; } diff --git a/lang/perl/t/02_bin_encode.t b/lang/perl/t/02_bin_encode.t index 4a3001fe978..5be10e7b7d6 100644 --- a/lang/perl/t/02_bin_encode.t +++ b/lang/perl/t/02_bin_encode.t @@ -24,18 +24,26 @@ use Config; use Test::More; use Test::Exception; use Math::BigInt; +use JSON::PP; # For booleans use_ok 'Avro::BinaryEncoder'; -sub primitive_ok { - my ($primitive_type, $primitive_val, $expected_enc) = @_; +sub primitive { + my ($type, $data) = @_; - my $data; - my $meth = "encode_$primitive_type"; - Avro::BinaryEncoder->$meth( - undef, $primitive_val, sub { $data = ${$_[0]} } + my $encoded; + my $method = "encode_$type"; + Avro::BinaryEncoder->$method( + undef, $data, sub { $encoded = ${$_[0]} } ); - is $data, $expected_enc, "primitive $primitive_type encoded correctly"; + return $encoded; +} + +sub primitive_ok { + my ($type, $have, $want) = @_; + my $data = primitive( $type => $have ); + is primitive( $type => $have ), $want, + "primitive $type encoded @{[ $have // '' ]} correctly"; return $data; } @@ -44,8 +52,48 @@ sub primitive_ok { primitive_ok null => undef, ''; primitive_ok null => 'whatev', ''; - primitive_ok boolean => 0, "\x0"; - primitive_ok boolean => 1, "\x1"; + primitive_ok boolean => 0, "\x0"; + primitive_ok boolean => 1, "\x1"; + primitive_ok boolean => 'false', "\x0"; + primitive_ok boolean => 'true', "\x1"; + primitive_ok boolean => 'f', "\x0"; + primitive_ok boolean => 't', "\x1"; + primitive_ok boolean => 'no', "\x0"; + primitive_ok boolean => 'yes', "\x1"; + primitive_ok boolean => 'n', "\x0"; + primitive_ok boolean => 'y', "\x1"; + primitive_ok boolean => 'FALSE', "\x0"; + primitive_ok boolean => 'TRUE', "\x1"; + primitive_ok boolean => 'F', "\x0"; + primitive_ok boolean => 'T', "\x1"; + primitive_ok boolean => 'NO', "\x0"; + primitive_ok boolean => 'YES', "\x1"; + primitive_ok boolean => 'N', "\x0"; + primitive_ok boolean => 'Y', "\x1"; + primitive_ok boolean => !!0, "\x0"; # Native false + primitive_ok boolean => !!1, "\x1"; # Native true + primitive_ok boolean => $JSON::PP::false, "\x0"; + primitive_ok boolean => $JSON::PP::true, "\x1"; + + throws_ok { + primitive boolean => undef; + } "Avro::BinaryEncoder::Error", " is not a valid boolean value"; + + throws_ok { + primitive boolean => 2; + } "Avro::BinaryEncoder::Error", "'2' is not a valid boolean value"; + + throws_ok { + primitive boolean => -1; + } "Avro::BinaryEncoder::Error", "'-1' is not a valid boolean value"; + + throws_ok { + primitive boolean => 'anything truthy'; + } "Avro::BinaryEncoder::Error", "'anything truthy' is not a valid boolean value"; + + throws_ok { + primitive boolean => {}; + } "Avro::BinaryEncoder::Error", "cannot encode a 'HASH' reference as boolean"; ## - high-bit of each byte should be set except for last one ## - rest of bits are: @@ -74,33 +122,30 @@ sub primitive_ok { primitive_ok long => -Math::BigInt->new(2)**63, "\xff\xff\xff\xff\xff\xff\xff\xff\xff\x01"; throws_ok { - primitive_ok int => Math::BigInt->new(2)**31; + primitive int => Math::BigInt->new(2)**31; } "Avro::BinaryEncoder::Error", "32-bit signed int overflow"; throws_ok { - primitive_ok int => -Math::BigInt->new(2)**31 - 1; + primitive int => -Math::BigInt->new(2)**31 - 1; } "Avro::BinaryEncoder::Error", "32-bit signed int underflow"; throws_ok { - primitive_ok int => Math::BigInt->new(2)**63; + primitive int => Math::BigInt->new(2)**63; } "Avro::BinaryEncoder::Error", "64-bit signed int overflow"; throws_ok { - primitive_ok long => -Math::BigInt->new(2)**63 - 1; + primitive long => -Math::BigInt->new(2)**63 - 1; } "Avro::BinaryEncoder::Error", "64-bit signed int underflow"; - for (qw(long int)) { - throws_ok { - primitive_ok $_ => "x", undef; - } 'Avro::BinaryEncoder::Error', 'numeric values only'; - }; + throws_ok { + primitive $_ => "x"; + } 'Avro::BinaryEncoder::Error', 'numeric values only' for qw(long int); + # In Unicode, there are decimals that aren't 0-9. # Make sure we handle non-ascii decimals cleanly. - for (qw(long int)) { - throws_ok { - primitive_ok $_ => "\N{U+0661}", undef; - } 'Avro::BinaryEncoder::Error', 'ascii decimals only'; - }; + throws_ok { + primitive $_ => "\N{U+0661}"; + } 'Avro::BinaryEncoder::Error', 'ascii decimals only' for qw(long int); } ## spec examples