Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AVRO-1517: [Perl] Encode UTF-8 strings as bytes #2979

Merged
merged 1 commit into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions lang/perl/Changes
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ Revision history for Perl extension Avro
for int and long types were off by one
- Silenced a spurious warning that was raised when
validating an undefined value for some data types
- Make sure Unicode strings are downgraded when they
are encoded in fields of type 'byte' or 'fixed'.
Errors resulting from this process will be raised as
Avro::BinaryEncoder::Error exceptions

1.00 Fri Jan 17 15:00:00 2014
- Relicense under apache license 2.0
Expand Down
21 changes: 14 additions & 7 deletions lang/perl/lib/Avro/BinaryEncoder.pm
Original file line number Diff line number Diff line change
Expand Up @@ -138,15 +138,17 @@ sub encode_double {
sub encode_bytes {
my $class = shift;
my ($schema, $data, $cb) = @_;
encode_long($class, undef, bytes::length($data), $cb);
throw Avro::BinaryEncoder::Error("Invalid data given for 'bytes': Contains values >255")
unless utf8::downgrade($data, 1);
encode_long($class, undef, length($data), $cb);
$cb->(\$data);
}

sub encode_string {
my $class = shift;
my ($schema, $data, $cb) = @_;
my $bytes = Encode::encode_utf8($data);
encode_long($class, undef, bytes::length($bytes), $cb);
encode_long($class, undef, length($bytes), $cb);
$cb->(\$bytes);
}

Expand Down Expand Up @@ -270,11 +272,16 @@ sub encode_union {
sub encode_fixed {
my $class = shift;
my ($schema, $data, $cb) = @_;
if (bytes::length $data != $schema->size) {
my $s1 = bytes::length $data;
my $s2 = $schema->size;
throw Avro::BinaryEncoder::Error("Fixed size doesn't match $s1!=$s2");
}

throw Avro::BinaryEncoder::Error("Invalid data given for 'fixed': Contains values >255")
unless utf8::downgrade($data, 1);

my $length = length $data;
my $size = $schema->size;

throw Avro::BinaryEncoder::Error("Fixed size doesn't match $length!=$size")
unless $length == $size;

$cb->(\$data);
}

Expand Down
15 changes: 9 additions & 6 deletions lang/perl/lib/Avro/Schema.pm
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,11 @@ sub is_data_valid {
if ($type eq 'float' or $type eq 'double') {
$data =~ /^$RE{num}{real}$/ ? return 1 : 0;
}
if ($type eq "bytes" or $type eq "string") {
if ($type eq 'bytes') {
return 0 if ref $data;
return 1 unless utf8::is_utf8($data) and $data =~ /[^\x00-\xFF]/;
}
if ($type eq 'string') {
return 1 unless ref $data;
}
if ($type eq 'boolean') {
Expand Down Expand Up @@ -807,11 +811,10 @@ sub new {
}

sub is_data_valid {
my $schema = shift;
my $default = shift;
my $size = $schema->{size};
return 1 if $default && bytes::length $default == $size;
return 0;
my ( $schema, $data ) = @_;

return 0 if utf8::is_utf8($data) && $data =~ /[^\x00-\xFF]/;
return $data && length($data) == $schema->{size};
}

sub size {
Expand Down
28 changes: 27 additions & 1 deletion lang/perl/t/01_schema.t
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ use strict;
use warnings;

use Test::More;
plan tests => 137;
plan tests => 145;
use Test::Exception;
use_ok 'Avro::Schema';

Expand All @@ -42,6 +42,32 @@ isa_ok $s2, 'Avro::Schema::Primitive';
is $s2->type, "string", "type is string";
is $s, $s2, "string Schematas are singletons";

## Perl strings as bytes
{
my $schema = Avro::Schema->parse(q({"type": "bytes"}));
ok $schema->is_data_valid(''), 'Empty string is valid as bytes';
ok $schema->is_data_valid("\0"), 'Zero byte is valid as bytes';
ok !$schema->is_data_valid("\x{100}"), 'Values > 255 not valid as bytes';

my $bytes = '';
utf8::upgrade($bytes);

ok $schema->is_data_valid($bytes), 'Upgraded string valid as bytes';
}

## Perl strings as fixed
{
my $schema = Avro::Schema->parse(q({"type": "fixed", "name": "foo", "size": 1 }));
ok !$schema->is_data_valid(''), 'Too few bytes vs. schema';
ok $schema->is_data_valid("\0"), 'Zero byte is valid as fixed';
ok !$schema->is_data_valid("\x{100}"), 'Values > 255 not valid as fixed';

my $bytes = "\xff";
utf8::upgrade($bytes);

ok $schema->is_data_valid($bytes), 'Upgraded string valid as fixed';
}

## Records
{
my $s3 = Avro::Schema::Record->new(
Expand Down