Skip to content

Commit

Permalink
Add character encoding examples and documentation, e.g. parsing UTF-8
Browse files Browse the repository at this point in the history
  • Loading branch information
cwbaker committed Jun 2, 2023
2 parents 1f85c9d + e346baf commit 6821f78
Show file tree
Hide file tree
Showing 6 changed files with 94 additions and 59 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -183,9 +183,9 @@ Parser<const char*, XmlUserData> parser( xml_parser_state_machine );
Reference a parse table as an `extern` variable for offline generated parse tables. See [lalr_calculator_example.cpp](#lalr_calculator_example.cpp) for an example of compiling a grammar to parse tables at runtime.
Create a `Parser` object with the parse table as the sole argument to the constructor. The `Parser` class template requires an iterator type template argument and optionally allows for user data, character type, character traits, and allocator to be overridden.
Create a `Parser` object with the parse table as the sole argument to the constructor. The `Parser` class template requires an iterator type template argument and optionally allows for user data; and character type, traits, and allocator to be overridden. In the above example the iterator type is `const char*`, user data is the custom `XmlUserData` type, and the character parameters default to those implied by the iterator.
In the above example the iterator type is `const char*` and the user data is the custom `XmlUserData` type.
Change the `Iterator` template parameter to read input from different sources and convert character encodings, e.g. from UTF-8 in a file to UTF-32 in memory. See [lalr_json_example.cpp](lalr/lalr_examples/lalr_json_example.cpp) for an example of reading a UTF-8 encoded file to UTF-32, `char32_t` in memory. To parse UTF-8 input to UTF-8 encoding in memory it is usually sufficient to use a iterator templated to `unsigned char` or `uint8_t`, see [lalr_xml_example.cpp](lalr/lalr_examples/lalr_xml_example.cpp) for an example of doing so in practice.
**3. Bind lexer action handlers**
Expand Down
8 changes: 4 additions & 4 deletions src/lalr/Parser.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -494,7 +494,7 @@ void Parser<Iterator, UserData, Char, Traits, Allocator>::debug_shift( const Par
if ( debug_enabled_ )
{
const ParserSymbol* symbol = node.symbol();
const std::string& lexeme = node.lexeme();
const std::basic_string<Char, Traits, Allocator>& lexeme = node.lexeme();
int line = node.line();
int column = node.column();
fire_printf( "SHIFT: (%s %s %d:%d)\n", symbol ? symbol->identifier : "", lexeme.c_str(), line, column );
Expand All @@ -521,14 +521,14 @@ void Parser<Iterator, UserData, Char, Traits, Allocator>::debug_reduce( const Pa

if ( debug_enabled_ )
{
fire_printf( "REDUCE: %s <- ", reduced_symbol->identifier );
fire_printf( "REDUCE: %s <- ", reduced_symbol->identifier );

const ParserNode* node = nodes_.data() + start;
const ParserNode* node_end = nodes_.data() + finish;
if ( node != node_end )
{
const ParserSymbol* symbol = node->symbol();
const std::string& lexeme = node->lexeme();
const std::basic_string<Char, Traits, Allocator>& lexeme = node->lexeme();
int line = node->line();
int column = node->column();
fire_printf( "(%s %s %d:%d)", symbol ? symbol->identifier : "", lexeme.c_str(), line, column );
Expand All @@ -538,7 +538,7 @@ void Parser<Iterator, UserData, Char, Traits, Allocator>::debug_reduce( const Pa
while ( node != node_end )
{
const ParserSymbol* symbol = node->symbol();
const std::string& lexeme = node->lexeme();
const std::basic_string<Char, Traits, Allocator>& lexeme = node->lexeme();
int line = node->line();
int column = node->column();
fire_printf( " (%s %s %d:%d)", symbol ? symbol->identifier : "", lexeme.c_str(), line, column );
Expand Down
3 changes: 3 additions & 0 deletions src/lalr/lalr_examples/lalr_examples.forge
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ for _, cc in toolsets('^cc.*') do
libraries = libraries;
'${lib}/lalr_${platform}_${architecture}';
cc:Cxx '${obj}/%1' {
defines = {
([[LALR_EXAMPLES=\"%s/\"]]):format( pwd() );
};
"lalr_examples.cpp",
"lalr_error_handling_calculator_example.cpp",
"lalr_hello_world_example.cpp",
Expand Down
90 changes: 54 additions & 36 deletions src/lalr/lalr_examples/lalr_json_example.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,30 +4,41 @@
#include <lalr/Parser.ipp>
#include <lalr/PositionIterator.hpp>
#include <lalr/string_literal.hpp>
#include <fstream>
#include <string.h>
#include <locale>
#include <clocale>
#include <codecvt>
#ifdef __APPLE__
#include <stdint.h>
#else
#include <uchar.h>
#endif

using namespace std;
using namespace lalr;

namespace
{

typedef std::basic_string<char32_t> String;

struct Attribute;

struct Value
{
std::string value_;
String value_;
std::vector<shared_ptr<Attribute>> attributes_;
std::vector<shared_ptr<Value>> elements_;

Value()
: value_()
, attributes_()
, elements_()
{
{
}

Value( const std::string& value )
Value( const String& value )
: value_( value )
, attributes_()
, elements_()
Expand All @@ -37,7 +48,7 @@ struct Value

struct Attribute
{
std::string name_;
String name_;
shared_ptr<Value> value_;

Attribute()
Expand All @@ -46,7 +57,7 @@ struct Attribute
{
}

Attribute( const std::string& name, const shared_ptr<Value>& value )
Attribute( const String& name, const shared_ptr<Value>& value )
: name_( name )
, value_( value )
{
Expand All @@ -55,7 +66,7 @@ struct Attribute

struct JsonUserData
{
std::string name_;
String name_;
shared_ptr<Value> value_;

JsonUserData()
Expand All @@ -70,66 +81,66 @@ struct JsonUserData
{
}

JsonUserData( const std::string& name, shared_ptr<Value> value )
JsonUserData( const String& name, shared_ptr<Value> value )
: name_( name )
, value_( value )
{
}
};

static JsonUserData document( const JsonUserData* start, const ParserNode<char>* nodes, size_t length )
static JsonUserData document( const JsonUserData* start, const ParserNode<char32_t>* nodes, size_t length )
{
return start[1];
}

static JsonUserData attribute( const JsonUserData* start, const ParserNode<char>* nodes, size_t length )
static JsonUserData attribute( const JsonUserData* start, const ParserNode<char32_t>* nodes, size_t length )
{
const shared_ptr<Value>& attribute = start[2].value_;
return JsonUserData( nodes[0].lexeme(), attribute );
}

static JsonUserData null( const JsonUserData* start, const ParserNode<char>* nodes, size_t length )
static JsonUserData null( const JsonUserData* start, const ParserNode<char32_t>* nodes, size_t length )
{
shared_ptr<Value> null_value = make_shared<Value>();
return JsonUserData( null_value );
}

static JsonUserData value( const JsonUserData* start, const ParserNode<char>* nodes, size_t length )
static JsonUserData value( const JsonUserData* start, const ParserNode<char32_t>* nodes, size_t length )
{
shared_ptr<Value> value = make_shared<Value>( nodes[0].lexeme() );
return JsonUserData( value );
}

static JsonUserData object( const JsonUserData* start, const ParserNode<char>* nodes, size_t length )
static JsonUserData object( const JsonUserData* start, const ParserNode<char32_t>* nodes, size_t length )
{
return start[1];
}

static JsonUserData add_to_object( const JsonUserData* start, const ParserNode<char>* nodes, size_t length )
static JsonUserData add_to_object( const JsonUserData* start, const ParserNode<char32_t>* nodes, size_t length )
{
const shared_ptr<Value>& object = start[0].value_;
shared_ptr<Attribute> attribute = make_shared<Attribute>( start[2].name_, start[2].value_ );
object->attributes_.push_back( attribute );
return JsonUserData( object );
}

static JsonUserData create_object( const JsonUserData* start, const ParserNode<char>* nodes, size_t length )
static JsonUserData create_object( const JsonUserData* start, const ParserNode<char32_t>* nodes, size_t length )
{
shared_ptr<Value> object = make_shared<Value>();
shared_ptr<Attribute> attribute = make_shared<Attribute>( start[0].name_, start[0].value_ );
object->attributes_.push_back( attribute );
return JsonUserData( object );
}

static JsonUserData add_to_array( const JsonUserData* start, const ParserNode<char>* nodes, size_t length )
static JsonUserData add_to_array( const JsonUserData* start, const ParserNode<char32_t>* nodes, size_t length )
{
const shared_ptr<Value>& array = start[0].value_;
const shared_ptr<Value>& element = start[2].value_;
array->elements_.push_back( element );
return JsonUserData( array );
}

static JsonUserData create_array( const JsonUserData* start, const ParserNode<char>* nodes, size_t length )
static JsonUserData create_array( const JsonUserData* start, const ParserNode<char32_t>* nodes, size_t length )
{
shared_ptr<Value> array = make_shared<Value>();
const shared_ptr<Value>& element = start[0].value_;
Expand All @@ -147,20 +158,28 @@ static void indent( int level )

static void print( const Value& value, int level )
{
std::setlocale( LC_ALL, "en_US.UTF-8" );
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> utf8;

for ( const shared_ptr<Attribute>& attribute : value.attributes_ )
{
LALR_ASSERT( attribute );
const string& name = attribute->name_;
const String& name = attribute->name_;
const Value& value = *attribute->value_;
if ( value.attributes_.empty() && value.elements_.empty() )
{
indent( level + 1 );
printf( "%s='%s'\n", attribute->name_.c_str(), attribute->value_->value_.c_str() );
printf( "%s='%s'\n",
utf8.to_bytes(attribute->name_).c_str(),
utf8.to_bytes(attribute->value_->value_).c_str()
);
}
else
{
indent( level + 1 );
printf( "%s:\n", attribute->name_.c_str() );
printf( "%s:\n",
utf8.to_bytes(attribute->name_).c_str()
);
print( value, level + 1 );
}
}
Expand All @@ -173,7 +192,10 @@ static void print( const Value& value, int level )
if ( value.attributes_.empty() && value.elements_.empty() )
{
indent( level + 1 );
printf( "%d: '%s'\n", index, value.value_.c_str() );
printf( "%d: '%s'\n",
index,
utf8.to_bytes(value.value_).c_str()
);
}
else
{
Expand All @@ -190,8 +212,8 @@ static void print( const Value& value, int level )
void lalr_json_example()
{
extern const lalr::ParserStateMachine* json_parser_state_machine;
Parser<const char*, JsonUserData> parser( json_parser_state_machine );
parser.set_lexer_action_handler( "string", &string_literal<const char*> );
Parser<istreambuf_iterator<char32_t>, JsonUserData> parser( json_parser_state_machine );
parser.set_lexer_action_handler( "string", &string_literal<istreambuf_iterator<char32_t>> );
parser.parser_action_handlers()
( "document", &document )
( "add_to_object", &add_to_object )
Expand All @@ -205,20 +227,16 @@ void lalr_json_example()
( "array", &object )
;

const char* input =
"{\n"
" \"model\": {\n"
" \"format\": \"Model\",\n"
" \"version\": 1,\n"
" \"address\": \"0017FAB0\",\n"
" \"items\": {\n"
" \"name\": \"Albert\"\n"
" },\n"
" \"more_items\": ['one', 2, 3]\n"
" }\n"
"}\n";

parser.parse( input, input + strlen(input) );
using std::locale;
using std::codecvt;
using std::basic_ifstream;
using std::istreambuf_iterator;
std::basic_ifstream<char32_t> file( LALR_EXAMPLES "lalr_json_example.json", std::ios_base::binary );
file.imbue( locale(file.getloc(), new codecvt<char32_t, char, std::mbstate_t>) );
istreambuf_iterator<char32_t> input( file );
istreambuf_iterator<char32_t> input_end;

parser.parse( input, input_end );
LALR_ASSERT( parser.accepted() );
LALR_ASSERT( parser.full() );
print( *parser.user_data().value_, 0 );
Expand Down
11 changes: 11 additions & 0 deletions src/lalr/lalr_examples/lalr_json_example.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{
"model": {
"format": "Model",
"version": 1,
"address": "0017FAB0",
"items": {
"name": "Albert"
},
"more_items": ["one", 2, 3, "to prove that it's really UTF-8... 😁 😄!"]
}
}
Loading

0 comments on commit 6821f78

Please sign in to comment.