Skip to content

Commit

Permalink
feat: support HTML entities in JSX text/attributes
Browse files Browse the repository at this point in the history
JSX text and attributes support HTML character references (a.k.a.
entities), and don't support ECMAScript string escape sequences.

Although the [spec] calls it "historical" and threatens to change it,
it _is_ in the spec, and the spec is pretty stable at this point.

In changing this, I landed back on an idea that @maxbrunsfeld suggested
in a [PR review] some time ago: having separate `string` and
`jsx_string` nodes, and aliasing `jsx_string` to `string` for consumers'
convenience. At that time, having two different node types was deemed
unnecessary, but this adds a second, more substantive difference between
the two, so I've brought the idea back, and stopped allowing invalid
newlines in JS string literals, which is invalid in both JS and TS.

[spec]: https://facebook.github.io/jsx/#sec-jsx-string-characters
[PR review]: #140 (comment)
  • Loading branch information
cpmsmith authored and amaanq committed Feb 1, 2024
1 parent c2c2260 commit b16c69a
Show file tree
Hide file tree
Showing 2 changed files with 72 additions and 26 deletions.
48 changes: 38 additions & 10 deletions grammar.js
Original file line number Diff line number Diff line change
Expand Up @@ -607,10 +607,15 @@ module.exports = grammar({

// Should not contain new lines and should not start or end with a space
jsx_text: _ => choice(
/[^{}<>\n ]([^{}<>\n]*[^{}<>\n ])?/,
/[^{}<>\n& ]([^{}<>\n&]*[^{}<>\n& ])?/,
/\/\/[^\n]*/,
),

// An entity can be named, numeric (decimal), or numeric (hexadecimal). The
// longest entity name is 29 characters long, and the HTML spec says that
// no more will ever be added.
html_character_reference: _ => /&(#([xX][0-9a-fA-F]{1,6}|[0-9]{1,5})|[A-Za-z]{1,30});/,

jsx_expression: $ => seq(
'{',
optional(choice(
Expand All @@ -623,6 +628,7 @@ module.exports = grammar({

_jsx_child: $ => choice(
$.jsx_text,
$.html_character_reference,
$._jsx_element,
$.jsx_expression,
),
Expand Down Expand Up @@ -682,8 +688,36 @@ module.exports = grammar({
)),
),

_jsx_string: $ => choice(
seq(
'"',
repeat(choice(
alias($.unescaped_double_jsx_string_fragment, $.string_fragment),
$.html_character_reference,
)),
'"',
),
seq(
'\'',
repeat(choice(
alias($.unescaped_single_jsx_string_fragment, $.string_fragment),
$.html_character_reference,
)),
'\'',
),
),

// Workaround to https://github.com/tree-sitter/tree-sitter/issues/1156
// We give names to the token() constructs containing a regexp
// so as to obtain a node in the CST.
//
unescaped_double_jsx_string_fragment: _ => token.immediate(prec(1, /[^"&]+/)),

// same here
unescaped_single_jsx_string_fragment: _ => token.immediate(prec(1, /[^'&]+/)),

_jsx_attribute_value: $ => choice(
$.string,
alias($._jsx_string, $.string),
$.jsx_expression,
$._jsx_element,
),
Expand Down Expand Up @@ -909,12 +943,6 @@ module.exports = grammar({
// Primitives
//

// Here we tolerate unescaped newlines in double-quoted and
// single-quoted string literals.
// This is legal in typescript as jsx/tsx attribute values (as of
// 2020), and perhaps will be valid in javascript as well in the
// future.
//
string: $ => choice(
seq(
'"',
Expand All @@ -938,10 +966,10 @@ module.exports = grammar({
// We give names to the token() constructs containing a regexp
// so as to obtain a node in the CST.
//
unescaped_double_string_fragment: _ => token.immediate(prec(1, /[^"\\]+/)),
unescaped_double_string_fragment: _ => token.immediate(prec(1, /[^"\\\r\n]+/)),

// same here
unescaped_single_string_fragment: _ => token.immediate(prec(1, /[^'\\]+/)),
unescaped_single_string_fragment: _ => token.immediate(prec(1, /[^'\\\r\n]+/)),

escape_sequence: _ => token.immediate(seq(
'\\',
Expand Down
50 changes: 34 additions & 16 deletions test/corpus/literals.txt
Original file line number Diff line number Diff line change
Expand Up @@ -108,22 +108,6 @@ world';
(expression_statement
(string (string_fragment) (escape_sequence) (string_fragment))))

============================================================
Non-standard unescaped newlines legal in TSX attributes
============================================================

"hello
world";

'hello
world';

---

(program
(expression_statement (string (string_fragment)))
(expression_statement (string (string_fragment))))

=========================================================
JSX strings with unescaped newlines for TSX attributes
=========================================================
Expand Down Expand Up @@ -151,3 +135,37 @@ JSX strings with unescaped newlines for TSX attributes
(jsx_attribute (property_identifier) (string (string_fragment))))
(jsx_closing_element
(identifier)))))

===============================================
JSX with HTML character references (entities)
===============================================

<a>foo &nbsp; bar</a>;

<abbr title="foo &nbsp; \n bar">foo</abbr>;

----

(program
(expression_statement
(jsx_element
(jsx_opening_element
(identifier))
(jsx_text)
(html_character_reference)
(jsx_text)
(jsx_closing_element
(identifier))))
(expression_statement
(jsx_element
(jsx_opening_element
(identifier)
(jsx_attribute
(property_identifier)
(string
(string_fragment)
(html_character_reference)
(string_fragment))))
(jsx_text)
(jsx_closing_element
(identifier)))))

0 comments on commit b16c69a

Please sign in to comment.