Skip to content

Commit

Permalink
sourcepos: remove and document unsound sourcepos calculations
Browse files Browse the repository at this point in the history
  • Loading branch information
kivikakk committed Mar 31, 2023
1 parent 44f501f commit 586f179
Show file tree
Hide file tree
Showing 6 changed files with 112 additions and 103 deletions.
4 changes: 3 additions & 1 deletion src/html.rs
Original file line number Diff line number Diff line change
Expand Up @@ -988,7 +988,9 @@ impl<'o> HtmlFormatter<'o> {
fn render_sourcepos<'a>(&mut self, node: &'a AstNode<'a>) -> io::Result<()> {
if self.options.render.sourcepos {
let ast = node.data.borrow();
write!(self.output, " data-sourcepos=\"{}\"", ast.sourcepos)?;
if ast.sourcepos.start.line > 0 {
write!(self.output, " data-sourcepos=\"{}\"", ast.sourcepos)?;
}
}
Ok(())
}
Expand Down
66 changes: 11 additions & 55 deletions src/parser/autolink.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use crate::ctype::{isalnum, isalpha, isspace};
use crate::nodes::{AstNode, LineColumn, NodeLink, NodeValue, Sourcepos};
use crate::nodes::{AstNode, NodeLink, NodeValue};
use crate::parser::inlines::make_inline;
use once_cell::sync::Lazy;
use std::str;
Expand All @@ -10,7 +10,6 @@ pub(crate) fn process_autolinks<'a>(
arena: &'a Arena<AstNode<'a>>,
node: &'a AstNode<'a>,
contents_str: &mut String,
sourcepos: &mut Sourcepos,
) {
let contents = contents_str.as_bytes();
let len = contents.len();
Expand All @@ -22,22 +21,19 @@ pub(crate) fn process_autolinks<'a>(
while i < len {
match contents[i] {
b':' => {
post_org =
url_match(arena, contents, i, sourcepos.start.column_add(i as isize));
post_org = url_match(arena, contents, i);
if post_org.is_some() {
break;
}
}
b'w' => {
post_org =
www_match(arena, contents, i, sourcepos.start.column_add(i as isize));
post_org = www_match(arena, contents, i);
if post_org.is_some() {
break;
}
}
b'@' => {
post_org =
email_match(arena, contents, i, sourcepos.start.column_add(i as isize));
post_org = email_match(arena, contents, i);
if post_org.is_some() {
break;
}
Expand All @@ -56,17 +52,10 @@ pub(crate) fn process_autolinks<'a>(
post.insert_after(make_inline(
arena,
NodeValue::Text(remain.to_string()),
(
sourcepos.start.line,
sourcepos.start.column + i + skip,
sourcepos.end.line,
sourcepos.end.column,
)
.into(),
(0, 1, 0, 1).into(),
));
}
contents_str.truncate(i);
sourcepos.end.column -= len - i;
return;
}
}
Expand All @@ -76,7 +65,6 @@ fn www_match<'a>(
arena: &'a Arena<AstNode<'a>>,
contents: &[u8],
i: usize,
start: LineColumn,
) -> Option<(&'a AstNode<'a>, usize, usize)> {
static WWW_DELIMS: Lazy<[bool; 256]> = Lazy::new(|| {
let mut sc = [false; 256];
Expand Down Expand Up @@ -114,13 +102,7 @@ fn www_match<'a>(
url,
title: String::new(),
}),
(
start.line,
start.column,
start.line,
start.column + link_end,
)
.into(),
(0, 1, 0, 1).into(),
);

inl.append(make_inline(
Expand All @@ -130,7 +112,7 @@ fn www_match<'a>(
.unwrap()
.to_string(),
),
(start.line, start.column, start.line, start.column + i).into(),
(0, 1, 0, 1).into(),
));
Some((inl, 0, link_end))
}
Expand Down Expand Up @@ -231,7 +213,6 @@ fn url_match<'a>(
arena: &'a Arena<AstNode<'a>>,
contents: &[u8],
i: usize,
start: LineColumn,
) -> Option<(&'a AstNode<'a>, usize, usize)> {
const SCHEMES: [&[u8]; 3] = [b"http", b"https", b"ftp"];

Expand Down Expand Up @@ -271,25 +252,13 @@ fn url_match<'a>(
url: url.clone(),
title: String::new(),
}),
(
start.line,
start.column - rewind,
start.line,
start.column + link_end,
)
.into(),
(0, 1, 0, 1).into(),
);

inl.append(make_inline(
arena,
NodeValue::Text(url),
(
start.line,
start.column - rewind,
start.line,
start.column + link_end,
)
.into(),
(0, 1, 0, 1).into(),
));
Some((inl, rewind, rewind + link_end))
}
Expand All @@ -298,7 +267,6 @@ fn email_match<'a>(
arena: &'a Arena<AstNode<'a>>,
contents: &[u8],
i: usize,
start: LineColumn,
) -> Option<(&'a AstNode<'a>, usize, usize)> {
static EMAIL_OK_SET: Lazy<[bool; 256]> = Lazy::new(|| {
let mut sc = [false; 256];
Expand Down Expand Up @@ -368,25 +336,13 @@ fn email_match<'a>(
url,
title: String::new(),
}),
(
start.line,
start.column - rewind,
start.line,
start.column + link_end - 1,
)
.into(),
(0, 1, 0, 1).into(),
);

inl.append(make_inline(
arena,
NodeValue::Text(text.to_string()),
(
start.line,
start.column - rewind,
start.line,
start.column + link_end - 1,
)
.into(),
(0, 1, 0, 1).into(),
));
Some((inl, rewind, rewind + link_end))
}
5 changes: 4 additions & 1 deletion src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1900,7 +1900,7 @@ impl<'a, 'o, 'c> Parser<'a, 'o, 'c> {
}

if self.options.extension.autolink {
autolink::process_autolinks(self.arena, node, text, sourcepos);
autolink::process_autolinks(self.arena, node, text);
}
}

Expand Down Expand Up @@ -1936,6 +1936,9 @@ impl<'a, 'o, 'c> Parser<'a, 'o, 'c> {

text.drain(..end);

// These are sound only because the exact text that we've matched and
// the count thereof (i.e. "end") will precisely map to characters in
// the source document.
sourcepos.start.column += end;
parent.data.borrow_mut().sourcepos.start.column += end;

Expand Down
54 changes: 49 additions & 5 deletions src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,42 @@ macro_rules! html_opts {
html_opts!([$($optclass.$optname),*], $lhs, $rhs)
};
([$($optclass:ident.$optname:ident),*], $lhs:expr, $rhs:expr) => {
crate::tests::html_opts_i($lhs, $rhs, |opts| {
$crate::tests::html_opts_i($lhs, $rhs, |opts| {
$(opts.$optclass.$optname = true;)*
});
};
([all], $lhs:expr, $rhs:expr) => {
$crate::tests::html_opts_w($lhs, $rhs, &$crate::ComrakOptions {
extension: $crate::ComrakExtensionOptions {
strikethrough: true,
tagfilter: true,
table: true,
autolink: true,
tasklist: true,
superscript: true,
header_ids: Some("user-content-".to_string()),
footnotes: true,
description_lists: true,
front_matter_delimiter: Some("---".to_string()),
shortcodes: true,
},
parse: $crate::ComrakParseOptions {
smart: true,
default_info_string: Some("rust".to_string()),
relaxed_tasklist_matching: true,
},
render: $crate::ComrakRenderOptions {
hardbreaks: true,
github_pre_lang: true,
full_info_string: true,
width: 80,
unsafe_: true,
escape: true,
list_style: $crate::ListStyleType::Star,
sourcepos: true,
},
});
}
}

pub(crate) use html_opts;
Expand Down Expand Up @@ -180,14 +212,25 @@ fn asssert_node_eq<'a>(node: &'a AstNode<'a>, location: &[usize], expected: &Nod
compare_strs(&actual, &expected, "ast comparison");
}

macro_rules! sourcepos {
(($spsl:literal:$spsc:literal-$spel:literal:$spec:literal)) => {
($spsl, $spsc, $spel, $spec).into()
};
((XXX)) => {
(0, 1, 0, 1).into()
};
}

pub(crate) use sourcepos;

macro_rules! ast {
(($name:tt ($spsl:literal:$spsc:literal-$spel:literal:$spec:literal))) => {
ast!(($name ($spsl:$spsc-$spel:$spec) []))
(($name:tt $sp:tt)) => {
ast!(($name $sp []))
};
(($name:tt ($spsl:literal:$spsc:literal-$spel:literal:$spec:literal) $content:tt)) => {
(($name:tt $sp:tt $content:tt)) => {
AstMatchTree {
name: stringify!($name).to_string(),
sourcepos: ($spsl, $spsc, $spel, $spec).into(),
sourcepos: sourcepos!($sp),
content: ast!($content),
}
};
Expand Down Expand Up @@ -259,6 +302,7 @@ enum AstMatchContent {
}

impl AstMatchTree {
#[track_caller]
fn assert_match<'a>(&self, node: &'a AstNode<'a>) {
let ast = node.data.borrow();
assert_eq!(self.name, ast.value.xml_node_name(), "node type matches");
Expand Down
31 changes: 27 additions & 4 deletions src/tests/autolink.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,29 @@ fn autolink_no_link_bad() {

#[test]
fn sourcepos_correctly_restores_context() {
// There's unsoundness in trying to maintain and adjust sourcepos
// when doing autolinks in the light of:
//
// a) Some source elements introducing a different number of characters
// to the content text than they take in source, i.e. smart
// punctuation.
//
// b) Text node consolidation happening before autolinking.
//
// (b) is obviously non-optional, but it means we end up with Text
// nodes with different byte counts than their sourcepos span lengths.
//
// One possible solution would be to actually accumulate multiple
// sourcepos spans per Text node, each also tracking the number of
// bytes of content text it's responsible for. This would work well
// enough as long as we never had to adjust a sourcepos into a spot
// within a sourcepos span that had a target text width where it
// wasn't equal. That probably wouldn't happen, though -- i.e. we're
// never autolinking into the middle of a rendered smart punctuation.
//
// For now the desired sourcepos is documented in comment. What we
// have currently (after backing out the adjustments, having hit the
// above case) matches cmark-gfm.
assert_ast_match!(
[],
"ab _cde_ [email protected] h*ijklm* n",
Expand Down Expand Up @@ -81,11 +104,11 @@ fn sourcepos_correctly_restores_context() {
(emph (1:4-1:8) [
(text (1:5-1:7) "cde")
])
(text (1:9-1:9) " ")
(link (1:10-1:15) [
(text (1:10-1:15) "[email protected]")
(text (1:9-1:17) " ") // (text (1:9-1:9) " ")
(link (XXX) [ // (link (1:10-1:15) [
(text (XXX) "[email protected]") // (text (1:10-1:15) "[email protected]")
])
(text (1:16-1:17) " h")
(text (XXX) " h") // (text (1:16-1:17) " h")
(emph (1:18-1:24) [
(text (1:19-1:23) "ijklm")
])
Expand Down
55 changes: 18 additions & 37 deletions src/tests/fuzz.rs
Original file line number Diff line number Diff line change
@@ -1,42 +1,5 @@
use super::{html, html_opts};

/// To be used when moving from the all_options fuzz target to here.
/// Reduce to the actual necessary options to reproduce.
#[allow(dead_code)]
#[cfg(feature = "shortcodes")]
fn all_options() -> crate::ComrakOptions {
crate::ComrakOptions {
extension: crate::ComrakExtensionOptions {
strikethrough: true,
tagfilter: true,
table: true,
autolink: true,
tasklist: true,
superscript: true,
header_ids: Some("user-content-".to_string()),
footnotes: true,
description_lists: true,
front_matter_delimiter: Some("---".to_string()),
shortcodes: true,
},
parse: crate::ComrakParseOptions {
smart: true,
default_info_string: Some("rust".to_string()),
relaxed_tasklist_matching: true,
},
render: crate::ComrakRenderOptions {
hardbreaks: true,
github_pre_lang: true,
full_info_string: true,
width: 80,
unsafe_: true,
escape: true,
list_style: crate::ListStyleType::Star,
sourcepos: true,
},
}
}

#[test]
fn pointy_brace_open() {
html("<!-", "<p>&lt;!-</p>\n");
Expand Down Expand Up @@ -97,3 +60,21 @@ fn line_end() {
fn bracket_match() {
html("[;\0V\n]::g\n[;\0V\n]", "<p><a href=\":g\">;�V\n</a></p>\n");
}

#[test]
fn trailing_hyphen() {
html_opts!(
[extension.autolink, parse.smart, render.sourcepos],
"[email protected]",
"<p data-sourcepos=\"1:1-1:5\">[email protected]</p>\n"
);
}

#[test]
fn trailing_hyphen_matches() {
html_opts!(
[extension.autolink, parse.smart, render.sourcepos],
"[email protected]",
"<p data-sourcepos=\"1:1-1:6\"><a href=\"mailto:[email protected]\">[email protected]</a>–</p>\n"
);
}

0 comments on commit 586f179

Please sign in to comment.